<br>

# Milestone1_4. Exploratory Data Analysis

Identify and engineer influential Features

## Team #30

In [1]:
import pandas as pd
import numpy as np
import altair as alt
from vega_datasets import data
import geopandas as gpd
import json
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from scipy.stats import ttest_ind
from scipy.stats import f_oneway
from scipy.stats import mannwhitneyu
from scipy import linalg
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import statsmodels.api as sm
from statsmodels.stats.multicomp import pairwise_tukeyhsd



In [2]:
# Set the maximum number of displayed columns to a higher value
pd.set_option('display.max_columns', None)

# Disable the max rows limit
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [3]:
# Yelp color palette
# https://www.behance.net/gallery/26422079/Yelp-Rebrand-Concept
# https://www.flerlagetwins.com/2021/06/datafam-colors-color-palette.html

# colors = ['#D84465','#B04C75','#B4ACA6','#FF6F4C','#F15060','#B04C75','#8F648C','#746CAF','#96B6E5','#3Ad4A4']
colors = ['#D84465','#B04C75','#3369dd', '#B4ACA6', '#0097a7','#FF6F4C','#F15060', '#B04C75']

<br>

# Context

Our analysis of Yelp restaurant data focuses on 4 prominent cuisines: Chinese, Japanese, Italian, and Mexican. These cuisines are among the top cuisines in the U.S. that are not of American origin. 

These cuisines have gained widespread popularity and acceptance in the United States and are often considered staples of international cuisine. Each brings a unique set of flavors, cooking techniques, and cultural influences, contributing to the diversity of the American culinary landscape.

We compare the ratings of each of the four cuisines and aim to understand the factors that contribute to the disparities in ratings across these culinary categories.

<br>

<br>

## 4.1. Check Data Coverage

#### Insights

the Yelp data is evidently not randomly sampled at the zip code level, whereas our secondary data source - demographics -  is structured at the zip code level. Therefore, our analysis is likely biased due to the data limitations, and our findings may not be applicable to the entire country.


In [4]:
master3 = pd.read_csv('data/master3.csv')
print(master3.shape)

(49369, 51)


In [5]:
master4 = pd.read_csv('data/master4.csv')
print(master4.shape)
print(master4.dtypes)

(12005, 64)
business_id     object
name            object
city            object
state           object
zip_code       float64
                ...   
gluten_free      int64
fast_food        int64
breakfast        int64
nightlife        int64
ctgy_count       int64
Length: 64, dtype: object


In [6]:
master3.zip_code = master3.zip_code.astype(int)
master4.zip_code = master4.zip_code.astype(int)

In [7]:
print(f'The subset of Yelp academic data, focusing solely on restaurants, includes {master3.zip_code.nunique()} unique zip codes.')
print(f'The subset of Yelp academic data, focusing solely on four cuisines, includes {master4.zip_code.nunique()} unique zip codes.')

The subset of Yelp academic data, focusing solely on restaurants, includes 772 unique zip codes.
The subset of Yelp academic data, focusing solely on four cuisines, includes 688 unique zip codes.


In [8]:
# zip county mapping: https://simplemaps.com/data/us-zips

zip_county = pd.read_csv('data/uszips.csv')

zip_county.dtypes

zip                   int64
lat                 float64
lng                 float64
city                 object
state_id             object
state_name           object
zcta                   bool
parent_zcta         float64
population          float64
density             float64
county_fips           int64
county_name          object
county_weights       object
county_names_all     object
county_fips_all      object
imprecise              bool
military               bool
timezone             object
dtype: object

In [9]:
print(f'There are {zip_county.zip.nunique()} unique zip codes in the U.S.')

There are 33788 unique zip codes in the U.S.


In [10]:
pcnt = round(master3.zip_code.nunique() / zip_county.zip.nunique() * 100, 1)
print(f'''The Yelp Academic data is probably a limited subset of actual data, 
given that restaurant information is available for only {pcnt}% of U.S. zip codes.''')


The Yelp Academic data is probably a limited subset of actual data, 
given that restaurant information is available for only 2.3% of U.S. zip codes.


In [11]:
# Yelp academic data only covers a few states

master3.groupby('state').business_id.count().sort_values(ascending=False)

state
PA    12443
FL     8698
TN     4317
MO     4205
IN     4135
LA     3610
NJ     3306
AZ     2648
NV     1641
ID     1296
CA     1129
IL      981
DE      957
CO        1
MT        1
NC        1
Name: business_id, dtype: int64

In [12]:
a = master4.zip_code.unique()
b = zip_county.zip.unique()
print(len(a))
print(len(b))
print(len(set(a).intersection(set(b))))

688
33788
688


In [13]:
df1 = (master3.merge(zip_county[['zip','county_fips']].rename(columns={'zip':'zip_code'}), 
                         how='left', on='zip_code'))
df2 = (master4.merge(zip_county[['zip','county_fips']].rename(columns={'zip':'zip_code'}), 
                         how='left', on='zip_code'))

In [14]:
restaurant_count_byCounty = (zip_county[['state_name','county_name','county_fips']].drop_duplicates()
            .merge(df1.groupby('county_fips')['business_id'].count().reset_index()
                   .rename(columns={'business_id':'restaurant_count'})
                   , how='left', on='county_fips'))

In [15]:
restaurant_count_byCounty.isna().sum()

state_name             0
county_name            0
county_fips            0
restaurant_count    3143
dtype: int64

In [16]:
restaurant_count_byCounty.fillna(0, inplace=True)

In [17]:
counties = alt.topo_feature(data.us_10m.url, 'counties')


alt.Chart(counties).mark_geoshape().encode(
    color='restaurant_count:Q',
    tooltip=['state_name:N','county_name:N','restaurant_count:Q']
).transform_lookup(
    lookup='id',
    from_=alt.LookupData(restaurant_count_byCounty, 'county_fips', ['restaurant_count','county_name','state_name'])
).project(
    type='albersUsa'
).properties(
    title={
        'text': 'Yelp Academic Data Coverage',
        'fontSize': 18  # Adjust the font size as needed
    },
    width=500,
    height=300
)

In [18]:
# Restaurant count by state for subset of Yelp data

master4.groupby('state').business_id.count().sort_values(ascending=False)

state
PA    3012
FL    1907
MO    1016
TN     989
IN     976
NJ     972
AZ     887
LA     640
NV     469
CA     333
ID     303
IL     262
DE     239
Name: business_id, dtype: int64

<br>

## 4.2. Variables

#### explain

We categorized features into numerical and categorical, which facilitates the application of different methods in exploring the relationship between restaurant stars and numerical versus categorical features.


In [19]:
master4.columns

Index(['business_id', 'name', 'city', 'state', 'zip_code', 'latitude',
       'longitude', 'stars', 'review_count', 'is_open', 'categories',
       'RestaurantsDelivery', 'OutdoorSeating', 'BusinessAcceptsCreditCards',
       'BikeParking', 'RestaurantsTakeOut', 'Alcohol', 'Caters',
       'RestaurantsReservations', 'GoodForKids', 'RestaurantsGoodForGroups',
       'HasTV', 'NoiseLevel', 'RestaurantsPriceRange', 'expensive',
       'free_WiFi', 'attire_dressy', 'noise_loud', 'median_household_income',
       'population', 'household_cnt', 'median_age',
       'population_hispanic_latino', 'population_white', 'population_asian',
       'bachelors_pcnt', 'education_pcnt', 'restaurant_count',
       'population_perRestaurant', 'household_perRestaurant',
       'hispanic_latino_pcnt', 'white_pcnt', 'asian_pcnt',
       'useful_review_count', 'funny_review_count', 'cool_review_count',
       'review_sentiment_score', 'avg_tip_compliment', 'tip_sentiment_score',
       'tip_count', 'has_tip'

In [20]:
numerical_vars = ['median_household_income','household_cnt', 
                  'median_age', 'bachelors_pcnt', 'education_pcnt', 
                  'hispanic_latino_pcnt', 'white_pcnt', 'asian_pcnt',
                  'population_perRestaurant', 'household_perRestaurant', 
                  'review_count', 'review_sentiment_score', 'useful_review_count', 'funny_review_count', 'cool_review_count', 
                  'avg_tip_compliment', 'tip_sentiment_score', 'tip_count','ctgy_count']

categorical_vars_binary = ['RestaurantsDelivery', 'OutdoorSeating', 'BusinessAcceptsCreditCards',
                    'BikeParking', 'RestaurantsTakeOut', 'Alcohol', 'Caters',
                    'RestaurantsReservations', 'GoodForKids',
                    'RestaurantsGoodForGroups', 'HasTV', 
                    'free_WiFi', 'noise_loud', 'attire_dressy','expensive',
                    'plant_based', 'seafood', 'gluten_free', 'fast_food', 'breakfast', 'nightlife']

categorical_vars_nominal = ['cuisine']


nonUsed_vars = ['business_id', 'name', 'city', 'state', 'zip_code', 'latitude',
                'longitude', 'stars','is_open', 'categories', 
                'NoiseLevel',  'RestaurantsPriceRange', 
                'restaurant_count', 'population',  'has_tip', 
                'Chinese', 'Japanese','Italian', 'Mexican', 'MECE_check',
                'population_hispanic_latino', 'population_white', 'population_asian']


assert (len(numerical_vars) + len(categorical_vars_binary) 
        + len(categorical_vars_nominal) + len(nonUsed_vars) 
        == master4.shape[1])

In [21]:
master4[categorical_vars_binary].nunique()

RestaurantsDelivery           2
OutdoorSeating                2
BusinessAcceptsCreditCards    2
BikeParking                   2
RestaurantsTakeOut            2
Alcohol                       2
Caters                        2
RestaurantsReservations       2
GoodForKids                   2
RestaurantsGoodForGroups      2
HasTV                         2
free_WiFi                     2
noise_loud                    2
attire_dressy                 2
expensive                     1
plant_based                   2
seafood                       2
gluten_free                   2
fast_food                     2
breakfast                     2
nightlife                     2
dtype: int64

In [22]:
categorical_vars_binary.remove('attire_dressy')
categorical_vars_binary.remove('expensive')

In [23]:
nonUsed_vars.append('attire_dressy')
nonUsed_vars.append('expensive')

assert (len(numerical_vars) + len(categorical_vars_binary) 
        + len(categorical_vars_nominal) + len(nonUsed_vars) 
        == master4.shape[1])