In [1]:
import pandas as pd

## Extract JSONs into DataFrames

In [2]:
# Establish paths to JSON files
biz = "Data/yelp_academic_dataset_business.json"
reviews = "Data/yelp_academic_dataset_review.json"

In [3]:
# Read Business data into Pandas
biz_df = pd.read_json(biz, lines=True)

#biz_df.head()

## Review & Scrub Data

In [4]:
#Assess Data Types... if needed
biz_df.dtypes

business_id      object
name             object
address          object
city             object
state            object
postal_code      object
latitude        float64
longitude       float64
stars           float64
review_count      int64
is_open           int64
attributes       object
categories       object
hours            object
dtype: object

In [5]:
#Assess what cities to pull in: Las Vegas, Toronto, Phoenix, Charlotte. Not including Scottsdale as it's close to Phoenix
biz_df['city'].value_counts().head()

Las Vegas     31631
Toronto       20366
Phoenix       20171
Charlotte     10422
Scottsdale     9342
Name: city, dtype: int64

In [6]:
#Remove closed businesses, column is_open
OpenBiz = biz_df[biz_df['is_open']==1]

#OpenBiz.head()

In [7]:
#Check if "0" were removed
OpenBiz['is_open'].value_counts().head(10)


1    168903
Name: is_open, dtype: int64

In [8]:
#Drop "is_open" as it is no longer needed
biz_df2 = OpenBiz.drop('is_open', axis=1)

#biz_df2.head()

In [9]:
#Split categories (strings) in to their own row
Categories = biz_df2.assign(categories = biz_df2.categories.str.split(',')).explode('categories')

# Categories.head()

In [10]:
#Check category counts, "Restaurants" = 30776 & " Restaurants" = 13189

Categories.categories.value_counts()

 Restaurants                 30776
 Shopping                    21555
 Food                        18475
 Home Services               14886
Restaurants                  13189
                             ...  
Coffeeshops                      1
Neuropathologists                1
Product Design                   1
Music Production Services        1
 Toxicologists                   1
Name: categories, Length: 2505, dtype: int64

In [11]:
#Update " Restaurants" to just "Restaurants"
Categories['categories'] = Categories['categories'].replace(
    {' Restaurants': 'Restaurants'})

In [12]:
#Check category counts... checks out

Categories.categories.value_counts()

Restaurants                43965
 Shopping                  21555
 Food                      18475
 Home Services             14886
 Health & Medical          12758
                           ...  
 Sikh Temples                  1
Bar Crawl                      1
Surf Schools                   1
Tubing                         1
Misting System Services        1
Name: categories, Length: 2504, dtype: int64

In [13]:
#Create CSV for the front end ppl to review as scrub continues
#Categories.to_csv("Data/Yelp_InitialScrub.csv")

In [14]:
# Establish paths to CSV path
biz2 = "Data/Yelp_InitialScrub.csv"

In [15]:
bizDF = pd.read_csv(biz2)
#bizDF.head()

In [16]:
Attributes = bizDF.assign(attributes = bizDF.attributes.str.split(',')).explode('attributes')
#Attributes.head()

In [17]:
#Attributes.dtypes... attributes is an object

In [18]:
Attributes['attributes'].astype(str)

0         {'BusinessAcceptsCreditCards': 'True'
0                         'BikeParking': 'True'
0                        'GoodForKids': 'False'
0          'BusinessParking': "{'garage': False
0                               'street': False
                          ...                  
715200                                      nan
715201                                      nan
715202                                      nan
715203                                      nan
715204                                      nan
Name: attributes, Length: 9034994, dtype: object

In [19]:
#Pull all attributes that contain info we want: Restaurants NEED CUISINE INFO
Attributes[Attributes['attributes'].str.contains('RestaurantsTakeOut', case=True, na=False)].attributes.value_counts()

 'RestaurantsTakeOut': 'True'      159535
 'RestaurantsTakeOut': 'False'      19005
{'RestaurantsTakeOut': 'True'       15317
 'RestaurantsTakeOut': 'True'}      11584
 'RestaurantsTakeOut': 'False'}      3015
{'RestaurantsTakeOut': 'False'       1840
 'RestaurantsTakeOut': 'None'         384
{'RestaurantsTakeOut': 'True'}        169
 'RestaurantsTakeOut': 'None'}        142
{'RestaurantsTakeOut': 'False'}        79
{'RestaurantsTakeOut': 'None'          42
Name: attributes, dtype: int64

In [20]:
#Only keep items that contain food restaurants, i.e. RestaurantsTakeOut

biz_df3 = Attributes[Attributes['attributes'].str.contains("'RestaurantsTakeOut': 'True'", na=False)]

#biz_df3.head()

In [21]:
# Check that attributes drill down worked
sorted(biz_df3['attributes'].unique())

[" 'RestaurantsTakeOut': 'True'",
 " 'RestaurantsTakeOut': 'True'}",
 "{'RestaurantsTakeOut': 'True'",
 "{'RestaurantsTakeOut': 'True'}"]

In [22]:
# Check for repeats/spelling
#sorted(biz_df3['city'].unique())

In [23]:
#Update Las Vegas names to be uniform. MANY spelling errors in data

biz_df3.loc[:,'city'] = biz_df3.loc[:,'city'].replace(
    ['C Las Vegas', 'Henderson and Las Vegas','Lake Las Vegas','Las  Vegas','LAS VEGAS AP','Las Vegas East'\
     ,'Las Vegas Nevada','Las Vegas Nv','Las Vegas,','Las Vegas, Nevada','Las Vegas, NV','Las Vegass'\
     ,'N E Las Vegas','N Las Vegas','N W Las Vegas','N. Las Vegas','N.Las Vegas','North Las Vegas','South Las Vegas'\
     ,'West Las Vegass','Las vegas','LAS VEGAS'] , 'Las Vegas')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [24]:
# Check after updates
#sorted(biz_df3['city'].unique())

In [25]:
#Update Charlotte names to be uniform. MANY spelling errors in data

biz_df3.loc[:,'city'] = biz_df3.loc[:,'city'].replace(
    ['North Charlotte', 'South Charlotte','Charotte'] , 'Charlotte')

In [26]:
# Check after updates
#sorted(biz_df3['city'].unique())

In [27]:
#Update Toronto names to be uniform. MANY spelling errors in data

biz_df3.loc[:,'city'] = biz_df3.loc[:,'city'].replace(
    ['Downtown Toronto', 'North Toronto','TORONTO - DANFORTH (OT)','TORONTO - FRONT ST (OT)','Toronto Division'\
     ,'Toronto-Etobicoke','Toronto-North York','Toronto-West','West Toronto','Tornto'] , 'Toronto')

In [28]:
# Check after updates
#sorted(biz_df3['city'].unique())

In [29]:
biz_df3.loc[:,'city'] = biz_df3.loc[:,'city'].replace(
    ['Metro Phoenix', 'North Phoenix','PHOENIX AP','Phoenix AZ','Phoenix Valley','Phoenix,','Phoenix,AZ'\
     ,'Phoneix','Phonenix,','Phoniex','Phx'],'Phoenix')

In [30]:
# Check after updates
#sorted(biz_df3['city'].unique())

In [31]:
# ONLY keep items in City: Charlotte, Las Vegas, Phoenix, Toronto

biz_df4 = biz_df3[biz_df3['city'].str.contains('Charlotte|Las Vegas|Phoenix|Toronto', regex=True)]

#biz_df4.head()

In [32]:
# Check after updates
sorted(biz_df4['city'].unique())

['Charlotte', 'Las Vegas', 'Phoenix', 'Toronto']

In [33]:
#Create CSV for the front end ppl to review as scrub continues V2
#biz_df4.to_csv("Data/Yelp_InitialScrubV2.csv")

In [34]:
#Categories to filter through
#sorted(biz_df4['categories'].unique())

In [35]:
biz_df4.categories.value_counts()

 Restaurants              8688
 Food                     4741
Restaurants               3673
 Fast Food                1754
Food                      1577
                          ... 
 Medical Centers             1
 Baby Gear & Furniture       1
 Martial Arts                1
Themed Cafes                 1
 Wine Tasting Classes        1
Name: categories, Length: 753, dtype: int64

In [36]:
#Update " Restaurants" to just "Restaurants"
biz_df4.loc[:,'categories'] = biz_df4.loc[:,'categories'].replace(
    [' Food'] , 'Food')

In [37]:
biz_df4.loc[:,'categories'] = biz_df4.loc[:,'categories'].replace(
    [' Restaurants'], 'Restaurants')

In [38]:
biz_df4.categories.value_counts().head(25)

Restaurants                   12361
Food                           6318
 Fast Food                     1754
 Nightlife                     1482
 Bars                          1478
 Coffee & Tea                  1469
 Sandwiches                    1376
 Breakfast & Brunch            1284
 American (Traditional)        1186
 Burgers                       1122
 Mexican                       1081
 Pizza                          945
 Specialty Food                 767
 American (New)                 750
 Desserts                       689
 Chinese                        658
 Italian                        651
 Event Planning & Services      648
 Salad                          644
 Bakeries                       591
 Cafes                          583
 Chicken Wings                  581
Fast Food                       532
Coffee & Tea                    527
 Seafood                        512
Name: categories, dtype: int64

In [39]:
#Remove Restaurants & Food as these are typically double qualifiers to cuisines

biz_df5 = biz_df4[~biz_df4['categories'].str.contains('Restaurants|Food| Nightlife| Bars|Bars| Coffee & Tea| Sandwiches|\
    | Breakfast & Brunch| Specialty Food| Desserts| Event Planning & Services| Bakeries| Cafes|Cafes| Chicken Wings|\
    |Coffee & Tea| Salad| Caterers| Grocery|Sandwiches| Ice Cream & Frozen Yogurt|Pizza| Wine & Spirits| Delis|\
    | Shopping| Vegetarian| Tacos| Beer| Burgers| Convenience Store|Nightlife| Vegan|Breakfast & Brunch| Diners|Bars|\
    | Gluten-Free| Gas Stations| Soup|Desserts| Automotive|Bakeries| Chicken Shop| Arts & Entertainment|Grocery|Cafes|\
    | Bubble Tea|Chicken Wings| Noodles|Ice Cream & Frozen Yogurt|Salad| Donuts| Health Markets| Lounges| Hot Dogs|\
    | Drugstores| Venues & Event Spaces| Buffets| Bagels|Event Planning & Services| Tea Rooms|Delis| Breweries|\
    |Caterers| Ramen| Custom Cakes| Meat Shops| Street Vendors| Cupcakes| Flowers & Gifts| Chocolatiers & Shops|\
    |Diners| Local Flavor|Beer|Shopping|Donuts| Fish & Chips|Convenience Store| Shaved Ice| Poke|Burgers| Dim Sum|\
    |Bubble Tea|Vegetarian| Hotels & Travel| Patisserie/Cake Shop| Music Venue|Gas Station| Creperies| Fashion|\
    | Farmers Market|Vegan| Karaoke| Casinos| Coffee Roasteries| Department Stores|Soup| Seafood Markets|\
    | Fruits & Veggies| Organic Stores|Hot Dogs|Gluten-Free| Acai Bowls|Noodles| Cheesesteaks|Automotive| Active Life|\
    | Hotels| Wraps| Candy Stores| Butcher|Chicken Shop| Cheese Shops| Service Stations| Kebab|Bagels| Gelato|\
    | Health & Medical|Tea Rooms| Waffles| Falafel| Party & Event Planning|Buffets| Pasta Shops|Local Flavor|\
    | Modern European|Health Markets| Beauty & Spas| Poutineries|Tacos| Personal Chefs| Florists| Gift Shops|Ramen|\
    | New Mexican Cuisine|Seafood Markets| Japanese Curry|New Mexican Cuisine| Traditional Chinese Medicine|\
    |Japanese Curry',na=False)]

In [40]:
biz_df5.categories.value_counts().head(50)

 American (Traditional)    1186
 Mexican                   1081
 American (New)             750
 Chinese                    658
 Italian                    651
 Seafood                    512
 Japanese                   480
Mexican                     443
 Asian Fusion               382
American (Traditional)      308
 Pubs                       288
 Mediterranean              288
 Barbeque                   258
Chinese                     251
 Canadian (New)             223
 Thai                       223
 Middle Eastern             220
Italian                     209
 Indian                     198
Japanese                    198
 Steakhouses                186
 Tex-Mex                    185
 Vietnamese                 169
 Greek                      165
 Korean                     161
American (New)              159
 Halal                      139
 Southern                   139
 Latin American             135
Seafood                     117
Thai                        113
 Gastrop

In [41]:
#Edit spaces out of names to make unique

biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' American (New)'], 'American (New)')

In [42]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Asian Fusion'], 'Asian Fusion')

In [43]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' American (Traditional)'], 'American (Traditional)')

In [44]:
biz_df5.categories.value_counts().head(10)

American (Traditional)    1494
 Mexican                  1081
American (New)             909
 Chinese                   658
 Italian                   651
 Seafood                   512
Asian Fusion               480
 Japanese                  480
Mexican                    443
 Mediterranean             288
Name: categories, dtype: int64

In [45]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Barbeque'], 'Barbeque')

In [46]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Cajun/Creole'], 'Cajun/Creole')

In [47]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Canadian (New)'], 'Canadian (New)')

In [48]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Caribbean'], 'Caribbean')

In [49]:
biz_df5.categories.value_counts().head(10)

American (Traditional)    1494
 Mexican                  1081
American (New)             909
 Chinese                   658
 Italian                   651
 Seafood                   512
Asian Fusion               480
 Japanese                  480
Mexican                    443
Barbeque                   353
Name: categories, dtype: int64

In [50]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Chinese'], 'Chinese')

In [51]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' French'], 'French')

In [52]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Greek'], 'Greek')

In [53]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Halal'], 'Halal')

In [54]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Hawaiian'], 'Hawaiian')

In [55]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Indian'], 'Indian')

In [56]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Italian'], 'Italian')

In [57]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Japanese'], 'Japanese')

In [58]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Korean'], 'Korean')

In [59]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Latin American'], 'Latin American')

In [60]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Mexican'], 'Mexican')

In [61]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Mediterranean'], 'Mediterranean')

In [62]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Middle Eastern'], 'Middle Eastern')

In [63]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Pubs'], 'Pubs')

In [64]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    ['Pubs'], 'Gastropubs')

In [65]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Gastropubs'], 'Gastropubs')

In [66]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Seafood'], 'Seafood')

In [67]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Southern'], 'Southern')

In [68]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Steakhouses'], 'Steakhouses')

In [69]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Tapas/Small Plates'], 'Tapas/Small Plates')

In [70]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Tex-Mex'], 'Tex-Mex')

In [71]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Thai'], 'Thai')

In [72]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Filipino'], 'Filipino')

In [73]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Pakistani'], 'Pakistani')

In [74]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Taiwanese'], 'Taiwanese')

In [75]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Portuguese'], 'Portuguese')

In [76]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Persian/Iranian'], 'Persian/Iranian')

In [77]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Salvadoran'], 'Salvadoran')

In [78]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Kosher'], 'Kosher')

In [79]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' African'], 'African')

In [80]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Spanish'], 'Spanish')

In [81]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Cantonese'], 'Cantonese')

In [82]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' British'], 'British')

In [83]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Irish'], 'Irish')

In [84]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Turkish'], 'Turkish')

In [85]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Ethiopian'], 'Ethiopian')

In [86]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Vietnamese'], 'Vietnamese')

In [87]:
biz_df5.categories.value_counts().head(40)

Mexican                   1524
American (Traditional)    1494
Chinese                    909
American (New)             909
Italian                    860
Japanese                   678
Seafood                    629
Asian Fusion               480
Gastropubs                 474
Mediterranean              390
Barbeque                   353
Thai                       336
Indian                     286
Canadian (New)             277
Middle Eastern             276
Vietnamese                 249
Steakhouses                241
Korean                     222
Greek                      219
Tex-Mex                    215
Latin American             174
Southern                   170
Halal                      166
Caribbean                  158
Hawaiian                   129
Tapas/Small Plates         119
French                      88
Pakistani                   88
Filipino                    83
Cajun/Creole                74
Taiwanese                   57
Portuguese                  57
Persian/

In [88]:
# ONLY keep top 40 categories over all 4 cities

FinalDF = biz_df5[biz_df5['categories'].str.contains('Mexican|American (Traditional)|Chinese|American (New)|Italian|\
    |Japanese|Seafood|Asian Fusion|Gastropubs|Mediterranean|Barbeque|Thai|Indian|Canadian (New)|Middle Eastern|\
    |Vietnamese|Steakhouses|Korean|Greek|Tex-Mex|Latin American|Southern|Halal|Caribbean|Hawaiian|Tapas/Small Plates\
    |Pakistani|French|Filipino|Cajun/Creole|Portuguese|Taiwanese|Persian/Iranian|Salvadoran|Kosher|Cantonese|African\
    |Spanish|Ethiopian|British', regex=True)]

FinalDF.head()


  return func(self, *args, **kwargs)


Unnamed: 0.1,Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,attributes,categories,hours
249,73,0QjROMVW9ACKjhSEfHqNCQ,Mi Mi Restaurant,688 Gerrard Street E,Toronto,ON,M4M 1Y3,43.666376,-79.348773,4.0,116,{'RestaurantsTakeOut': 'True',Vietnamese,"{'Monday': '11:0-22:0', 'Tuesday': '11:0-22:0'..."
531,161,cicPsia8Wj-DNRkmLbD_xg,The Keg Steakhouse + Bar,2201 Yonge Street,Toronto,ON,M4S 2B2,43.705842,-79.397841,3.5,91,'RestaurantsTakeOut': 'True',Steakhouses,"{'Monday': '11:30-0:0', 'Tuesday': '11:30-0:0'..."
534,161,cicPsia8Wj-DNRkmLbD_xg,The Keg Steakhouse + Bar,2201 Yonge Street,Toronto,ON,M4S 2B2,43.705842,-79.397841,3.5,91,'RestaurantsTakeOut': 'True',Seafood,"{'Monday': '11:30-0:0', 'Tuesday': '11:30-0:0'..."
777,238,AN0bWhisCf6LN9eHZ7DQ3w,Los Olivos Ristorante,3759 E Desert Inn Rd,Las Vegas,NV,89121,36.129178,-115.092483,5.0,222,'RestaurantsTakeOut': 'True',Italian,"{'Monday': '0:0-0:0', 'Tuesday': '16:0-21:0', ..."
806,246,AtD6B83S4Mbmq0t7iDnUVA,Veggie House,"5115 Spring Mountain Rd, Ste 203",Las Vegas,NV,89146,36.125569,-115.210911,4.5,1142,'RestaurantsTakeOut': 'True',Japanese,"{'Monday': '11:30-21:30', 'Tuesday': '11:30-21..."


In [108]:
FinalDF.categories.value_counts().head(45)

Mexican            1524
Chinese             909
Italian             860
Japanese            678
Seafood             629
Asian Fusion        480
Gastropubs          474
Mediterranean       390
Barbeque            353
Thai                336
Indian              286
Middle Eastern      276
Vietnamese          249
Steakhouses         241
Korean              222
Greek               219
Tex-Mex             215
Latin American      174
Southern            170
Halal               166
Caribbean           158
Hawaiian            129
French               88
Pakistani            88
Filipino             83
Cajun/Creole         74
Portuguese           57
Taiwanese            57
Persian/Iranian      49
Salvadoran           43
Cantonese            41
Kosher               41
Spanish              38
Ethiopian            37
British              32
Name: categories, dtype: int64

In [106]:
# Check after updates, cities
sorted(FinalDF['city'].unique())

['Charlotte', 'Las Vegas', 'Phoenix', 'Toronto']

In [107]:
# Check after updates, cuisines
sorted(FinalDF['categories'].unique())

['Asian Fusion',
 'Barbeque',
 'British',
 'Cajun/Creole',
 'Cantonese',
 'Caribbean',
 'Chinese',
 'Ethiopian',
 'Filipino',
 'French',
 'Gastropubs',
 'Greek',
 'Halal',
 'Hawaiian',
 'Indian',
 'Italian',
 'Japanese',
 'Korean',
 'Kosher',
 'Latin American',
 'Mediterranean',
 'Mexican',
 'Middle Eastern',
 'Pakistani',
 'Persian/Iranian',
 'Portuguese',
 'Salvadoran',
 'Seafood',
 'Southern',
 'Spanish',
 'Steakhouses',
 'Taiwanese',
 'Tex-Mex',
 'Thai',
 'Vietnamese']

In [90]:
# #Redo cuisines we decided on in Saturday class
# biz_df4['categories'] = biz_df4['categories'].replace(
#     {' Thai': 'Thai'})

In [91]:
#ONLY keep cuisines
#biz_df4 = biz_df3[biz_df3['city'].str.contains('Charlotte|Las Vegas|Phoenix|Toronto', regex=True)]

In [92]:
# Resys = biz_df4.groupby(['categories'])

# Resys.count().head()

In [93]:
# Read Review data into Pandas
#review_df = pd.read_json(reviews)


### Transform Player DataFrames
* Reduce Player DFs to only the 9 columns we want to evaluate
* Update column headers to coorespond with schema 
* Add 'Season' column to DataFrames


In [94]:
# # 2018 Player Data reduction
# # Extract "Player", "Tm", "Pos", "G", "FG", "FG%", "FT", "FT%" and "PTS"
# reduced_player18_df = player_2018_df.loc[:, ["Player", "Tm", "Pos", "G", "FG", "FG%", "FT", "FT%", "PTS"]]
# reduced_player18_df.head(10)

In [95]:
# # 2019 Player Data reduction
# # Extract "Player", "Tm", "Pos", "G", "FG", "FG%", "FT", "FT%" and "PTS"
# reduced_player19_df = player_2019_df.loc[:, ["Player", "Tm", "Pos", "G", "FG", "FG%", "FT", "FT%", "PTS"]]
# reduced_player19_df.head(10)

In [96]:
# #Header updates, Player Data:
# revised_player18_df = reduced_player18_df.rename(columns={'Tm': 'team_name', 'Pos': 'Position',	'G': 'Games', 'FG%': 'FG_pct', 'FT%': 'FT_pct', 'PTS': 'Total_Pts'})

# revised_player19_df = reduced_player19_df.rename(columns={'Tm': 'team_name', 'Pos': 'Position',	'G': 'Games', 'FG%': 'FG_pct', 'FT%': 'FT_pct', 'PTS': 'Total_Pts', })

In [97]:
# # Add 'Season' column
# revised_player18_df.insert(0, "Season", 2018, True)

# revised_player19_df.insert(0, "Season", 2019, True)

### Transform Game DataFrames
* Remove the Box Score column from the Game Data
* Update the Game Data DFs so team names match the 3 letter accrynyms on the Player Data DFs
* Update column headers to coorespond with schema 
* Add 'Season' column 

In [98]:
# # 2018 Game Data reduction
# # Extract "Date", "Visitor/Neutral", "PTS", "Visitor/Neutral", "PTS.1"
# reduced_game18_df = wnba_2018_df.loc[:, ["Date", "Visitor/Neutral", "PTS", "Home/Neutral", "PTS.1"]]
# reduced_game18_df.head(10)

In [99]:
# # 2019 Game Data reduction
# # Extract "Date", "Visitor/Neutral", "PTS", "Visitor/Neutral", "PTS.1"
# reduced_game19_df = wnba_2019_df.loc[:, ["Date", "Visitor/Neutral", "PTS", "Home/Neutral", "PTS.1"]]
# reduced_game19_df.head(10)

In [100]:
# #Replace each full team name with 3-letter accrynym (i.e. DallasWings --> DAL)

# replacements = {
#     "Dallas Wings": "DAL",
#     "Chicago Sky": "CHI",
#     "New York Liberty": "NYL",
#     "Las Vegas Aces": "LVA",
#     "Atlanta Dream": "ATL",
#     "Los Angeles Sparks": "LAS",
#     "Phoenix Mercury": "PHO",
#     "Seattle Storm": "SEA",
#     "Indiana Fever": "IND",
#     "Washington Mystics": "WAS",
#     "Minnesota Lynx": "MIN",
#     "Connecticut Sun": "CON",
# }
# reduced_game18_df["Visitor/Neutral"].replace(replacements, inplace=True)

# reduced_game18_df["Home/Neutral"].replace(replacements, inplace=True)

# reduced_game19_df["Visitor/Neutral"].replace(replacements, inplace=True)

# reduced_game19_df["Home/Neutral"].replace(replacements, inplace=True)

In [101]:
# # Headers updates, Game Data:
# revised_game18_df = reduced_game18_df.rename(columns={'Visitor/Neutral': 'away_team', 'PTS': 'away_team_pts', 'Home/Neutral': 'home_team', 'PTS.1': 'home_team_pts'})

# revised_game19_df = reduced_game19_df.rename(columns={'Visitor/Neutral': 'away_team', 'PTS': 'away_team_pts', 'Home/Neutral': 'home_team', 'PTS.1': 'home_team_pts'})

In [102]:
# # Add 'Season' Column to Game Data
# revised_game18_df.insert(0, "Season", 2018, True)

# revised_game19_df.insert(0, "Season", 2019, True)

### Connect to local database

In [103]:
# connection_string = "postgres:postgres@localhost:5432/WNBA"
# engine = create_engine(f'postgresql://{connection_string}')

In [104]:
# # Confirm tables
# engine.table_names()

### Load DataFrames into database

In [105]:
# revised_game18_df.to_sql(name='Game_Data_18', con=engine, if_exists='append', index=True)

# revised_game19_df.to_sql(name='Game_Data_19', con=engine, if_exists='append', index=True)

# revised_player18_df.to_sql(name='Player_Data_18', con=engine, if_exists='append', index=True)

# revised_player19_df.to_sql(name='Player_Data_19', con=engine, if_exists='append', index=True)
