In [1]:
import pandas as pd

## Extract JSONs into DataFrames

In [2]:
# Establish paths to JSON files
biz = "yelp_academic_dataset_business.json"
reviews = "yelp_academic_dataset_review.json"
biz

'yelp_academic_dataset_business.json'

In [3]:
# Read Business data into Pandas
biz_df = pd.read_json(biz, lines=True)

#biz_df.head()

## Review & Scrub Data

In [4]:
#Assess Data Types... if needed
biz_df.dtypes

business_id      object
name             object
address          object
city             object
state            object
postal_code      object
latitude        float64
longitude       float64
stars           float64
review_count      int64
is_open           int64
attributes       object
categories       object
hours            object
dtype: object

In [5]:
#Assess what cities to pull in: Las Vegas, Toronto, Phoenix, Charlotte. Not including Scottsdale as it's close to Phoenix
biz_df['city'].value_counts().head()

Las Vegas     31631
Toronto       20366
Phoenix       20171
Charlotte     10422
Scottsdale     9342
Name: city, dtype: int64

In [6]:
#Remove closed businesses, column is_open
OpenBiz = biz_df[biz_df['is_open']==1]

#OpenBiz.head()

In [7]:
#Check if "0" were removed
OpenBiz['is_open'].value_counts().head(10)


1    168903
Name: is_open, dtype: int64

In [8]:
#Drop "is_open" as it is no longer needed
biz_df2 = OpenBiz.drop('is_open', axis=1)

#biz_df2.head()

In [9]:
#Split categories (strings) in to their own row
Categories = biz_df2.assign(categories = biz_df2.categories.str.split(',')).explode('categories')

# Categories.head()

In [10]:
#Check category counts, "Restaurants" = 30776 & " Restaurants" = 13189

Categories.categories.value_counts()

 Restaurants                 30776
 Shopping                    21555
 Food                        18475
 Home Services               14886
Restaurants                  13189
                             ...  
 Geneticists                     1
Music Production Services        1
Municipality                     1
 Ceremonial Clothing             1
 Halfway Houses                  1
Name: categories, Length: 2505, dtype: int64

In [11]:
#Update " Restaurants" to just "Restaurants"
Categories['categories'] = Categories['categories'].replace(
    {' Restaurants': 'Restaurants'})

In [12]:
#Check category counts... checks out

Categories.categories.value_counts()

Restaurants                  43965
 Shopping                    21555
 Food                        18475
 Home Services               14886
 Health & Medical            12758
                             ...  
 Geneticists                     1
Music Production Services        1
Municipality                     1
 Ceremonial Clothing             1
 Halfway Houses                  1
Name: categories, Length: 2504, dtype: int64

In [13]:
#Create CSV for the front end ppl to review as scrub continues
#Categories.to_csv("Data/Yelp_InitialScrub.csv")

In [14]:
# Establish paths to CSV path
biz2 = "Yelp_InitialScrub.csv"

In [15]:
bizDF = pd.read_csv(biz2)
#bizDF.head()

FileNotFoundError: [Errno 2] File Data/Yelp_InitialScrub.csv does not exist: 'Data/Yelp_InitialScrub.csv'

In [None]:
Attributes = bizDF.assign(attributes = bizDF.attributes.str.split(',')).explode('attributes')
#Attributes.head()

In [None]:
#Attributes.dtypes... attributes is an object

In [None]:
Attributes['attributes'].astype(str)

In [None]:
#Pull all attributes that contain info we want: Restaurants NEED CUISINE INFO
Attributes[Attributes['attributes'].str.contains('RestaurantsTakeOut', case=True, na=False)].attributes.value_counts()

In [None]:
#Only keep items that contain food restaurants, i.e. RestaurantsTakeOut

biz_df3 = Attributes[Attributes['attributes'].str.contains("'RestaurantsTakeOut': 'True'", na=False)]

#biz_df3.head()

In [None]:
# Check that attributes drill down worked
sorted(biz_df3['attributes'].unique())

In [None]:
# Check for repeats/spelling
#sorted(biz_df3['city'].unique())

In [None]:
#Update Las Vegas names to be uniform. MANY spelling errors in data

biz_df3.loc[:,'city'] = biz_df3.loc[:,'city'].replace(
    ['C Las Vegas', 'Henderson and Las Vegas','Lake Las Vegas','Las  Vegas','LAS VEGAS AP','Las Vegas East'\
     ,'Las Vegas Nevada','Las Vegas Nv','Las Vegas,','Las Vegas, Nevada','Las Vegas, NV','Las Vegass'\
     ,'N E Las Vegas','N Las Vegas','N W Las Vegas','N. Las Vegas','N.Las Vegas','North Las Vegas','South Las Vegas'\
     ,'West Las Vegass','Las vegas','LAS VEGAS'] , 'Las Vegas')


In [None]:
# Check after updates
#sorted(biz_df3['city'].unique())

In [None]:
#Update Charlotte names to be uniform. MANY spelling errors in data

biz_df3.loc[:,'city'] = biz_df3.loc[:,'city'].replace(
    ['North Charlotte', 'South Charlotte','Charotte'] , 'Charlotte')

In [None]:
# Check after updates
#sorted(biz_df3['city'].unique())

In [None]:
#Update Toronto names to be uniform. MANY spelling errors in data

biz_df3.loc[:,'city'] = biz_df3.loc[:,'city'].replace(
    ['Downtown Toronto', 'North Toronto','TORONTO - DANFORTH (OT)','TORONTO - FRONT ST (OT)','Toronto Division'\
     ,'Toronto-Etobicoke','Toronto-North York','Toronto-West','West Toronto','Tornto'] , 'Toronto')

In [None]:
# Check after updates
#sorted(biz_df3['city'].unique())

In [None]:
biz_df3.loc[:,'city'] = biz_df3.loc[:,'city'].replace(
    ['Metro Phoenix', 'North Phoenix','PHOENIX AP','Phoenix AZ','Phoenix Valley','Phoenix,','Phoenix,AZ'\
     ,'Phoneix','Phonenix,','Phoniex','Phx'],'Phoenix')

In [None]:
# Check after updates
#sorted(biz_df3['city'].unique())

In [None]:
# ONLY keep items in City: Charlotte, Las Vegas, Phoenix, Toronto

biz_df4 = biz_df3[biz_df3['city'].str.contains('Charlotte|Las Vegas|Phoenix|Toronto', regex=True)]

#biz_df4.head()

In [None]:
# Check after updates
sorted(biz_df4['city'].unique())

In [None]:
#Create CSV for the front end ppl to review as scrub continues V2
#biz_df4.to_csv("Data/Yelp_InitialScrubV2.csv")

In [None]:
#Categories to filter through
#sorted(biz_df4['categories'].unique())

In [None]:
biz_df4.categories.value_counts()

In [None]:
#Update " Restaurants" to just "Restaurants"
biz_df4.loc[:,'categories'] = biz_df4.loc[:,'categories'].replace(
    [' Food'] , 'Food')

In [None]:
biz_df4.loc[:,'categories'] = biz_df4.loc[:,'categories'].replace(
    [' Restaurants'], 'Restaurants')

In [None]:
biz_df4.categories.value_counts().head(25)

In [None]:
#Remove Restaurants & Food as these are typically double qualifiers to cuisines

biz_df5 = biz_df4[~biz_df4['categories'].str.contains('Restaurants|Food| Nightlife| Bars|Bars| Coffee & Tea| Sandwiches|\
    | Breakfast & Brunch| Specialty Food| Desserts| Event Planning & Services| Bakeries| Cafes|Cafes| Chicken Wings|\
    |Coffee & Tea| Salad| Caterers| Grocery|Sandwiches| Ice Cream & Frozen Yogurt|Pizza| Wine & Spirits| Delis|\
    | Shopping| Vegetarian| Tacos| Beer| Burgers| Convenience Store|Nightlife| Vegan|Breakfast & Brunch| Diners|Bars|\
    | Gluten-Free| Gas Stations| Soup|Desserts| Automotive|Bakeries| Chicken Shop| Arts & Entertainment|Grocery|Cafes|\
    | Bubble Tea|Chicken Wings| Noodles|Ice Cream & Frozen Yogurt|Salad| Donuts| Health Markets| Lounges| Hot Dogs|\
    | Drugstores| Venues & Event Spaces| Buffets| Bagels|Event Planning & Services| Tea Rooms|Delis| Breweries|\
    |Caterers| Ramen| Custom Cakes| Meat Shops| Street Vendors| Cupcakes| Flowers & Gifts| Chocolatiers & Shops|\
    |Diners| Local Flavor|Beer|Shopping|Donuts| Fish & Chips|Convenience Store| Shaved Ice| Poke|Burgers| Dim Sum|\
    |Bubble Tea|Vegetarian| Hotels & Travel| Patisserie/Cake Shop| Music Venue|Gas Station| Creperies| Fashion|\
    | Farmers Market|Vegan| Karaoke| Casinos| Coffee Roasteries| Department Stores|Soup| Seafood Markets|\
    | Fruits & Veggies| Organic Stores|Hot Dogs|Gluten-Free| Acai Bowls|Noodles| Cheesesteaks|Automotive| Active Life|\
    | Hotels| Wraps| Candy Stores| Butcher|Chicken Shop| Cheese Shops| Service Stations| Kebab|Bagels| Gelato|\
    | Health & Medical|Tea Rooms| Waffles| Falafel| Party & Event Planning|Buffets| Pasta Shops|Local Flavor|\
    | Modern European|Health Markets| Beauty & Spas| Poutineries|Tacos| Personal Chefs| Florists| Gift Shops|Ramen|\
    | New Mexican Cuisine|Seafood Markets| Japanese Curry|New Mexican Cuisine| Traditional Chinese Medicine|\
    |Japanese Curry',na=False)]

In [None]:
biz_df5.categories.value_counts().head(50)

In [None]:
#Edit spaces out of names to make unique

biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' American (New)'], 'American (New)')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Asian Fusion'], 'Asian Fusion')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' American (Traditional)'], 'American (Traditional)')

In [None]:
biz_df5.categories.value_counts().head(10)

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Barbeque'], 'Barbeque')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Cajun/Creole'], 'Cajun/Creole')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Canadian (New)'], 'Canadian (New)')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Caribbean'], 'Caribbean')

In [None]:
biz_df5.categories.value_counts().head(10)

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Chinese'], 'Chinese')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' French'], 'French')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Greek'], 'Greek')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Halal'], 'Halal')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Hawaiian'], 'Hawaiian')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Indian'], 'Indian')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Italian'], 'Italian')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Japanese'], 'Japanese')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Korean'], 'Korean')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Latin American'], 'Latin American')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Mexican'], 'Mexican')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Mediterranean'], 'Mediterranean')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Middle Eastern'], 'Middle Eastern')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Pubs'], 'Pubs')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    ['Pubs'], 'Gastropubs')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Gastropubs'], 'Gastropubs')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Seafood'], 'Seafood')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Southern'], 'Southern')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Steakhouses'], 'Steakhouses')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Tapas/Small Plates'], 'Tapas/Small Plates')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Tex-Mex'], 'Tex-Mex')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Thai'], 'Thai')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Filipino'], 'Filipino')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Pakistani'], 'Pakistani')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Taiwanese'], 'Taiwanese')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Portuguese'], 'Portuguese')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Persian/Iranian'], 'Persian/Iranian')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Salvadoran'], 'Salvadoran')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Kosher'], 'Kosher')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' African'], 'African')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Spanish'], 'Spanish')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Cantonese'], 'Cantonese')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' British'], 'British')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Irish'], 'Irish')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Turkish'], 'Turkish')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Ethiopian'], 'Ethiopian')

In [None]:
biz_df5.loc[:,'categories'] = biz_df5.loc[:,'categories'].replace(
    [' Vietnamese'], 'Vietnamese')

In [None]:
biz_df5.categories.value_counts().head(40)

In [None]:
# ONLY keep top 40 categories over all 4 cities

FinalDF = biz_df5[biz_df5['categories'].str.contains('Mexican|American (Traditional)|Chinese|American (New)|Italian|\
    |Japanese|Seafood|Asian Fusion|Gastropubs|Mediterranean|Barbeque|Thai|Indian|Canadian (New)|Middle Eastern|\
    |Vietnamese|Steakhouses|Korean|Greek|Tex-Mex|Latin American|Southern|Halal|Caribbean|Hawaiian|Tapas/Small Plates\
    |Pakistani|French|Filipino|Cajun/Creole|Portuguese|Taiwanese|Persian/Iranian|Salvadoran|Kosher|Cantonese|African\
    |Spanish|Ethiopian|British', regex=True)]

FinalDF.head()


In [None]:
FinalDF.categories.value_counts().head(45)

In [None]:
# Check after updates, cities
sorted(FinalDF['city'].unique())

In [None]:
# Check after updates, cuisines
sorted(FinalDF['categories'].unique())

In [None]:
#Create FINAL CSV for the front end ppl to review as scrub continues V3
#FinalDF.to_csv("Data/Yelp_FinalScrub.csv")

# #Mongo Connection (Eline)

In [None]:
import pymongo
from pymongo import MongoClient

In [None]:
client = MongoClient('localhost', 27017)

In [None]:
db = client['YelpRestaurant']

In [None]:
db

In [None]:
FinalDF

In [None]:
import json
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)
db = client.YelpRestaurant
collection = db.YelpRestaurant
collection.update({}, db, upsert=True)
# data = json.loads(YelpRestaurant.to_json(orient='records'))
# collection.insert_many(data)
cursor = list(collection.find({}))
for document in cursor:
    print(document)

In [None]:
biz = 

### Transform Player DataFrames
* Reduce Player DFs to only the 9 columns we want to evaluate
* Update column headers to coorespond with schema 
* Add 'Season' column to DataFrames


### Transform Game DataFrames
* Remove the Box Score column from the Game Data
* Update the Game Data DFs so team names match the 3 letter accrynyms on the Player Data DFs
* Update column headers to coorespond with schema 
* Add 'Season' column 

### Connect to local database

In [None]:
# connection_string = "postgres:postgres@localhost:5432/WNBA"
# engine = create_engine(f'postgresql://{connection_string}')

In [None]:
# # Confirm tables
# engine.table_names()

### Load DataFrames into database

In [None]:
# revised_game18_df.to_sql(name='Game_Data_18', con=engine, if_exists='append', index=True)

# revised_game19_df.to_sql(name='Game_Data_19', con=engine, if_exists='append', index=True)

# revised_player18_df.to_sql(name='Player_Data_18', con=engine, if_exists='append', index=True)

# revised_player19_df.to_sql(name='Player_Data_19', con=engine, if_exists='append', index=True)
