In [1]:
import pandas as pd

## Extract JSONs into DataFrames

In [2]:
# Establish paths to JSON files
biz = "Data/yelp_academic_dataset_business.json"
reviews = "Data/yelp_academic_dataset_review.json"

In [3]:
# Read Business data into Pandas
biz_df = pd.read_json(biz, lines=True)

#biz_df.head()

# Review & Scrub Data

In [5]:
#Assess Data TYpes
biz_df.dtypes

business_id      object
name             object
address          object
city             object
state            object
postal_code      object
latitude        float64
longitude       float64
stars           float64
review_count      int64
is_open           int64
attributes       object
categories       object
hours            object
dtype: object

In [30]:
#Assess what cities to pull in: Las Vegas, Toronto, Phoenix, Charlotte. Not including Scottsdale as it's close to Phoenix
biz_df['city'].value_counts().head()

Las Vegas     31631
Toronto       20366
Phoenix       20171
Charlotte     10422
Scottsdale     9342
Name: city, dtype: int64

In [7]:
#Remove closed businesses, column is_open
OpenBiz = biz_df[biz_df['is_open']==1]

#OpenBiz.head()

In [10]:
#Check if "0" were removed
OpenBiz['is_open'].value_counts().head(10)

1    168903
Name: is_open, dtype: int64

In [29]:
#Drop "is_open" as it is no longer needed
biz_df2 = OpenBiz.drop('is_open', axis=1)

#biz_df2.head()

In [31]:
#Split categories (strings) in to their own row
Categories = biz_df2.assign(categories = biz_df2.categories.str.split(',')).explode('categories')

#Categories.head()

In [32]:
#Why are there two different "Restaurant" categories????

Categories.categories.value_counts().head()

 Restaurants      30776
 Shopping         21555
 Food             18475
 Home Services    14886
Restaurants       13189
Name: categories, dtype: int64

In [34]:
Categories[Categories['categories'].str.contains('Restaurants', case=True, na=False)].categories.value_counts()

 Restaurants           30776
Restaurants            13189
 Pop-Up Restaurants       12
Pop-Up Restaurants         4
Name: categories, dtype: int64

In [None]:
#Update "Restaurants"(2) to uniquely be "Restaurant" as the category

In [None]:
#Remove all items that are not "Restaurant"

In [35]:
# Read Review data into Pandas
#review_df = pd.read_json(reviews)


### Transform Player DataFrames
* Reduce Player DFs to only the 9 columns we want to evaluate
* Update column headers to coorespond with schema 
* Add 'Season' column to DataFrames


In [None]:
# 2018 Player Data reduction
# Extract "Player", "Tm", "Pos", "G", "FG", "FG%", "FT", "FT%" and "PTS"
reduced_player18_df = player_2018_df.loc[:, ["Player", "Tm", "Pos", "G", "FG", "FG%", "FT", "FT%", "PTS"]]
reduced_player18_df.head(10)

In [None]:
# 2019 Player Data reduction
# Extract "Player", "Tm", "Pos", "G", "FG", "FG%", "FT", "FT%" and "PTS"
reduced_player19_df = player_2019_df.loc[:, ["Player", "Tm", "Pos", "G", "FG", "FG%", "FT", "FT%", "PTS"]]
reduced_player19_df.head(10)

In [None]:
#Header updates, Player Data:
revised_player18_df = reduced_player18_df.rename(columns={'Tm': 'team_name', 'Pos': 'Position',	'G': 'Games', 'FG%': 'FG_pct', 'FT%': 'FT_pct', 'PTS': 'Total_Pts'})

revised_player19_df = reduced_player19_df.rename(columns={'Tm': 'team_name', 'Pos': 'Position',	'G': 'Games', 'FG%': 'FG_pct', 'FT%': 'FT_pct', 'PTS': 'Total_Pts', })

In [None]:
# Add 'Season' column
revised_player18_df.insert(0, "Season", 2018, True)

revised_player19_df.insert(0, "Season", 2019, True)

### Transform Game DataFrames
* Remove the Box Score column from the Game Data
* Update the Game Data DFs so team names match the 3 letter accrynyms on the Player Data DFs
* Update column headers to coorespond with schema 
* Add 'Season' column 

In [None]:
# 2018 Game Data reduction
# Extract "Date", "Visitor/Neutral", "PTS", "Visitor/Neutral", "PTS.1"
reduced_game18_df = wnba_2018_df.loc[:, ["Date", "Visitor/Neutral", "PTS", "Home/Neutral", "PTS.1"]]
reduced_game18_df.head(10)

In [None]:
# 2019 Game Data reduction
# Extract "Date", "Visitor/Neutral", "PTS", "Visitor/Neutral", "PTS.1"
reduced_game19_df = wnba_2019_df.loc[:, ["Date", "Visitor/Neutral", "PTS", "Home/Neutral", "PTS.1"]]
reduced_game19_df.head(10)

In [None]:
#Replace each full team name with 3-letter accrynym (i.e. DallasWings --> DAL)

replacements = {
    "Dallas Wings": "DAL",
    "Chicago Sky": "CHI",
    "New York Liberty": "NYL",
    "Las Vegas Aces": "LVA",
    "Atlanta Dream": "ATL",
    "Los Angeles Sparks": "LAS",
    "Phoenix Mercury": "PHO",
    "Seattle Storm": "SEA",
    "Indiana Fever": "IND",
    "Washington Mystics": "WAS",
    "Minnesota Lynx": "MIN",
    "Connecticut Sun": "CON",
}
reduced_game18_df["Visitor/Neutral"].replace(replacements, inplace=True)

reduced_game18_df["Home/Neutral"].replace(replacements, inplace=True)

reduced_game19_df["Visitor/Neutral"].replace(replacements, inplace=True)

reduced_game19_df["Home/Neutral"].replace(replacements, inplace=True)

In [None]:
# Headers updates, Game Data:
revised_game18_df = reduced_game18_df.rename(columns={'Visitor/Neutral': 'away_team', 'PTS': 'away_team_pts', 'Home/Neutral': 'home_team', 'PTS.1': 'home_team_pts'})

revised_game19_df = reduced_game19_df.rename(columns={'Visitor/Neutral': 'away_team', 'PTS': 'away_team_pts', 'Home/Neutral': 'home_team', 'PTS.1': 'home_team_pts'})

In [None]:
# Add 'Season' Column to Game Data
revised_game18_df.insert(0, "Season", 2018, True)

revised_game19_df.insert(0, "Season", 2019, True)

### Connect to local database

In [None]:
connection_string = "postgres:postgres@localhost:5432/WNBA"
engine = create_engine(f'postgresql://{connection_string}')

In [None]:
# Confirm tables
engine.table_names()

### Load DataFrames into database

In [None]:
revised_game18_df.to_sql(name='Game_Data_18', con=engine, if_exists='append', index=True)

revised_game19_df.to_sql(name='Game_Data_19', con=engine, if_exists='append', index=True)

revised_player18_df.to_sql(name='Player_Data_18', con=engine, if_exists='append', index=True)

revised_player19_df.to_sql(name='Player_Data_19', con=engine, if_exists='append', index=True)
