In [1]:
from pathlib import Path
import sys
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from data.paths.parquet_paths import BUSINESS

In [2]:
business_df = pd.read_parquet(str(BUSINESS))
business_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   business_id   10000 non-null  object 
 1   name          10000 non-null  object 
 2   address       10000 non-null  object 
 3   attributes    8590 non-null   object 
 4   city          10000 non-null  object 
 5   state         10000 non-null  object 
 6   postal_code   10000 non-null  object 
 7   latitude      10000 non-null  float32
 8   longitude     10000 non-null  float32
 9   stars         10000 non-null  float32
 10  review_count  10000 non-null  int32  
 11  is_open       10000 non-null  int32  
 12  categories    10000 non-null  object 
 13  hours         7716 non-null   object 
dtypes: float32(3), int32(2), object(9)
memory usage: 898.6+ KB


In [3]:
business_df.isna().sum()

business_id        0
name               0
address            0
attributes      1410
city               0
state              0
postal_code        0
latitude           0
longitude          0
stars              0
review_count       0
is_open            0
categories         0
hours           2284
dtype: int64

In [4]:
business_df['business_id'].is_unique
business_df = business_df.set_index('business_id')

In [5]:
business_df[['stars', 'review_count']].describe()

Unnamed: 0,stars,review_count
count,10000.0,10000.0
mean,3.55045,35.0676
std,1.01852,103.118055
min,1.0,3.0
25%,3.0,4.0
50%,3.5,9.0
75%,4.5,26.0
max,5.0,4138.0


In [6]:
business_df.sample(1)

Unnamed: 0_level_0,name,address,attributes,city,state,postal_code,latitude,longitude,stars,review_count,is_open,categories,hours
business_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
jqlxOzP3SDGKvPIK-9CwBA,Panda Express,4984 S Power Rd,"{'AcceptsInsurance': None, 'AgesAllowed': None...",Mesa,AZ,85212,33.325409,-111.687141,1.0,3,0,"[Chinese, Restaurants]",


In [7]:
def append_nested_columns(dataframe: pd.DataFrame, col_name: str) -> pd.DataFrame:
    """
    Flattens nested json and appends result columns to target dataframe
    :param dataframe: target pandas dataframe
    :param col_name: name of column containing json
    :returns: targe df with appended nested columns
    """
    dataframe = pd.concat([dataframe, pd.json_normalize(dataframe[col_name].values.tolist())], axis=1)
    return dataframe

In [8]:
business_df = append_nested_columns(business_df, 'attributes')
business_df = business_df.drop('attributes', axis=1)

In [9]:
has_wifi = ['paid', 'free']
business_df['WiFi'] = business_df['WiFi'].apply(lambda x: x if x in has_wifi else 'no')
business_df['WiFi'].unique()

array(['no', 'free', 'paid'], dtype=object)

In [10]:
%store business_df

Stored 'business_df' (DataFrame)


## Load business data into .json

In [13]:
path = Path.cwd().joinpath('..', 'json-data', 'business.json').resolve()
business_df.to_json(path_or_buf=path, orient="records", index=True)
# business_df.to_json(path_or_buf=path, orient="split")