<h1>Ingestion On Businesses and Category</h1>

Our Goal in this Notebook: 

1. Ingest data from the google local data PA metadata 
1. Upload a csv format of this metadata and the categories data

In [1]:
import pandas as pd 
import numpy as np 
import json 
import os


In [3]:
raw_data_path = '../data/google/meta-Pennsylvania.json'

## Ingest Categories

In [5]:
def process_category_data(raw_data_path): 
    data_list = [] 
    #process the data
    with open(raw_data_path, 'r') as f: 
        for line in f: 
            #load the line
            data = json.loads(line) 
            #load the category
            categories = data['category']
            #if the category is empty, append None
            if not categories: 
                continue
            #if the category is not empty, append the category to the list
            for category in categories: 
                data_list.append([data['gmap_id'], category])
    df = pd.DataFrame(data_list, columns=['gmap_id', 'category'])
    return df

In [6]:
df_categories = process_category_data(raw_data_path)

In [7]:
df_categories.head()

Unnamed: 0,gmap_id,category
0,0x89c46d5e4554eae1:0xa2f8b211524ca29a,Pizza delivery
1,0x89c48c790b767ec7:0x1943c20a42196a68,HVAC contractor
2,0x89c48c790b767ec7:0x1943c20a42196a68,Air conditioning contractor
3,0x89c48c790b767ec7:0x1943c20a42196a68,Air conditioning repair service
4,0x89c48c790b767ec7:0x1943c20a42196a68,Air duct cleaning service


In [8]:
df_categories.describe()

Unnamed: 0,gmap_id,category
count,428166,428166
unique,188950,3484
top,0x89c6b24cc647a23f:0x6ddc895d0ab4520f,Restaurant
freq,20,11801


In [9]:
df_categories.to_csv('../data/google/pa_categories.csv', index=False)

## Ingest Businesses

In [4]:
def process_business_data(raw_data_path): 
    data_list = [] 
    with open(raw_data_path, 'r') as f: 
        for line in f: 
            #load the line
            data = json.loads(line) 
            #load the features 
            if not data: 
                continue
        
            gmap_id = data['gmap_id'] #primary key
            address = data['address'] #string
            description =  data['description'] if data['description'] else None #string or none
            latitude =  data['latitude'] #float
            longitude = data['longitude'] #float
            avg_rating = data['avg_rating'] if data['avg_rating'] else None #float or none
            price = data['price'] if data['price'] else None #string
            num_reviews = data['num_of_reviews'] if data['num_of_reviews'] else 0
            
            data_list.append([gmap_id, address, description, latitude, longitude, avg_rating, price, num_reviews])
    df = pd.DataFrame(data_list, columns=['gmap_id', 'address', 'description', 'latitude', 'longitude', 'avg_rating', 'price', 'num_reviews'])
    return df

In [5]:
df_businesses = process_business_data(raw_data_path)

In [7]:
df_businesses.to_csv('../data/google/pa_businesses.csv', index=False)