<a href="https://colab.research.google.com/github/AdvisorChuanChuan/CIS550-JuiceFinder/blob/final-submission/datasets/preprocessing/yelp_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Yelp Dataset Preprocessing and Ingestion into MySQL Database

## How to get the data
1. Downloaded Yelp Dataset from https://www.yelp.com/dataset.
2. Unzipped the downloaded dataset locally on your computer.
3. Upload the `yelp_academic_dataset_business.json` file to Colab notebook.

In [None]:
import pandas as pd

## Load Data

In [None]:
business_json_path = '/content/yelp_academic_dataset_business.json'
df_full = pd.read_json(business_json_path, lines=True)

In [None]:
df_full.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,CA,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,MO,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,AZ,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,PA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,PA,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [None]:
df_full.describe()

Unnamed: 0,latitude,longitude,stars,review_count,is_open
count,150346.0,150346.0,150346.0,150346.0,150346.0
mean,36.67115,-89.357339,3.596724,44.866561,0.79615
std,5.872759,14.918502,0.974421,121.120136,0.40286
min,27.555127,-120.095137,1.0,5.0,0.0
25%,32.187293,-90.35781,3.0,8.0,1.0
50%,38.777413,-86.121179,3.5,15.0,1.0
75%,39.954036,-75.421542,4.5,37.0,1.0
max,53.679197,-73.200457,5.0,7568.0,1.0


## Preprocessing/Cleaning

### Remove businesses that are closed

In this dataset, there is a column `is_open`: 1 indicates the business is open and 0 indicates the business is closed.

In [None]:
df_clean = df_full[df_full['is_open']==1]

Now all rows have `is_open = 1` and `hours != None`. So we remove `is_open` column.



In [None]:
df_clean = df_clean.drop(['is_open'], axis=1)

### Filter businesses by category

Get a sense of the categories

In [None]:
df_explode = df_clean.assign(categories = df_clean.categories
                         .str.split(', ')).explode('categories')

In [None]:
df_explode['categories'].value_counts()

Restaurants               34987
Food                      20419
Shopping                  20186
Home Services             13322
Beauty & Spas             12263
                          ...  
Guamanian                     1
Cheese Tasting Classes        1
Bike Repair                   1
Tonkatsu                      1
Trade Fairs                   1
Name: categories, Length: 1302, dtype: int64

Filter the data: we only care about restaurants and food. So we only keep businesses within these two categories.

In [None]:
df_food = df_clean[df_clean['categories'].str.contains(
              'Restaurants|Food',
              case=False, na=False)]

In [None]:
df_food.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44582 entries, 3 to 150339
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   business_id   44582 non-null  object 
 1   name          44582 non-null  object 
 2   address       44582 non-null  object 
 3   city          44582 non-null  object 
 4   state         44582 non-null  object 
 5   postal_code   44582 non-null  object 
 6   latitude      44582 non-null  float64
 7   longitude     44582 non-null  float64
 8   stars         44582 non-null  float64
 9   review_count  44582 non-null  int64  
 10  attributes    43877 non-null  object 
 11  categories    44582 non-null  object 
 12  hours         39864 non-null  object 
dtypes: float64(3), int64(1), object(9)
memory usage: 4.8+ MB


In [None]:
df_food.describe()

Unnamed: 0,latitude,longitude,stars,review_count
count,44582.0,44582.0,44582.0,44582.0
mean,37.002406,-88.153724,3.548506,88.924745
std,6.092015,14.01782,0.888338,199.810367
min,27.564457,-120.083748,1.0,5.0
25%,32.193279,-90.253887,3.0,13.0
50%,39.483271,-86.051765,3.5,32.0
75%,39.960806,-75.378043,4.0,88.0
max,53.679197,-74.661348,5.0,7568.0


## Export to CSV

In [None]:
df_food.to_csv("yelp.csv", index=False)

# Use python to populate location data

In [None]:
!pip install pymysql
import pymysql
import json
from sqlalchemy import create_engine

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pymysql
  Downloading PyMySQL-1.0.3-py3-none-any.whl (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.7/43.7 KB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymysql
Successfully installed pymysql-1.0.3


In [None]:
data = df_food[:].apply(lambda row: 
    (row['business_id'], 
     row['name'],
     row['address'],
     row['city'],
     row['state'],
     row['postal_code'],
     f'POINT({row["longitude"]} {row["latitude"]})',
     row['stars'],
     row['review_count'],
     json.dumps(row['attributes']),
     row['categories'],
     json.dumps(row['hours'])
     ), axis=1).tolist()

In [None]:
for row in data:
    for c in row[5]:
        if not ('0' <= c <= '9'):
            print(row)
            break;

('JX4tUpd09YFchLBuI43lGw', 'Naked Cyber Cafe & Espresso Bar', '10303  108 Street NW', 'Edmonton', 'AB', 'T5J 1L7', 'POINT(-113.506589 53.5446819)', 4.0, 12, '{"OutdoorSeating": "False", "BusinessParking": "{\'garage\': False, \'street\': True, \'validated\': True, \'lot\': True, \'valet\': False}", "WiFi": "u\'free\'", "RestaurantsPriceRange2": "2", "HasTV": "False", "Alcohol": "u\'none\'", "RestaurantsTakeOut": "True", "BikeParking": "True", "Smoking": "u\'no\'", "Music": "{\'dj\': False, \'background_music\': False, \'no_music\': False, \'jukebox\': False, \'live\': True, \'video\': False, \'karaoke\': False}", "Caters": "False", "DogsAllowed": "False", "CoatCheck": "False", "WheelchairAccessible": "True"}', 'Arts & Entertainment, Music Venues, Internet Service Providers, Nightlife, Food, Coffee & Tea, Jazz & Blues, Professional Services, Internet Cafes', '{"Monday": "11:0-1:0", "Tuesday": "11:0-1:0", "Wednesday": "11:0-1:0", "Thursday": "11:0-1:0", "Friday": "11:0-1:0", "Saturday": 

In [None]:
insert_data_sql = '''
INSERT INTO Restaurants (business_id, name, address, city, state, postal_code, location, stars, review_count, attributes, categories, hours)
VALUES (%s, %s, %s, %s, %s, %s, ST_SRID(ST_GeomFromText(%s), 4326), %s, %s, %s, %s, %s)
'''

The following database credentials are hided for safety.

In [None]:
con = pymysql.connect(host='???', port=3306, user='???', 
passwd='???', db='???')
with con:
    cur = con.cursor()
    cur.executemany(insert_data_sql, data)
    con.commit()