# Yelp Data Merging

In [1]:
# Copied and pasted from my imports from project 3, 
# delete if we dont need(Aerika)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import stop_words

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.linear_model import LogisticRegression

from sklearn.naive_bayes import MultinomialNB, ComplementNB, GaussianNB, BernoulliNB

from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, auc

from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.tokenize import ToktokTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords 

import regex as re

from scipy.stats import ttest_ind

In [2]:
file_paths = []
for file in os.listdir('../data'):
    if 'businesses2019' in file:
        file_paths.append('../data/'+file)

In [3]:
master_df = {
    'id': [],
    'latitude': [],
    'longitude': [],
    'price': [],
    'review_count': [],
    'rating': [],
    'zip_code': [],
    'city': [],
    'alias': [],
    'category': [],
}

master_df = pd.DataFrame(master_df)

In [4]:
for path in file_paths:
    
    master_df = pd.concat([master_df, pd.read_csv(path)])

master_df.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,alias,categories,category,city,id,latitude,longitude,price,rating,review_count,zip_code
0,chichen-itza-restaurant-los-angeles-3,,"['mexican', 'sandwiches', 'soup']",Los Angeles,vC_6J_nGyf4J8xt-Vu6Shw,34.01744,-118.2783,$$,4.5,1190.0,90007.0
1,,,['childrensmuseums'],,,,,,,,
2,,,['museums'],,,,,,,,
3,figueroa-philly-cheese-steak-los-angeles-2,,"['cheesesteaks', 'sandwiches', 'breakfast_brun...",Los Angeles,vfHJzF0ShYtwmotXE-0PiA,34.014196,-118.282417,$$,4.5,1076.0,90037.0
4,dirt-dog-los-angeles-4,,"['hotdog', 'beerbar']",Los Angeles,0z23Jk7U_MpvtqKINPL2fA,34.028292,-118.275208,$,4.5,1900.0,90007.0


In [5]:
# Seeing shape of dataframe BEFORE dropping duplicates 

master_df.shape

(127508, 11)

In [6]:
# Dropping duplicates

master_df = master_df.sort_values(['alias',
                       'id', 'latitude', 'longitude', 'review_count'],
                      ascending = False).drop_duplicates(subset = ['id'],
                                                         keep ='first')

In [7]:
# Shape of df AFTER dropping duplicates

master_df.shape

(29164, 11)

In [8]:
# Resetting master_df index

master_df.reset_index(inplace=True)
master_df.drop(columns='index', inplace=True)

# NaNs in master_df

In [9]:
master_df.isna().sum()

alias               1
categories      24450
category         4715
city                1
id                  1
latitude           17
longitude          17
price               1
rating              1
review_count        1
zip_code          125
dtype: int64

In [10]:
master_df_copy = master_df.copy()

In [11]:
master_df_copy.rename(columns={'category': 'categories_'}, inplace=True)

In [12]:
master_df_copy.drop(columns='categories', inplace=True)

In [13]:
master_df_copy.rename(columns={'categories_': 'categories'}, inplace=True)

In [14]:
master_df = master_df.combine_first(master_df_copy)

In [15]:
master_df.drop(columns='category', inplace=True)

In [16]:
master_df.isna().sum()

alias             1
categories        1
city              1
id                1
latitude         17
longitude        17
price             1
rating            1
review_count      1
zip_code        125
dtype: int64

In [17]:
# Dropping rows w/ prices that aren't $, $$, $$$, or $$$$.

data_drop_df = master_df.loc[(master_df['price'] != '$') & (master_df['price'] != '$$') & (master_df['price'] != '$$$') & (master_df['price'] != '$$$$')]
drop_indexes = list(data_drop_df.index)

master_df.drop(index=drop_indexes, inplace = True)
master_df.shape


(28992, 10)

In [18]:
# Businesses that don't have latitude/zipcodes = Food Trucks, Caterers --> dropped

food_trucks = master_df.loc[(master_df['latitude'].isna()) | (master_df['zip_code'].isna())].index.tolist()

master_df.drop(index = food_trucks, inplace = True)
master_df.shape


(28874, 10)

In [19]:
master_df.isna().sum()

alias           0
categories      0
city            0
id              0
latitude        0
longitude       0
price           0
rating          0
review_count    0
zip_code        0
dtype: int64

# Replacing \\$ in price to numbers

In [20]:
master_df['price'] = master_df['price'].map({'$$$$':4, '$$$':3, '$$':2,'$':1})

In [21]:
master_df.head(2)

Unnamed: 0,alias,categories,city,id,latitude,longitude,price,rating,review_count,zip_code
0,重庆特色小面-chongqing-special-noodles-san-gabriel-3,"['noodles', 'chinese']",San Gabriel,mm6l24khOfXhKqn4pKVojA,34.10307,-118.09184,1,4.0,204.0,91776
1,重庆小面-best-noodle-house-rosemead-3,"['noodles', 'szechuan']",Rosemead,DgXxBgUEvARu45x7RSgYRw,34.081182,-118.066226,1,4.0,141.0,91770


# Dropping Rows 
- convert zip_codes from float -> int -> str
- zipcodes that don't start with '9'

In [22]:
master_df = master_df.reset_index(drop=True)

In [23]:
master_df['zip_code'] = master_df['zip_code'].astype(int).astype(str)

In [24]:
notin_la = []
for i in range(len(master_df['zip_code'])):
    if master_df['zip_code'][i].startswith('9') == False:
        notin_la.append(master_df['zip_code'].index[i])

In [25]:
notin_la

[958,
 1738,
 3040,
 4562,
 6685,
 8107,
 9653,
 9942,
 9947,
 9958,
 9962,
 9968,
 9972,
 9978,
 9980,
 9989,
 9991,
 10003,
 10012,
 10034,
 10056,
 10057,
 10074,
 10078,
 10080,
 10085,
 10087,
 10088,
 10089,
 10104,
 10106,
 10109,
 10117,
 14818,
 15037,
 15282,
 16596,
 18768,
 24158,
 25502,
 25643,
 27140]

In [26]:
master_df.shape

(28874, 10)

In [27]:
master_df = master_df.drop(index=notin_la).reset_index(drop=True)

In [28]:
master_df.shape

(28832, 10)

# Removing rows that aren't around the same latitude/longitude

In [29]:
master_df.describe()

Unnamed: 0,latitude,longitude,price,rating,review_count
count,28832.0,28832.0,28832.0,28832.0,28832.0
mean,34.051109,-118.241874,1.480751,3.58494,241.149695
std,0.242592,0.385936,0.57738,0.812564,419.532416
min,33.167953,-149.429066,1.0,0.0,0.0
25%,33.949684,-118.384977,1.0,3.0,37.0
50%,34.052069,-118.263453,1.0,3.5,107.0
75%,34.132193,-118.122784,2.0,4.0,279.0
max,61.581385,-84.414552,4.0,5.0,16662.0


In [30]:
master_df = master_df.drop(index=master_df.loc[(master_df['longitude'] > -117) | (master_df['latitude'] > 35)].index).reset_index(drop=True)

In [31]:
master_df.shape

(28823, 10)

# Feature Engineering

## Save the cleaned data to a CSV

In [32]:
# Don't uncomment this unless you actually want to create a new csv

master_df.to_csv('../data/master_jerry.csv')