# Yelp Data Merging

In [26]:
# Copied and pasted from my imports from project 3, 
# delete if we dont need(Aerika)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction import stop_words

from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.linear_model import LogisticRegression

from sklearn.naive_bayes import MultinomialNB, ComplementNB, GaussianNB, BernoulliNB

from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, auc

from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.tokenize import ToktokTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords 

import regex as re

from scipy.stats import ttest_ind

In [27]:
file_paths = []
for file in os.listdir('../data'):
    if 'businesses2019' in file:
        file_paths.append('../data/'+file)

In [28]:
master_df = {
    'id': [],
    'latitude': [],
    'longitude': [],
    'price': [],
    'review_count': [],
    'rating': [],
    'zip_code': [],
    'city': [],
    'alias': [],
    'category': [],
}

master_df = pd.DataFrame(master_df)

In [29]:
for path in file_paths:
    
    master_df = pd.concat([master_df, pd.read_csv(path)])

master_df.head()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,alias,category,city,id,latitude,longitude,price,rating,review_count,zip_code
0,chichen-itza-restaurant-los-angeles-3,"['mexican', 'sandwiches', 'soup']",Los Angeles,vC_6J_nGyf4J8xt-Vu6Shw,34.01744,-118.2783,$$,4.5,1190.0,90007.0
1,,['childrensmuseums'],,,,,,,,
2,,['museums'],,,,,,,,
3,figueroa-philly-cheese-steak-los-angeles-2,"['cheesesteaks', 'sandwiches', 'breakfast_brun...",Los Angeles,vfHJzF0ShYtwmotXE-0PiA,34.014196,-118.282417,$$,4.5,1076.0,90037.0
4,dirt-dog-los-angeles-4,"['hotdog', 'beerbar']",Los Angeles,0z23Jk7U_MpvtqKINPL2fA,34.028292,-118.275208,$,4.5,1900.0,90007.0


In [30]:
# Seeing shape of dataframe BEFORE dropping duplicates 

master_df.shape

(73995, 10)

In [31]:
# Dropping duplicates

master_df = master_df.sort_values(['alias',
                       'id', 'latitude', 'longitude', 'review_count'],
                      ascending = False).drop_duplicates(subset = ['id', 'alias', 'zip_code'],
                                                         keep ='first')

In [32]:
# Shape of df AFTER dropping duplicates

master_df.shape

(10077, 10)

In [33]:
# Resetting master_df index

master_df.reset_index(inplace=True)
master_df.drop(columns='index', inplace=True)

# NaNs in master_df

In [34]:
master_df.isna().sum()

alias            1
category         1
city             1
id               1
latitude         3
longitude        3
price            1
rating           1
review_count     1
zip_code        43
dtype: int64

In [35]:
# Dropping rows w/ prices that aren't $, $$, $$$, or $$$$.

data_drop_df = master_df.loc[(master_df['price'] != '$') & (master_df['price'] != '$$') & (master_df['price'] != '$$$') & (master_df['price'] != '$$$$')]
drop_indexes = list(data_drop_df.index)

master_df.drop(index=drop_indexes, inplace = True)
master_df.shape


(9910, 10)

In [36]:
# Businesses that don't have latitude/zipcodes = Food Trucks, Caterers --> dropped

food_trucks = master_df.loc[(master_df['latitude'].isna()) | (master_df['zip_code'].isna())].index.tolist()

master_df.drop(index = food_trucks, inplace = True)
master_df.shape


(9872, 10)

In [37]:
master_df.isna().sum()

alias           0
category        0
city            0
id              0
latitude        0
longitude       0
price           0
rating          0
review_count    0
zip_code        0
dtype: int64

# Replacing \\$ in price to numbers

In [38]:
master_df['price'] = master_df['price'].map({'$$$$':4, '$$$':3, '$$':2,'$':1})

In [39]:
master_df.head(2)

Unnamed: 0,alias,category,city,id,latitude,longitude,price,rating,review_count,zip_code
0,いざかや-おせん-izakaya-osen-los-angeles,"['sushi', 'izakaya', 'seafood']",Los Angeles,us0WnDOySVXXXwCqs0AaCw,34.083192,-118.273245,2,4.5,651.0,90026.0
1,ô-banh-mi-los-angeles,"['vietnamese', 'sandwiches', 'coffee']",Los Angeles,XRbyfQZ7nWwJCOjRC8EW-g,34.09845,-118.2751,1,4.0,111.0,90027.0


In [41]:
master_df.loc[master_df['latitude']>]

Unnamed: 0,alias,category,city,id,latitude,longitude,price,rating,review_count,zip_code
172,yalla-mediterranean-culver-city-2,"['mediterranean', 'greek']",Culver City,gM5cq233dtS5CalGqe72mQ,34.023215,-118.394458,2,4.0,349.0,93453.0
723,tortas-ahogadas-los-primos-santa-ana,"['mexican', 'foodtrucks']",Santa Ana,PH3sasg6P3PzBqNbXh8XuQ,33.74901,-117.89935,1,4.5,94.0,92703.0
901,the-tackle-box-food-truck-westwood,"['foodtrucks', 'seafood', 'sandwiches']",Westwood,K6dRXEPvjIAj78iLX8AREQ,40.294842,-121.05352,2,3.5,47.0,96137.0
1163,the-feeding-frenzy-tustin-2,"['foodtrucks', 'desserts', 'catering']",Tustin,MaMcWQp1PjAgW_bXjViSVg,33.73095,-117.81275,1,5.0,7.0,92780.0
1298,the-cake-factory-and-more-fontana,"['desserts', 'bakeries', 'customcakes']",Fontana,geGRPNvHqt-O7YD-EFJE9w,34.15697,-117.48216,2,4.5,45.0,92336.0
2240,starbucks-marina-del-rey-10,['coffee'],Marina Del Rey,jKWO_ML-Kws1JdYpOlmTWw,33.984363,-118.443489,2,3.0,14.0,92092.0
2705,seventy7-west-westwood,"['cocktailbars', 'lounges', 'venues']",Westwood,Adp9Ba4dBaKasi1bfpZz-A,34.06191,-118.44764,2,4.5,19.0,96137.0
2816,salt-n-pepper-truck-costa-mesa,"['foodtrucks', 'steak', 'hotdog']",Costa Mesa,lNnmA8gz8_5GPCgnBqsUpg,33.677521,-117.92112,3,3.5,66.0,92626.0
3572,phenomnom-truck-huntington-beach-3,"['foodtrucks', 'southern']",Huntington Beach,xSoIMortNzykYHzWGyMOtw,33.709942,-117.990471,2,4.0,85.0,92647.0
4437,mess-hall-canteen-garden-grove-7,"['foodtrucks', 'sandwiches', 'salad']",Garden Grove,JiYslHHmxDRNtjDg_49fmQ,33.7885,-117.97395,2,4.0,178.0,92841.0


# Dropping Rows 
- convert zip_codes from float -> int -> str
- zipcodes that don't start with '9'

In [15]:
master_df = master_df.reset_index(drop=True)

In [16]:
master_df['zip_code'] = master_df['zip_code'].astype(int).astype(str)

In [17]:
notin_la = []
for i in range(len(master_df['zip_code'])):
    if master_df['zip_code'][i].startswith('9') == False:
        notin_la.append(master_df['zip_code'].index[i])

In [18]:
notin_la

[2211,
 2753,
 3269,
 3361,
 3362,
 3367,
 3386,
 3387,
 3400,
 3401,
 3404,
 5155,
 5687,
 8726]

In [19]:
master_df.shape

(9872, 10)

In [20]:
master_df = master_df.drop(index=notin_la).reset_index(drop=True)

# Removing rows that aren't around the same latitude/longitude

In [53]:
master_df = master_df.drop(index=master_df.loc[(master_df['longitude'] > -117) | (master_df['latitude'] > 35)].index).reset_index(drop=True)

In [54]:
master_df.shape

(9860, 10)

## Save the cleaned data to a CSV

In [56]:
# Don't uncomment this unless you actually want to create a new csv

# master_df.to_csv('../data/master.csv')