### Imports

In [26]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

### Reading in the data

In [27]:
#reading our Yelp restaurant data
df = pd.read_csv('./yelp_dataset/restaurant_reviews.csv')
df.head()


Unnamed: 0.1,Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,attributes,categories,user_id,review_stars,text,date
0,0,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,30 Eglinton Avenue W,Mississauga,ON,L5R 3E7,43.605499,-79.652289,2.5,"{'RestaurantsReservations': 'True', 'GoodForMe...","Specialty Food, Restaurants, Dim Sum, Imported...",2K62MJ4CJ19L8Tp5pRfjfQ,3.0,My girlfriend and I went for dinner at Emerald...,2017-01-27 21:54:30
1,1,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,30 Eglinton Avenue W,Mississauga,ON,L5R 3E7,43.605499,-79.652289,2.5,"{'RestaurantsReservations': 'True', 'GoodForMe...","Specialty Food, Restaurants, Dim Sum, Imported...",A0kENtCCoVT3m7T35zb2Vg,3.0,We've always been there on a Sunday so we were...,2013-06-24 23:11:30
2,2,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,30 Eglinton Avenue W,Mississauga,ON,L5R 3E7,43.605499,-79.652289,2.5,"{'RestaurantsReservations': 'True', 'GoodForMe...","Specialty Food, Restaurants, Dim Sum, Imported...",SuOLY03LW5ZcnynKhbTydA,3.0,"***No automatic doors, not baby friendly!*** I...",2016-01-04 12:59:22
3,3,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,30 Eglinton Avenue W,Mississauga,ON,L5R 3E7,43.605499,-79.652289,2.5,"{'RestaurantsReservations': 'True', 'GoodForMe...","Specialty Food, Restaurants, Dim Sum, Imported...",lymyUak6KNcNKoDbK87MiQ,1.0,"Horrible service,\nI went there tonight with m...",2014-05-09 02:38:43
4,4,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,30 Eglinton Avenue W,Mississauga,ON,L5R 3E7,43.605499,-79.652289,2.5,"{'RestaurantsReservations': 'True', 'GoodForMe...","Specialty Food, Restaurants, Dim Sum, Imported...",6vU0I5XgCv9OQHZ76rV6qw,4.0,One of the gauges of a good Chinese restaurant...,2011-03-21 14:39:55


### Preliminary EDA & Cleaning

In [28]:
#getting rid of unamed columns
df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

In [29]:
#confirming it worked
df.head(2)

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,attributes,categories,user_id,review_stars,text,date
0,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,30 Eglinton Avenue W,Mississauga,ON,L5R 3E7,43.605499,-79.652289,2.5,"{'RestaurantsReservations': 'True', 'GoodForMe...","Specialty Food, Restaurants, Dim Sum, Imported...",2K62MJ4CJ19L8Tp5pRfjfQ,3.0,My girlfriend and I went for dinner at Emerald...,2017-01-27 21:54:30
1,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,30 Eglinton Avenue W,Mississauga,ON,L5R 3E7,43.605499,-79.652289,2.5,"{'RestaurantsReservations': 'True', 'GoodForMe...","Specialty Food, Restaurants, Dim Sum, Imported...",A0kENtCCoVT3m7T35zb2Vg,3.0,We've always been there on a Sunday so we were...,2013-06-24 23:11:30


In [30]:
#let's remind ourselves of the rows,columns
df.shape

(3566764, 15)

In [31]:
#column names
df.columns

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'attributes', 'categories', 'user_id',
       'review_stars', 'text', 'date'],
      dtype='object')

In [32]:
#double checking the dtypes
df.dtypes

business_id      object
name             object
address          object
city             object
state            object
postal_code      object
latitude        float64
longitude       float64
stars           float64
attributes       object
categories       object
user_id          object
review_stars    float64
text             object
date             object
dtype: object

In [33]:
#further investigation of user_id
#looks like user id is mix of letters and numbers, so 'object is correct'
df['user_id'][1]

'A0kENtCCoVT3m7T35zb2Vg'

In [34]:
#why is the date and object, not sure if I need this, but I will keep it for now
df['date'][1]

'2013-06-24 23:11:30'

In [35]:
df['categories'].value_counts()

Mexican, Restaurants                                                        46609
Restaurants, Mexican                                                        43207
Chinese, Restaurants                                                        27349
Pizza, Restaurants                                                          26664
Restaurants, Pizza                                                          22855
                                                                            ...  
Caterers, Restaurants, Soul Food, Event Planning & Services                     3
Restaurants, Coffee & Tea, Sandwiches, Food, Cafes                              3
Mediterranean, International Grocery, Restaurants, Delis, Food, Grocery         3
Restaurants, Thai, Vietnamese, Chinese                                          3
Chicken Wings, Restaurants, Pizza, Food, Food Delivery Services, Italian        3
Name: categories, Length: 23636, dtype: int64

In [36]:
df.isnull().sum()

business_id         0
name                1
address          5968
city                1
state               1
postal_code       972
latitude            1
longitude           1
stars               1
attributes      15069
categories          1
user_id             1
review_stars        1
text                2
date                2
dtype: int64

In [37]:
#i want to cut our data down by focusing on one state
df['state'].unique()

array(['ON', 'NC', 'AZ', 'OH', 'PA', 'NV', 'QC', 'AB', 'WI', 'IL', 'NY',
       'SC', 'NM', 'VA', 'NE', 'CA', 'WA', 'XWY', 'CON', 'BC', 'VT', nan,
       'XGM', 'AR', 'AL'], dtype=object)

In [38]:
df['state'].value_counts()

NV     1175316
AZ     1013354
ON      474983
NC      236604
OH      198821
PA      176969
QC      121365
WI       79190
AB       54216
IL       23675
SC       12105
NY          95
NM          14
VA           9
XWY          8
NE           8
AR           7
VT           5
CA           4
WA           3
CON          3
BC           3
XGM          3
AL           3
Name: state, dtype: int64

In [39]:
#keeping only data that belongs to PA
df = df[df['state']=='PA']

In [40]:
df.shape

(176969, 15)

In [41]:
#thinking of breaking it down even futher. What's the best city to use?
df['city'].value_counts()

Pittsburgh     138950
Monroeville      3733
Homestead        2307
Wexford          2177
Bridgeville      2158
                ...  
Sutersville         4
Aliquippa           3
Plum Boro           3
Port Vue            3
Midway              3
Name: city, Length: 152, dtype: int64

In [42]:
df = df[df['city']=='Pittsburgh']

In [43]:
df.shape

(138950, 15)

In [44]:
#now let's check the nulls again
df.isnull().sum()

business_id       0
name              0
address         226
city              0
state             0
postal_code      38
latitude          0
longitude         0
stars             0
attributes      763
categories        0
user_id           0
review_stars      0
text              0
date              0
dtype: int64

### Below I used a mask to explore nulls further. I'm deciding which information I need to keep for the final dataset.

In [45]:
#this mask take only observations with nulls. This should make exploring this portion of the data easier.
mask = False
for col in df.columns: 
    mask = mask | df[col].isnull()
dfnulls = df[mask]

In [46]:
dfnulls[dfnulls['attributes'].isnull()]

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,attributes,categories,user_id,review_stars,text,date
17809,JGXkldESnG69YXlSNVAgJQ,Eggheads,1162 Brookline Blvd,Pittsburgh,PA,15226,40.391934,-80.013482,2.0,,"Restaurants, American (Traditional), Nightlife...",Pm1GpPl-sv_pqqIi6rIdLA,2.0,Last call at 12:15 with customers in the place...,2017-02-22 05:23:35
17810,JGXkldESnG69YXlSNVAgJQ,Eggheads,1162 Brookline Blvd,Pittsburgh,PA,15226,40.391934,-80.013482,2.0,,"Restaurants, American (Traditional), Nightlife...",rCWrxuRC8_pfagpchtHp6A,3.0,Jr's has very recently been rechristened as Eg...,2012-10-22 03:08:39
17811,JGXkldESnG69YXlSNVAgJQ,Eggheads,1162 Brookline Blvd,Pittsburgh,PA,15226,40.391934,-80.013482,2.0,,"Restaurants, American (Traditional), Nightlife...",9V-MChk4H7Y-OM5YqGbjYw,1.0,crack bar!!!!!! closed at 1 am on a saturday.....,2016-10-09 06:22:50
78977,pFUFVB0cPY2wL0Gr4vL3GA,Gaetano's Restaurant,1617 Banksville Rd,Pittsburgh,PA,15216,40.409814,-80.031374,2.0,,"Italian, Restaurants, American (New)",Iq_-UfNcdi7xi9oVjGfwnw,2.0,I was there for a buffer for a very small even...,2017-03-10 02:13:22
78978,pFUFVB0cPY2wL0Gr4vL3GA,Gaetano's Restaurant,1617 Banksville Rd,Pittsburgh,PA,15216,40.409814,-80.031374,2.0,,"Italian, Restaurants, American (New)",HQaColeFEM5v1kFt2AeNdQ,1.0,A group of us attended a murder mystery dinner...,2016-03-16 13:37:47
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3479732,oJ2ubid01aI4n1cdIwteYw,Tuscan Cafe,6 Smithfield St,Pittsburgh,PA,15222,40.436865,-80.000512,4.0,,"Restaurants, Cafes, Italian",jZ1ziwykb5EEla5y4D5Ozg,3.0,The first thing you'll notice at this place is...,2010-12-03 17:58:21
3479733,oJ2ubid01aI4n1cdIwteYw,Tuscan Cafe,6 Smithfield St,Pittsburgh,PA,15222,40.436865,-80.000512,4.0,,"Restaurants, Cafes, Italian",Xa2wTiBzvL4NgZUcq3oASg,5.0,This is definitely a unique little hidden gem ...,2018-04-03 15:18:38
3501952,WAagBTVAylf-LXc3PNoRZQ,Flipt,777 Casino Dr,Pittsburgh,PA,15212,40.447699,-80.022577,3.0,,"Burgers, Beer, Wine & Spirits, Food, Restauran...",Egji_XwjS4dykCjk8qoZWw,3.0,It can be difficult giving a fair review to an...,2018-10-22 15:29:03
3501953,WAagBTVAylf-LXc3PNoRZQ,Flipt,777 Casino Dr,Pittsburgh,PA,15212,40.447699,-80.022577,3.0,,"Burgers, Beer, Wine & Spirits, Food, Restauran...",6BnvEluqfazShd1tg9kqGg,4.0,We visited the casino before a Steelers game w...,2018-10-29 16:37:51


In [47]:
#text is missing but rating is still there. Still usefull
dfnulls[dfnulls['text'].isnull()]

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,attributes,categories,user_id,review_stars,text,date


In [48]:
#date not necessary to make recommendation. Will keep for now.
dfnulls[dfnulls['date'].isnull()]

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,attributes,categories,user_id,review_stars,text,date


In [49]:
#latitude/longitude there (postal code not needed for this project)not necessarily...will keep these rows
dfnulls[dfnulls['postal_code'].isnull()]

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,attributes,categories,user_id,review_stars,text,date
263741,usCGyfoTxOYOqygtI9orUA,Leena's Food Truck,,Pittsburgh,PA,,40.440625,-79.995886,4.0,"{'RestaurantsAttire': ""'casual'"", 'GoodForKids...","Restaurants, Food, Food Trucks, Mediterranean",ZS60rrKqtI63UX_nW_gqUQ,3.0,Love that they are keeping Leenas going. Oakl...,2016-06-26 07:03:47
263742,usCGyfoTxOYOqygtI9orUA,Leena's Food Truck,,Pittsburgh,PA,,40.440625,-79.995886,4.0,"{'RestaurantsAttire': ""'casual'"", 'GoodForKids...","Restaurants, Food, Food Trucks, Mediterranean",O9iMk00m9HnGPCLl0s7YIA,5.0,Got to try them at the Pittsburgh VegFest 2016...,2016-07-31 04:04:29
263743,usCGyfoTxOYOqygtI9orUA,Leena's Food Truck,,Pittsburgh,PA,,40.440625,-79.995886,4.0,"{'RestaurantsAttire': ""'casual'"", 'GoodForKids...","Restaurants, Food, Food Trucks, Mediterranean",6Ki3bAL0wx9ymbdJqbSWMA,2.0,So this food truck is a spin off of the amazin...,2016-06-19 13:52:12
263744,usCGyfoTxOYOqygtI9orUA,Leena's Food Truck,,Pittsburgh,PA,,40.440625,-79.995886,4.0,"{'RestaurantsAttire': ""'casual'"", 'GoodForKids...","Restaurants, Food, Food Trucks, Mediterranean",nWRdZzWzsqH1Us7r1LM-Pg,5.0,The best falafel and hummus in Pittsburgh. Per...,2016-09-18 13:42:36
263745,usCGyfoTxOYOqygtI9orUA,Leena's Food Truck,,Pittsburgh,PA,,40.440625,-79.995886,4.0,"{'RestaurantsAttire': ""'casual'"", 'GoodForKids...","Restaurants, Food, Food Trucks, Mediterranean",RGFJ1gdhPi8qfXYwzv9JZQ,5.0,Great gyros. The falafel and babaganoush is ho...,2018-10-05 23:22:00
263746,usCGyfoTxOYOqygtI9orUA,Leena's Food Truck,,Pittsburgh,PA,,40.440625,-79.995886,4.0,"{'RestaurantsAttire': ""'casual'"", 'GoodForKids...","Restaurants, Food, Food Trucks, Mediterranean",gofnmwo5ptoEnQI47zCBYg,4.0,Their food truck was parked in front of soldie...,2016-11-09 18:07:56
961123,7waMQSqPcbSSAInazaQPxg,Wise County Biscuits,,Pittsburgh,PA,,40.440625,-79.995886,5.0,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Sandwiches, Food Trucks, Restaurants, Breakfas...",4m9NXICYBC5i9t4aTt-I6w,5.0,I stopped by here earlier today with my wife. ...,2018-04-30 00:53:26
961124,7waMQSqPcbSSAInazaQPxg,Wise County Biscuits,,Pittsburgh,PA,,40.440625,-79.995886,5.0,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Sandwiches, Food Trucks, Restaurants, Breakfas...",uCDg1MNiTo2-v-TxlmAQew,5.0,These guys know what they're doing. Everyone ...,2017-11-19 13:23:57
961125,7waMQSqPcbSSAInazaQPxg,Wise County Biscuits,,Pittsburgh,PA,,40.440625,-79.995886,5.0,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Sandwiches, Food Trucks, Restaurants, Breakfas...",VzjWvXlKfM_QYV-SPS25lw,5.0,"I mean, who doesn't love biscuits? But when yo...",2018-05-04 01:45:52
961126,7waMQSqPcbSSAInazaQPxg,Wise County Biscuits,,Pittsburgh,PA,,40.440625,-79.995886,5.0,"{'BusinessAcceptsCreditCards': 'True', 'Restau...","Sandwiches, Food Trucks, Restaurants, Breakfas...",s1ECXwxkoPyrtaYvsMz6Ng,5.0,Have to give all the credit on finding this pl...,2018-08-09 16:36:28


In [51]:
 # df.to_csv('./yelp_dataset/Pittsburgh_restaurant_reviews.csv')