## Data Cleaning

### FSQ POI

In [711]:
# Import modules
import pandas as pd

In [712]:
# Load dataset 
fsq_poi = pd.read_csv('fsq_poi.csv')
fsq_poi.head()

Unnamed: 0,distance,name,price,rating,geocodes.main.latitude,geocodes.main.longitude,geocodes.roof.latitude,geocodes.roof.longitude,req_cat
0,127,Lollipops Gelato,1.0,8.6,40.893585,-73.843692,40.893585,-73.843692,13065
1,797,Ripe Kitchen and Bar,2.0,8.4,40.898196,-73.838821,40.898196,-73.838821,13065
2,821,Ali's Roti Shop,1.0,8.1,40.89395,-73.856803,40.89395,-73.856803,13065
3,983,Jimbo's Hamburger Palace,1.0,8.0,40.891853,-73.858478,40.891853,-73.858478,13065
4,454,Cooler Runnings Jamaican Restaurant,2.0,6.4,40.898173,-73.850254,40.898173,-73.850254,13065


In [713]:
fsq_poi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7000 entries, 0 to 6999
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   distance                 7000 non-null   int64  
 1   name                     7000 non-null   object 
 2   price                    6255 non-null   float64
 3   rating                   4677 non-null   float64
 4   geocodes.main.latitude   7000 non-null   float64
 5   geocodes.main.longitude  7000 non-null   float64
 6   geocodes.roof.latitude   6419 non-null   float64
 7   geocodes.roof.longitude  6419 non-null   float64
 8   req_cat                  7000 non-null   int64  
dtypes: float64(6), int64(2), object(1)
memory usage: 492.3+ KB


In [714]:
# Drop irrelevant or repetitive columns
fsq_poi.drop(columns=['geocodes.roof.latitude', 'geocodes.roof.longitude'], inplace=True)
fsq_poi.head()


Unnamed: 0,distance,name,price,rating,geocodes.main.latitude,geocodes.main.longitude,req_cat
0,127,Lollipops Gelato,1.0,8.6,40.893585,-73.843692,13065
1,797,Ripe Kitchen and Bar,2.0,8.4,40.898196,-73.838821,13065
2,821,Ali's Roti Shop,1.0,8.1,40.89395,-73.856803,13065
3,983,Jimbo's Hamburger Palace,1.0,8.0,40.891853,-73.858478,13065
4,454,Cooler Runnings Jamaican Restaurant,2.0,6.4,40.898173,-73.850254,13065


In [715]:
# Rename columns for better interpretability 
fsq_poi.columns = ['Distance (m)','Name','Price','Rating','Latitude','Longitude','Category']
print(fsq_poi.shape)
fsq_poi.head()

(7000, 7)


Unnamed: 0,Distance (m),Name,Price,Rating,Latitude,Longitude,Category
0,127,Lollipops Gelato,1.0,8.6,40.893585,-73.843692,13065
1,797,Ripe Kitchen and Bar,2.0,8.4,40.898196,-73.838821,13065
2,821,Ali's Roti Shop,1.0,8.1,40.89395,-73.856803,13065
3,983,Jimbo's Hamburger Palace,1.0,8.0,40.891853,-73.858478,13065
4,454,Cooler Runnings Jamaican Restaurant,2.0,6.4,40.898173,-73.850254,13065


In [716]:
# Check for unique values

def unique_values(df):
    columns = df.columns
    for col in columns:
        print('Column Name:', col)
        print('# of Unique Values:', (len(df[col].unique())))
        print('Unique Values:', df[col].unique())
        print('==============================================================')
    
unique_values(fsq_poi)

Column Name: Distance (m)
# of Unique Values: 1004
Unique Values: [ 127  797  821 ... 1001 1033 1230]
Column Name: Name
# of Unique Values: 4535
Unique Values: ['Lollipops Gelato' 'Ripe Kitchen and Bar' "Ali's Roti Shop" ...
 'Flavors Corner' 'Ooi Sushi and Bar' 'Kum Fung Chinese Restaurant']
Column Name: Price
# of Unique Values: 5
Unique Values: [ 1.  2. nan  3.  4.]
Column Name: Rating
# of Unique Values: 52
Unique Values: [8.6 8.4 8.1 8.  6.4 6.7 6.6 5.8 nan 5.7 8.2 7.9 7.6 7.3 6.8 5.9 6.3 6.1
 8.3 7.5 7.7 7.2 7.  6.9 6.  6.5 5.6 8.9 8.7 7.8 7.1 6.2 5.3 8.5 9.2 9.
 5.4 7.4 8.8 5.2 5.5 5.  4.7 9.1 9.3 9.4 9.5 9.6 5.1 4.8 4.9 4.5]
Column Name: Latitude
# of Unique Values: 4982
Unique Values: [40.893585 40.898196 40.89395  ... 40.755323 40.755997 40.621817]
Column Name: Longitude
# of Unique Values: 4989
Unique Values: [-73.843692 -73.838821 -73.856803 ... -73.945402 -73.942414 -74.07237 ]
Column Name: Category
# of Unique Values: 1
Unique Values: [13065]


In [717]:
# Check for null values
fsq_poi.isnull().sum()

Distance (m)       0
Name               0
Price            745
Rating          2323
Latitude           0
Longitude          0
Category           0
dtype: int64

There are null values in price and rating. However it has low impact, as it simply signifies that some restuarants have no price and rating information, which does not effect our objective.

In [718]:
# Check for duplicate rows
fsq_poi[fsq_poi.duplicated()]

Unnamed: 0,Distance (m),Name,Price,Rating,Latitude,Longitude,Category
5637,500,El Pollo II Restaurant,1.0,7.2,40.59068,-74.191997,13065


In [719]:
# 1 duplicate row found, remove from DataFrame
fsq_poi = fsq_poi.drop([5637])

# Confirm removal
fsq_poi[fsq_poi.duplicated()]

Unnamed: 0,Distance (m),Name,Price,Rating,Latitude,Longitude,Category


In [720]:
# See variance in numeric feautures
fsq_poi.describe()

Unnamed: 0,Distance (m),Price,Rating,Latitude,Longitude,Category
count,6999.0,6254.0,4676.0,6999.0,6999.0,6999.0
mean,578.36248,1.597058,7.808618,40.711354,-73.937832,13065.0
std,261.53718,0.705532,1.001289,0.093952,0.112958,0.0
min,3.0,1.0,4.5,40.507106,-74.251372,13065.0
25%,375.5,1.0,7.1,40.631761,-73.992029,13065.0
50%,585.0,1.0,8.0,40.71188,-73.928118,13065.0
75%,790.0,2.0,8.7,40.773461,-73.857604,13065.0
max,4095.0,4.0,9.6,40.912863,-73.698338,13065.0


In [721]:
# Preview cleaned dataset
print(fsq_poi.shape)
fsq_poi.head()


(6999, 7)


Unnamed: 0,Distance (m),Name,Price,Rating,Latitude,Longitude,Category
0,127,Lollipops Gelato,1.0,8.6,40.893585,-73.843692,13065
1,797,Ripe Kitchen and Bar,2.0,8.4,40.898196,-73.838821,13065
2,821,Ali's Roti Shop,1.0,8.1,40.89395,-73.856803,13065
3,983,Jimbo's Hamburger Palace,1.0,8.0,40.891853,-73.858478,13065
4,454,Cooler Runnings Jamaican Restaurant,2.0,6.4,40.898173,-73.850254,13065


In [722]:
# Export cleaned dataset to csv
fsq_poi.to_csv('fsq_poi_clean.csv')

### Yelp POI

In [723]:
# Load dataset
yelp_poi = pd.read_csv('yelp_poi.csv')
yelp_poi.head()

Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,transactions,...,coordinates.longitude,location.address1,location.address2,location.address3,location.city,location.zip_code,location.country,location.state,location.display_address,req_cat
0,fLJw_HGD01RTQ_WqlJf6Ew,ripe-kitchen-and-bar-mount-vernon,Ripe Kitchen & Bar,https://s3-media3.fl.yelpcdn.com/bphoto/NCpeGV...,False,https://www.yelp.com/biz/ripe-kitchen-and-bar-...,287,"[{'alias': 'caribbean', 'title': 'Caribbean'}]",4.0,['delivery'],...,-73.838821,151 W Sandford Blvd,,,Mount Vernon,10550,US,NY,"['151 W Sandford Blvd', 'Mount Vernon, NY 10550']",restaurants
1,iJci5KN7H39D3JXHiNBDKw,h-i-m-ital-health-food-market-bronx,H.I.M Ital Health Food Market,https://s3-media4.fl.yelpcdn.com/bphoto/Tze_Nm...,False,https://www.yelp.com/biz/h-i-m-ital-health-foo...,47,"[{'alias': 'healthmarkets', 'title': 'Health M...",4.5,['delivery'],...,-73.854665,4374B White Plains Rd,,,Bronx,10466,US,NY,"['4374B White Plains Rd', 'Bronx, NY 10466']",restaurants
2,IcEafi60F4pyEHaM4lR4Wg,alis-roti-shop-bronx,Ali's Roti Shop,https://s3-media2.fl.yelpcdn.com/bphoto/6SQMma...,False,https://www.yelp.com/biz/alis-roti-shop-bronx?...,104,"[{'alias': 'trinidadian', 'title': 'Trinidadia...",4.0,[],...,-73.85684,4220 White Plains Rd,,,Bronx,10466,US,NY,"['4220 White Plains Rd', 'Bronx, NY 10466']",restaurants
3,h5dm87qaqHeTBGcqTi9hWg,paulas-soul-cafe-bronx,Paula's Soul Cafe,https://s3-media4.fl.yelpcdn.com/bphoto/yfGmup...,False,https://www.yelp.com/biz/paulas-soul-cafe-bron...,202,"[{'alias': 'seafood', 'title': 'Seafood'}, {'a...",3.0,"['delivery', 'pickup']",...,-73.85568,746 E 233rd St,,,Bronx,10466,US,NY,"['746 E 233rd St', 'Bronx, NY 10466']",restaurants
4,_XMhEsFSFUQsuKae7zzFZg,jerk-house-bronx,Jerk House,https://s3-media4.fl.yelpcdn.com/bphoto/Jx4v64...,False,https://www.yelp.com/biz/jerk-house-bronx?adju...,62,"[{'alias': 'caribbean', 'title': 'Caribbean'}]",3.5,"['delivery', 'pickup']",...,-73.85648,4246 White Plains,,,Bronx,10466,US,NY,"['4246 White Plains', 'Bronx, NY 10466']",restaurants


In [724]:
yelp_poi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7115 entries, 0 to 7114
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        7115 non-null   object 
 1   alias                     7115 non-null   object 
 2   name                      7115 non-null   object 
 3   image_url                 7058 non-null   object 
 4   is_closed                 7115 non-null   bool   
 5   url                       7115 non-null   object 
 6   review_count              7115 non-null   int64  
 7   categories                7115 non-null   object 
 8   rating                    7115 non-null   float64
 9   transactions              7115 non-null   object 
 10  price                     6116 non-null   object 
 11  phone                     6924 non-null   float64
 12  display_phone             6924 non-null   object 
 13  distance                  7115 non-null   float64
 14  coordina

In [725]:
# yelp_poi.iloc[:,10:]
yelp_poi.columns

Index(['id', 'alias', 'name', 'image_url', 'is_closed', 'url', 'review_count',
       'categories', 'rating', 'transactions', 'price', 'phone',
       'display_phone', 'distance', 'coordinates.latitude',
       'coordinates.longitude', 'location.address1', 'location.address2',
       'location.address3', 'location.city', 'location.zip_code',
       'location.country', 'location.state', 'location.display_address',
       'req_cat'],
      dtype='object')

In [726]:
# Drop irrelevant or repetitive columns
yelp_poi.drop(columns=['id','alias','image_url','is_closed','url','categories','transactions','phone',
                        'display_phone','location.address2','location.address3',
                        'location.city','location.zip_code','location.country',
                        'location.state','location.display_address'],
                         inplace=True)
yelp_poi.head()

Unnamed: 0,name,review_count,rating,price,distance,coordinates.latitude,coordinates.longitude,location.address1,req_cat
0,Ripe Kitchen & Bar,287,4.0,$$,805.077589,40.898214,-73.838821,151 W Sandford Blvd,restaurants
1,H.I.M Ital Health Food Market,47,4.5,$,715.467929,40.897633,-73.854665,4374B White Plains Rd,restaurants
2,Ali's Roti Shop,104,4.0,$,809.642747,40.894,-73.85684,4220 White Plains Rd,restaurants
3,Paula's Soul Cafe,202,3.0,$$,746.891106,40.89269,-73.85568,746 E 233rd St,restaurants
4,Jerk House,62,3.5,$$,771.525854,40.89464,-73.85648,4246 White Plains,restaurants


In [727]:
# Rename columns for better interpretability 
yelp_poi.columns = ['Name','Reviews','Rating','Price',
                    'Distance (m)','Latitude','Longitude','Address','Category']
                    
print(yelp_poi.shape)
yelp_poi.head()

(7115, 9)


Unnamed: 0,Name,Reviews,Rating,Price,Distance (m),Latitude,Longitude,Address,Category
0,Ripe Kitchen & Bar,287,4.0,$$,805.077589,40.898214,-73.838821,151 W Sandford Blvd,restaurants
1,H.I.M Ital Health Food Market,47,4.5,$,715.467929,40.897633,-73.854665,4374B White Plains Rd,restaurants
2,Ali's Roti Shop,104,4.0,$,809.642747,40.894,-73.85684,4220 White Plains Rd,restaurants
3,Paula's Soul Cafe,202,3.0,$$,746.891106,40.89269,-73.85568,746 E 233rd St,restaurants
4,Jerk House,62,3.5,$$,771.525854,40.89464,-73.85648,4246 White Plains,restaurants


In [728]:
# Check for unique values
unique_values(yelp_poi)

Column Name: Name
# of Unique Values: 4163
Unique Values: ['Ripe Kitchen & Bar' 'H.I.M Ital Health Food Market' "Ali's Roti Shop"
 ... 'The Local Bar and Cafe' 'Kum Fung Chinese Restaurant' 'Amendment 18']
Column Name: Reviews
# of Unique Values: 1072
Unique Values: [ 287   47  104 ...  784  397 1028]
Column Name: Rating
# of Unique Values: 9
Unique Values: [4.  4.5 3.  3.5 5.  2.5 2.  1.5 1. ]
Column Name: Price
# of Unique Values: 5
Unique Values: ['$$' '$' '$$$' nan '$$$$']
Column Name: Distance (m)
# of Unique Values: 7048
Unique Values: [ 805.0775886   715.46792863  809.64274698 ...  825.92920937 1049.84712452
 1076.21508396]
Column Name: Latitude
# of Unique Values: 4402
Unique Values: [40.898214   40.897633   40.894      ... 40.7495622  40.621825
 40.62530162]
Column Name: Longitude
# of Unique Values: 4400
Unique Values: [-73.838821   -73.854665   -73.85684    ... -73.9477008  -74.07234
 -74.07454516]
Column Name: Address
# of Unique Values: 4266
Unique Values: ['151 W Sandford

In [729]:
# Check for nulls
yelp_poi.isnull().sum()

Name              0
Reviews           0
Rating            0
Price           999
Distance (m)      0
Latitude          0
Longitude         0
Address         166
Category          0
dtype: int64

There are null values in price and address. However it has low impact, as it simply signifies that some restuarants have no price and address information, which does not effect our objective.

Additionally, we have the coordinates which will be the main features used for city mapping.

In [730]:
# Check for duplicates
yelp_poi[yelp_poi.duplicated()]

# No duplicates

Unnamed: 0,Name,Reviews,Rating,Price,Distance (m),Latitude,Longitude,Address,Category


In [731]:
# See variance in numeric feautures
yelp_poi.describe()

Unnamed: 0,Reviews,Rating,Distance (m),Latitude,Longitude
count,7115.0,7115.0,7115.0,7115.0,7115.0
mean,396.188897,3.916936,1571.646996,40.713105,-73.942424
std,863.333845,0.630598,4323.548896,0.093239,0.113619
min,1.0,1.0,16.13817,40.205016,-74.448224
25%,39.0,3.5,420.978899,40.634212,-73.99444
50%,127.0,4.0,681.84268,40.716641,-73.93961
75%,368.0,4.5,936.679688,40.771113,-73.865726
max,14361.0,5.0,77147.675762,40.91594,-73.100648


In [732]:
# Preview cleaned dataset
print(yelp_poi.shape)
yelp_poi.head()

(7115, 9)


Unnamed: 0,Name,Reviews,Rating,Price,Distance (m),Latitude,Longitude,Address,Category
0,Ripe Kitchen & Bar,287,4.0,$$,805.077589,40.898214,-73.838821,151 W Sandford Blvd,restaurants
1,H.I.M Ital Health Food Market,47,4.5,$,715.467929,40.897633,-73.854665,4374B White Plains Rd,restaurants
2,Ali's Roti Shop,104,4.0,$,809.642747,40.894,-73.85684,4220 White Plains Rd,restaurants
3,Paula's Soul Cafe,202,3.0,$$,746.891106,40.89269,-73.85568,746 E 233rd St,restaurants
4,Jerk House,62,3.5,$$,771.525854,40.89464,-73.85648,4246 White Plains,restaurants


In [733]:
# Export cleaned dataset to csv
yelp_poi.to_csv('yelp_poi_cleaned.csv')

### Google Places POI

In [734]:
gp_poi = pd.read_csv('google_places_poi.csv')
gp_poi.head()

Unnamed: 0,business_status,icon,icon_background_color,icon_mask_base_uri,name,photos,place_id,price_level,rating,reference,...,geometry.location.lng,geometry.viewport.northeast.lat,geometry.viewport.northeast.lng,geometry.viewport.southwest.lat,geometry.viewport.southwest.lng,opening_hours.open_now,plus_code.compound_code,plus_code.global_code,permanently_closed,req_cat
0,OPERATIONAL,https://maps.gstatic.com/mapfiles/place_api/ic...,#FF9E67,https://maps.gstatic.com/mapfiles/place_api/ic...,Ripe Kitchen & Bar,"[{'height': 2471, 'html_attributions': ['<a hr...",ChIJ9aOdYiyNwokRS49czrNTQo4,2.0,4.3,ChIJ9aOdYiyNwokRS49czrNTQo4,...,-73.838855,40.899498,-73.837452,40.8968,-73.84015,True,"V5X6+7F Mount Vernon, NY, USA",87G8V5X6+7F,,restaurant
1,OPERATIONAL,https://maps.gstatic.com/mapfiles/place_api/ic...,#FF9E67,https://maps.gstatic.com/mapfiles/place_api/ic...,New China Garden,"[{'height': 4032, 'html_attributions': ['<a hr...",ChIJcVDAYNfywokRPUaKrm0UBpg,1.0,3.9,ChIJcVDAYNfywokRPUaKrm0UBpg,...,-73.853364,40.899306,-73.8521,40.896608,-73.854798,True,"V4XW+5M The Bronx, NY, USA",87G8V4XW+5M,,restaurant
2,OPERATIONAL,https://maps.gstatic.com/mapfiles/place_api/ic...,#FF9E67,https://maps.gstatic.com/mapfiles/place_api/ic...,Dunkin',"[{'height': 3000, 'html_attributions': ['<a hr...",ChIJ--58aynzwokRqSvaRDKwX1M,1.0,3.8,ChIJ--58aynzwokRqSvaRDKwX1M,...,-73.849089,40.891936,-73.847664,40.889239,-73.850362,True,"V5R2+59 New York, NY, USA",87G8V5R2+59,,restaurant
3,OPERATIONAL,https://maps.gstatic.com/mapfiles/place_api/ic...,#FF9E67,https://maps.gstatic.com/mapfiles/place_api/ic...,Subway,"[{'height': 766, 'html_attributions': ['<a hre...",ChIJAy-ibCnzwokRdmkKGd1Yzhw,1.0,3.6,ChIJAy-ibCnzwokRdmkKGd1Yzhw,...,-73.849152,40.891828,-73.847659,40.88913,-73.850357,True,"V5R2+58 New York, NY, USA",87G8V5R2+58,,restaurant
4,OPERATIONAL,https://maps.gstatic.com/mapfiles/place_api/ic...,#FF9E67,https://maps.gstatic.com/mapfiles/place_api/ic...,Popeyes Louisiana Kitchen,"[{'height': 1192, 'html_attributions': ['<a hr...",ChIJXVacFSvzwokRa3VpQDuwRDE,1.0,3.8,ChIJXVacFSvzwokRa3VpQDuwRDE,...,-73.843383,40.890762,-73.842012,40.888064,-73.84471,True,"V5Q4+QJ New York, NY, USA",87G8V5Q4+QJ,,restaurant


In [735]:
gp_poi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5619 entries, 0 to 5618
Data columns (total 25 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   business_status                  5619 non-null   object 
 1   icon                             5619 non-null   object 
 2   icon_background_color            5619 non-null   object 
 3   icon_mask_base_uri               5619 non-null   object 
 4   name                             5619 non-null   object 
 5   photos                           5442 non-null   object 
 6   place_id                         5619 non-null   object 
 7   price_level                      4786 non-null   float64
 8   rating                           5522 non-null   float64
 9   reference                        5619 non-null   object 
 10  scope                            5619 non-null   object 
 11  types                            5619 non-null   object 
 12  user_ratings_total  

In [736]:
# gp_poi.iloc[0:5,10:]
gp_poi.columns

Index(['business_status', 'icon', 'icon_background_color',
       'icon_mask_base_uri', 'name', 'photos', 'place_id', 'price_level',
       'rating', 'reference', 'scope', 'types', 'user_ratings_total',
       'vicinity', 'geometry.location.lat', 'geometry.location.lng',
       'geometry.viewport.northeast.lat', 'geometry.viewport.northeast.lng',
       'geometry.viewport.southwest.lat', 'geometry.viewport.southwest.lng',
       'opening_hours.open_now', 'plus_code.compound_code',
       'plus_code.global_code', 'permanently_closed', 'req_cat'],
      dtype='object')

In [737]:
# Drop irrelevant or repetitive columns
gp_poi.drop(columns=['business_status','icon','icon_background_color','icon_mask_base_uri',
                    'photos','place_id','reference','scope','types','geometry.viewport.northeast.lat',
                    'geometry.viewport.northeast.lng','geometry.viewport.southwest.lat',
                    'geometry.viewport.southwest.lng','opening_hours.open_now',
                    'plus_code.compound_code','plus_code.global_code','permanently_closed'], 
                    inplace=True)

gp_poi.head()

Unnamed: 0,name,price_level,rating,user_ratings_total,vicinity,geometry.location.lat,geometry.location.lng,req_cat
0,Ripe Kitchen & Bar,2.0,4.3,714.0,"151 West Sandford Boulevard, Mount Vernon",40.898209,-73.838855,restaurant
1,New China Garden,1.0,3.9,85.0,"724 Nereid Avenue, The Bronx",40.897919,-73.853364,restaurant
2,Dunkin',1.0,3.8,280.0,"980 East 233rd Street, The Bronx",40.890459,-73.849089,restaurant
3,Subway,1.0,3.6,121.0,"980 East 233rd Street, The Bronx",40.890468,-73.849152,restaurant
4,Popeyes Louisiana Kitchen,1.0,3.8,467.0,"1201 East 233rd Street, The Bronx",40.889492,-73.843383,restaurant


In [738]:
# Rename columns for better interpretability
gp_poi.columns = ['Name','Price','Rating','Reviews','Address','Latitude','Longitude','Category']
print(gp_poi.shape)
gp_poi.head()

(5619, 8)


Unnamed: 0,Name,Price,Rating,Reviews,Address,Latitude,Longitude,Category
0,Ripe Kitchen & Bar,2.0,4.3,714.0,"151 West Sandford Boulevard, Mount Vernon",40.898209,-73.838855,restaurant
1,New China Garden,1.0,3.9,85.0,"724 Nereid Avenue, The Bronx",40.897919,-73.853364,restaurant
2,Dunkin',1.0,3.8,280.0,"980 East 233rd Street, The Bronx",40.890459,-73.849089,restaurant
3,Subway,1.0,3.6,121.0,"980 East 233rd Street, The Bronx",40.890468,-73.849152,restaurant
4,Popeyes Louisiana Kitchen,1.0,3.8,467.0,"1201 East 233rd Street, The Bronx",40.889492,-73.843383,restaurant


In [739]:
# Check for unique values
unique_values(gp_poi)

Column Name: Name
# of Unique Values: 3078
Unique Values: ['Ripe Kitchen & Bar' 'New China Garden' "Dunkin'" ... 'Exquisito'
 'Greek Panorama' 'Kum Fung']
Column Name: Price
# of Unique Values: 5
Unique Values: [ 2.  1. nan  3.  4.]
Column Name: Rating
# of Unique Values: 35
Unique Values: [4.3 3.9 3.8 3.6 3.3 4.2 3.7 4.1 2.2 1.  4.  4.5 5.  4.6 2.9 4.4 4.7 3.
 3.4 2.4 3.2 2.5 3.5 2.7 3.1 2.8 4.9 nan 2.  2.6 2.1 4.8 2.3 1.9 1.8]
Column Name: Reviews
# of Unique Values: 1434
Unique Values: [ 714.   85.  280. ... 1007. 1291. 1732.]
Column Name: Address
# of Unique Values: 3883
Unique Values: ['151 West Sandford Boulevard, Mount Vernon'
 '724 Nereid Avenue, The Bronx' '980 East 233rd Street, The Bronx' ...
 '24 -11 41st Avenue, Long Island City' '455 Main Street, New York'
 '838 Bay Street, Staten Island']
Column Name: Latitude
# of Unique Values: 4000
Unique Values: [40.8982087 40.897919  40.8904588 ... 40.7535229 40.759305  40.6218795]
Column Name: Longitude
# of Unique Values: 3996
Uni

In [740]:
# Check for nulls
gp_poi.isnull().sum()

Name           0
Price        833
Rating        97
Reviews       97
Address        0
Latitude       0
Longitude      0
Category       0
dtype: int64

There are null values in price, rating, and reviews. However it has low impact, as it simply signifies that some restuarants have no price, rating, or review information, which does not effect our objective.

In [741]:
# Check for duplicates
gp_poi = gp_poi[gp_poi.duplicated()]
len(gp_poi)

1578

In [742]:
# Duplicate rows found, remove from DataFrame
gp_poi = gp_poi.drop_duplicates()

# Confirm removal
gp_poi[gp_poi.duplicated()]

Unnamed: 0,Name,Price,Rating,Reviews,Address,Latitude,Longitude,Category


In [743]:
# See variance in numeric feautures
gp_poi.describe()

Unnamed: 0,Price,Rating,Reviews,Latitude,Longitude
count,1116.0,1246.0,1246.0,1260.0,1260.0
mean,1.660394,4.083226,740.94061,40.713778,-73.951417
std,0.671181,0.439148,1260.821341,0.094919,0.111364
min,1.0,1.0,1.0,40.522106,-74.238666
25%,1.0,3.9,148.0,40.626667,-74.000674
50%,2.0,4.2,360.5,40.713171,-73.944073
75%,2.0,4.4,847.0,40.786754,-73.869725
max,4.0,5.0,19357.0,40.894788,-73.707534


In [744]:
# Preview cleaned dataset
print(gp_poi.shape)
gp_poi.head()

(1260, 8)


Unnamed: 0,Name,Price,Rating,Reviews,Address,Latitude,Longitude,Category
54,Prime 33 Banquet hall #1,2.0,4.3,150.0,"3323 Merritt Avenue, The Bronx",40.881788,-73.826755,restaurant
80,An Beal Bocht Cafe,2.0,4.6,534.0,"445 West 238th Street, The Bronx",40.887375,-73.904953,restaurant
82,Goodfellas Riverdale 🍕,2.0,3.8,171.0,"3661 Waldo Avenue, The Bronx",40.887026,-73.904387,restaurant
84,Corner Cafe & Bakery,1.0,4.4,164.0,"3718 Riverdale Avenue, The Bronx",40.887266,-73.90692,restaurant
85,Salvatore's,2.0,4.4,656.0,"3738 Riverdale Avenue, The Bronx",40.887631,-73.906955,restaurant


A pizza slice!

The names can be kept as is, even with an emoji, as it will not affect our modelling, as the name column will not be used.

In [745]:
# Export cleaned dataset to csv
gp_poi.to_csv('google_places_poi_clean.csv')