## Data Cleaning

### FSQ POI

In [1]:
# Import modules
import pandas as pd

In [2]:
# Load dataset 
fsq_poi = pd.read_csv('fsq_poi.csv')
fsq_poi.head()

Unnamed: 0,distance,name,price,rating,geocodes.main.latitude,geocodes.main.longitude,geocodes.roof.latitude,geocodes.roof.longitude,req_cat
0,127,Lollipops Gelato,1.0,8.6,40.893585,-73.843692,40.893585,-73.843692,13065
1,797,Ripe Kitchen and Bar,2.0,8.4,40.898196,-73.838821,40.898196,-73.838821,13065
2,821,Ali's Roti Shop,1.0,8.1,40.89395,-73.856803,40.89395,-73.856803,13065
3,983,Jimbo's Hamburger Palace,1.0,8.0,40.891853,-73.858478,40.891853,-73.858478,13065
4,454,Cooler Runnings Jamaican Restaurant,2.0,6.4,40.898173,-73.850254,40.898173,-73.850254,13065


In [3]:
fsq_poi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7000 entries, 0 to 6999
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   distance                 7000 non-null   int64  
 1   name                     7000 non-null   object 
 2   price                    6255 non-null   float64
 3   rating                   4677 non-null   float64
 4   geocodes.main.latitude   7000 non-null   float64
 5   geocodes.main.longitude  7000 non-null   float64
 6   geocodes.roof.latitude   6419 non-null   float64
 7   geocodes.roof.longitude  6419 non-null   float64
 8   req_cat                  7000 non-null   int64  
dtypes: float64(6), int64(2), object(1)
memory usage: 492.3+ KB


In [4]:
# Drop irrelevant or repetitive columns
fsq_poi.drop(columns=['geocodes.roof.latitude', 'geocodes.roof.longitude','geocodes.main.latitude','geocodes.main.longitude'], inplace=True)
fsq_poi.head()


Unnamed: 0,distance,name,price,rating,req_cat
0,127,Lollipops Gelato,1.0,8.6,13065
1,797,Ripe Kitchen and Bar,2.0,8.4,13065
2,821,Ali's Roti Shop,1.0,8.1,13065
3,983,Jimbo's Hamburger Palace,1.0,8.0,13065
4,454,Cooler Runnings Jamaican Restaurant,2.0,6.4,13065


In [5]:
# removing brackets and quotations from neighbourhood info
fsq_poi['location.neighborhood'] = fsq_poi['location.neighborhood'].str.strip('[\'')
fsq_poi['location.neighborhood'] = fsq_poi['location.neighborhood'].str.strip(']\'')

KeyError: 'location.neighborhood'

In [None]:
# Rename columns for better interpretability 
fsq_poi.columns = ['Distance (m)','Name','Price','Rating','Borough','Neighborhood','Zipcode','Latitude','Longitude','Category']
print(fsq_poi.shape)
fsq_poi.head()

(7001, 10)


Unnamed: 0,Distance (m),Name,Price,Rating,Borough,Neighborhood,Zipcode,Latitude,Longitude,Category
0,127.0,Lollipops Gelato,1.0,8.6,Bronx,Edenwald,10466.0,40.894705,-73.847201,restaurant
1,797.0,Ripe Kitchen and Bar,2.0,8.4,Mount Vernon,Wakefield,10550.0,40.894705,-73.847201,restaurant
2,821.0,Ali's Roti Shop,1.0,8.1,Bronx,Wakefield,10466.0,40.894705,-73.847201,restaurant
3,983.0,Jimbo's Hamburger Palace,1.0,8.0,Bronx,Williambridge,10466.0,40.894705,-73.847201,restaurant
4,454.0,Cooler Runnings Jamaican Restaurant,2.0,6.4,Bronx,Wakefield,10466.0,40.894705,-73.847201,restaurant


In [None]:
# Check for unique values

def unique_values(df):
    columns = df.columns
    for col in columns:
        print('Column Name:', col)
        print('# of Unique Values:', (len(df[col].unique())))
        print('Unique Values:', df[col].unique())
        print('==============================================================')
    
unique_values(fsq_poi)

Column Name: Distance (m)
# of Unique Values: 1003
Unique Values: [ 127.  797.  821. ... 1001. 1033. 1230.]
Column Name: Name
# of Unique Values: 4535
Unique Values: ['Lollipops Gelato' 'Ripe Kitchen and Bar' "Ali's Roti Shop" ...
 'Wholesome Factory' 'Flavors Corner' 'Ooi Sushi and Bar']
Column Name: Price
# of Unique Values: 5
Unique Values: [ 1.  2. nan  3.  4.]
Column Name: Rating
# of Unique Values: 53
Unique Values: [8.6 8.4 8.1 8.  6.4 6.7 6.6 5.8 nan 5.7 7.9 7.6 7.3 6.8 6.3 5.9 6.1 8.3
 8.2 7.5 7.7 7.  7.2 6.9 6.  6.5 5.6 8.8 8.9 8.7 7.8 7.1 6.2 5.3 8.5 9.2
 9.  5.4 7.4 4.6 5.2 9.1 5.5 5.  4.7 9.3 9.4 9.5 9.6 4.8 5.1 4.9 4.5]
Column Name: Borough
# of Unique Values: 77
Unique Values: ['Bronx' 'Mount Vernon' 'New York' 'Pelham' 'Pelham Manor' 'Yonkers'
 'City Island' 'Long Island City' 'Queens' 'Parkchester' 'Riverdale'
 'Brooklyn' 'Sunset Park' 'Brighton Beach' 'Gowanus' 'Coney Island'
 'Bedford-Stuyvesant' 'Ozone Park' 'Woodhaven' 'Harlem' 'Roosevelt Island'
 'Astoria' 'Woodsi

In [None]:
# Check for null values
fsq_poi.isnull().sum()

Distance (m)       2
Name               2
Price            732
Rating          2316
Borough            2
Neighborhood     780
Zipcode            5
Latitude           0
Longitude          0
Category           0
dtype: int64

There are null values in price and rating. However it has low impact, as it simply signifies that some restuarants have no price and rating information, which does not effect our objective.

Some neighbourhoods are empty, but we can cross-reference with our other data when we join our dataframes later.

In [None]:
# Check for duplicate rows
fsq_poi[fsq_poi.duplicated()]

Unnamed: 0,Distance (m),Name,Price,Rating,Borough,Neighborhood,Zipcode,Latitude,Longitude,Category


In [None]:
# no duplicates found

Unnamed: 0,Distance (m),Name,Price,Rating,Latitude,Longitude,Category


In [None]:
# See variance in numeric feautures
fsq_poi.describe()

Unnamed: 0,Distance (m),Price,Rating,Zipcode,Latitude,Longitude
count,6999.0,6269.0,4685.0,6996.0,7001.0,7001.0
mean,578.058723,1.594353,7.803308,10793.412379,40.71117,-73.937763
std,258.267312,0.704104,1.004244,537.147679,0.094263,0.112911
min,3.0,1.0,4.5,7202.0,40.505334,-74.246569
25%,376.5,1.0,7.1,10308.0,40.632546,-73.994279
50%,585.0,1.0,8.0,11004.0,40.711243,-73.930102
75%,790.0,2.0,8.7,11235.0,40.773529,-73.857446
max,4095.0,4.0,9.6,11697.0,40.908543,-73.708847


In [None]:
# Preview cleaned dataset
print(fsq_poi.shape)
fsq_poi.head()


(7001, 10)


Unnamed: 0,Distance (m),Name,Price,Rating,Borough,Neighborhood,Zipcode,Latitude,Longitude,Category
0,127.0,Lollipops Gelato,1.0,8.6,Bronx,Edenwald,10466.0,40.894705,-73.847201,restaurant
1,797.0,Ripe Kitchen and Bar,2.0,8.4,Mount Vernon,Wakefield,10550.0,40.894705,-73.847201,restaurant
2,821.0,Ali's Roti Shop,1.0,8.1,Bronx,Wakefield,10466.0,40.894705,-73.847201,restaurant
3,983.0,Jimbo's Hamburger Palace,1.0,8.0,Bronx,Williambridge,10466.0,40.894705,-73.847201,restaurant
4,454.0,Cooler Runnings Jamaican Restaurant,2.0,6.4,Bronx,Wakefield,10466.0,40.894705,-73.847201,restaurant


In [None]:
# Export cleaned dataset to csv
fsq_poi.to_csv('fsq_poi_clean.csv')

### Yelp POI

In [None]:
# Load dataset
yelp_poi = pd.read_csv('yelp_poi.csv')
yelp_poi.head()

Unnamed: 0,id,alias,name,image_url,is_closed,url,review_count,categories,rating,transactions,...,coordinates.longitude,location.address1,location.address2,location.address3,location.city,location.zip_code,location.country,location.state,location.display_address,req_cat
0,fLJw_HGD01RTQ_WqlJf6Ew,ripe-kitchen-and-bar-mount-vernon,Ripe Kitchen & Bar,https://s3-media3.fl.yelpcdn.com/bphoto/NCpeGV...,False,https://www.yelp.com/biz/ripe-kitchen-and-bar-...,287,"[{'alias': 'caribbean', 'title': 'Caribbean'}]",4.0,['delivery'],...,-73.838821,151 W Sandford Blvd,,,Mount Vernon,10550,US,NY,"['151 W Sandford Blvd', 'Mount Vernon, NY 10550']",restaurants
1,iJci5KN7H39D3JXHiNBDKw,h-i-m-ital-health-food-market-bronx,H.I.M Ital Health Food Market,https://s3-media4.fl.yelpcdn.com/bphoto/Tze_Nm...,False,https://www.yelp.com/biz/h-i-m-ital-health-foo...,47,"[{'alias': 'healthmarkets', 'title': 'Health M...",4.5,['delivery'],...,-73.854665,4374B White Plains Rd,,,Bronx,10466,US,NY,"['4374B White Plains Rd', 'Bronx, NY 10466']",restaurants
2,IcEafi60F4pyEHaM4lR4Wg,alis-roti-shop-bronx,Ali's Roti Shop,https://s3-media2.fl.yelpcdn.com/bphoto/6SQMma...,False,https://www.yelp.com/biz/alis-roti-shop-bronx?...,104,"[{'alias': 'trinidadian', 'title': 'Trinidadia...",4.0,[],...,-73.85684,4220 White Plains Rd,,,Bronx,10466,US,NY,"['4220 White Plains Rd', 'Bronx, NY 10466']",restaurants
3,h5dm87qaqHeTBGcqTi9hWg,paulas-soul-cafe-bronx,Paula's Soul Cafe,https://s3-media4.fl.yelpcdn.com/bphoto/yfGmup...,False,https://www.yelp.com/biz/paulas-soul-cafe-bron...,202,"[{'alias': 'seafood', 'title': 'Seafood'}, {'a...",3.0,"['delivery', 'pickup']",...,-73.85568,746 E 233rd St,,,Bronx,10466,US,NY,"['746 E 233rd St', 'Bronx, NY 10466']",restaurants
4,_XMhEsFSFUQsuKae7zzFZg,jerk-house-bronx,Jerk House,https://s3-media4.fl.yelpcdn.com/bphoto/Jx4v64...,False,https://www.yelp.com/biz/jerk-house-bronx?adju...,62,"[{'alias': 'caribbean', 'title': 'Caribbean'}]",3.5,"['delivery', 'pickup']",...,-73.85648,4246 White Plains,,,Bronx,10466,US,NY,"['4246 White Plains', 'Bronx, NY 10466']",restaurants


In [None]:
yelp_poi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7115 entries, 0 to 7114
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   id                        7115 non-null   object 
 1   alias                     7115 non-null   object 
 2   name                      7115 non-null   object 
 3   image_url                 7058 non-null   object 
 4   is_closed                 7115 non-null   bool   
 5   url                       7115 non-null   object 
 6   review_count              7115 non-null   int64  
 7   categories                7115 non-null   object 
 8   rating                    7115 non-null   float64
 9   transactions              7115 non-null   object 
 10  price                     6116 non-null   object 
 11  phone                     6924 non-null   float64
 12  display_phone             6924 non-null   object 
 13  distance                  7115 non-null   float64
 14  coordina

In [None]:
# yelp_poi.iloc[:,10:]
yelp_poi.columns

Index(['id', 'alias', 'name', 'image_url', 'is_closed', 'url', 'review_count',
       'categories', 'rating', 'transactions', 'price', 'phone',
       'display_phone', 'distance', 'coordinates.latitude',
       'coordinates.longitude', 'location.address1', 'location.address2',
       'location.address3', 'location.city', 'location.zip_code',
       'location.country', 'location.state', 'location.display_address',
       'req_cat'],
      dtype='object')

In [None]:
# Drop irrelevant or repetitive columns
yelp_poi.drop(columns=['id','alias','image_url','is_closed','url','categories','transactions','phone',
                        'display_phone','location.address2','location.address3',
                        'location.country','location.state','location.display_address'],
                         inplace=True)
yelp_poi.head()

Unnamed: 0,name,review_count,rating,price,distance,coordinates.latitude,coordinates.longitude,location.address1,location.city,location.zip_code,req_cat
0,Ripe Kitchen & Bar,287,4.0,$$,805.077589,40.898214,-73.838821,151 W Sandford Blvd,Mount Vernon,10550,restaurants
1,H.I.M Ital Health Food Market,47,4.5,$,715.467929,40.897633,-73.854665,4374B White Plains Rd,Bronx,10466,restaurants
2,Ali's Roti Shop,104,4.0,$,809.642747,40.894,-73.85684,4220 White Plains Rd,Bronx,10466,restaurants
3,Paula's Soul Cafe,202,3.0,$$,746.891106,40.89269,-73.85568,746 E 233rd St,Bronx,10466,restaurants
4,Jerk House,62,3.5,$$,771.525854,40.89464,-73.85648,4246 White Plains,Bronx,10466,restaurants


In [None]:
# Rename columns for better interpretability 
yelp_poi.columns = ['Name','Reviews','Rating','Price',
                    'Distance (m)','Latitude','Longitude','Address','Borough','Zipcode','Category']
yelp_poi['Category'] = 'restaurant'
                    
print(yelp_poi.shape)
yelp_poi.head()

(7115, 11)


Unnamed: 0,Name,Reviews,Rating,Price,Distance (m),Latitude,Longitude,Address,Borough,Zipcode,Category
0,Ripe Kitchen & Bar,287,4.0,$$,805.077589,40.898214,-73.838821,151 W Sandford Blvd,Mount Vernon,10550,restaurant
1,H.I.M Ital Health Food Market,47,4.5,$,715.467929,40.897633,-73.854665,4374B White Plains Rd,Bronx,10466,restaurant
2,Ali's Roti Shop,104,4.0,$,809.642747,40.894,-73.85684,4220 White Plains Rd,Bronx,10466,restaurant
3,Paula's Soul Cafe,202,3.0,$$,746.891106,40.89269,-73.85568,746 E 233rd St,Bronx,10466,restaurant
4,Jerk House,62,3.5,$$,771.525854,40.89464,-73.85648,4246 White Plains,Bronx,10466,restaurant


In [None]:
# Check for unique values
unique_values(yelp_poi)

Column Name: Name
# of Unique Values: 4163
Unique Values: ['Ripe Kitchen & Bar' 'H.I.M Ital Health Food Market' "Ali's Roti Shop"
 ... 'The Local Bar and Cafe' 'Kum Fung Chinese Restaurant' 'Amendment 18']
Column Name: Reviews
# of Unique Values: 1072
Unique Values: [ 287   47  104 ...  784  397 1028]
Column Name: Rating
# of Unique Values: 9
Unique Values: [4.  4.5 3.  3.5 5.  2.5 2.  1.5 1. ]
Column Name: Price
# of Unique Values: 5
Unique Values: ['$$' '$' '$$$' nan '$$$$']
Column Name: Distance (m)
# of Unique Values: 7048
Unique Values: [ 805.0775886   715.46792863  809.64274698 ...  825.92920937 1049.84712452
 1076.21508396]
Column Name: Latitude
# of Unique Values: 4402
Unique Values: [40.898214   40.897633   40.894      ... 40.7495622  40.621825
 40.62530162]
Column Name: Longitude
# of Unique Values: 4400
Unique Values: [-73.838821   -73.854665   -73.85684    ... -73.9477008  -74.07234
 -74.07454516]
Column Name: Address
# of Unique Values: 4266
Unique Values: ['151 W Sandford

In [None]:
# Check for nulls
yelp_poi.isnull().sum()

Name              0
Reviews           0
Rating            0
Price           999
Distance (m)      0
Latitude          0
Longitude         0
Address         166
Borough           0
Zipcode           0
Category          0
dtype: int64

There are null values in price and address. However it has low impact, as it simply signifies that some restuarants have no price and address information, which does not effect our objective.

Additionally, we have the coordinates which will be the main features used for city mapping.

In [None]:
# Check for duplicates
yelp_poi[yelp_poi.duplicated()]

# No duplicates

Unnamed: 0,Name,Reviews,Rating,Price,Distance (m),Latitude,Longitude,Address,Borough,Zipcode,Category


In [None]:
# See variance in numeric feautures
yelp_poi.describe()

Unnamed: 0,Reviews,Rating,Distance (m),Latitude,Longitude,Zipcode
count,7115.0,7115.0,7115.0,7115.0,7115.0,7115.0
mean,396.188897,3.916936,1571.646996,40.713105,-73.942424,10754.917639
std,863.333845,0.630598,4323.548896,0.093239,0.113619,558.516439
min,1.0,1.0,16.13817,40.205016,-74.448224,7047.0
25%,39.0,3.5,420.978899,40.634212,-73.99444,10305.0
50%,127.0,4.0,681.84268,40.716641,-73.93961,10473.0
75%,368.0,4.5,936.679688,40.771113,-73.865726,11231.0
max,14361.0,5.0,77147.675762,40.91594,-73.100648,12234.0


In [None]:
# Preview cleaned dataset
print(yelp_poi.shape)
yelp_poi.head()

(7115, 11)


Unnamed: 0,Name,Reviews,Rating,Price,Distance (m),Latitude,Longitude,Address,Borough,Zipcode,Category
0,Ripe Kitchen & Bar,287,4.0,$$,805.077589,40.898214,-73.838821,151 W Sandford Blvd,Mount Vernon,10550,restaurant
1,H.I.M Ital Health Food Market,47,4.5,$,715.467929,40.897633,-73.854665,4374B White Plains Rd,Bronx,10466,restaurant
2,Ali's Roti Shop,104,4.0,$,809.642747,40.894,-73.85684,4220 White Plains Rd,Bronx,10466,restaurant
3,Paula's Soul Cafe,202,3.0,$$,746.891106,40.89269,-73.85568,746 E 233rd St,Bronx,10466,restaurant
4,Jerk House,62,3.5,$$,771.525854,40.89464,-73.85648,4246 White Plains,Bronx,10466,restaurant


In [None]:
# Export cleaned dataset to csv
yelp_poi.to_csv('yelp_poi_cleaned.csv')

### Google Places - Restaurants

In [None]:
gp_food = pd.read_csv('google_places_poi.csv')
gp_food.head()

Unnamed: 0,business_status,icon,icon_background_color,icon_mask_base_uri,name,photos,place_id,price_level,rating,reference,...,geometry.location.lng,geometry.viewport.northeast.lat,geometry.viewport.northeast.lng,geometry.viewport.southwest.lat,geometry.viewport.southwest.lng,opening_hours.open_now,plus_code.compound_code,plus_code.global_code,permanently_closed,req_cat
0,OPERATIONAL,https://maps.gstatic.com/mapfiles/place_api/ic...,#FF9E67,https://maps.gstatic.com/mapfiles/place_api/ic...,Ripe Kitchen & Bar,"[{'height': 2471, 'html_attributions': ['<a hr...",ChIJ9aOdYiyNwokRS49czrNTQo4,2.0,4.3,ChIJ9aOdYiyNwokRS49czrNTQo4,...,-73.838855,40.899498,-73.837452,40.8968,-73.84015,True,"V5X6+7F Mount Vernon, NY, USA",87G8V5X6+7F,,restaurant
1,OPERATIONAL,https://maps.gstatic.com/mapfiles/place_api/ic...,#FF9E67,https://maps.gstatic.com/mapfiles/place_api/ic...,New China Garden,"[{'height': 4032, 'html_attributions': ['<a hr...",ChIJcVDAYNfywokRPUaKrm0UBpg,1.0,3.9,ChIJcVDAYNfywokRPUaKrm0UBpg,...,-73.853364,40.899306,-73.8521,40.896608,-73.854798,True,"V4XW+5M The Bronx, NY, USA",87G8V4XW+5M,,restaurant
2,OPERATIONAL,https://maps.gstatic.com/mapfiles/place_api/ic...,#FF9E67,https://maps.gstatic.com/mapfiles/place_api/ic...,Dunkin',"[{'height': 3000, 'html_attributions': ['<a hr...",ChIJ--58aynzwokRqSvaRDKwX1M,1.0,3.8,ChIJ--58aynzwokRqSvaRDKwX1M,...,-73.849089,40.891936,-73.847664,40.889239,-73.850362,True,"V5R2+59 New York, NY, USA",87G8V5R2+59,,restaurant
3,OPERATIONAL,https://maps.gstatic.com/mapfiles/place_api/ic...,#FF9E67,https://maps.gstatic.com/mapfiles/place_api/ic...,Subway,"[{'height': 766, 'html_attributions': ['<a hre...",ChIJAy-ibCnzwokRdmkKGd1Yzhw,1.0,3.6,ChIJAy-ibCnzwokRdmkKGd1Yzhw,...,-73.849152,40.891828,-73.847659,40.88913,-73.850357,True,"V5R2+58 New York, NY, USA",87G8V5R2+58,,restaurant
4,OPERATIONAL,https://maps.gstatic.com/mapfiles/place_api/ic...,#FF9E67,https://maps.gstatic.com/mapfiles/place_api/ic...,Popeyes Louisiana Kitchen,"[{'height': 1192, 'html_attributions': ['<a hr...",ChIJXVacFSvzwokRa3VpQDuwRDE,1.0,3.8,ChIJXVacFSvzwokRa3VpQDuwRDE,...,-73.843383,40.890762,-73.842012,40.888064,-73.84471,True,"V5Q4+QJ New York, NY, USA",87G8V5Q4+QJ,,restaurant


In [None]:
gp_food.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5619 entries, 0 to 5618
Data columns (total 25 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   business_status                  5619 non-null   object 
 1   icon                             5619 non-null   object 
 2   icon_background_color            5619 non-null   object 
 3   icon_mask_base_uri               5619 non-null   object 
 4   name                             5619 non-null   object 
 5   photos                           5442 non-null   object 
 6   place_id                         5619 non-null   object 
 7   price_level                      4786 non-null   float64
 8   rating                           5522 non-null   float64
 9   reference                        5619 non-null   object 
 10  scope                            5619 non-null   object 
 11  types                            5619 non-null   object 
 12  user_ratings_total  

In [None]:
# gp_food.iloc[0:5,10:]
gp_food.columns

Index(['business_status', 'icon', 'icon_background_color',
       'icon_mask_base_uri', 'name', 'photos', 'place_id', 'price_level',
       'rating', 'reference', 'scope', 'types', 'user_ratings_total',
       'vicinity', 'geometry.location.lat', 'geometry.location.lng',
       'geometry.viewport.northeast.lat', 'geometry.viewport.northeast.lng',
       'geometry.viewport.southwest.lat', 'geometry.viewport.southwest.lng',
       'opening_hours.open_now', 'plus_code.compound_code',
       'plus_code.global_code', 'permanently_closed', 'req_cat'],
      dtype='object')

In [None]:
# Drop irrelevant or repetitive columns
gp_food.drop(columns=['business_status','icon','icon_background_color','icon_mask_base_uri',
                    'photos','place_id','reference','scope','types','geometry.viewport.northeast.lat',
                    'geometry.viewport.northeast.lng','geometry.viewport.southwest.lat',
                    'geometry.viewport.southwest.lng','opening_hours.open_now',
                    'plus_code.compound_code','plus_code.global_code','permanently_closed'], 
                    inplace=True)

gp_food.head()

Unnamed: 0,name,price_level,rating,user_ratings_total,vicinity,geometry.location.lat,geometry.location.lng,req_cat
0,Ripe Kitchen & Bar,2.0,4.3,714.0,"151 West Sandford Boulevard, Mount Vernon",40.898209,-73.838855,restaurant
1,New China Garden,1.0,3.9,85.0,"724 Nereid Avenue, The Bronx",40.897919,-73.853364,restaurant
2,Dunkin',1.0,3.8,280.0,"980 East 233rd Street, The Bronx",40.890459,-73.849089,restaurant
3,Subway,1.0,3.6,121.0,"980 East 233rd Street, The Bronx",40.890468,-73.849152,restaurant
4,Popeyes Louisiana Kitchen,1.0,3.8,467.0,"1201 East 233rd Street, The Bronx",40.889492,-73.843383,restaurant


In [None]:
# Rename columns for better interpretability
gp_food.columns = ['Name','Price','Rating','Reviews','Address','Latitude','Longitude','Category']
print(gp_food.shape)
gp_food.head()

(5619, 8)


Unnamed: 0,Name,Price,Rating,Reviews,Address,Latitude,Longitude,Category
0,Ripe Kitchen & Bar,2.0,4.3,714.0,"151 West Sandford Boulevard, Mount Vernon",40.898209,-73.838855,restaurant
1,New China Garden,1.0,3.9,85.0,"724 Nereid Avenue, The Bronx",40.897919,-73.853364,restaurant
2,Dunkin',1.0,3.8,280.0,"980 East 233rd Street, The Bronx",40.890459,-73.849089,restaurant
3,Subway,1.0,3.6,121.0,"980 East 233rd Street, The Bronx",40.890468,-73.849152,restaurant
4,Popeyes Louisiana Kitchen,1.0,3.8,467.0,"1201 East 233rd Street, The Bronx",40.889492,-73.843383,restaurant


In [None]:
# splitting Address into Address & Borough columns
gp_food[['Address','Borough']] = gp_food['Address'].str.split(',',1,expand=True)
gp_food = gp_food[['Name','Price','Rating','Reviews','Address','Borough','Latitude','Longitude','Category']]

In [None]:
gp_food

Unnamed: 0,Name,Price,Rating,Reviews,Address,Borough,Latitude,Longitude,Category
0,Ripe Kitchen & Bar,2.0,4.3,714.0,151 West Sandford Boulevard,Mount Vernon,40.898209,-73.838855,restaurant
1,New China Garden,1.0,3.9,85.0,724 Nereid Avenue,The Bronx,40.897919,-73.853364,restaurant
2,Dunkin',1.0,3.8,280.0,980 East 233rd Street,The Bronx,40.890459,-73.849089,restaurant
3,Subway,1.0,3.6,121.0,980 East 233rd Street,The Bronx,40.890468,-73.849152,restaurant
4,Popeyes Louisiana Kitchen,1.0,3.8,467.0,1201 East 233rd Street,The Bronx,40.889492,-73.843383,restaurant
...,...,...,...,...,...,...,...,...,...
5614,Romeo's Pizza,1.0,4.3,91.0,117 Broad Street,Staten Island,40.624812,-74.079342,restaurant
5615,Amins fried chiken,,4.8,21.0,610 Richmond Road,Staten Island,40.611662,-74.088093,restaurant
5616,Taco Bell,1.0,3.9,21.0,429 Tompkins Avenue,Staten Island,40.615877,-74.072007,restaurant
5617,Palm Grill,,3.5,2.0,160 Saint Marys Avenue #1/2,Staten Island,40.613963,-74.072582,restaurant


In [None]:
# Check for unique values
unique_values(gp_food)

Column Name: Name
# of Unique Values: 3078
Unique Values: ['Ripe Kitchen & Bar' 'New China Garden' "Dunkin'" ... 'Exquisito'
 'Greek Panorama' 'Kum Fung']
Column Name: Price
# of Unique Values: 5
Unique Values: [ 2.  1. nan  3.  4.]
Column Name: Rating
# of Unique Values: 35
Unique Values: [4.3 3.9 3.8 3.6 3.3 4.2 3.7 4.1 2.2 1.  4.  4.5 5.  4.6 2.9 4.4 4.7 3.
 3.4 2.4 3.2 2.5 3.5 2.7 3.1 2.8 4.9 nan 2.  2.6 2.1 4.8 2.3 1.9 1.8]
Column Name: Reviews
# of Unique Values: 1434
Unique Values: [ 714.   85.  280. ... 1007. 1291. 1732.]
Column Name: Address
# of Unique Values: 3858
Unique Values: ['151 West Sandford Boulevard' '724 Nereid Avenue' '980 East 233rd Street'
 ... '24 -11 41st Avenue' '455 Main Street' '838 Bay Street']
Column Name: Borough
# of Unique Values: 161
Unique Values: [' Mount Vernon' ' The Bronx' ' 2090 Bartow Avenue, The Bronx'
 ' 1930 Bartow Avenue, The Bronx' None ' Pelham Manor' ' Pelham'
 ' New York' ' Yonkers' ' 4201 Webster Avenue, The Bronx'
 ' 3059 Webster Aven

In [None]:
# Check for nulls
gp_food.isnull().sum()

Name           0
Price        833
Rating        97
Reviews       97
Address        0
Borough       39
Latitude       0
Longitude      0
Category       0
dtype: int64

There are null values in price, rating, and reviews. However it has low impact, as it simply signifies that some restuarants have no price, rating, or review information, which does not effect our objective.

Some boroughs have null values, but we can assign that information when we join with our other data .

In [None]:
# Check for duplicates
gp_food = gp_food[gp_food.duplicated()]
len(gp_food)

1578

In [None]:
# Duplicate rows found, remove from DataFrame
gp_food = gp_food.drop_duplicates()

# Confirm removal
gp_food[gp_food.duplicated()]

Unnamed: 0,Name,Price,Rating,Reviews,Address,Borough,Latitude,Longitude,Category


In [None]:
# See variance in numeric features
gp_food.describe()

Unnamed: 0,Price,Rating,Reviews,Latitude,Longitude
count,1116.0,1246.0,1246.0,1260.0,1260.0
mean,1.660394,4.083226,740.94061,40.713778,-73.951417
std,0.671181,0.439148,1260.821341,0.094919,0.111364
min,1.0,1.0,1.0,40.522106,-74.238666
25%,1.0,3.9,148.0,40.626667,-74.000674
50%,2.0,4.2,360.5,40.713171,-73.944073
75%,2.0,4.4,847.0,40.786754,-73.869725
max,4.0,5.0,19357.0,40.894788,-73.707534


In [None]:
# Preview cleaned dataset
print(gp_food.shape)
gp_food.head()

(1260, 9)


Unnamed: 0,Name,Price,Rating,Reviews,Address,Borough,Latitude,Longitude,Category
54,Prime 33 Banquet hall #1,2.0,4.3,150.0,3323 Merritt Avenue,The Bronx,40.881788,-73.826755,restaurant
80,An Beal Bocht Cafe,2.0,4.6,534.0,445 West 238th Street,The Bronx,40.887375,-73.904953,restaurant
82,Goodfellas Riverdale 🍕,2.0,3.8,171.0,3661 Waldo Avenue,The Bronx,40.887026,-73.904387,restaurant
84,Corner Cafe & Bakery,1.0,4.4,164.0,3718 Riverdale Avenue,The Bronx,40.887266,-73.90692,restaurant
85,Salvatore's,2.0,4.4,656.0,3738 Riverdale Avenue,The Bronx,40.887631,-73.906955,restaurant


A pizza slice!

The names can be kept as is, even with an emoji as it will not affect our modelling. This is because the name column will not be used.

In [None]:
# Export cleaned dataset to csv
gp_food.to_csv('google_places_poi_clean.csv')

### Google Places - Parks

In [None]:
# Load dataset
gp_parks = pd.read_csv('google_places_park.csv')
gp_parks.head()

Unnamed: 0,business_status,icon,icon_background_color,icon_mask_base_uri,name,photos,place_id,price_level,rating,reference,...,geometry.location.lng,geometry.viewport.northeast.lat,geometry.viewport.northeast.lng,geometry.viewport.southwest.lat,geometry.viewport.southwest.lng,opening_hours.open_now,plus_code.compound_code,plus_code.global_code,permanently_closed,req_cat
0,OPERATIONAL,https://maps.gstatic.com/mapfiles/place_api/ic...,#FF9E67,https://maps.gstatic.com/mapfiles/place_api/ic...,Ripe Kitchen & Bar,"[{'height': 2471, 'html_attributions': ['<a hr...",ChIJ9aOdYiyNwokRS49czrNTQo4,2.0,4.3,ChIJ9aOdYiyNwokRS49czrNTQo4,...,-73.838855,40.899498,-73.837452,40.8968,-73.84015,False,"V5X6+7F Mount Vernon, NY, USA",87G8V5X6+7F,,park
1,OPERATIONAL,https://maps.gstatic.com/mapfiles/place_api/ic...,#4DB546,https://maps.gstatic.com/mapfiles/place_api/ic...,Bissel Gardens,"[{'height': 3264, 'html_attributions': ['<a hr...",ChIJWcfdYdHywokRwdVd0ZGEGQU,,4.2,ChIJWcfdYdHywokRwdVd0ZGEGQU,...,-73.850196,40.901136,-73.848742,40.898439,-73.85144,,"V4XX+WW New York, NY, USA",87G8V4XX+WW,,park
2,OPERATIONAL,https://maps.gstatic.com/mapfiles/place_api/ic...,#4DB546,https://maps.gstatic.com/mapfiles/place_api/ic...,Givans Creek Woods,"[{'height': 4128, 'html_attributions': ['<a hr...",ChIJgZk8XsWMwokR1QLdmJSK07A,,4.2,ChIJgZk8XsWMwokR1QLdmJSK07A,...,-73.830511,40.880394,-73.829395,40.877696,-73.832093,,"V5H9+MQ The Bronx, NY, USA",87G8V5H9+MQ,,park
3,OPERATIONAL,https://maps.gstatic.com/mapfiles/place_api/ic...,#4DB546,https://maps.gstatic.com/mapfiles/place_api/ic...,Northeast Educational Park,"[{'height': 814, 'html_attributions': ['<a hre...",ChIJIR4lh8iMwokRwkigEwrhiv0,,5.0,ChIJIR4lh8iMwokRwkigEwrhiv0,...,-73.830569,40.875423,-73.82922,40.872725,-73.831918,True,"V5F9+JQ The Bronx, NY, USA",87G8V5F9+JQ,,park
4,OPERATIONAL,https://maps.gstatic.com/mapfiles/place_api/ic...,#4DB546,https://maps.gstatic.com/mapfiles/place_api/ic...,Co-op City Greenway,"[{'height': 3024, 'html_attributions': ['<a hr...",ChIJm2DXRMaMwokRtMMtdMysYHY,,4.6,ChIJm2DXRMaMwokRtMMtdMysYHY,...,-73.827847,40.875427,-73.826498,40.872729,-73.829196,True,"V5FC+JV New York, NY, USA",87G8V5FC+JV,,park


In [None]:
gp_parks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3459 entries, 0 to 3458
Data columns (total 25 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   business_status                  3377 non-null   object 
 1   icon                             3459 non-null   object 
 2   icon_background_color            3459 non-null   object 
 3   icon_mask_base_uri               3459 non-null   object 
 4   name                             3459 non-null   object 
 5   photos                           2861 non-null   object 
 6   place_id                         3459 non-null   object 
 7   price_level                      27 non-null     float64
 8   rating                           2894 non-null   float64
 9   reference                        3459 non-null   object 
 10  scope                            3459 non-null   object 
 11  types                            3459 non-null   object 
 12  user_ratings_total  

In [None]:
gp_parks.columns

Index(['business_status', 'icon', 'icon_background_color',
       'icon_mask_base_uri', 'name', 'photos', 'place_id', 'price_level',
       'rating', 'reference', 'scope', 'types', 'user_ratings_total',
       'vicinity', 'geometry.location.lat', 'geometry.location.lng',
       'geometry.viewport.northeast.lat', 'geometry.viewport.northeast.lng',
       'geometry.viewport.southwest.lat', 'geometry.viewport.southwest.lng',
       'opening_hours.open_now', 'plus_code.compound_code',
       'plus_code.global_code', 'permanently_closed', 'req_cat'],
      dtype='object')

In [None]:
# Drop irrelevant or repetitive columns
gp_parks.drop(columns=['business_status','icon','icon_background_color','icon_mask_base_uri',
                    'photos','place_id','reference','scope','types','geometry.viewport.northeast.lat',
                    'geometry.viewport.northeast.lng','geometry.viewport.southwest.lat',
                    'geometry.viewport.southwest.lng','opening_hours.open_now',
                    'plus_code.compound_code','plus_code.global_code','permanently_closed'], 
                    inplace=True)

gp_parks.head()

Unnamed: 0,name,price_level,rating,user_ratings_total,vicinity,geometry.location.lat,geometry.location.lng,req_cat
0,Ripe Kitchen & Bar,2.0,4.3,714.0,"151 West Sandford Boulevard, Mount Vernon",40.898209,-73.838855,park
1,Bissel Gardens,,4.2,25.0,"4525 Barnes Avenue, The Bronx",40.899858,-73.850196,park
2,Givans Creek Woods,,4.2,151.0,"4567, 1000 Baychester Avenue, The Bronx",40.87918,-73.830511,park
3,Northeast Educational Park,,5.0,2.0,The Bronx,40.874074,-73.830569,park
4,Co-op City Greenway,,4.6,207.0,The Bronx,40.874078,-73.827847,park


In [None]:
# Rename columns for better interpretability
gp_parks.columns = ['Name','Price','Rating','Reviews','Address','Latitude','Longitude','Category']
print(gp_parks.shape)
gp_parks.head()

(3459, 8)


Unnamed: 0,Name,Price,Rating,Reviews,Address,Latitude,Longitude,Category
0,Ripe Kitchen & Bar,2.0,4.3,714.0,"151 West Sandford Boulevard, Mount Vernon",40.898209,-73.838855,park
1,Bissel Gardens,,4.2,25.0,"4525 Barnes Avenue, The Bronx",40.899858,-73.850196,park
2,Givans Creek Woods,,4.2,151.0,"4567, 1000 Baychester Avenue, The Bronx",40.87918,-73.830511,park
3,Northeast Educational Park,,5.0,2.0,The Bronx,40.874074,-73.830569,park
4,Co-op City Greenway,,4.6,207.0,The Bronx,40.874078,-73.827847,park


In [None]:
# Check for unique values
unique_values(gp_parks)

Column Name: Name
# of Unique Values: 2247
Unique Values: ['Ripe Kitchen & Bar' 'Bissel Gardens' 'Givans Creek Woods' ...
 'Outdoor PingPong table' 'Roosevelt Island' 'City Viewpoint']
Column Name: Price
# of Unique Values: 5
Unique Values: [ 2. nan  1.  4.  0.]
Column Name: Rating
# of Unique Values: 32
Unique Values: [4.3 4.2 5.  4.6 nan 4.1 4.7 3.9 4.4 4.5 1.  4.  3.6 3.7 3.8 3.5 4.8 3.
 2.5 3.3 2.  1.5 1.6 2.9 2.7 4.9 3.1 3.4 3.2 2.3 2.8 2.4]
Column Name: Reviews
# of Unique Values: 519
Unique Values: [7.14000e+02 2.50000e+01 1.51000e+02 2.00000e+00 2.07000e+02         nan
 4.60000e+01 4.03000e+02 3.34000e+02 1.58600e+03 3.80000e+01 6.00000e+00
 4.10000e+01 5.00000e+01 1.29000e+02 2.17000e+02 4.00000e+00 8.50000e+01
 1.00000e+00 3.35000e+02 2.00000e+01 3.10000e+02 8.70000e+01 5.00000e+00
 2.80000e+01 1.90000e+02 1.67000e+02 2.37000e+02 2.04000e+02 7.00000e+00
 1.80000e+01 8.40000e+01 7.80000e+01 1.50000e+01 7.90000e+01 6.70000e+01
 7.30000e+01 2.59000e+02 1.60000e+01 7.20000e+01 3.

In [None]:
# Check for nulls
gp_parks.isnull().sum()

Name            0
Price        3432
Rating        565
Reviews       565
Address         0
Latitude        0
Longitude       0
Category        0
dtype: int64

There are null values in price, rating, and reviews. However it has low impact, as it simply signifies that some parks have no price, rating, or review information, which does not affect our objective.

In [None]:
# Check for duplicates
gp_parks = gp_parks[gp_parks.duplicated()]
len(gp_parks)

1150

In [None]:
# Duplicate rows found, remove from DataFrame
gp_parks = gp_parks.drop_duplicates().reset_index(drop=True)

# Confirm removal
gp_parks[gp_parks.duplicated()]

Unnamed: 0,Name,Price,Rating,Reviews,Address,Latitude,Longitude,Category


In [None]:
# See variance in numeric feautures
gp_parks.describe()

Unnamed: 0,Price,Rating,Reviews,Latitude,Longitude
count,8.0,720.0,720.0,855.0,855.0
mean,1.875,4.289861,503.305556,40.734356,-73.941078
std,1.125992,0.505266,3407.031268,0.088809,0.088196
min,0.0,1.0,1.0,40.512033,-74.228107
25%,1.75,4.0,7.0,40.676442,-73.988292
50%,2.0,4.3,37.5,40.729272,-73.942442
75%,2.0,4.6,156.25,40.8149,-73.884535
max,4.0,5.0,75227.0,40.902327,-73.706385


In [None]:
# Preview cleaned dataset
print(gp_parks.shape)
gp_parks.head()

(855, 8)


Unnamed: 0,Name,Price,Rating,Reviews,Address,Latitude,Longitude,Category
0,Givans Creek Woods,,4.2,151.0,"4567, 1000 Baychester Avenue, The Bronx",40.87918,-73.830511,park
1,Wave Hill Public Garden & Cultural Center,,4.7,1586.0,"4900 Independence Avenue, The Bronx",40.897852,-73.911445,park
2,Riverdale Park,,4.5,217.0,"254 Palisade Avenue, The Bronx",40.89474,-73.916803,park
3,Bell Tower Park,,4.5,50.0,"W. 239th St. &, Riverdale Avenue, The Bronx",40.889209,-73.908388,park
4,Henry Hudson Parkway,,,,The Bronx,40.893249,-73.9084,park


In [None]:
# Export cleaned dataset to csv
gp_parks.to_csv('google_places_parks_clean.csv')

### Google Places - Supermarkets

In [None]:
# Load dataset
gp_market = pd.read_csv('google_places_supermarket.csv')
gp_market.head()

Unnamed: 0,business_status,icon,icon_background_color,icon_mask_base_uri,name,photos,place_id,rating,reference,scope,...,geometry.viewport.northeast.lat,geometry.viewport.northeast.lng,geometry.viewport.southwest.lat,geometry.viewport.southwest.lng,opening_hours.open_now,plus_code.compound_code,plus_code.global_code,price_level,permanently_closed,req_cat
0,OPERATIONAL,https://maps.gstatic.com/mapfiles/place_api/ic...,#4B96F3,https://maps.gstatic.com/mapfiles/place_api/ic...,Associated Supermarkets of Edenwald,"[{'height': 5312, 'html_attributions': ['<a hr...",ChIJE0hQCSzzwokRNXtfA5ppMfs,3.9,ChIJE0hQCSzzwokRNXtfA5ppMfs,GOOGLE,...,40.889539,-73.845599,40.886841,-73.848297,True,"V5Q3+85 New York, NY, USA",87G8V5Q3+85,,,supermarket
1,OPERATIONAL,https://maps.gstatic.com/mapfiles/place_api/ic...,#4B96F3,https://maps.gstatic.com/mapfiles/place_api/ic...,Fine Fare Supermarkets,"[{'height': 2592, 'html_attributions': ['<a hr...",ChIJwQqYaSvzwokR2vl44PAU-mw,4.1,ChIJwQqYaSvzwokR2vl44PAU-mw,GOOGLE,...,40.890697,-73.842325,40.887999,-73.845023,True,"V5Q4+QH New York, NY, USA",87G8V5Q4+QH,,,supermarket
2,OPERATIONAL,https://maps.gstatic.com/mapfiles/place_api/ic...,#4B96F3,https://maps.gstatic.com/mapfiles/place_api/ic...,Foodtown of White Plains Road,"[{'height': 2988, 'html_attributions': ['<a hr...",ChIJNRBLRNjywokRwgIuMLIdBkY,4.1,ChIJNRBLRNjywokRwgIuMLIdBkY,GOOGLE,...,40.897503,-73.854237,40.894805,-73.856935,True,"V4WV+FQ New York, NY, USA",87G8V4WV+FQ,,,supermarket
3,OPERATIONAL,https://maps.gstatic.com/mapfiles/place_api/ic...,#4B96F3,https://maps.gstatic.com/mapfiles/place_api/ic...,233rd Caribbean meat market,"[{'height': 768, 'html_attributions': ['<a hre...",ChIJG56qDCfzwokREsftljqbKKg,4.7,ChIJG56qDCfzwokREsftljqbKKg,GOOGLE,...,40.895154,-73.855691,40.892456,-73.858389,True,"V4VV+F6 The Bronx, NY, USA",87G8V4VV+F6,,,supermarket
4,OPERATIONAL,https://maps.gstatic.com/mapfiles/place_api/ic...,#4B96F3,https://maps.gstatic.com/mapfiles/place_api/ic...,City Fresh Market,,ChIJ--GWDyzzwokRMvHRh43j9oA,4.3,ChIJ--GWDyzzwokRMvHRh43j9oA,GOOGLE,...,40.889492,-73.845551,40.886794,-73.848249,True,"V5Q3+75 The Bronx, NY, USA",87G8V5Q3+75,,,supermarket


In [None]:
gp_market.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2449 entries, 0 to 2448
Data columns (total 25 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   business_status                  2449 non-null   object 
 1   icon                             2449 non-null   object 
 2   icon_background_color            2449 non-null   object 
 3   icon_mask_base_uri               2449 non-null   object 
 4   name                             2449 non-null   object 
 5   photos                           2195 non-null   object 
 6   place_id                         2449 non-null   object 
 7   rating                           2319 non-null   float64
 8   reference                        2449 non-null   object 
 9   scope                            2449 non-null   object 
 10  types                            2449 non-null   object 
 11  user_ratings_total               2319 non-null   float64
 12  vicinity            

In [None]:
gp_market.columns

Index(['business_status', 'icon', 'icon_background_color',
       'icon_mask_base_uri', 'name', 'photos', 'place_id', 'rating',
       'reference', 'scope', 'types', 'user_ratings_total', 'vicinity',
       'geometry.location.lat', 'geometry.location.lng',
       'geometry.viewport.northeast.lat', 'geometry.viewport.northeast.lng',
       'geometry.viewport.southwest.lat', 'geometry.viewport.southwest.lng',
       'opening_hours.open_now', 'plus_code.compound_code',
       'plus_code.global_code', 'price_level', 'permanently_closed',
       'req_cat'],
      dtype='object')

In [None]:
# Drop irrelevant or repetitive columns
gp_market.drop(columns=['business_status','icon','icon_background_color','icon_mask_base_uri',
                    'photos','place_id','reference','scope','types','geometry.viewport.northeast.lat',
                    'geometry.viewport.northeast.lng','geometry.viewport.southwest.lat',
                    'geometry.viewport.southwest.lng','opening_hours.open_now',
                    'plus_code.compound_code','plus_code.global_code','permanently_closed'], 
                    inplace=True)

gp_market.head()

Unnamed: 0,name,rating,user_ratings_total,vicinity,geometry.location.lat,geometry.location.lng,price_level,req_cat
0,Associated Supermarkets of Edenwald,3.9,372.0,"4141 Laconia Avenue, The Bronx",40.888266,-73.847102,,supermarket
1,Fine Fare Supermarkets,4.1,547.0,"1199 East 233rd Street, The Bronx",40.889463,-73.84362,,supermarket
2,Foodtown of White Plains Road,4.1,1195.0,"4332 White Plains Road, The Bronx",40.896172,-73.855542,,supermarket
3,233rd Caribbean meat market,4.7,35.0,"4206 White Plains Road, The Bronx",40.893745,-73.856977,,supermarket
4,City Fresh Market,4.3,7.0,"4139 Laconia Avenue, The Bronx",40.888146,-73.847006,,supermarket


In [None]:
# Rename columns for better interpretability
gp_market.columns = ['Name','Rating','Reviews','Address','Latitude','Longitude','Price','Category']
gp_market = gp_market[['Name','Price','Rating','Reviews','Address','Latitude','Longitude','Category']]
print(gp_market.shape)
gp_market.head()

(2449, 8)


Unnamed: 0,Name,Price,Rating,Reviews,Address,Latitude,Longitude,Category
0,Associated Supermarkets of Edenwald,,3.9,372.0,"4141 Laconia Avenue, The Bronx",40.888266,-73.847102,supermarket
1,Fine Fare Supermarkets,,4.1,547.0,"1199 East 233rd Street, The Bronx",40.889463,-73.84362,supermarket
2,Foodtown of White Plains Road,,4.1,1195.0,"4332 White Plains Road, The Bronx",40.896172,-73.855542,supermarket
3,233rd Caribbean meat market,,4.7,35.0,"4206 White Plains Road, The Bronx",40.893745,-73.856977,supermarket
4,City Fresh Market,,4.3,7.0,"4139 Laconia Avenue, The Bronx",40.888146,-73.847006,supermarket


In [None]:
# Check for unique values
unique_values(gp_market)

Column Name: Name
# of Unique Values: 973
Unique Values: ['Associated Supermarkets of Edenwald' 'Fine Fare Supermarkets'
 'Foodtown of White Plains Road' '233rd Caribbean meat market'
 'City Fresh Market' 'Met Foodmarkets' 'King of Kings African Market'
 'Key Food Supermarkets' 'LUPITAS MEXICAN GROCERY II INC'
 'Food Universe Marketplace' 'Stop & Shop'
 'CO-OP CITY AFRICAN MARKET Inc.' 'ShopRite of Pelham Parkway'
 'CTown Supermarkets' 'SY Grace West Indian Supermarket'
 'Garden Gourmet Market - Supermarket' 'Food Dynasty Supermarkets'
 'Foodtown of Riverdale' 'ALDI' 'Broadway Deals' 'NY 99¢ & More'
 'Target Grocery' "Ben's Market" 'Irish Mini Mart' 'Family Dollar'
 'Foodtown of Bainbridge' "Moronta's Supermarket Corp"
 'Kings Fruit Market' 'H & B FRESH PRODUCE' 'D & J Supermarket'
 'Gausia Foods Halal Supermarket' 'Dhaka Supermarket and Halal Meat'
 'Bangla Town Supermarket' 'Foodtown of White Plains'
 'Foodtown of Boston Road' 'Cherry Valley Marketplace'
 'Western Beef Supermarket' '

In [None]:
# Check for nulls
gp_market.isnull().sum()

Name            0
Price        2050
Rating        130
Reviews       130
Address         0
Latitude        0
Longitude       0
Category        0
dtype: int64

There are null values in price, rating, and reviews. However it has low impact, as it simply signifies that some restuarants have no price, rating, or review information, which does not affect our objective.

In [None]:
# Check for duplicates
gp_market = gp_market[gp_market.duplicated()]
len(gp_market)

973

In [None]:
# Duplicate rows found, remove from DataFrame
gp_market = gp_market.drop_duplicates().reset_index(drop=True)

# Confirm removal
gp_market[gp_market.duplicated()]

Unnamed: 0,Name,Price,Rating,Reviews,Address,Latitude,Longitude,Category


In [None]:
# See variance in numeric feautures
gp_market.describe()

Unnamed: 0,Price,Rating,Reviews,Latitude,Longitude
count,123.0,670.0,670.0,707.0,707.0
mean,1.780488,4.086418,447.519403,40.741506,-73.929199
std,0.659565,0.493889,725.941919,0.080656,0.069386
min,1.0,1.0,1.0,40.566074,-74.165802
25%,1.0,3.9,48.25,40.680266,-73.973771
50%,2.0,4.1,201.0,40.735935,-73.928134
75%,2.0,4.4,501.5,40.819564,-73.881706
max,4.0,5.0,5848.0,40.893737,-73.71624


In [None]:
# Preview cleaned dataset
print(gp_market.shape)
gp_market.head()

(707, 8)


Unnamed: 0,Name,Price,Rating,Reviews,Address,Latitude,Longitude,Category
0,Key Food Supermarkets,,3.6,410.0,"540 West 235th Street, The Bronx",40.885437,-73.909246,supermarket
1,CTown Supermarkets,2.0,3.7,270.0,"5249 Broadway, The Bronx",40.875843,-73.908714,supermarket
2,Garden Gourmet Market - Supermarket,,4.5,1800.0,"5665 Broadway, The Bronx",40.881037,-73.903533,supermarket
3,Stop & Shop,2.0,4.2,2514.0,"5716 Broadway, The Bronx",40.882063,-73.902058,supermarket
4,Foodtown of Riverdale,,4.3,1706.0,"5555 Broadway, The Bronx",40.878621,-73.905335,supermarket


In [None]:
# Export cleaned dataset to csv
gp_market.to_csv('google_places_market_clean.csv')

### Google Places - Schools (adds more than just schools)

In [None]:
# Load dataset
gp_schools = pd.read_csv('google_places_school.csv')
gp_schools.head()

Unnamed: 0,business_status,icon,icon_background_color,icon_mask_base_uri,name,photos,place_id,price_level,rating,reference,...,geometry.location.lng,geometry.viewport.northeast.lat,geometry.viewport.northeast.lng,geometry.viewport.southwest.lat,geometry.viewport.southwest.lng,opening_hours.open_now,plus_code.compound_code,plus_code.global_code,permanently_closed,req_cat
0,OPERATIONAL,https://maps.gstatic.com/mapfiles/place_api/ic...,#FF9E67,https://maps.gstatic.com/mapfiles/place_api/ic...,Ripe Kitchen & Bar,"[{'height': 2471, 'html_attributions': ['<a hr...",ChIJ9aOdYiyNwokRS49czrNTQo4,2.0,4.3,ChIJ9aOdYiyNwokRS49czrNTQo4,...,-73.838855,40.899498,-73.837452,40.8968,-73.84015,False,"V5X6+7F Mount Vernon, NY, USA",87G8V5X6+7F,,school
1,OPERATIONAL,https://maps.gstatic.com/mapfiles/place_api/ic...,#4DB546,https://maps.gstatic.com/mapfiles/place_api/ic...,Bissel Gardens,"[{'height': 3264, 'html_attributions': ['<a hr...",ChIJWcfdYdHywokRwdVd0ZGEGQU,,4.2,ChIJWcfdYdHywokRwdVd0ZGEGQU,...,-73.850196,40.901136,-73.848742,40.898439,-73.85144,,"V4XX+WW New York, NY, USA",87G8V4XX+WW,,school
2,OPERATIONAL,https://maps.gstatic.com/mapfiles/place_api/ic...,#4DB546,https://maps.gstatic.com/mapfiles/place_api/ic...,Givans Creek Woods,"[{'height': 4128, 'html_attributions': ['<a hr...",ChIJgZk8XsWMwokR1QLdmJSK07A,,4.2,ChIJgZk8XsWMwokR1QLdmJSK07A,...,-73.830511,40.880394,-73.829395,40.877696,-73.832093,,"V5H9+MQ The Bronx, NY, USA",87G8V5H9+MQ,,school
3,OPERATIONAL,https://maps.gstatic.com/mapfiles/place_api/ic...,#4DB546,https://maps.gstatic.com/mapfiles/place_api/ic...,Northeast Educational Park,"[{'height': 814, 'html_attributions': ['<a hre...",ChIJIR4lh8iMwokRwkigEwrhiv0,,5.0,ChIJIR4lh8iMwokRwkigEwrhiv0,...,-73.830569,40.875423,-73.82922,40.872725,-73.831918,True,"V5F9+JQ The Bronx, NY, USA",87G8V5F9+JQ,,school
4,OPERATIONAL,https://maps.gstatic.com/mapfiles/place_api/ic...,#4DB546,https://maps.gstatic.com/mapfiles/place_api/ic...,Co-op City Greenway,"[{'height': 3024, 'html_attributions': ['<a hr...",ChIJm2DXRMaMwokRtMMtdMysYHY,,4.6,ChIJm2DXRMaMwokRtMMtdMysYHY,...,-73.827847,40.875427,-73.826498,40.872729,-73.829196,True,"V5FC+JV New York, NY, USA",87G8V5FC+JV,,school


In [None]:
gp_schools.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8749 entries, 0 to 8748
Data columns (total 25 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   business_status                  8667 non-null   object 
 1   icon                             8749 non-null   object 
 2   icon_background_color            8749 non-null   object 
 3   icon_mask_base_uri               8749 non-null   object 
 4   name                             8749 non-null   object 
 5   photos                           5385 non-null   object 
 6   place_id                         8749 non-null   object 
 7   price_level                      32 non-null     float64
 8   rating                           6487 non-null   float64
 9   reference                        8749 non-null   object 
 10  scope                            8749 non-null   object 
 11  types                            8749 non-null   object 
 12  user_ratings_total  

In [None]:
gp_schools.columns

Index(['business_status', 'icon', 'icon_background_color',
       'icon_mask_base_uri', 'name', 'photos', 'place_id', 'price_level',
       'rating', 'reference', 'scope', 'types', 'user_ratings_total',
       'vicinity', 'geometry.location.lat', 'geometry.location.lng',
       'geometry.viewport.northeast.lat', 'geometry.viewport.northeast.lng',
       'geometry.viewport.southwest.lat', 'geometry.viewport.southwest.lng',
       'opening_hours.open_now', 'plus_code.compound_code',
       'plus_code.global_code', 'permanently_closed', 'req_cat'],
      dtype='object')

In [None]:
# Drop irrelevant or repetitive columns
gp_schools.drop(columns=['business_status','icon','icon_background_color','icon_mask_base_uri',
                    'photos','place_id','reference','scope','types','geometry.viewport.northeast.lat',
                    'geometry.viewport.northeast.lng','geometry.viewport.southwest.lat',
                    'geometry.viewport.southwest.lng','opening_hours.open_now',
                    'plus_code.compound_code','plus_code.global_code','permanently_closed'], 
                    inplace=True)

gp_schools.head()

Unnamed: 0,name,price_level,rating,user_ratings_total,vicinity,geometry.location.lat,geometry.location.lng,req_cat
0,Ripe Kitchen & Bar,2.0,4.3,714.0,"151 West Sandford Boulevard, Mount Vernon",40.898209,-73.838855,school
1,Bissel Gardens,,4.2,25.0,"4525 Barnes Avenue, The Bronx",40.899858,-73.850196,school
2,Givans Creek Woods,,4.2,151.0,"4567, 1000 Baychester Avenue, The Bronx",40.87918,-73.830511,school
3,Northeast Educational Park,,5.0,2.0,The Bronx,40.874074,-73.830569,school
4,Co-op City Greenway,,4.6,207.0,The Bronx,40.874078,-73.827847,school


In [None]:
# Rename columns for better interpretability
gp_schools.columns = ['Name','Price','Rating','Reviews','Address','Latitude','Longitude','Category']
print(gp_schools.shape)
gp_schools.head()

(8749, 8)


Unnamed: 0,Name,Price,Rating,Reviews,Address,Latitude,Longitude,Category
0,Ripe Kitchen & Bar,2.0,4.3,714.0,"151 West Sandford Boulevard, Mount Vernon",40.898209,-73.838855,school
1,Bissel Gardens,,4.2,25.0,"4525 Barnes Avenue, The Bronx",40.899858,-73.850196,school
2,Givans Creek Woods,,4.2,151.0,"4567, 1000 Baychester Avenue, The Bronx",40.87918,-73.830511,school
3,Northeast Educational Park,,5.0,2.0,The Bronx,40.874074,-73.830569,school
4,Co-op City Greenway,,4.6,207.0,The Bronx,40.874078,-73.827847,school


In [None]:
# Check for unique values
unique_values(gp_schools)

Column Name: Name
# of Unique Values: 5659
Unique Values: ['Ripe Kitchen & Bar' 'Bissel Gardens' 'Givans Creek Woods' ...
 'Kathryn Brickell Music - Far Rockaway' 'Yeshiva Gedola Meor Hatalmud'
 'Main Street Theatre & Dance Alliance']
Column Name: Price
# of Unique Values: 5
Unique Values: [ 2. nan  1.  4.  0.]
Column Name: Rating
# of Unique Values: 39
Unique Values: [4.3 4.2 5.  4.6 nan 4.1 4.7 3.9 4.4 4.5 1.  4.  3.6 3.7 3.8 3.5 4.8 3.
 2.5 3.3 2.  1.5 1.6 2.9 2.7 4.9 3.1 3.4 3.2 2.3 2.8 2.4 2.6 2.2 1.4 2.1
 1.7 1.9 1.8]
Column Name: Reviews
# of Unique Values: 532
Unique Values: [7.14000e+02 2.50000e+01 1.51000e+02 2.00000e+00 2.07000e+02         nan
 4.60000e+01 4.03000e+02 3.34000e+02 1.58600e+03 3.80000e+01 6.00000e+00
 4.10000e+01 5.00000e+01 1.29000e+02 2.17000e+02 4.00000e+00 8.50000e+01
 1.00000e+00 3.35000e+02 2.00000e+01 3.10000e+02 8.70000e+01 5.00000e+00
 2.80000e+01 1.90000e+02 1.67000e+02 2.37000e+02 2.04000e+02 7.00000e+00
 1.80000e+01 8.40000e+01 7.80000e+01 1.50000e

In [None]:
# Check for nulls
gp_schools.isnull().sum()

Name            0
Price        8717
Rating       2262
Reviews      2262
Address         0
Latitude        0
Longitude       0
Category        0
dtype: int64

There are null values in price, rating, and reviews. However it has low impact, as it simply signifies that some schools have no price, rating, or review information, which does not affect our objective.

In [None]:
# Check for duplicates
gp_schools = gp_schools[gp_schools.duplicated()]
len(gp_schools)

2849

In [None]:
# Duplicate rows found, remove from DataFrame
gp_schools = gp_schools.drop_duplicates().reset_index(drop=True)

# Confirm removal
gp_schools[gp_schools.duplicated()]

Unnamed: 0,Name,Price,Rating,Reviews,Address,Latitude,Longitude,Category


In [None]:
# See variance in numeric feautures
gp_schools.describe()

Unnamed: 0,Price,Rating,Reviews,Latitude,Longitude
count,9.0,1638.0,1638.0,2128.0,2128.0
mean,1.666667,4.202503,234.302808,40.723136,-73.946601
std,1.224745,0.669083,2270.741549,0.093409,0.099538
min,0.0,1.0,1.0,40.512033,-74.238592
25%,1.0,3.9,6.0,40.643244,-73.993266
50%,2.0,4.3,17.0,40.720216,-73.943868
75%,2.0,4.675,49.0,40.808596,-73.881272
max,4.0,5.0,75227.0,40.90347,-73.706385


In [None]:
# Preview cleaned dataset
print(gp_schools.shape)
gp_schools

(2128, 8)


Unnamed: 0,Name,Price,Rating,Reviews,Address,Latitude,Longitude,Category
0,Givans Creek Woods,,4.2,151.0,"4567, 1000 Baychester Avenue, The Bronx",40.879180,-73.830511,school
1,Wave Hill Public Garden & Cultural Center,,4.7,1586.0,"4900 Independence Avenue, The Bronx",40.897852,-73.911445,school
2,Riverdale Park,,4.5,217.0,"254 Palisade Avenue, The Bronx",40.894740,-73.916803,school
3,Bell Tower Park,,4.5,50.0,"W. 239th St. &, Riverdale Avenue, The Bronx",40.889209,-73.908388,school
4,Henry Hudson Parkway,,,,The Bronx,40.893249,-73.908400,school
...,...,...,...,...,...,...,...,...
2123,School Buildings Division,,5.0,1.0,"2811 Queens Plaza North, Long Island City",40.750235,-73.938086,school
2124,Citywide Council on Special Education,,5.0,1.0,Long Island City,40.750262,-73.938082,school
2125,Industrial Management & Train,,,,"43-82 Vernon Boulevard, Long Island City",40.751245,-73.952622,school
2126,Bright Horizons at Long Island City,,4.7,20.0,"42-09 28th Street, Queens",40.749357,-73.939062,school


In [None]:
gp_schools['Name'].unique().tolist()

['Givans Creek Woods',
 'Wave Hill Public Garden & Cultural Center',
 'Riverdale Park',
 'Bell Tower Park',
 'Henry Hudson Parkway',
 'Brust Park',
 'Quad',
 'Pure Love Organic Farms',
 'Ewen Park',
 'Ewen Park Dog Run',
 'Siren Slope',
 'Cooney Grauer Field',
 'Enchanted Garden',
 'Marble Hill Houses Park',
 'Crescent Park',
 'Fort #4 Playground',
 "Harley J. Mosley, Sr. Veteran's Memorial Park",
 'Kiddie Park, Alcott Place',
 'Haffen Park',
 'Northeast Educational Park',
 'Northeast Bronx Educational Park',
 'Mosholu Parkway',
 'Kossuth Playground',
 'Risse Street Park',
 'Bainbridge Avenue Garden',
 'Mosholu Pkwy',
 'Whalen Playground',
 'Roberto Clemente State Park',
 'Morton Playground',
 'Galileo Playground',
 'Mount Hope Playground',
 'Jardin De Las Rosas',
 'Bergen Triangle',
 'River Garden',
 'Vidalia Park',
 'Boone Slope',
 'Eae J Mitchell Park',
 'Krystal Garden Group',
 'Daly Avenue Garden',
 'Rock Garden Park',
 'Mohegan Triangle',
 'Hylan Park',
 'Miracle Garden',
 'Seabu

In [None]:
# Export cleaned dataset to csv
gp_schools.to_csv('google_places_schools_clean.csv')

### Google Places - Transit (when dropping duplicates, lose alot of unique rows)

In [None]:
# Load dataset
gp_transit = pd.read_csv('google_places_transit.csv')
gp_transit.head()

Unnamed: 0,business_status,icon,icon_background_color,icon_mask_base_uri,name,place_id,reference,scope,types,vicinity,...,geometry.viewport.southwest.lng,plus_code.compound_code,plus_code.global_code,photos,rating,user_ratings_total,opening_hours.open_now,price_level,permanently_closed,req_cat
0,OPERATIONAL,https://maps.gstatic.com/mapfiles/place_api/ic...,#7B9EB0,https://maps.gstatic.com/mapfiles/place_api/ic...,Baychester Av/Pitman Av,ChIJhYtMWNTywokRmxLQLfbyUhg,ChIJhYtMWNTywokRmxLQLfbyUhg,GOOGLE,"['transit_station', 'point_of_interest', 'esta...",United States,...,-73.846383,"V5W3+HX New York, NY, USA",87G8V5W3+HX,,,,,,,transit_station
1,OPERATIONAL,https://maps.gstatic.com/mapfiles/place_api/ic...,#7B9EB0,https://maps.gstatic.com/mapfiles/place_api/ic...,Baychester Av/Edenwald Av,ChIJbST_YdXywokRJjZ5163rsOI,ChIJbST_YdXywokRJjZ5163rsOI,GOOGLE,"['transit_station', 'point_of_interest', 'esta...",United States,...,-73.845095,"V5V4+6G New York, NY, USA",87G8V5V4+6G,,,,,,,transit_station
2,OPERATIONAL,https://maps.gstatic.com/mapfiles/place_api/ic...,#7B9EB0,https://maps.gstatic.com/mapfiles/place_api/ic...,Nereid Av/Bruner Av,ChIJzWjJztbywokRtc3_GWi-3so,ChIJzWjJztbywokRtc3_GWi-3so,GOOGLE,"['transit_station', 'point_of_interest', 'esta...",United States,...,-73.850993,"V5X2+94 New York, NY, USA",87G8V5X2+94,,,,,,,transit_station
3,OPERATIONAL,https://maps.gstatic.com/mapfiles/place_api/ic...,#7B9EB0,https://maps.gstatic.com/mapfiles/place_api/ic...,Nereid Av/Byron Av,ChIJcYpjadfywokRFmPl3dBM4V8,ChIJcYpjadfywokRFmPl3dBM4V8,GOOGLE,"['transit_station', 'point_of_interest', 'esta...",United States,...,-73.853909,"V4XW+4X New York, NY, USA",87G8V4XW+4X,,,,,,,transit_station
4,OPERATIONAL,https://maps.gstatic.com/mapfiles/place_api/ic...,#7B9EB0,https://maps.gstatic.com/mapfiles/place_api/ic...,Baychester Ave/Pitman Av,ChIJg7ZlV9TywokR9IEhSF7JIPI,ChIJg7ZlV9TywokR9IEhSF7JIPI,GOOGLE,"['transit_station', 'point_of_interest', 'esta...",Bronx,...,-73.846445,"V5W3+HX New York, NY, USA",87G8V5W3+HX,"[{'height': 4032, 'html_attributions': ['<a hr...",4.0,1.0,,,,transit_station


In [None]:
gp_transit.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5996 entries, 0 to 5995
Data columns (total 25 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   business_status                  5996 non-null   object 
 1   icon                             5996 non-null   object 
 2   icon_background_color            5996 non-null   object 
 3   icon_mask_base_uri               5996 non-null   object 
 4   name                             5996 non-null   object 
 5   place_id                         5996 non-null   object 
 6   reference                        5996 non-null   object 
 7   scope                            5996 non-null   object 
 8   types                            5996 non-null   object 
 9   vicinity                         5994 non-null   object 
 10  geometry.location.lat            5996 non-null   float64
 11  geometry.location.lng            5996 non-null   float64
 12  geometry.viewport.no

In [None]:
gp_transit.columns

Index(['business_status', 'icon', 'icon_background_color',
       'icon_mask_base_uri', 'name', 'place_id', 'reference', 'scope', 'types',
       'vicinity', 'geometry.location.lat', 'geometry.location.lng',
       'geometry.viewport.northeast.lat', 'geometry.viewport.northeast.lng',
       'geometry.viewport.southwest.lat', 'geometry.viewport.southwest.lng',
       'plus_code.compound_code', 'plus_code.global_code', 'photos', 'rating',
       'user_ratings_total', 'opening_hours.open_now', 'price_level',
       'permanently_closed', 'req_cat'],
      dtype='object')

In [None]:
# Drop irrelevant or repetitive columns
gp_transit.drop(columns=['business_status','icon','icon_background_color','icon_mask_base_uri',
                    'photos','place_id','reference','scope','types','geometry.viewport.northeast.lat',
                    'geometry.viewport.northeast.lng','geometry.viewport.southwest.lat',
                    'geometry.viewport.southwest.lng','opening_hours.open_now',
                    'plus_code.compound_code','plus_code.global_code','permanently_closed'], 
                    inplace=True)

gp_transit.head()

Unnamed: 0,name,vicinity,geometry.location.lat,geometry.location.lng,rating,user_ratings_total,price_level,req_cat
0,Baychester Av/Pitman Av,United States,40.896448,-73.84502,,,,transit_station
1,Baychester Av/Edenwald Av,United States,40.893101,-73.843734,,,,transit_station
2,Nereid Av/Bruner Av,United States,40.89846,-73.849651,,,,transit_station
3,Nereid Av/Byron Av,United States,40.897842,-73.852588,,,,transit_station
4,Baychester Ave/Pitman Av,Bronx,40.896377,-73.845116,4.0,1.0,,transit_station


In [None]:
# Rename columns for better interpretability
gp_transit.columns = ['Name','Address','Latitude','Longitude','Rating','Reviews','Price','Category']
gp_transit = gp_transit[['Name','Price','Rating','Reviews','Address','Latitude','Longitude','Category']]
print(gp_transit.shape)
gp_transit.head()

(5996, 8)


Unnamed: 0,Name,Price,Rating,Reviews,Address,Latitude,Longitude,Category
0,Baychester Av/Pitman Av,,,,United States,40.896448,-73.84502,transit_station
1,Baychester Av/Edenwald Av,,,,United States,40.893101,-73.843734,transit_station
2,Nereid Av/Bruner Av,,,,United States,40.89846,-73.849651,transit_station
3,Nereid Av/Byron Av,,,,United States,40.897842,-73.852588,transit_station
4,Baychester Ave/Pitman Av,,4.0,1.0,Bronx,40.896377,-73.845116,transit_station


In [None]:
# Check for unique values
unique_values(gp_transit)

Column Name: Name
# of Unique Values: 4339
Unique Values: ['Baychester Av/Pitman Av' 'Baychester Av/Edenwald Av'
 'Nereid Av/Bruner Av' ... 'Vanderbilt Ave/Hillside Ave'
 'Van Duzer St/Baring Pl' 'Van Duzer St/Roff St']
Column Name: Price
# of Unique Values: 2
Unique Values: [nan  2.]
Column Name: Rating
# of Unique Values: 36
Unique Values: [nan 4.  5.  4.5 1.  3.  3.4 2.  3.1 3.3 4.8 2.6 3.8 4.4 4.6 2.5 4.2 3.7
 4.3 4.7 2.9 3.5 2.7 3.9 4.1 3.6 2.3 1.8 2.8 2.1 1.5 2.4 1.9 4.9 3.2 1.7]
Column Name: Reviews
# of Unique Values: 119
Unique Values: [      nan 1.000e+00 2.000e+00 1.900e+01 4.000e+00 5.800e+01 3.000e+00
 5.000e+00 2.900e+01 4.700e+01 5.900e+01 3.100e+01 3.500e+01 7.000e+00
 9.700e+01 1.040e+02 1.200e+02 8.500e+01 8.000e+00 1.170e+02 5.400e+01
 6.900e+01 1.100e+01 4.300e+02 6.000e+00 3.600e+01 3.300e+01 7.000e+01
 7.700e+01 1.700e+01 2.300e+01 2.500e+01 4.500e+01 5.100e+01 5.200e+01
 3.400e+01 7.300e+01 3.700e+01 1.000e+02 1.160e+02 3.800e+01 6.300e+01
 4.800e+01 4.100e+01 4.

In [None]:
# Check for nulls
gp_transit.isnull().sum()

Name            0
Price        5992
Rating       4165
Reviews      4165
Address         2
Latitude        0
Longitude       0
Category        0
dtype: int64

There are null values in price, price, rating, and reviews. However it has low impact, as it simply signifies that some forms of transit have no price, rating, or review information, which does not affect our objective.

In [1]:
# Check for duplicates
gp_transit = gp_transit[gp_transit.duplicated()]
len(gp_transit)

NameError: name 'gp_transit' is not defined

In [None]:
# Duplicate rows found, remove from DataFrame
gp_transit = gp_transit.drop_duplicates().reset_index(drop=True)

# Confirm removal
gp_transit[gp_transit.duplicated()]

Unnamed: 0,Name,Price,Rating,Reviews,Address,Latitude,Longitude,Category


In [None]:
# See variance in numeric feautures
gp_transit.describe()

Unnamed: 0,Price,Rating,Reviews,Latitude,Longitude
count,1.0,156.0,156.0,509.0,509.0
mean,2.0,3.972436,108.75,40.697372,-73.975161
std,,1.091644,623.715719,0.095966,0.113418
min,2.0,1.0,1.0,40.519108,-74.230279
25%,2.0,3.6,1.0,40.612717,-74.069244
50%,2.0,4.1,2.0,40.69938,-73.968803
75%,2.0,5.0,22.25,40.757742,-73.907362
max,2.0,5.0,6010.0,40.893913,-73.712307


In [None]:
# Preview cleaned dataset
print(gp_transit.shape)
gp_transit

(509, 8)


Unnamed: 0,Name,Price,Rating,Reviews,Address,Latitude,Longitude,Category
0,Henry Hudson Pky E/w 246 St,,,,United States,40.893724,-73.907913,transit_station
1,Henry Hudson Pkwy W/W 239 St,,,,United States,40.889652,-73.909019,transit_station
2,W 239 St/Riverdale Av,,4.0,2.0,United States,40.889137,-73.908577,transit_station
3,Henry Hudson Pkwy E/W 239 St,,,,United States,40.889648,-73.908051,transit_station
4,Henry Hudson Pkwy W/W 246 St,,5.0,2.0,United States,40.892857,-73.908569,transit_station
...,...,...,...,...,...,...,...,...
504,Vanderbilt Av/Tompkins Av,,,,United States,40.620221,-74.077191,transit_station
505,Targee St/Osgood Ave,,,,United States,40.619289,-74.084633,transit_station
506,Targee St/Young St,,1.0,1.0,United States,40.620682,-74.084282,transit_station
507,Targee St/Sobel Court,,,,United States,40.613884,-74.084702,transit_station


In [None]:
len(gp_transit['Name'].unique().tolist())
gp_transit['Name'].unique().tolist()

['Henry Hudson Pky E/w 246 St',
 'Henry Hudson Pkwy W/W 239 St',
 'W 239 St/Riverdale Av',
 'Henry Hudson Pkwy E/W 239 St',
 'Henry Hudson Pkwy W/W 246 St',
 'Henry Hudson Pkwy E/W 246 St',
 'W 230 St / Broadway',
 'Broadway/w 231 St',
 'Baychester Av/Aldrich St',
 'Asch Loop/Bartow Av',
 'Bainbridge Av/E Mosholu Py N',
 'Bainbridge Av/E 204 St',
 'Boston Rd/Vyse Av',
 'E 138 St/Jackson Av',
 'E 138 St/Cypress Av',
 'E 135 St/Cypress Ave',
 'E 138 St/Willow Av',
 'E 138 St/Bruckner Blvd',
 'Cypress Av',
 'E 138 St & St Anns Av',
 "E 135 St/St Ann's Av",
 "E 138 St/St Ann's Ave",
 'St Anns Avenue & East 138 Street',
 'Prospect Av/E 156 St',
 'Tiffany St/Randall Av',
 'E 163 St/Tinton Av',
 'E 163 St/Trinity Av',
 'E 163 St/3 Av',
 'Williamsbridge Rd/Rhinelander Av',
 'E Tremont Av/Castle Hill Av',
 'Fordham',
 '3 Av/E 188 St',
 'Webster Av/E 187 St',
 'Kappock St/Knolls Cres',
 'Kappock St/Johnson Av',
 'Irwin Av/Johnson Av',
 'Henry Hudson Pkwy E/W 235 St',
 'E Tremont Av/Sampson Av',


In [None]:
# Export cleaned dataset to csv
gp_transit.to_csv('google_places_transit_clean.csv')