## Visualizations of NYC POI Data

### Inner Join

In [187]:
# Import modules
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as spstats
%matplotlib inline

In [188]:
# Load POI datasets
gp = pd.read_csv('clean_data/gp_poi_clean.csv',)
yelp = pd.read_csv('clean_data/yelp_poi_cleaned.csv')
fsq = pd.read_csv('clean_data/fsq_poi_clean.csv')

In [189]:
yelp.drop(columns=['Unnamed: 0','Address','Reviews','Distance (m)'], inplace=True)

In [190]:
yelp.head()

Unnamed: 0,Name,Rating,Price,Latitude,Longitude,Borough,Zipcode,Category
0,Ripe Kitchen & Bar,4.0,$$,40.898214,-73.838821,Mount Vernon,10550,restaurant
1,H.I.M Ital Health Food Market,4.5,$,40.897633,-73.854665,Bronx,10466,restaurant
2,Ali's Roti Shop,4.0,$,40.894,-73.85684,Bronx,10466,restaurant
3,Paula's Soul Cafe,3.0,$$,40.89269,-73.85568,Bronx,10466,restaurant
4,Jerk House,3.5,$$,40.89464,-73.85648,Bronx,10466,restaurant


In [191]:
fsq.drop(columns=['Unnamed: 0', 'Neighborhood', 'Distance (m)'], inplace=True)

In [192]:
fsq = fsq[['Name','Rating','Price','Latitude','Longitude','Borough','Zipcode','Category']]
fsq.head()

Unnamed: 0,Name,Rating,Price,Latitude,Longitude,Borough,Zipcode,Category
0,Lollipops Gelato,8.6,1.0,40.894705,-73.847201,Bronx,10466.0,restaurant
1,Ripe Kitchen and Bar,8.4,2.0,40.894705,-73.847201,Mount Vernon,10550.0,restaurant
2,Ali's Roti Shop,8.1,1.0,40.894705,-73.847201,Bronx,10466.0,restaurant
3,Jimbo's Hamburger Palace,8.0,1.0,40.894705,-73.847201,Bronx,10466.0,restaurant
4,Cooler Runnings Jamaican Restaurant,6.4,2.0,40.894705,-73.847201,Bronx,10466.0,restaurant


In [193]:
gp.drop(columns=['Reviews','Neighbourhood','Address'], inplace=True)

In [194]:
gp = gp[['Name','Rating','Price','Latitude','Longitude','Borough','Zipcode','Category']]
gp.head()

Unnamed: 0,Name,Rating,Price,Latitude,Longitude,Borough,Zipcode,Category
0,Ripe Kitchen & Bar,4.3,2.0,40.898209,-73.838855,Mount Vernon,,restaurant
1,New China Garden,3.9,1.0,40.897919,-73.853364,Bronx,10466.0,restaurant
2,Dunkin',3.8,1.0,40.890459,-73.849089,Bronx,10466.0,restaurant
3,Subway,3.6,1.0,40.890468,-73.849152,Bronx,10466.0,restaurant
4,Popeyes Louisiana Kitchen,3.8,1.0,40.889492,-73.843383,Bronx,10466.0,restaurant


In [195]:
full_poi = pd.concat([gp, yelp, fsq], axis=0)
print(full_poi.shape)
full_poi.head()

(29876, 8)


Unnamed: 0,Name,Rating,Price,Latitude,Longitude,Borough,Zipcode,Category
0,Ripe Kitchen & Bar,4.3,2.0,40.898209,-73.838855,Mount Vernon,,restaurant
1,New China Garden,3.9,1.0,40.897919,-73.853364,Bronx,10466.0,restaurant
2,Dunkin',3.8,1.0,40.890459,-73.849089,Bronx,10466.0,restaurant
3,Subway,3.6,1.0,40.890468,-73.849152,Bronx,10466.0,restaurant
4,Popeyes Louisiana Kitchen,3.8,1.0,40.889492,-73.843383,Bronx,10466.0,restaurant


In [196]:
full_poi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29876 entries, 0 to 7000
Data columns (total 8 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Name       29874 non-null  object 
 1   Rating     22609 non-null  float64
 2   Price      15988 non-null  object 
 3   Latitude   29876 non-null  float64
 4   Longitude  29876 non-null  float64
 5   Borough    29652 non-null  object 
 6   Zipcode    29597 non-null  float64
 7   Category   29876 non-null  object 
dtypes: float64(4), object(4)
memory usage: 2.1+ MB


In [197]:
full_poi.isnull().sum()

Name             2
Rating        7267
Price        13888
Latitude         0
Longitude        0
Borough        224
Zipcode        279
Category         0
dtype: int64

In [198]:
len(full_poi[full_poi.duplicated()])

2654

In [199]:
full_poi.describe()

Unnamed: 0,Rating,Latitude,Longitude,Zipcode
count,22609.0,29876.0,29876.0,29597.0
mean,4.824021,40.71287,-73.935804,10803.456735
std,1.695686,0.092993,0.110578,542.766539
min,1.0,40.205016,-74.448224,7047.0
25%,4.0,40.635703,-73.989455,10308.0
50%,4.3,40.713724,-73.929113,11101.0
75%,5.0,40.773529,-73.858855,11235.0
max,9.6,40.916627,-73.100648,12234.0


In [200]:
full_poi = full_poi.drop_duplicates()
full_poi.head()

Unnamed: 0,Name,Rating,Price,Latitude,Longitude,Borough,Zipcode,Category
0,Ripe Kitchen & Bar,4.3,2.0,40.898209,-73.838855,Mount Vernon,,restaurant
1,New China Garden,3.9,1.0,40.897919,-73.853364,Bronx,10466.0,restaurant
2,Dunkin',3.8,1.0,40.890459,-73.849089,Bronx,10466.0,restaurant
3,Subway,3.6,1.0,40.890468,-73.849152,Bronx,10466.0,restaurant
4,Popeyes Louisiana Kitchen,3.8,1.0,40.889492,-73.843383,Bronx,10466.0,restaurant


In [201]:
full_poi['Price'] = full_poi['Price'].replace('$',1.0)
full_poi['Price'] = full_poi['Price'].replace('$$',2.0)
full_poi['Price'] = full_poi['Price'].replace('$$$',3.0)
full_poi['Price'] = full_poi['Price'].replace('$$$$',4.0)

In [202]:
full_poi['Price'].unique()

array([ 2.,  1., nan,  3.,  4.,  0.])

In [203]:
full_poi['Rating'].unique()

array([4.3, 3.9, 3.8, 3.6, 3.3, 4.2, 3.7, 4.1, 2.2, 1. , 4. , 4.5, 5. ,
       4.6, 2.9, 4.4, 4.7, 3. , 3.4, 2.4, 3.2, 2.5, 3.5, 2.7, 3.1, 2.8,
       4.9, nan, 2. , 2.6, 2.1, 4.8, 2.3, 1.9, 1.8, 1.5, 1.6, 1.4, 1.7,
       8.6, 8.4, 8.1, 8. , 6.4, 6.7, 6.6, 5.8, 5.7, 7.9, 7.6, 7.3, 6.8,
       6.3, 5.9, 6.1, 8.3, 8.2, 7.5, 7.7, 7. , 7.2, 6.9, 6. , 6.5, 5.6,
       8.8, 8.9, 8.7, 7.8, 7.1, 6.2, 5.3, 8.5, 9.2, 9. , 5.4, 7.4, 5.2,
       9.1, 5.5, 9.3, 9.4, 9.5, 9.6, 5.1])

In [215]:
from sklearn.preprocessing import MinMaxScaler

# Rescale price
scaler = MinMaxScaler((0,5))
full_poi['Price'] = pd.DataFrame(scaler.fit_transform(full_poi['Price'].values.reshape(-1,1)))

# Rescale rating
full_poi['Rating'] = pd.DataFrame(scaler.fit_transform(full_poi['Rating'].values.reshape(-1,1)))
full_poi.head()

Unnamed: 0,Name,Price,Rating,Reviews,Borough,Neighbourhood,Address,Zipcode,Latitude,Longitude,Category,Distance (m)
0,Ripe Kitchen & Bar,2.5,4.125,714.0,Mount Vernon,,"151 West Sandford Boulevard, Mount Vernon",,40.898209,-73.838855,restaurant,
1,New China Garden,1.25,3.625,85.0,Bronx,Northeast Bronx,"724 Nereid Avenue, The Bronx",10466.0,40.897919,-73.853364,restaurant,
2,Dunkin',1.25,3.5,280.0,Bronx,Northeast Bronx,"980 East 233rd Street, The Bronx",10466.0,40.890459,-73.849089,restaurant,
3,Subway,1.25,3.25,121.0,Bronx,Northeast Bronx,"980 East 233rd Street, The Bronx",10466.0,40.890468,-73.849152,restaurant,
4,Popeyes Louisiana Kitchen,1.25,3.5,467.0,Bronx,Northeast Bronx,"1201 East 233rd Street, The Bronx",10466.0,40.889492,-73.843383,restaurant,


### Full Outer Join

In [205]:
# Load POI datasets
gp = pd.read_csv('clean_data/gp_poi_clean.csv',)
yelp = pd.read_csv('clean_data/yelp_poi_cleaned.csv')
fsq = pd.read_csv('clean_data/fsq_poi_clean.csv')

In [206]:
full_poi = pd.concat([gp, yelp, fsq], axis=0)
print(full_poi.shape)
full_poi.head()

(29876, 14)


Unnamed: 0.1,Name,Price,Rating,Reviews,Borough,Neighbourhood,Address,Zipcode,Latitude,Longitude,Category,Unnamed: 0,Distance (m),Neighborhood
0,Ripe Kitchen & Bar,2.0,4.3,714.0,Mount Vernon,,"151 West Sandford Boulevard, Mount Vernon",,40.898209,-73.838855,restaurant,,,
1,New China Garden,1.0,3.9,85.0,Bronx,Northeast Bronx,"724 Nereid Avenue, The Bronx",10466.0,40.897919,-73.853364,restaurant,,,
2,Dunkin',1.0,3.8,280.0,Bronx,Northeast Bronx,"980 East 233rd Street, The Bronx",10466.0,40.890459,-73.849089,restaurant,,,
3,Subway,1.0,3.6,121.0,Bronx,Northeast Bronx,"980 East 233rd Street, The Bronx",10466.0,40.890468,-73.849152,restaurant,,,
4,Popeyes Louisiana Kitchen,1.0,3.8,467.0,Bronx,Northeast Bronx,"1201 East 233rd Street, The Bronx",10466.0,40.889492,-73.843383,restaurant,,,


In [207]:
full_poi.drop(columns=['Unnamed: 0','Neighborhood'], inplace=True)

In [208]:
full_poi.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29876 entries, 0 to 7000
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Name           29874 non-null  object 
 1   Price          15988 non-null  object 
 2   Rating         22609 non-null  float64
 3   Reviews        17924 non-null  float64
 4   Borough        29652 non-null  object 
 5   Neighbourhood  14979 non-null  object 
 6   Address        22707 non-null  object 
 7   Zipcode        29597 non-null  float64
 8   Latitude       29876 non-null  float64
 9   Longitude      29876 non-null  float64
 10  Category       29876 non-null  object 
 11  Distance (m)   14114 non-null  float64
dtypes: float64(6), object(6)
memory usage: 3.0+ MB


In [209]:
full_poi.isnull().sum()

Name                 2
Price            13888
Rating            7267
Reviews          11952
Borough            224
Neighbourhood    14897
Address           7169
Zipcode            279
Latitude             0
Longitude            0
Category             0
Distance (m)     15762
dtype: int64

In [210]:
len(full_poi[full_poi.duplicated(subset=['Name','Address'])])

5120

In [211]:
full_poi.describe()

Unnamed: 0,Rating,Reviews,Zipcode,Latitude,Longitude,Distance (m)
count,22609.0,17924.0,29597.0,29876.0,29876.0,14114.0
mean,4.824021,386.779235,10803.456735,40.71287,-73.935804,1078.935906
std,1.695686,2194.796764,542.766539,0.092993,0.110578,3114.896095
min,1.0,1.0,7047.0,40.205016,-74.448224,3.0
25%,4.0,16.0,10308.0,40.635703,-73.989455,398.0
50%,4.3,89.0,11101.0,40.713724,-73.929113,630.0
75%,5.0,344.0,11235.0,40.773529,-73.858855,855.501531
max,9.6,244164.0,12234.0,40.916627,-73.100648,77147.675762


In [212]:
full_poi = full_poi.drop_duplicates(subset=['Name','Address'])

In [213]:
full_poi['Price'] = full_poi['Price'].replace('$',1.0)
full_poi['Price'] = full_poi['Price'].replace('$$',2.0)
full_poi['Price'] = full_poi['Price'].replace('$$$',3.0)
full_poi['Price'] = full_poi['Price'].replace('$$$$',4.0)

In [214]:
# Rescale price
scaler = MinMaxScaler((0,5))
full_poi['Price'] = pd.DataFrame(scaler.fit_transform(full_poi['Price'].values.reshape(-1,1)))

# Rescale rating
full_poi['Rating'] = pd.DataFrame(scaler.fit_transform(full_poi['Rating'].values.reshape(-1,1)))
full_poi.head()

Unnamed: 0,Name,Price,Rating,Reviews,Borough,Neighbourhood,Address,Zipcode,Latitude,Longitude,Category,Distance (m)
0,Ripe Kitchen & Bar,2.5,1.918605,714.0,Mount Vernon,,"151 West Sandford Boulevard, Mount Vernon",,40.898209,-73.838855,restaurant,
1,New China Garden,1.25,1.686047,85.0,Bronx,Northeast Bronx,"724 Nereid Avenue, The Bronx",10466.0,40.897919,-73.853364,restaurant,
2,Dunkin',1.25,1.627907,280.0,Bronx,Northeast Bronx,"980 East 233rd Street, The Bronx",10466.0,40.890459,-73.849089,restaurant,
3,Subway,1.25,1.511628,121.0,Bronx,Northeast Bronx,"980 East 233rd Street, The Bronx",10466.0,40.890468,-73.849152,restaurant,
4,Popeyes Louisiana Kitchen,1.25,1.627907,467.0,Bronx,Northeast Bronx,"1201 East 233rd Street, The Bronx",10466.0,40.889492,-73.843383,restaurant,
