# Data Preparation

### Task
Parse the json **nyc_geo.json** into the dataframe with the following columns:
- Borough
- Neighborhood
- Latitude
- Longitude

### Import Modules

In [17]:
# import modules
import pandas as pd
from pandas import json_normalize
import json
import os
import requests

### Import Dataset

In [18]:
# json to dataframe
with open('nyc_geo.json') as f:
    nyc_json = json.load(f)

pd.json_normalize(nyc_json)

Unnamed: 0,type,totalFeatures,features,bbox,crs.type,crs.properties.name
0,FeatureCollection,306,"[{'type': 'Feature', 'id': 'nyu_2451_34572.1',...","[-74.2492599487305, 40.5033187866211, -73.7061...",name,urn:ogc:def:crs:EPSG::4326


In [19]:
# familiarize self with json, we can see here that the important info is stored within the features column
nyc_json['features'][0]

{'type': 'Feature',
 'id': 'nyu_2451_34572.1',
 'geometry': {'type': 'Point',
  'coordinates': [-73.84720052054902, 40.89470517661]},
 'geometry_name': 'geom',
 'properties': {'name': 'Wakefield',
  'stacked': 1,
  'annoline1': 'Wakefield',
  'annoline2': None,
  'annoline3': None,
  'annoangle': 0.0,
  'borough': 'Bronx',
  'bbox': [-73.84720052054902,
   40.89470517661,
   -73.84720052054902,
   40.89470517661]}}

In [20]:
# drill down into feature columns, save as new dataframe
nyc = pd.json_normalize(nyc_json, record_path= 'features')

# examine nyc dataframe
print(nyc.info())
nyc.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306 entries, 0 to 305
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   type                  306 non-null    object 
 1   id                    306 non-null    object 
 2   geometry_name         306 non-null    object 
 3   geometry.type         306 non-null    object 
 4   geometry.coordinates  306 non-null    object 
 5   properties.name       306 non-null    object 
 6   properties.stacked    306 non-null    int64  
 7   properties.annoline1  306 non-null    object 
 8   properties.annoline2  163 non-null    object 
 9   properties.annoline3  9 non-null      object 
 10  properties.annoangle  306 non-null    float64
 11  properties.borough    306 non-null    object 
 12  properties.bbox       306 non-null    object 
dtypes: float64(1), int64(1), object(11)
memory usage: 31.2+ KB
None


Unnamed: 0,type,id,geometry_name,geometry.type,geometry.coordinates,properties.name,properties.stacked,properties.annoline1,properties.annoline2,properties.annoline3,properties.annoangle,properties.borough,properties.bbox
0,Feature,nyu_2451_34572.1,geom,Point,"[-73.84720052054902, 40.89470517661]",Wakefield,1,Wakefield,,,0.0,Bronx,"[-73.84720052054902, 40.89470517661, -73.84720..."
1,Feature,nyu_2451_34572.2,geom,Point,"[-73.82993910812398, 40.87429419303012]",Co-op City,2,Co-op,City,,0.0,Bronx,"[-73.82993910812398, 40.87429419303012, -73.82..."
2,Feature,nyu_2451_34572.3,geom,Point,"[-73.82780644716412, 40.887555677350775]",Eastchester,1,Eastchester,,,0.0,Bronx,"[-73.82780644716412, 40.887555677350775, -73.8..."
3,Feature,nyu_2451_34572.4,geom,Point,"[-73.90564259591682, 40.89543742690383]",Fieldston,1,Fieldston,,,0.0,Bronx,"[-73.90564259591682, 40.89543742690383, -73.90..."
4,Feature,nyu_2451_34572.5,geom,Point,"[-73.9125854610857, 40.890834493891305]",Riverdale,1,Riverdale,,,0.0,Bronx,"[-73.9125854610857, 40.890834493891305, -73.91..."


### Data Cleaning

In [21]:
# geometry.coordinates is a list within one column, so let's split it so that all values are shown in the main dataframe
coords = pd.DataFrame(nyc['geometry.coordinates'].tolist(), columns=['long','lat'])

# properties.bbox is the same as geometry.coordinates, except that properties.bbox lists the same coordiantes twice, so we'll filter that out immediately
# we will also filter out geometry.coordinates as it has now been split
nyc_drop = nyc.drop(columns=['geometry.coordinates', 'properties.bbox'])

In [22]:
# check for unique values

def unique_values(df):
    columns = df.columns
    for col in columns:
        print('Column Name:', col)
        print('# of Unique Values:', (len(df[col].unique())))
        print('Unique Values:', df[col].unique())
        print('==============================================================')
    
unique_values(nyc_drop)

Column Name: type
# of Unique Values: 1
Unique Values: ['Feature']
Column Name: id
# of Unique Values: 306
Unique Values: ['nyu_2451_34572.1' 'nyu_2451_34572.2' 'nyu_2451_34572.3'
 'nyu_2451_34572.4' 'nyu_2451_34572.5' 'nyu_2451_34572.6'
 'nyu_2451_34572.7' 'nyu_2451_34572.8' 'nyu_2451_34572.9'
 'nyu_2451_34572.10' 'nyu_2451_34572.11' 'nyu_2451_34572.12'
 'nyu_2451_34572.13' 'nyu_2451_34572.14' 'nyu_2451_34572.15'
 'nyu_2451_34572.16' 'nyu_2451_34572.17' 'nyu_2451_34572.18'
 'nyu_2451_34572.19' 'nyu_2451_34572.20' 'nyu_2451_34572.21'
 'nyu_2451_34572.22' 'nyu_2451_34572.23' 'nyu_2451_34572.24'
 'nyu_2451_34572.25' 'nyu_2451_34572.26' 'nyu_2451_34572.27'
 'nyu_2451_34572.28' 'nyu_2451_34572.29' 'nyu_2451_34572.30'
 'nyu_2451_34572.31' 'nyu_2451_34572.32' 'nyu_2451_34572.33'
 'nyu_2451_34572.34' 'nyu_2451_34572.35' 'nyu_2451_34572.36'
 'nyu_2451_34572.37' 'nyu_2451_34572.38' 'nyu_2451_34572.39'
 'nyu_2451_34572.40' 'nyu_2451_34572.41' 'nyu_2451_34572.42'
 'nyu_2451_34572.43' 'nyu_2451_34

In [23]:
# drop columns that are not of use based on unique values and assignment criteria
nyc_drop2 = nyc_drop.drop(columns=['id', 'type', 'geometry_name', 'geometry.type', 'properties.stacked', 'properties.annoline1', 'properties.annoline2', 'properties.annoline3', 'properties.annoangle'])

In [24]:
# drop columns that are not of use based on unique values and assignment criteria
nyc_drop.drop(columns=['id', 'type', 'geometry_name', 'geometry.type', 'properties.stacked', 'properties.annoline1', 'properties.annoline2', 'properties.annoline3', 'properties.annoangle'])

Unnamed: 0,properties.name,properties.borough
0,Wakefield,Bronx
1,Co-op City,Bronx
2,Eastchester,Bronx
3,Fieldston,Bronx
4,Riverdale,Bronx
...,...,...
301,Hudson Yards,Manhattan
302,Hammels,Queens
303,Bayswater,Queens
304,Queensbridge,Queens


In [25]:
# create a complete dataframe with the coordinates column and the newest dataframe
nyc_complete = nyc_drop2.join(coords)
nyc_complete

Unnamed: 0,properties.name,properties.borough,long,lat
0,Wakefield,Bronx,-73.847201,40.894705
1,Co-op City,Bronx,-73.829939,40.874294
2,Eastchester,Bronx,-73.827806,40.887556
3,Fieldston,Bronx,-73.905643,40.895437
4,Riverdale,Bronx,-73.912585,40.890834
...,...,...,...,...
301,Hudson Yards,Manhattan,-74.000111,40.756658
302,Hammels,Queens,-73.805530,40.587338
303,Bayswater,Queens,-73.765968,40.611322
304,Queensbridge,Queens,-73.945631,40.756091


In [26]:
# check for null values
nyc_complete.isnull().sum()

# no null values

properties.name       0
properties.borough    0
long                  0
lat                   0
dtype: int64

In [27]:
# check for duplicate rows
nyc_complete[nyc_complete.duplicated()]

# no duplicate rows

Unnamed: 0,properties.name,properties.borough,long,lat


In [28]:
# reorder columns, and rename columns
nyc_clean = nyc_complete[['properties.borough','properties.name','lat','long']]
nyc_clean.columns = ['Borough', 'Neighbourhood','Latitude', 'Longitude']


In [29]:
# preview cleaned dataset! 
nyc_clean.head()

Unnamed: 0,Borough,Neighbourhood,Latitude,Longitude
0,Bronx,Wakefield,40.894705,-73.847201
1,Bronx,Co-op City,40.874294,-73.829939
2,Bronx,Eastchester,40.887556,-73.827806
3,Bronx,Fieldston,40.895437,-73.905643
4,Bronx,Riverdale,40.890834,-73.912585


In [30]:
nyc_clean.to_csv('clean_data/nyc_geo_cleaned.csv', index=False)

### Task
Use different data sources and APIs to collect information about the neigborhoods that can be used for segmentation.

* api_helper_functions
* api_pulls (fsq, yelp)
* google_places_parsing
* google_places_data_cleaning
* rat_data_cleaning
* uber_data_cleaning

### Task

Visualize the neigborhoods with graphs.

In [31]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import plotly.express as px
%matplotlib inline

In [32]:
poi = pd.read_csv('clean_data/merged_pois_cleaned.csv')
poi.head()

Unnamed: 0.1,Unnamed: 0,Distance (m),Name,Neighbourhood,Zipcode,Reviews,Rating,Price,Latitude,Longitude,Borough,Category
0,0,127.0,Lollipops Gelato,Northeast Bronx,10466.0,,4.215686,1.0,40.894705,-73.847201,Bronx,restaurant
1,2,821.0,Ali's Roti Shop,Northeast Bronx,10466.0,545.0,4.0,1.0,40.894,-73.85684,Bronx,restaurant
2,4,983.0,Jimbo's Hamburger Palace,Northeast Bronx,10466.0,24.0,3.0,1.0,40.891804,-73.858604,Bronx,restaurant
3,6,454.0,Cooler Runnings Jamaican Restaurant,Northeast Bronx,10466.0,50.0,3.0,1.0,40.898157,-73.85029,Bronx,restaurant
4,7,902.0,McDonald's,Northeast Bronx,10470.0,,2.72549,1.0,40.894705,-73.847201,Bronx,restaurant


In [33]:
poi['Borough'].unique()

array(['Bronx', 'Manhattan', 'Brooklyn', 'Queens', 'Staten Island'],
      dtype=object)

#### [Scatter Plot](https://plotly.com/python/line-and-scatter/) - Average Rating/Price for POI's by Borough

In [34]:
# Create a new DataFrame with average rating and average price of places of interest by borough
avg_price = poi[['Borough','Price']]
avg_price = avg_price.groupby(avg_price['Borough']).mean().reset_index()

borough_avg = poi[['Borough','Rating']]
borough_avg = borough_avg.groupby(borough_avg['Borough']).mean().reset_index()

borough_avg['Price'] = avg_price['Price']

borough_avg.columns = ['Borough', 'Average Rating', 'Average Price']
borough_avg

Unnamed: 0,Borough,Average Rating,Average Price
0,Bronx,3.886331,1.460526
1,Brooklyn,4.017779,1.647059
2,Manhattan,4.168687,2.013906
3,Queens,3.918687,1.576894
4,Staten Island,3.88129,1.582027


In [35]:
# Create scatter plot based off new DataFrame
fig = px.scatter(borough_avg, x='Average Price', y='Average Rating', color='Borough',
                title="Average Rating/Prices for POI\'s by Boroughs",
                width=800, height=800)

# Customize scatter points
fig.update_traces(marker=dict(size=20,
                              line=dict(width=2,
                                        color='DarkSlateGrey')),
                  selector=dict(mode='markers'))

# Customize graph layout
fig.update_layout(
    margin=dict(l=5, r=5, t=35, b=5),
    paper_bgcolor="White",
)

# Show graph (autoscaled)
fig.show()

In [36]:
# Create new DataFrame with count of each place of interest by borough
categories = poi[['Borough','Category']]
categories = categories.groupby(['Borough','Category']).size().unstack(fill_value=0).reset_index()
categories


Category,Borough,park,restaurant,school,supermarket,transit_station
0,Bronx,409,1738,591,256,687
1,Brooklyn,618,2681,958,464,977
2,Manhattan,490,1583,460,272,541
3,Queens,531,2982,1061,340,1097
4,Staten Island,159,1256,373,42,685


#### [Vertical Stacked Bar Chart](https://plotly.com/python/bar-charts/) - Places of Interest by Borough

In [37]:
# Set y-labels to use for stacked bar chart based off new DataFrame
y_labels = ['park', 'restaurant','school','supermarket','transit_station']

# Create stacked bar chart
fig = px.bar(categories, x='Borough', y= y_labels,
            labels={'value':'Count of POI\'s',
                    'variable': 'Place of Interest'},
            title='Places of Interest by Borough', width=700, height=750)

# Customize bars
fig.update_traces(marker_line_color='DarkSlateGrey',
                  marker_line_width=1.5)

# Customize graph layout
fig.update_layout(
    margin=dict(l=5, r=5, t=35, b=5),
    paper_bgcolor="White",
)

# Show graph (autoscaled)
fig.show()