## Data Preprocessing for New York City 

In [None]:
import pandas as pd
import numpy as np
import json

!pip install geopy
from geopy.geocoders import Nominatim

import requests
!pip install folium
import folium

import json

### New-York Location json file download

In [None]:
import urllib.request
url = 'https://cocl.us/new_york_dataset'
filename = 'nyu_2451_34572-geojson.json'
urllib.request.urlretrieve(url, filename)

### Using NewYork JSON data

In [2]:
with open('nyu_2451_34572-geojson.json') as json_data:
    newyork_data=json.load(json_data)

In [3]:
NY_nbhood_data=newyork_data['features']
NY_nbhood_data[0]

{'geometry': {'coordinates': [-73.84720052054902, 40.89470517661],
  'type': 'Point'},
 'geometry_name': 'geom',
 'id': 'nyu_2451_34572.1',
 'properties': {'annoangle': 0.0,
  'annoline1': 'Wakefield',
  'annoline2': None,
  'annoline3': None,
  'bbox': [-73.84720052054902,
   40.89470517661,
   -73.84720052054902,
   40.89470517661],
  'borough': 'Bronx',
  'name': 'Wakefield',
  'stacked': 1},
 'type': 'Feature'}

In [4]:
NY_headers=['Borough','Neighborhood','Latitude','Longitude'] 
NY_nbhd_df=pd.DataFrame(columns=NY_headers)
NY_nbhd_df #Empty DataFrame created with column headers

Unnamed: 0,Borough,Neighborhood,Latitude,Longitude


In [5]:
#Pushing json data to DataFrame 
for data in NY_nbhood_data:
    borough=data["properties"]["borough"]
    neighborhood=data["properties"]["name"]
    nbhd_lat=data["geometry"]["coordinates"][1]
    nbhd_long=data["geometry"]["coordinates"][0]

    NY_nbhd_df=NY_nbhd_df.append({"Borough":borough,
                                  "Neighborhood":neighborhood,
                                  "Latitude":nbhd_lat,
                                  "Longitude":nbhd_long},ignore_index=True)


In [6]:
NY_nbhd_df.to_csv('NY_Neighborhood_Locations.csv', index=None, header=True, encoding='utf8') #to convert to .csv file format

First Dataset created! `NY_Neighborhood_Locations.csv`

### Generating a map to show the Neighborhood Locations

In [7]:
address='New York City, NY'

geolocator=Nominatim(user_agent='NY_app') #Nominatim highly discourages using default 'user_agent'
location=geolocator.geocode(address)
latitude=location.latitude
longitude=location.longitude

print('The coordinates of New York is {},{}'.format(latitude,longitude)) #

The coordinates of New York is 40.7127281,-74.0060152


In [8]:
NYmap=folium.Map(location=[latitude,longitude], zoom_start=12,tiles='OpenStreetMap')
for lat,long,borough,neighborhood in zip(NY_nbhd_df['Latitude'],NY_nbhd_df['Longitude'],
                                         NY_nbhd_df['Borough'],NY_nbhd_df['Neighborhood']):
    location=[lat,long]
    label='Place:{},{}'.format(borough,neighborhood)
    label = folium.Popup(label, parse_html = True)
    folium.CircleMarker(location=[lat,long],
                       radius=5,
                       popup=label,
                       color='red',
                       fill=True,
                       fill_color='#ccc431',
                       fill_opacity=0.7,
                       parse_html=False).add_to(NYmap)
NYmap

### Foursquare API feature to extract Venues near Neighborhoods

In [9]:
CLIENT_ID ='...'      #Foursquare Client ID
CLIENT_SECRET ='...'  #Foursquare Client secret
VERSION ='20180605'   #Foursquare Version

In [14]:
LIMIT = 200
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    venues=[]
    for name, lat, long in zip(names, latitudes, longitudes):
        print(name)
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            long, 
            radius, 
            LIMIT)
        results = requests.get(url).json()["response"]['groups'][0]['items']
        venues.append([(
            name, 
            lat, 
            long, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])
    
    nearby_venues=pd.DataFrame([item for venue in venues for item in venue])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return(nearby_venues)

In [15]:
NY_venues = getNearbyVenues(names = NY_nbhd_df['Neighborhood'], latitudes = NY_nbhd_df['Latitude'], longitudes = NY_nbhd_df['Longitude'])

Wakefield
Co-op City
Eastchester
Fieldston
Riverdale
Kingsbridge
Marble Hill
Woodlawn
Norwood
Williamsbridge
Baychester
Pelham Parkway
City Island
Bedford Park
University Heights
Morris Heights
Fordham
East Tremont
West Farms
High  Bridge
Melrose
Mott Haven
Port Morris
Longwood
Hunts Point
Morrisania
Soundview
Clason Point
Throgs Neck
Country Club
Parkchester
Westchester Square
Van Nest
Morris Park
Belmont
Spuyten Duyvil
North Riverdale
Pelham Bay
Schuylerville
Edgewater Park
Castle Hill
Olinville
Pelham Gardens
Concourse
Unionport
Edenwald
Bay Ridge
Bensonhurst
Sunset Park
Greenpoint
Gravesend
Brighton Beach
Sheepshead Bay
Manhattan Terrace
Flatbush
Crown Heights
East Flatbush
Kensington
Windsor Terrace
Prospect Heights
Brownsville
Williamsburg
Bushwick
Bedford Stuyvesant
Brooklyn Heights
Cobble Hill
Carroll Gardens
Red Hook
Gowanus
Fort Greene
Park Slope
Cypress Hills
East New York
Starrett City
Canarsie
Flatlands
Mill Island
Manhattan Beach
Coney Island
Bath Beach
Borough Park
Dyker

In [17]:
NY_venues.head(20)

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Wakefield,40.894705,-73.847201,Lollipops Gelato,40.894123,-73.845892,Dessert Shop
1,Wakefield,40.894705,-73.847201,Rite Aid,40.896649,-73.844846,Pharmacy
2,Wakefield,40.894705,-73.847201,Carvel Ice Cream,40.890487,-73.848568,Ice Cream Shop
3,Wakefield,40.894705,-73.847201,Walgreens,40.896687,-73.84485,Pharmacy
4,Wakefield,40.894705,-73.847201,Dunkin',40.890459,-73.849089,Donut Shop
5,Wakefield,40.894705,-73.847201,Shell,40.894187,-73.845862,Gas Station
6,Wakefield,40.894705,-73.847201,Cooler Runnings Jamaican Restaurant Inc,40.898083,-73.850259,Caribbean Restaurant
7,Wakefield,40.894705,-73.847201,SUBWAY,40.890468,-73.849152,Sandwich Place
8,Wakefield,40.894705,-73.847201,Koss Quick Wash,40.891281,-73.849904,Laundromat
9,Co-op City,40.874294,-73.829939,Dollar Tree,40.870125,-73.828989,Discount Store


In [18]:
NY_venues.groupby('Neighborhood').count()

Unnamed: 0_level_0,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Allerton,30,30,30,30,30,30
Annadale,11,11,11,11,11,11
Arden Heights,5,5,5,5,5,5
Arlington,5,5,5,5,5,5
Arrochar,20,20,20,20,20,20
Arverne,18,18,18,18,18,18
Astoria,100,100,100,100,100,100
Astoria Heights,14,14,14,14,14,14
Auburndale,19,19,19,19,19,19
Bath Beach,50,50,50,50,50,50


In [19]:
print('There are {} unique categories.'.format(len(NY_venues['Venue Category'].unique())))

There are 431 unique categories.


In [20]:
NY_onehot = pd.get_dummies(NY_venues[['Venue Category']], prefix = "", prefix_sep = "")

NY_onehot['Neighborhood'] = NY_venues['Neighborhood']

col_list = list(NY_onehot.columns)
col_list.remove('Neighborhood')
cols = ['Neighborhood'] + [col for col in col_list]

NY_onehot = NY_onehot[cols]

NY_onehot.head()

Unnamed: 0,Neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport Terminal,American Restaurant,Antique Shop,Arcade,Arepa Restaurant,...,Warehouse Store,Waste Facility,Waterfront,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Wakefield,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Wakefield,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Wakefield,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Wakefield,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Wakefield,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
NY_onehot.shape

(10263, 431)

In [22]:
NY_grouped = NY_onehot.groupby('Neighborhood').mean().reset_index()
NY_grouped

Unnamed: 0,Neighborhood,Accessories Store,Adult Boutique,Afghan Restaurant,African Restaurant,Airport Terminal,American Restaurant,Antique Shop,Arcade,Arepa Restaurant,...,Warehouse Store,Waste Facility,Waterfront,Weight Loss Center,Whisky Bar,Wine Bar,Wine Shop,Wings Joint,Women's Store,Yoga Studio
0,Allerton,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
1,Annadale,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,Arden Heights,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,Arlington,0.000000,0.0,0.000000,0.000000,0.0,0.200000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,Arrochar,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
5,Arverne,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.055556,0.000000,0.000000,0.000000
6,Astoria,0.000000,0.0,0.000000,0.000000,0.0,0.010000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.010000,0.000000,0.000000,0.000000
7,Astoria Heights,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
8,Auburndale,0.000000,0.0,0.000000,0.000000,0.0,0.052632,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
9,Bath Beach,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [23]:
NY_grouped.shape

(300, 431)

In [25]:
export_csv = NY_grouped.to_csv('NY_grouped.csv', index = None, header=True, encoding = 'utf8')

Another Dataset is ready! `NY_grouped.csv`