# Capstone Project- The Battle of the Neighborhoods

In [138]:
# Importing packages
print('Importing libraries...')
import numpy as np
import pandas as pd
import json
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values
import requests 
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe
import matplotlib.cm as cm
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import folium
print('Libraries imported! :D')

Importing libraries...
Libraries imported! :D


In [139]:
#Importing cities dataset
!wget -q -O 'cities.csv' https://public.opendatasoft.com/explore/dataset/1000-largest-us-cities-by-population-with-geographic-coordinates/download/?format=csv&timezone=Asia/Shanghai&lang=en&use_labels_for_header=true&csv_separator=%3B
print('Data downloaded!')

Data downloaded!


In [140]:
with open('cities.csv') as cities_data:
    cities= pd.read_csv(cities_data, sep=';')
cities.head(5)

Unnamed: 0,City,Rank,State,Growth From 2000 to 2013,Population,Coordinates
0,Marysville,552,Washington,115.7,63269,"48.0517637,-122.1770818"
1,Perris,466,California,98.7,72326,"33.7825194,-117.2286478"
2,Cleveland,48,Ohio,-18.1,390113,"41.49932,-81.6943605"
3,Worcester,129,Massachusetts,5.8,182544,"42.2625932,-71.8022934"
4,Columbia,192,South Carolina,11.7,133358,"34.0007104,-81.0348144"


In [141]:
#Splitting Coordinates column
lat = []
lon = []

# For each row in a varible,
for row in cities['Coordinates']:
    # Try to,
    try:
        # Split the row by comma and append
        # everything before the comma to lat
        lat.append(row.split(',')[0])
        # Split the row by comma and append
        # everything after the comma to lon
        lon.append(row.split(',')[1])
    # But if you get an error
    except:
        # append a missing value to lat
        lat.append(np.NaN)
        # append a missing value to lon
        lon.append(np.NaN)

# Create two new columns from lat and lon
cities['latitude'] = lat
cities['longitude'] = lon

cities.head(5)

Unnamed: 0,City,Rank,State,Growth From 2000 to 2013,Population,Coordinates,latitude,longitude
0,Marysville,552,Washington,115.7,63269,"48.0517637,-122.1770818",48.0517637,-122.1770818
1,Perris,466,California,98.7,72326,"33.7825194,-117.2286478",33.7825194,-117.2286478
2,Cleveland,48,Ohio,-18.1,390113,"41.49932,-81.6943605",41.49932,-81.6943605
3,Worcester,129,Massachusetts,5.8,182544,"42.2625932,-71.8022934",42.2625932,-71.8022934
4,Columbia,192,South Carolina,11.7,133358,"34.0007104,-81.0348144",34.0007104,-81.0348144


In [142]:
#Selecting relevent columns
cities.drop(['Rank', 'Growth From 2000 to 2013', 'Coordinates'], axis= 1, inplace= True)
cities

Unnamed: 0,City,State,Population,latitude,longitude
0,Marysville,Washington,63269,48.0517637,-122.1770818
1,Perris,California,72326,33.7825194,-117.2286478
2,Cleveland,Ohio,390113,41.49932,-81.6943605
3,Worcester,Massachusetts,182544,42.2625932,-71.8022934
4,Columbia,South Carolina,133358,34.0007104,-81.0348144
...,...,...,...,...,...
995,Santa Barbara,California,90412,34.4208305,-119.6981901
996,Addison,Illinois,37385,41.931696,-87.9889556
997,Rio Rancho,New Mexico,91956,35.2327544,-106.6630437
998,Normal,Illinois,54664,40.5142026,-88.9906312


In [143]:
#Checking data cleaning- Removing NAs and Duplicates
display(cities.isnull().any())
print('Are any rows duplicated? ' + str(cities.duplicated().any()))

City          False
State         False
Population    False
latitude      False
longitude     False
dtype: bool

Are any rows duplicated? False


In [144]:
#Checking Central location for United States
address = 'United States of America'

geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of U.S. are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of U.S. are 39.7837304, -100.4458825.


In [145]:
#Creating map showing all cities in consideration
map_us = folium.Map(location=[latitude, longitude], zoom_start=3)

# add markers to map
for lat, lng, city, state in zip(cities['latitude'], cities['longitude'], cities['City'], cities['State']):
    label = '{}, {}'.format(city, state)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_us)  
    
map_us

In [146]:
#Defining Foursquare Credentials
CLIENT_ID = 'S5CN3L51JZBRUYZVXCMZQOSAMNPOFYXY30PH32JLV0MK5SXT' # your Foursquare ID
CLIENT_SECRET = 'ENVGYBCT5JOAPJH2DD4LQD0ZIOZQYP3TPTUIYSBPYA4OFAU0' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: S5CN3L51JZBRUYZVXCMZQOSAMNPOFYXY30PH32JLV0MK5SXT
CLIENT_SECRET:ENVGYBCT5JOAPJH2DD4LQD0ZIOZQYP3TPTUIYSBPYA4OFAU0


In [147]:
#Defining function that will get all the services 
def getNearbyVenues(names, latitudes, longitudes, radius=25000, LIMIT=2000):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        try:
            results = requests.get(url).json()["response"]['groups'][0]['items']
        except KeyError:
            results= [np.nan, np.nan, np.nan, np.nan]
        
        # return only relevant information for each nearby venue
        try:
            venues_list.append([(
                name, 
                lat, 
                lng, 
                v['venue']['name'], 
                v['venue']['location']['lat'], 
                v['venue']['location']['lng'],  
                v['venue']['categories'][0]['name']) for v in results])
        except TypeError: 
            venues_list.append([(
            np.nan, np.nan, np.nan,np.nan,np.nan,np.nan,np.nan,)])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['City', 
                  'City Latitude', 
                  'City Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [159]:
us_venues = getNearbyVenues(names=cities['City'],
                                   latitudes=cities['latitude'],
                                   longitudes=cities['longitude'])

Marysville
Perris
Cleveland
Worcester
Columbia
Waterbury
Eagan
Southfield
Lafayette
Boise City
Elkhart
Buena Park
Marietta
Parker
West Palm Beach
Salem
Aurora
Leesburg
Doral
Westminster
Lubbock
Overland Park
Jackson
Gastonia
Daytona Beach
Kansas City
Noblesville
Southaven
Chino
Gaithersburg
Fountain Valley
Frederick
Greenville
DeKalb
West Allis
Thornton
West Des Moines
Fall River
Tulare
Chula Vista
Coral Springs
Concord
Smyrna
Apopka
Grand Forks
Lancaster
Naperville
Hollywood
Rogers
New Orleans
Lakewood
Oak Lawn
Caldwell
Portland
Concord
Burien
St. Charles
Waukesha
Flagstaff
Puyallup
Schenectady
Denver
Springfield
Campbell
Moreno Valley
Kettering
Lincoln Park
Lake Havasu City
North Little Rock
Reno
Littleton
Prescott
Dublin
Texarkana
Portland
Brooklyn Park
West Haven
El Centro
The Colony
Flint
Pittsburg
Buckeye
Brea
Indio
Scottsdale
Columbus
Lancaster
Lincoln
Trenton
Boston
Glendora
Rochester
Florissant
Annapolis
Chapel Hill
Lawrence
Calexico
Calumet City
Missoula
Clovis
Pearland
Gilbe

Mankato
New York
Simi Valley
West Covina
Castle Rock
Valdosta
Fort Worth
Carson
Evansville
Santa Maria
Brentwood
La Mesa
Westminster
Reading
Turlock
Greeley
Tyler
Winter Garden
Waco
Martinez
Thousand Oaks
Azusa
Bridgeport
St. Louis Park
Chesterfield
Greenacres
Pasco
Torrance
Madison
Janesville
Waukegan
Bowling Green
Arlington
Lewisville
Farmington
Oakley
Palo Alto
Springfield
New Bedford
Redmond
Brentwood
Davis
La Mirada
Smyrna
Altoona
Dothan
East Orange
Tinley Park
Biloxi
Urbana
Garden Grove
Lynwood
Northglenn
Canton
Brookhaven
Baton Rouge
Bristol
Independence
Lansing
Orland Park
Vineland
Green Bay
Evanston
Ocoee
Los Angeles
Federal Way
West Jordan
Yucaipa
Pittsburgh
Downey
Cicero
South Gate
Panama City
Joplin
League City
Salt Lake City
Fort Pierce
Phoenix
Corvallis
Cedar Hill
Chicopee
Ocala
Roswell
Park Ridge
Concord
Olympia
Huntington Beach
Corpus Christi
Haverhill
Hanover Park
Cambridge
Glendale
Stanton
Summerville
Davenport
Boca Raton
New Britain
Roseville
Cleveland Heights
Gresha

In [160]:
#Checking DataFrame
print(us_venues.shape)
us_venues.head()

(1000, 7)


Unnamed: 0,City,City Latitude,City Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,,,,,,,
1,,,,,,,
2,,,,,,,
3,,,,,,,
4,,,,,,,


In [161]:
#removing na values from dataframe
us_venues.dropna(axis=0, inplace=True)
print(us_venues.shape)
us_venues.head()

(0, 7)


Unnamed: 0,City,City Latitude,City Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category


In [151]:
#Looking at counts per city
us_venues.groupby('City').count().head()

Unnamed: 0_level_0,City Latitude,City Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [152]:
#How many unique categories
print('There are {} uniques categories.'.format(len(us_venues['Venue Category'].unique())))

There are 0 uniques categories.


In [153]:
# one hot encoding
us_onehot = pd.get_dummies(us_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
us_onehot['City'] = us_venues['City'] 

# move neighborhood column to the first column
fixed_columns = [us_onehot.columns[-1]] + list(us_onehot.columns[:-1])
us_onehot = us_onehot[fixed_columns]

us_onehot.head()

Unnamed: 0,City,Venue Category


In [154]:
#Fixing Columns
us_onehot=us_onehot.set_index('City')
us_onehot

Unnamed: 0_level_0,Venue Category
City,Unnamed: 1_level_1


In [155]:
#Grouped by city
us_grouped = us_onehot.groupby('City').mean().reset_index()
us_grouped

Unnamed: 0,City,Venue Category


In [156]:
#Function for most popular venue types per city
def return_most_common_venues(row, num_top_venues):
    row_categories = row.iloc[1:]
    row_categories_sorted = row_categories.sort_values(ascending=False)
    
    return row_categories_sorted.index.values[0:num_top_venues]

In [157]:
num_top_venues = 10

indicators = ['st', 'nd', 'rd']

# create columns according to number of top venues
columns = ['City']
for ind in np.arange(num_top_venues):
    try:
        columns.append('{}{} Most Common Venue'.format(ind+1, indicators[ind]))
    except:
        columns.append('{}th Most Common Venue'.format(ind+1))

# create a new dataframe
city_venues_sorted = pd.DataFrame(columns=columns)
city_venues_sorted['City'] = us_grouped['City']

for ind in np.arange(us_grouped.shape[0]):
    city_venues_sorted.iloc[ind, 1:] = return_most_common_venues(us_grouped.iloc[ind, :], num_top_venues)

city_venues_sorted.head()

Unnamed: 0,City,1st Most Common Venue,2nd Most Common Venue,3rd Most Common Venue,4th Most Common Venue,5th Most Common Venue,6th Most Common Venue,7th Most Common Venue,8th Most Common Venue,9th Most Common Venue,10th Most Common Venue


In [158]:
#K means clustering to visualize
# set number of clusters
kclusters = 10

us_grouped_clustering = us_grouped.drop('City', 1)

# run k-means clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(us_grouped_clustering)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10] 

ValueError: Found array with 0 sample(s) (shape=(0, 1)) while a minimum of 1 is required.

In [None]:
# add clustering labels
city_venues_sorted.insert(0, 'Cluster Labels', kmeans.labels_)

city_merged = cities

# merge toronto_grouped with toronto_data to add latitude/longitude for each neighborhood
city_merged = city_merged.join(city_venues_sorted.set_index('City'), on='City')

city_merged.dropna(axis=0, inplace=True)
city_merged['Cluster Labels']= city_merged['Cluster Labels'].astype(int)

city_merged.head() # check the last columns!

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=3)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(city_merged['latitude'], city_merged['longitude'], city_merged['City'], city_merged['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=5,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
       
map_clusters

In [None]:
user_reftable=us_grouped

In [135]:
#defining user needed data

c_name= input("What is your name? " )
imp= []
print('Please choose services from the following list: ')
data= list(user_reftable.columns)
chunks = [data[x:x+10] for x in range(0, len(data), 10)]
cat_table= pd.DataFrame(chunks)
display(cat_table)
for i in range(10):
    if i==0:
        p= input('What is the most important service? ')
    elif i==1:
        p= input('What is the ' + str(i+1) + 'nd most important service?')
    elif i==2:
        p= input('What is the ' + str(i+1) + 'rd most important service?')
    else:
        p= input('What is the ' + str(i+1) + 'th most important service?')
    imp.append(p)

print('Thank you for your input!')

What is your name? Avnika
Please choose services from the following list: 


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,City,Zoo Exhibit,ATM,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Food Court
1,Airport Lounge,Airport Service,Airport Terminal,American Restaurant,Amphitheater,Animal Shelter,Antique Shop,Apres Ski Bar,Aquarium,Arcade
2,Arepa Restaurant,Argentinian Restaurant,Art Gallery,Art Museum,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auditorium,Australian Restaurant,Austrian Restaurant
3,Auto Dealership,Auto Garage,Auto Workshop,Automotive Shop,BBQ Joint,Bagel Shop,Bakery,Bank,Bar,Baseball Field
4,Baseball Stadium,Basketball Court,Basketball Stadium,Bath House,Bathing Area,Bavarian Restaurant,Bay,Beach,Beach Bar,Bed & Breakfast
5,Beer Bar,Beer Garden,Beer Store,Belgian Restaurant,Big Box Store,Bike Shop,Bike Trail,Bistro,Board Shop,Boat or Ferry
6,Bookstore,Border Crossing,Botanical Garden,Boutique,Bowling Alley,Boxing Gym,Brazilian Restaurant,Breakfast Spot,Brewery,Bridal Shop
7,Bridge,Bubble Tea Shop,Buffet,Building,Burger Joint,Burmese Restaurant,Burrito Place,Business Service,Butcher,Café
8,Cajun / Creole Restaurant,Cambodian Restaurant,Camera Store,Campground,Canal,Canal Lock,Candy Store,Capitol Building,Car Wash,Caribbean Restaurant
9,Carpet Store,Casino,Castle,Cave,Cemetery,Cheese Shop,Chinese Restaurant,Chocolate Shop,Church,Churrascaria


What is the most important service? University
What is the 2nd most important service?Coffee Shop
What is the 3rd most important service?Store
What is the 4th most important service?Waterfront
What is the 5th most important service?Theme Park
What is the 6th most important service?Water PArk
What is the 7th most important service?School
What is the 8th most important service?High School
What is the 9th most important service?Train Station
What is the 10th most important service?Trail
Thank you for your input!


In [136]:
user_reftable= user_reftable.append({'City': c_name+"'s City"}, ignore_index=True)
user_row=user_reftable[user_reftable['City']==c_name+"'s City"].index[0]


In [137]:
user_reftable.set_index('City', inplace=True)
user_reftable

Unnamed: 0_level_0,Zoo Exhibit,ATM,Accessories Store,Adult Boutique,Advertising Agency,Afghan Restaurant,African Restaurant,Airport,Airport Food Court,Airport Lounge,...,Waterfall,Waterfront,Whisky Bar,Wine Bar,Wine Shop,Winery,Wings Joint,Women's Store,Yoga Studio,Zoo
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avnika,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.013514,0.000000,0.00,0.027027,0.0,0.00,0.013514
Avnika,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.010000,0.000000,0.01,0.020000,0.0,0.00,0.010000
Avnika,0.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.020000,0.00,0.000000,0.0,0.01,0.000000
Avnika,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.007692,0.011538,0.00,0.003846,0.0,0.00,0.003846
Avnika,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.010000,0.00,0.000000,0.0,0.00,0.010000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Avnika,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.00,0.000000,0.0,0.00,0.000000
Avnika,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.00,0.000000,0.0,0.00,0.000000
Avnika,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.020000,0.000000,0.00,0.010000,0.0,0.00,0.000000
Avnika,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.000000,0.000000,0.00,0.010000,0.0,0.00,0.000000


In [131]:
user_reftable.loc[user_row, imp[0]]= 0.23
user_reftable.loc[user_row, imp[1]]= 0.20
user_reftable.loc[user_row, imp[2]]=0.18
user_reftable.loc[user_row, imp[3]]=0.15
user_reftable.loc[user_row, imp[4]]=0.13
user_reftable.loc[user_row, imp[5]]=0.10
user_reftable.loc[user_row, imp[6]]=0.08
user_reftable.loc[user_row, imp[7]]=0.05
user_reftable.loc[user_row, imp[8]]=0.03
user_reftable.loc[user_row, imp[9]]=0.00

user_reftable.replace(np.nan, 0.0, inplace= True)
user_reftable[user_row]

TypeError: '<' not supported between instances of 'str' and 'int'

In [122]:
np.linspace(0,.23,10)

array([0.        , 0.02555556, 0.05111111, 0.07666667, 0.10222222,
       0.12777778, 0.15333333, 0.17888889, 0.20444444, 0.23      ])

In [127]:
user_reftable.iloc[user_row]

City                    Avnika's City
Zoo Exhibit                         0
ATM                                 0
Accessories Store                   0
Adult Boutique                      0
                            ...      
(652, Train Station)              0.1
(652, Video Store)               0.08
(652, Field )                    0.05
(652, Fountain)                  0.03
(652, Coffee Shop)                  0
Name: 652, Length: 525, dtype: object