## Installing needed packages

In [None]:
# !conda install -c conda-forge BeautifulSoup4 --yes 
# !conda install -c conda-forge requests --yes 
# !conda install -c conda-forge lxml --yes 
# !conda install -c conda-forge html5lib --yes 
# !conda install -c conda-forge geopy --yes
# !conda install -c conda-forge geocoder --yes

print('all installed...')

### Importing needed libraries

In [None]:
from bs4 import BeautifulSoup # for webscarping
import requests #to call the link
import numpy as np
import pandas as pd

#for maps
import folium

import geocoder

#for json files
import json
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

# import k-means from clustering stage
from sklearn.cluster import KMeans

#import for the lat and lang as the geocoder is not working
from geopy.geocoders import Nominatim
print('All imported')

## Part 1 begins here

### Using Beautiful Soup to parse the link after reading it using requests

In [None]:
url_IN = 'https://raw.githubusercontent.com/sanand0/pincode/master/data/IN.csv'
#source =  requests.get ('https://raw.githubusercontent.com/sanand0/pincode/master/data/IN.csv')
#soup  = BeautifulSoup(source,'csv')
#print (soup)

#### Converting all needed tables from the link in the dataframe

In [None]:
data = pd.read_csv(url_IN)
data = data.rename(columns={'key' :'Zip','place_name':'District','admin_name1':'State','latitude':'Latitude','longitude':'Longitude'})
#filter for New Delhi
df_data = data[data['State'].str.contains('New Delhi')].reset_index(drop=True)
df_data = df_data.drop(columns=['accuracy','Zip'],axis=1)
df_data = df_data.sort_values('District',ascending = False).groupby(['Latitude','Longitude']).head(1).reset_index(drop=True)
df_data

#### Creating the final dataset as per requirements

In [None]:
print('Final data has {} rows'.format(df_data.shape[0]))

### Visualizing New Delhi in the map

#### First checking co-ordinates

In [None]:
address = 'New Delhi, India'
geolocator = Nominatim(user_agent="IN_explorer")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of central New Delhi are {}, {}.'.format(latitude, longitude))

In [None]:
add = address
geoloc = Nominatim(user_agent='P')
#print(geoloc)
print(geoloc.geocode(add))
loc = geoloc.geocode(add)
lat = loc.latitude 
lng = loc.longitude
print(loc.latitude, loc.longitude)
label = 'ND'
map_dt1 =  folium.Map(location=[loc.latitude,loc.longitude], zoom_start=11)
#map_dt1
label = folium.Popup(label, parse_html= True)
folium.CircleMarker( 
    [lat,lng], 
        radius=3, 
        popup= label,
        color = 'green',
        fill = True,
        fill_color = '#33FF4F',
        fill_opacity = 0.7,
        parse_html =False).add_to(map_dt1)
#map_dt1

#### Now checking the areas of Delhi from the dataframe table on the map marking the Neighborhoods

In [None]:
#map creation
map_DT = folium.Map(location=[latitude,longitude],zoom_start=10)

#adding markers on above map

for lat, lng, label in zip(df_data['Latitude'],df_data['Longitude'],df_data['District']) :
    label = folium.Popup(label, parse_html= True)
    folium.CircleMarker( 
        [lat,lng], 
        radius=3, 
        popup= label,
        color = 'green',
        fill = True,
        fill_color = '#33FF4F',
        fill_opacity = 0.7,
        parse_html =False).add_to(map_DT)
map_DT

### Exploring the neighborhoods around Downtown Toronto

###### Using existing Foursquare creds

In [None]:
CLIENT_ID = 'ANUSTZNTX5MOJM10FOMOTATTFTLZ2C4WVMAGBCTTRCWQZEOO' # your Foursquare ID
CLIENT_SECRET = 'V4PHMEO5TGQ22DL1KYFB4TFDKX4N4SQ4AS5XOXBSLM0I4Z1W' # your Foursquare Secret
VERSION = '20180628' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

#### Taking any one of the neighborhoods to explore first - Connaught Place

In [None]:
df_data.loc[14,'District']

###### Let's shorten Connought Place as CP for upcoming analysis

In [None]:
CP_name = df_data.loc[14,'District']
CP_lat  =  df_data.loc[14,'Latitude']
CP_lng  = df_data.loc[14,'Longitude']
print('Location credentials for CP : Name {}, Lat {}, Long {}'.format(CP_name, CP_lat, CP_lng))

##### Getting url for exploration using Foursquare API and limiting the results to 30

In [None]:
radius = 10000
LIMIT = 50
url =  'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, CP_lat, CP_lng, VERSION, radius, LIMIT)
#url_hosp =  'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&categoryID={}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, HF_lat, HF_lng,categoryID, VERSION, radius, LIMIT)
print(url)
results = requests.get(url).json()#['response']['venues']
#print(url_hosp)
#results_hosp = requests.get(url_hosp).json()#['response']['venues']
#results_hosp

In [None]:
#only taking the required values
venues = results['response']['venues']
print (venues)
#flatten the json

df_CP = json_normalize(venues)
df_CP

In [None]:
filtered_columns = ['name','categories','location.lat','location.lng']
filtered_columns

#### Leveraging the get category type function

In [None]:
def get_category_type(i):
    try :
        cateogories =  i['venue.categories']
    except :
        categories =  i['categories']
        
        if len(categories) == 0:
            return None
        else:
            return categories[0]['name']

#### Cleaning the json and loading in dataframe

In [None]:
                   
CP_venues = df_CP.loc[:, filtered_columns]
CP_venues['categories'] = CP_venues.apply(get_category_type,axis=1)
                  
CP_venues.columns= [col.split(".")[-1] for col in CP_venues.columns]    
CP_venues=CP_venues[CP_venues['categories'].notna()]
CP_venues['categories'].unique()
## exporting the data in case the foursquari API is evoked for 24 hours
CP_venues.to_csv("CP_Venues.csv")

##### Rows returned by Foursquare

In [None]:
# HF_venues.categories.unique()
# #LIMIT == HF_venues.shape[0]
# HF_venues.groupby('categories').count().sort_values('name',ascending=False)
# HF_venues=HF_venues[HF_venues['categories'].notna()]
# d = HF_venues[HF_venues['categories'].str.contains('')]#('Restaurant|Pub|Caf|Bar|Snack|Coffee|Juic')]
# HF_venues.groupby('categories').count().sort_values('name',ascending=False)

##### Limit that we set matches with the shape

#### Now to replicate the process for all other areas of ND - we will create a function

In [None]:
def getNearbyVenues(names, lats, lngs, radius = 10000):
    
    venues_list = []
    for name, lat, lng in zip(names, lats, lngs):
        #print(name) # to get the names of the venues after callign the function
        
        #using the similar method as for Harbourfront
        
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
        #print(url)
        results = requests.get(url).json()['response']['groups'][0]['items']
        #print (requests.get(url))
        
        venues_list.append([(
            name,
            lat,
            lng,
            v['venue']['name'],
            v['venue']['location']['lat'],
            v['venue']['location']['lng'],
            v['venue']['categories'][0]['name']) for v in results])
        
        nearby_ven = pd.DataFrame([item for venue_list in venues_list for item in venue_list ])
        nearby_ven.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']

        
    return(nearby_ven)

In [None]:
ND_venues = getNearbyVenues(names=df_data['District'],
                                 lats= df_data['Latitude'],
                                 lngs =df_data['Longitude'], radius = 500
                                 )

In [None]:
ND_venues.shape
#print (url)

In [None]:
ND_venues.to_csv("ND_venues.csv")

##### Checking number of venues for each neighborhood

In [None]:
ND_venues.groupby('Neighborhood').count().reset_index()

##### Queen's Park is not coming as we were not able to retrieve any data from it's url

In [None]:
print('There are {} uniques categories.'.format(len(ND_venues['Venue Category'].unique())))
ND_venues['Venue Category'].unique()

### Analyzing each neighborhood before clustering

In [None]:
# doing onehot encoding

ND_onehot =  pd.get_dummies(ND_venues['Venue Category'])

# rename of column required as the Venue category value is also 'Neighborhood'
ND_onehot = ND_onehot.rename(columns={"Neighborhood":"Neighboring"})

ND_onehot = pd.concat([ND_venues['Neighborhood'],ND_onehot],axis=1)
ND_onehot

#### One hot encoding for frequency calculation

In [None]:
ND_grouped = ND_onehot.groupby('Neighborhood').mean().reset_index()
#d = ND_grouped.iloc[0,:]
#downtown_grouped
#print(d.iloc[1:].sort_values(ascending=False))
#d.iloc[1:].sort_values(ascending=False).index.values[0:3]

###### Checking sample data

In [None]:
ND_grouped

#### Checking top venues in the neighborhoods

In [None]:
top_venue_num = 10

for hood in ND_grouped['Neighborhood']:
   
    #selecting each row and transposing to get frequency of venues for each neighborhood
    temp = ND_grouped[ND_grouped['Neighborhood']==hood].T.reset_index()
    
    #selecting from 2nd row as the first row has neighborhood values
    temp = temp.iloc[1:]
    
    #defining columns for the temp table
    temp.columns=['Venue','Freq']
    #print(hood)
    #printing top 2 venues for each neighborhood
    #print(temp.sort_values('Freq',ascending = False).head(top_venue_num).reset_index(drop=True))
    #print('\n')

In [None]:
temp

#### Creating a function to get top venues for each neighborhood and storing them in pandas dataframe

In [None]:
def get_top_venues(row, num_top):
    row_categ_top = row.iloc[1:].sort_values(ascending=False)
    return row_categ_top.index.values[0:num_top]

#downtown_grouped.iloc[1, 1:]
#get_top_venues(downtown_grouped.iloc[1,1:], 5)
ND_grouped

##### Creating the dataframe now for all neighborhoods

In [None]:
num_top = 10  #to get top venues
columns = ['Neighborhood']
indicators = ['st', 'nd', 'rd'] #for column names as they will append to 1,2,3 to make them 1st, 2nd, 3rd
for i in range(1, num_top):
    try:
        columns.append('{}{} Most Common Venue'.format(i, indicators[i-1]))
    except:     
        columns.append('{}th Most common venue'.format(i))

neighborhood_venues_top = pd.DataFrame(columns=columns) #adding the columns to the dataframe
neighborhood_venues_top['Neighborhood'] = ND_grouped['Neighborhood'] #copying neighborhood data to the new dataframe

for j in range(0, ND_grouped.shape[0]):
    neighborhood_venues_top.iloc[j, 1:] = get_top_venues(ND_grouped.iloc[j, :], num_top-1)

In [None]:
neighborhood_venues_top

## Clustering Neighborhoods

In [None]:
kclusters = 6 # for number of clusters

downtown_clustering =  ND_grouped.drop('Neighborhood',axis=1) #dropping the column Neighborhood as it won't be needed for clustering
kmeans = KMeans(n_clusters=kclusters, random_state=0).fit(downtown_clustering)
kmeans.labels_[0:142] #checking the cluster groups for each neighborhood

#### Merging two datasets to get additional fields along with clusters for the neighborhood

In [None]:
neighborhood_venues_top.insert(0,'Cluster Labels',kmeans.labels_) #inserting the column Cluster Labels
df_data = df_data.rename(columns={'District':'Neighborhood'})
#df_data
merged_data = pd.merge(df_data,neighborhood_venues_top,how='inner',on='Neighborhood') #joining on Neighborhood column values
#merged_downtown_dat = merged_downtown_data[merged_downtown_data['Cluster Labels' == 0]]
merged_data.head(5)
#merged_downtown_data = merged_downtown_data[merged_downtown_data['Cluster Labels'] == 0].reset_index()

#### Seeing the clustering on the map

In [None]:
# create map
map_clusters = folium.Map(location=[latitude, longitude], zoom_start=6)

# set color scheme for the clusters
x = np.arange(kclusters)
ys = [i + x + (i*x)**2 for i in range(kclusters)]
colors_array = cm.rainbow(np.linspace(0, 1, len(ys)))
rainbow = [colors.rgb2hex(i) for i in colors_array]

# add markers to the map
markers_colors = []
for lat, lon, poi, cluster in zip(merged_data['Latitude'], merged_data['Longitude'], merged_data['Neighborhood'], merged_data['Cluster Labels']):
    label = folium.Popup(str(poi) + ' Cluster ' + str(cluster), parse_html=True)
    folium.CircleMarker(
        [lat, lon],
        radius=3,
        popup=label,
        color=rainbow[cluster-1],
        fill=True,
        fill_color=rainbow[cluster-1],
        fill_opacity=0.7).add_to(map_clusters)
map_clusters

#### We can check the data for each cluster using below

In [None]:
downtown_merged =  merged_data
downtown_merged.loc[downtown_merged['Cluster Labels']==0, downtown_merged.columns[[1] + list(range(5, downtown_merged.shape[1]))]].sort_values('Cluster Labels').reset_index(drop=True)

In [None]:

downtown_merged.loc[downtown_merged['Cluster Labels']==1, downtown_merged.columns[[1] + list(range(5, downtown_merged.shape[1]))]].sort_values('Cluster Labels').reset_index(drop=True)

In [None]:

downtown_merged.loc[downtown_merged['Cluster Labels']==2, downtown_merged.columns[[1] + list(range(5, downtown_merged.shape[1]))]].sort_values('Cluster Labels').reset_index(drop=True)

In [None]:

downtown_merged.loc[downtown_merged['Cluster Labels']==3, downtown_merged.columns[[1] + list(range(5, downtown_merged.shape[1]))]].sort_values('Cluster Labels').reset_index(drop=True)

In [None]:

downtown_merged.loc[downtown_merged['Cluster Labels']==4, downtown_merged.columns[[1] + list(range(5, downtown_merged.shape[1]))]].sort_values('Cluster Labels').reset_index(drop=True)

In [None]:

downtown_merged.loc[downtown_merged['Cluster Labels']==5, downtown_merged.columns[[1] + list(range(5, downtown_merged.shape[1]))]].sort_values('Cluster Labels').reset_index(drop=True)