In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
from bs4 import BeautifulSoup
import requests


In [5]:
from sklearn.cluster import KMeans

In [6]:
from pandas.io.json import json_normalize

# PART A

# Scraping Wikipedia Website  and putting data it in the data frame

In [117]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [118]:
source = requests.get(url).text

In [119]:
soup = BeautifulSoup(source)

In [120]:
table = soup.find('table')

### Rows array will have all of the rows html data, each element of the array representing every row

In [121]:
rows_arr = table.find_all('tr')

### Getting the relevant data stored in the table

In [122]:
rows = []
for row_data in rows_arr:
    row_arr = []
    for element in row_data.find_all('td'):
        row_arr.append(element.text.rstrip('\n'))
    rows.append(row_arr)
    


In [123]:
rows[0] = [heading.text.rstrip('\n') for heading in rows_arr[0].find_all('th')]

In [124]:
cols = rows[0]
cols

['Postcode', 'Borough', 'Neighbourhood']

In [125]:
df = pd.DataFrame(rows)

In [126]:
df

Unnamed: 0,0,1,2
0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront
6,M5A,Downtown Toronto,Regent Park
7,M6A,North York,Lawrence Heights
8,M6A,North York,Lawrence Manor
9,M7A,Queen's Park,Not assigned


### Setting column names in the df

In [127]:
df.columns = cols
if df.iloc[0].tolist() == cols:
    df = df.drop(0)

In [128]:
df.columns = ['PostalCode'] + [cols[1], ] + ['Neighborhood']

In [129]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


### Now, we have the data frame. All we have to do now is clean the data

### Removing the  rows which doesn't have a Borough

In [130]:
df = df[df.Borough != 'Not assigned'].reset_index(drop = True)
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights


In [131]:
df.Borough.value_counts()

Etobicoke           45
North York          38
Scarborough         37
Downtown Toronto    37
Central Toronto     17
West Toronto        13
York                 9
East Toronto         7
East York            6
Mississauga          1
Queen's Park         1
Name: Borough, dtype: int64

### First, finding those values for which a same postal code have different neighborhoods and then deleting all of the rows and concatenating the neighborhoods by ','

In [132]:
## Pseudo Code
# 1. Iterate through the postal codes
# 2. For a postal code, use
#     df[df.PostalCode == <postal code in for loop]
#     to get the df with the give postal code
# 3. Fetch all of the rows of the neighborhood column
# 4. Use .tolist()  to convert into python list
# 5. Then, use ','.join(list) to seprate the list with a ',' and save the string
# 6. At this point all of the postal code would be same because we're iterating through it bitch.
#      and we most likely should have all the boroughs same too!!! We have the string of neighborhoods also
# 7. Now delete all the irrelevant rows, and make a new relevant row and append it


In [133]:
postal_codes = df.PostalCode.unique()

for postal_code in postal_codes:
    postal_code_df = df[df.PostalCode == postal_code] 
    neighs = postal_code_df.Neighborhood
    neighs_str = ','.join(neighs)
    #In case we have more than one borough for same postal code area, we'll concatenate boroughs too then motehrfuker
    boroughs_str = ','.join(postal_code_df.Borough.unique())
    
    row = [postal_code, boroughs_str, neighs_str]
    #First binding the index so that mutating the df won't give any out of range / bounds
    indexes = postal_code_df.index.tolist()
    
    # Removing all of the fucking rows
    # Get the index of all of the rows with postal_code, delete them and add the  newly update postal_code with
    #Concated values
    df.drop(indexes, inplace = True)
    df = df.append({'PostalCode': postal_code, 'Neighborhood' : neighs_str, 'Borough' : boroughs_str}, ignore_index = True).reset_index(drop = True)


In [134]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Not assigned


### Now, if the neighborhood has an unassingned name, then assigning the name of the neighborhood to the respective name of it's borough

In [135]:
# Pseudo code

#Get the list of indexes of those neighborhoods which have unassigned values. Once you did that then 
#iterate through the indexes using df.loc[index] and get the borough name using
# df.loc[index, borough] and assign this to the neighborhood using
# df.loc[index, neighborhood] = df.loc[index, borough]
# code looks like this

#indexes = df[df.Neighborhood == 'Not assigned'].index.tolist()

# for i in indexes:
#    borough = df.loc[i, 'borough']
#    df.loc[i, 'neighborhood'] = borough


In [136]:
null_neighs_indexes = np.array(df[df.Neighborhood  == 'Not assigned'].index)

for i in null_neighs_indexes:
    #Getting the borough name
    borough = df.loc[i, 'Borough']
    #Assigning the borough name to the neighborhood name(neighborhood was null)
    df.loc[i, 'Neighborhood'] = borough

In [138]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,"Harbourfront,Regent Park"
3,M6A,North York,"Lawrence Heights,Lawrence Manor"
4,M7A,Queen's Park,Queen's Park


In [137]:
df.shape

(103, 3)

# PART B

###  Done with the  cleaning

### Now,we're supposed to get the lat and long using the geocoder

### Getting the latitude and longitude

In [29]:
import geocoder
from geopy.geocoders import Nominatim

In [38]:
# coordinates = np.empty(shape = (5, 2))
# count = 0

# geocoder = Nominatim(user_agent = 'my_app')
# for i in range(5):
#     PostalCode, Borough = df.loc[i, 'PostalCode'], df.loc[i, 'Borough']
#     neigh = df.loc[i, 'Neighborhood'].split(',')[0]
#     query = f'{neigh}, {Borough}, Toronto, Ontario, Canada'
#     error = True
#     while_count = 0
    
#     while error:
#         while_count += 1
#         try:
#             location = geocoder.geocode(query)
            
#         except:
#             location = None
#         if while_count >= 50:
#             lat,lng = None,None
#             print('Tried more than 50 times to feth coordinates but failed...', 'Neighborhood: ',neigh, 'Borough', Borough)
#             break
            
#         #If the type of location is not none, then error = false to get out of while loop
#         if type(location) != type(None):
#             error = False
#             lat = location.latitude
#             lng = location.longitude
#             coordinates[i] = [lat, lng]
        
        
#     count += 1
#     print(f'{count} Number of coordinates Found')

### <IMPORTANT NOTE
### >>

### The above function was taking very long to process and the requests were also not very successful. Hence, I imported the csv file of the location data

In [39]:
# with open('Geospatial_Coordinates.csv') as file:
#     loc_df = pd.read_csv(file)

In [30]:
link = 'https://cocl.us/Geospatial_data'
loc_df = pd.read_csv(link)

In [31]:
loc_df.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


### Combining the data of df_loc with df

#### Pseudo Code

In [42]:
#  Pseudo Code for the  above

# for i in range(len(loc_df)):
#     lat = df_loc.loc[i, 'latitude']
#     long = df_loc.loc[i, 'longitude']
#     postal_code = df_loc.loc[i, 'postal code']
    
#     index_df = df[df.postalcode == postal_code]
    
#     df.loc[index_df, 'latitude'] = lat
#     df.loc[index_df, 'longitude'] = long

### Creating lats and longs variables in df

In [34]:
df['Latitude'] = np.zeros(len(df))
df['Longitude'] = np.zeros(len(df))

### Fetching the postal  code from loc_df, finding index of that in the orignal df and then doing row operations as described in pseudo code

In [35]:
for i in range(len(loc_df)):
    lat = loc_df.loc[i, 'Latitude']
    lng = loc_df.loc[i, 'Longitude']
    postal_code = loc_df.loc[i, 'Postal Code']
    
    index_df = df[df.PostalCode == postal_code].index[0]
    
    df.loc[index_df, 'Latitude'] = lat
    df.loc[index_df, 'Longitude'] = lng
    

In [36]:
## Checking if each postal code's lat and lng values are correct in df by cross checking the same with the loc_Df
errors_at_index = []
for i in range(len(df)):
    code = df.loc[i]['PostalCode']
    lat_df = df[df.PostalCode == code].Latitude[i]
    lng_df = df[df.PostalCode == code].Longitude[i]
    
    
    #Index of the same row with the code postal_code in the loc_df
    i2 = loc_df[loc_df['Postal Code'] == code].index[0]
    lat_loc_df = loc_df[loc_df['Postal Code'] == code].Latitude[i2]
    lng_loc_df = loc_df[loc_df['Postal Code'] == code].Longitude[i2]
    
    if lat_df != lat_loc_df or lat_df != lat_loc_df:
        errors_at_index.append(i)
        

### Final df for PART A of the week - 3 project!!

In [40]:
df.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,M3A,North York,Parkwoods,43.753259,-79.329656
1,M4A,North York,Victoria Village,43.725882,-79.315572
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494


# PART C

### Using the explore endpoint of the  foursquare data to find the top venues that are there

In [41]:
#foursquare credentials

CLIENT_ID = '0GAHQBPMD4VB2FOP135GALSMFMFTTUSLIHOG4AGYNZSF2USK'
CLIENT_SECRET = 'D3P5H1X1D0SD0B0F1MEKBOGYB0O4HMCAOW3SX2BXVBVSEEYS'
VERSION = '20180605'

### LIMIT, RADIUS AND OTHER DETAILS

In [42]:
LIMIT = 50
RADIUS = 8000

### For a given location coordinates, we'll be getting the top n venues and putting it in a data frame
### First, set the limit and the radius and the version of the four square api
### After that, get the location data and then get the imporant information which is:
### 1. Venue's Name
### 2. Venue's Category
### 3. Which neighborhood are the venues in(for a given neighborhood, there'll be n venues and all of those will have same borough / neighborhood..
### 4. Venue's Lats and longs
### 5. It's neighborhood's lats and longs
### All of that in a data frame before we can do anything else

In [43]:
base_uri = 'https://api.foursquare.com/v2/'

In [44]:
def get_venues_url(lat, lng, limit = LIMIT, radius = RADIUS,client_id = CLIENT_ID, client_secret = CLIENT_SECRET):
    url = f'{base_uri}venues/explore?client_id={CLIENT_ID}&client_secret={CLIENT_SECRET}&v={VERSION}&ll={lat},{lng}&limit={LIMIT}&radius={RADIUS}'
    
    return url

In [45]:
c = df.iloc[0,-2:]

In [46]:
url = get_venues_url(c[0], c[1])

In [47]:
json = requests.get(url).json()

In [48]:
json['response']['groups'][0]['items'][0]

{'reasons': {'count': 0,
  'items': [{'summary': 'This spot is popular',
    'type': 'general',
    'reasonName': 'globalInteractionReason'}]},
 'venue': {'id': '4b8991cbf964a520814232e3',
  'name': "Allwyn's Bakery",
  'location': {'address': '81 Underhill drive',
   'lat': 43.75984035203157,
   'lng': -79.32471879917513,
   'labeledLatLngs': [{'label': 'display',
     'lat': 43.75984035203157,
     'lng': -79.32471879917513}],
   'distance': 833,
   'postalCode': 'M3A 1Z5',
   'cc': 'CA',
   'neighborhood': 'Parkwoods - Donalda',
   'city': 'Toronto',
   'state': 'ON',
   'country': 'Canada',
   'formattedAddress': ['81 Underhill drive', 'Toronto ON M3A 1Z5', 'Canada']},
  'categories': [{'id': '4bf58dd8d48988d144941735',
    'name': 'Caribbean Restaurant',
    'pluralName': 'Caribbean Restaurants',
    'shortName': 'Caribbean',
    'icon': {'prefix': 'https://ss3.4sqi.net/img/categories_v2/food/caribbean_',
     'suffix': '.png'},
    'primary': True}],
  'photos': {'count': 0, 'gro

In [49]:
def get_venues_dic(url, postal_code, borough):
    '''
    Returns the dictionary of the venues related to the url
    
    '''
    
    while True:
        
        try:
            venues_json = requests.get(url).json()
            if not type(venues_json) == type(None):
                break
        
        except:
             None
        
    
    try:
        items = venues_json['response']['groups'][0]['items']
    except:
        
        Exception('Key Error')
    
    
    
    names_arr = []
    id_arr = []
    cats_arr = []
    lats_arr = []
    lngs_arr = []
    
    
    for item, i in zip(items, range(len(items))):
        id_arr.append(item['venue']['id'])
        names_arr.append(item['venue']['name'])
        lats_arr.append(item['venue']['location']['lat'])
        lngs_arr.append(item['venue']['location']['lng'])
        category = item['venue']['categories'][0]['name']
        cats_arr.append(category)
    
    venues_dict = {
        'venue_names': names_arr,
        'venue_ids' : id_arr,  
        'venue_categories' : cats_arr,
        'venue_latitudes' : lats_arr,
        'venue_longitudes' : lngs_arr, 
        'borough' : np.array([borough for i in range(len(items))]), 
        'postal_code' : np.array([postal_code for i in range(len(items))])
        
                  }                  
    
                       
    return venues_dict

### Making a pandas data frame and  appending all of the venues of each and every neighborhood to it

In [50]:
venues_df = pd.DataFrame(columns = ['venue_names', 'venue_categories', 'venue_longitudes', 'venue_latitudes', 'borough', 'postal_code', 'venue_ids'])

In [51]:
venues_df = pd.DataFrame(columns = ['venue_names', 'venue_categories', 'venue_longitudes', 'venue_latitudes', 'borough', 'postal_code', 'venue_ids'])

for i in range(len(df)):
    lat = df.loc[i]['Latitude']
    lng = df.loc[i]['Longitude']
    url = get_venues_url(lat, lng)
    borough = df.loc[i]['Borough']
    postal_code = df.loc[i]['PostalCode']
    flag = True
    while flag:
        venues_dic = get_venues_dic(url, postal_code, borough)
        if type(venues_dic) == type(None):
            flag = True
            
        else:
            flag = False
    
    
    venues_df = venues_df.append(pd.DataFrame(venues_dic), ignore_index = True)
    print(str(int(i + 1)) + '. ', postal_code, 'postal code venues added!')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


1.  M3A postal code venues added!
2.  M4A postal code venues added!
3.  M5A postal code venues added!
4.  M6A postal code venues added!
5.  M7A postal code venues added!
6.  M9A postal code venues added!
7.  M1B postal code venues added!
8.  M3B postal code venues added!
9.  M4B postal code venues added!
10.  M5B postal code venues added!
11.  M6B postal code venues added!
12.  M9B postal code venues added!
13.  M1C postal code venues added!
14.  M3C postal code venues added!
15.  M4C postal code venues added!
16.  M5C postal code venues added!
17.  M6C postal code venues added!
18.  M9C postal code venues added!
19.  M1E postal code venues added!
20.  M4E postal code venues added!
21.  M5E postal code venues added!
22.  M6E postal code venues added!
23.  M1G postal code venues added!
24.  M4G postal code venues added!
25.  M5G postal code venues added!
26.  M6G postal code venues added!
27.  M1H postal code venues added!
28.  M2H postal code venues added!
29.  M3H postal code venues a

In [62]:
venues_df.shape

(5750, 7)

In [None]:
venues_df.head()

### Checking if each postal code have 50 number of venues. 

In [53]:
venues_df.postal_code.value_counts().head()

M6M    50
M2J    50
M9P    50
M3N    50
M3C    50
Name: postal_code, dtype: int64

In [54]:
cols = ['Borough', 'Postal_Code', 'Venue_Category', 'Venue_Id', 'Venue_Latitude', 'Venue_Longitude', 'Venue_Name']
venues_df.columns = cols

In [55]:
venues_df.head()

Unnamed: 0,Borough,Postal_Code,Venue_Category,Venue_Id,Venue_Latitude,Venue_Longitude,Venue_Name
0,North York,M3A,Caribbean Restaurant,4b8991cbf964a520814232e3,43.75984,-79.324719,Allwyn's Bakery
1,North York,M3A,Golf Course,4bd4846a6798ef3bd0c5618d,43.752816,-79.342741,Donalda Golf & Country Club
2,North York,M3A,Mediterranean Restaurant,4d0bf09d3bc0b60c7e5bd174,43.741839,-79.309296,Damas Grillhouse & Juice Bar
3,North York,M3A,Event Space,4b8ec91af964a520053733e3,43.763923,-79.342961,Graydon Hall Manor
4,North York,M3A,Coffee Shop,5a8743ed1543c734d2840194,43.735764,-79.344156,Starbucks Reserve Bar


### Making Dummies of different categories

In [60]:
category_dummies = pd.get_dummies(venues_df['Venue_Category'])

### Adding postal code to category_dummies

In [61]:
category_dummies['Postal_Code'] = venues_df['Postal_Code']
if category_dummies.columns[0] != 'Postal_Code':
    cols = [category_dummies.columns[-1], ] + category_dummies.columns[:-1].tolist()
    category_dummies = category_dummies[cols]

In [62]:
category_dummies['Borough'] = venues_df['Borough']

In [63]:
category_dummies.head()

Unnamed: 0,Postal_Code,Afghan Restaurant,Airport Lounge,American Restaurant,Aquarium,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Dealership,...,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit,Borough
0,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,North York
1,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,North York
2,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,North York
3,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,North York
4,M3A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,North York


In [64]:
df_grouped = category_dummies.groupby('Postal_Code').sum()
if df_grouped.index.name == 'Postal_Code':
    df_grouped = df_grouped.reset_index()

### The below function takes in a grouped_df and returns top n venues df

In [65]:
def get_top_n_venues(df_grouped, n_venues, postal_code_col_name):
    
    
    '''
    
    
    '''
    
#     #making a  numpy  array  with rows = df_grouped and cols = n_venues + 1, 1 extra for the postal_code
#     top_n_venues_arr = np.zeros(shape = (len(df_grouped), n_venues + 1))
    
    top_n_venues_arr = []
    
    if not postal_code_col_name in df_grouped.columns:
        raise Exception('postal_code_col_name is not found in the columns, provide the correct postal_code column name')
        
    for i in range(len(df_grouped)):
        # Series object having index as col names and values as row values
        row = df_grouped.iloc[i]
        
        # Binding the postal code of df to postal_code
        postal_code = row[postal_code_col_name]
        #Removing the postal_code since can't perform sort_values on categorical data, so making sure only category type
        # of venues are available
        row = row.drop(postal_code_col_name)
        
        try:
            top_n_venues = row.sort_values(ascending = False)[:n_venues].index.tolist()
        except:
            top_n_venues = np.full(shape = n_venues, fill_value = np.nan)
            
        # Row of top_n_venues which have postal code and top n venues
        top_n_venues_row = (postal_code, ) + tuple(top_n_venues)
        #Appending the row in a 2-d numpy array, each index representing a row and inner list scrolling through cols
    
        top_n_venues_arr.append(top_n_venues_row)
        
    df = pd.DataFrame(top_n_venues_arr)
    cols = [postal_code_col_name] + [str(i) + '_venue' for i in range(1, n_venues + 1)]
    df.columns = cols
        
    return df
        

In [66]:
top_venues_df = get_top_n_venues(df_grouped, 10, 'Postal_Code')

### The above data is ready for clustering now! Let's start with clustering

We'll be using the number of clusters similar to the lab in this week ie 5.

Our feature matrix will consist of the top n venues ( in this case 10) and each row is represented by a Postal Code

In [67]:
k_clusters = 5
cluster_df = df_grouped.drop('Postal_Code', axis = 1)
cluster_df.head()

Unnamed: 0,Afghan Restaurant,Airport Lounge,American Restaurant,Aquarium,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Dealership,Automotive Shop,...,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,2,8
1,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,2,6
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,2,1
3,0,0,0,0,0,1,2,1,0,0,...,0,1,0,0,0,0,0,0,1,1
4,0,0,0,0,0,1,2,1,0,0,...,0,1,0,0,0,0,0,0,0,0


### Initializing a KMeans object

In [68]:
knn_clusters = KMeans(n_clusters = k_clusters, random_state = 0).fit(cluster_df)

In [69]:
cluster_labels = knn_clusters.labels_

In [70]:
labels_arr = []

for i in range(len(cluster_labels)):
    row = [df_grouped.loc[i]['Postal_Code'], cluster_labels[i]]
    labels_arr.append(row)

In [71]:
labels_arr[:5]

[['M1B', 0], ['M1C', 0], ['M1E', 0], ['M1G', 0], ['M1H', 0]]

In [72]:
df_label = pd.DataFrame(labels_arr, columns = ['PostalCode', 'Label'])
df_label.head()

Unnamed: 0,PostalCode,Label
0,M1B,0
1,M1C,0
2,M1E,0
3,M1G,0
4,M1H,0


### Now we need to merge the top_venues_df with the lats and longs of the postal_code and the neighborhood name / neighborhood names and the borough names of the respective Postal_Code

In [73]:
df_grouped.head()

Unnamed: 0,Postal_Code,Afghan Restaurant,Airport Lounge,American Restaurant,Aquarium,Art Gallery,Arts & Crafts Store,Asian Restaurant,Athletics & Sports,Auto Dealership,...,Vegetarian / Vegan Restaurant,Vietnamese Restaurant,Warehouse Store,Whisky Bar,Wine Bar,Wings Joint,Women's Store,Yoga Studio,Zoo,Zoo Exhibit
0,M1B,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,2,8
1,M1C,0,0,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,2,6
2,M1E,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,2,1
3,M1G,0,0,0,0,0,1,2,1,0,...,0,1,0,0,0,0,0,0,1,1
4,M1H,0,0,0,0,0,1,2,1,0,...,0,1,0,0,0,0,0,0,0,0


In [74]:
top_venues_df

Unnamed: 0,Postal_Code,1_venue,2_venue,3_venue,4_venue,5_venue,6_venue,7_venue,8_venue,9_venue,10_venue
0,M1B,Zoo Exhibit,Park,Burger Joint,Caribbean Restaurant,Zoo,Indian Restaurant,Hakka Restaurant,Cosmetics Shop,Sports Bar,Fried Chicken Joint
1,M1C,Zoo Exhibit,Park,Beach,Zoo,Grocery Store,Indian Restaurant,Fast Food Restaurant,Caribbean Restaurant,Burger Joint,Pub
2,M1E,Park,Caribbean Restaurant,Burger Joint,Pub,Pizza Place,Indian Restaurant,Coffee Shop,Steakhouse,Zoo,Gym
3,M1G,Caribbean Restaurant,Park,Burger Joint,Gym,Steakhouse,Asian Restaurant,Indian Restaurant,Coffee Shop,Pub,Zoo Exhibit
4,M1H,Caribbean Restaurant,Coffee Shop,Chinese Restaurant,Park,Steakhouse,Indian Restaurant,Asian Restaurant,Burger Joint,Pub,Bookstore
5,M1J,Coffee Shop,Indian Restaurant,Park,Gym,Grocery Store,Caribbean Restaurant,Middle Eastern Restaurant,Burger Joint,Pub,Supermarket
6,M1K,Park,Burger Joint,Middle Eastern Restaurant,Indian Restaurant,Thai Restaurant,Gastropub,Supermarket,Grocery Store,Gym,Liquor Store
7,M1L,Park,Beach,Middle Eastern Restaurant,Café,Gastropub,Breakfast Spot,Thai Restaurant,Sandwich Place,Burger Joint,Indie Movie Theater
8,M1M,Park,Beach,Indian Restaurant,Middle Eastern Restaurant,Burger Joint,Thai Restaurant,Coffee Shop,Sandwich Place,Ice Cream Shop,Indie Movie Theater
9,M1N,Beach,Park,Breakfast Spot,Café,Indian Restaurant,Gastropub,Ice Cream Shop,Coffee Shop,Pub,French Restaurant


In [75]:
# ###
# Make the column name postal code same on both the dfs ie df and top_Venues_df

# then use
#     pd.merge(df1, df2, on = ['postal_code'], how = 'inner')
    
# but make sure that the column name postalcode is same in both the data frames! that's necessary

### Merging these dfs : df and top_venues_df on the basis of a common column postalcode

In [78]:
#Making the column names of postal code same on both the dfs

In [79]:
top_venues_df.columns

Index(['Postal_Code', '1_venue', '2_venue', '3_venue', '4_venue', '5_venue',
       '6_venue', '7_venue', '8_venue', '9_venue', '10_venue'],
      dtype='object')

In [80]:
cols = ['PostalCode'] + top_venues_df.columns.tolist()[1:]
top_venues_df.columns = cols

In [81]:
cols = ['PostalCode'] + df_grouped.columns.tolist()[1:]
df_grouped.columns = cols
df_grouped.columns.tolist()[:5]

['PostalCode',
 'Afghan Restaurant',
 'Airport Lounge',
 'American Restaurant',
 'Aquarium']

In [83]:
df_merged = pd.merge(df, top_venues_df, on = ['PostalCode'])
df_merged.head()

Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude,1_venue,2_venue,3_venue,4_venue,5_venue,6_venue,7_venue,8_venue,9_venue,10_venue
0,M3A,North York,Parkwoods,43.753259,-79.329656,Middle Eastern Restaurant,Supermarket,Gym / Fitness Center,Burger Joint,Mediterranean Restaurant,Caribbean Restaurant,Mexican Restaurant,Breakfast Spot,Steakhouse,Indian Restaurant
1,M4A,North York,Victoria Village,43.725882,-79.315572,Middle Eastern Restaurant,Gym / Fitness Center,Café,Park,Sandwich Place,Afghan Restaurant,Coffee Shop,Pastry Shop,Pet Store,Greek Restaurant
2,M5A,Downtown Toronto,"Harbourfront,Regent Park",43.65426,-79.360636,Coffee Shop,Café,Japanese Restaurant,Park,Restaurant,Farmers Market,Hotel,Museum,Mediterranean Restaurant,Sandwich Place
3,M6A,North York,"Lawrence Heights,Lawrence Manor",43.718518,-79.464763,Clothing Store,Furniture / Home Store,Café,Liquor Store,Vietnamese Restaurant,Bakery,Cosmetics Shop,Burger Joint,Boutique,Breakfast Spot
4,M7A,Queen's Park,Queen's Park,43.662301,-79.389494,Café,Coffee Shop,Hotel,Concert Hall,Pizza Place,Japanese Restaurant,Record Shop,Sporting Goods Shop,Ice Cream Shop,Organic Grocery


In [84]:
for i in range(len(df)):
    row = df.loc[i]
    postal_code_df = row['PostalCode']
    lat_df = row['Latitude']
    lng_df = row['Longitude']
    
    i = df_merged[df_merged.PostalCode == postal_code_df].index[0]
    lat_merge_df = df_merged.loc[i]['Latitude']
    lng_merge_df = df_merged.loc[i]['Longitude']
    if lat_df != lat_merge_df or lng_df != lng_merge_df:
        print('The two dfs are not joined properly')

### Visualizing the df_merged on a folium map!

In [85]:
import folium

In [86]:
geolocator = Nominatim(user_agent = 'app')
address = 'Toronto, Canada'
while True:
    try:
        location = geolocator.geocode(address)
        break
    except:
        pass

In [87]:
toronto_map = folium.Map(location = [location.latitude, location.longitude], zoom_start = 10)

### Merging the labels of clusters with the df_merged

In [89]:
if not 'Label' in df_merged.columns:
    df_merged = df_label.merge(df_merged, on = 'PostalCode')
    df_merged.head()

### Plotting the markers on the map

In [90]:
from colour import Color

In [91]:
r = Color('blue', hue = 0)

In [93]:
geolocator = Nominatim(user_agent = 'app')
address = 'Toronto, Canada'
while True:
    try:
        location = geolocator.geocode(address)
        if type(location)!= type(None):
            break
    except:
        pass
coordinates = location.latitude, location.longitude

In [94]:
fill_colors = ['blue', 'red', 'orange', 'green', 'purple']
colors = ['#1b998b',
'#faf6ff',
'#decdf5',
'#656176',
'#534d56']

In [95]:
toronto_map = folium.Map(location = coordinates, zoom_start = 10)

for i in range(len(df_merged)):
    row = df_merged.loc[i]
    postal_code = row['PostalCode']
    label = row['Label']
    borough = row['Borough']
    text = f'{postal_code}, label:{int(label)}'
    coordinates = (row['Latitude'], row['Longitude'])
    
    #Since label ranges from 0 to 4, the same indexes as the colors arr. We can use labels as indexes to select the color
    color = colors[int(label)]
    fill_color = fill_colors[int(label)]
    
    
    
    
    folium.CircleMarker(coordinates, popup = text, color = color, fill = True, fill_color = fill_color, fill_opacity = .4).add_to(toronto_map)
    

In [96]:
toronto_map

### Analyzing Clusters

### First, we'll need a function that will return the most occuring venue categories and their count. So, that we can see what kind of venue categories are famous for a particular cluster

In [97]:
def get_most_common_cats(df):
    '''
    input : A data frame which have m postal codes x n venue categories, each column of a data frame will represent
    a venue category for different neighborhoods / postaal codes
    
    d is a dictionary whose key will be category and whose value will be number of times that category appeared
    in different rows
    
    df will be a pandas data frame with different categoryies and their counts. Categories and counts are columns!
    '''
    
    d = {}
    for i in range(len(df)):
        row = df.loc[i]
        for category in row:
            if not category in d:
                d[category] = 1
            
            elif category in d:
                d[category] = d[category] + 1
                
    
    df = pd.DataFrame([[category, count] for category, count in d.items()])
    df.columns = ['Category', 'Count']
    
    
    return df
    

In [98]:
L = [str(i) + '_venue' for i in range(1, 11)]

### Cluster 1

In [99]:
c1 = df_merged[df_merged.Label == 0]

### Let's find the most common venues in this cluster

In [100]:
common_cat_1 = get_most_common_cats(c1[L].reset_index(drop = True))

In [101]:
common_cat_1.sort_values('Count', ascending = False)

Unnamed: 0,Category,Count
2,Burger Joint,10
5,Indian Restaurant,10
1,Park,9
3,Caribbean Restaurant,8
15,Coffee Shop,6
13,Pub,5
0,Zoo Exhibit,4
16,Steakhouse,4
17,Gym,4
4,Zoo,4


### Cluster 2

In [102]:
c2 = df_merged[df_merged['Label'] == 1]

### Getting most frequent categories of this cluster

In [103]:
common_cat_2 = get_most_common_cats(c2[L].reset_index(drop = True))

In [104]:
common_cat_2.sort_values('Count', ascending = False)

Unnamed: 0,Category,Count
6,Bakery,13
2,Coffee Shop,10
0,Café,9
1,Italian Restaurant,9
3,Park,8
7,Burger Joint,7
31,Seafood Restaurant,6
10,Sandwich Place,5
27,Burrito Place,5
15,Pizza Place,5


### Cluster 3

In [105]:
c3 = df_merged[df_merged['Label'] == 2]

### Most frequent categories of this cluster

In [106]:
common_cat_3 = get_most_common_cats(c3[L].reset_index(drop = True))

In [107]:
common_cat_3

Unnamed: 0,Category,Count
0,Coffee Shop,15
1,Park,12
2,Brewery,1
3,Café,13
4,Farmers Market,10
5,Pizza Place,8
6,Japanese Restaurant,13
7,Restaurant,13
8,Hotel,15
9,Bakery,1


### Cluster 4

In [108]:
c4 = df_merged[df_merged['Label'] == 3]

### Getting most frequent categories from this cluster

In [109]:
common_cat_4 = get_most_common_cats(c4[L].reset_index(drop = True))

In [110]:
common_cat_4.sort_values('Count', ascending = False)

Unnamed: 0,Category,Count
2,Coffee Shop,20
16,Japanese Restaurant,18
27,Furniture / Home Store,14
5,Liquor Store,13
9,Middle Eastern Restaurant,13
1,Caribbean Restaurant,12
3,Supermarket,10
6,Burger Joint,10
31,Steakhouse,10
47,Clothing Store,9


### Cluster 5

In [111]:
c5 = df_merged[df_merged['Label'] == 4]

### Getting most frequent cateogries of this cluster

In [112]:
common_cat_5 = get_most_common_cats(c5[L].reset_index(drop = True))

In [113]:
common_cat_5.sort_values('Count',ascending = False)

Unnamed: 0,Category,Count
0,Park,32
3,Café,31
12,Coffee Shop,24
10,Indian Restaurant,18
20,Brewery,16
16,Grocery Store,14
23,Pizza Place,13
4,Gastropub,10
22,BBQ Joint,9
46,Bar,9


# Conclusion: 

#### In week 3, we've done a lot of things
1. WE've gotten the data of postal codes through beautiful soup library
2. Clean the data
3. Get the location coordinates using the google api
4. Done the similar cluster analyzes and cluster the data using the common n venues as the features for clustering
5. Visualize the data with the folium maps
6. Found out which venues categories are the most famous ones for a given cluster
    The results of which are stored in common_cat_1 to common_cat_5

### Cluster Representation with most common venue categories

We have 5 different clusters. Each cluster is represented by some kind of common  / popular venues. These are:


### For cluster 1

1. Burger Joints
2. Indian Restaurant
3. Park
4. Caribbean Restaurant
5. Coffee Shops

### For cluster 2

1. Bakery
2. Coffee Shop
3. Cafe
4. Indian Restaurant
5. Park

### For cluster 3

1. Coffee Shop
2. Hotel
3. Cafe
4. Japanese Restuarant
5. Restaurant

### For Cluster 4

1. Coffee Shop
2. Japanese Restaurant
3. Furniture / HOme store
4. Liquor Store
5. Middle Eastern Restaurants

### For Cluster 5

1. Park
2. Cafe
3. Coffee Shop
4. Indian Restaurant
5. Brewery