# Capstone Project File

I am not importing the libraries here to keep track of the purpose of various libraries, will import them based on the need

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_postal_codes_of_Canada:_M'

In [3]:
import requests # To request the page text 

html_file = requests.get(url).text

In [4]:
#Using BeautifulSoup to parse this file

from bs4 import BeautifulSoup

In [5]:
soup = BeautifulSoup(html_file,'lxml')

## Finding the table in html file

In [6]:
html_table = soup.find('tbody')
features = []
total_features = html_table.find_all('th')
for feature in total_features:
    features.append(feature.text.strip('\n')) 
    
features

['Postcode', 'Borough', 'Neighbourhood']

In [7]:
# Now we have list of features in the data, lets create  a data frame

import pandas as pd
df = pd.DataFrame(columns=features)
df

Unnamed: 0,Postcode,Borough,Neighbourhood


In [8]:
html_table_allrows = html_table.find_all('tr')

for r_index, rows in enumerate(html_table_allrows):
    columns = rows.find_all('td') #Finding the total data entries in a single row
    for col,feat in zip(columns,features):
        df.loc[r_index,feat] = col.text.strip('\n')
df.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
1,M1A,Not assigned,Not assigned
2,M2A,Not assigned,Not assigned
3,M3A,North York,Parkwoods
4,M4A,North York,Victoria Village
5,M5A,Downtown Toronto,Harbourfront


## Cleaning the data


In [9]:
#Ignore cells with a Borough that is Not assigned.

df1=df[df.Borough != 'Not assigned']
df1.reset_index(inplace=True)
df1.drop('index',axis=1,inplace=True)
df1.head(10)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,Postcode,Borough,Neighbourhood
0,M3A,North York,Parkwoods
1,M4A,North York,Victoria Village
2,M5A,Downtown Toronto,Harbourfront
3,M5A,Downtown Toronto,Regent Park
4,M6A,North York,Lawrence Heights
5,M6A,North York,Lawrence Manor
6,M7A,Queen's Park,Not assigned
7,M9A,Etobicoke,Islington Avenue
8,M1B,Scarborough,Rouge
9,M1B,Scarborough,Malvern


In [10]:
# Combining the Neighbourhoods having same Postcode
df_clean = df1.groupby(['Postcode','Borough'])['Neighbourhood'].apply(','.join).reset_index()
df_clean.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


##  Assigning Borough where neighbourhood is not assigned

In [11]:
t=(df_clean.loc[df_clean['Neighbourhood']=='Not assigned']).index.values
for ii in t:
    df_clean.replace(to_replace=df_clean.iloc[ii]['Neighbourhood'], value=df_clean.iloc[ii]['Borough'],inplace=True)
df_clean.head()

Unnamed: 0,Postcode,Borough,Neighbourhood
0,M1B,Scarborough,"Rouge,Malvern"
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union"
2,M1E,Scarborough,"Guildwood,Morningside,West Hill"
3,M1G,Scarborough,Woburn
4,M1H,Scarborough,Cedarbrae


In [12]:
print(df_clean.shape)

(103, 3)


## Getting the Latitude and Longitude using the given file as geocoder seems to not working propoerly

In [13]:
df_latlong = pd.read_csv('Geospatial_Coordinates.csv')
df_latlong.head()

Unnamed: 0,Postal Code,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [14]:
df_latlong.rename(columns={'Postal Code':'Postcode'},inplace=True)
df_latlong.head()

Unnamed: 0,Postcode,Latitude,Longitude
0,M1B,43.806686,-79.194353
1,M1C,43.784535,-79.160497
2,M1E,43.763573,-79.188711
3,M1G,43.770992,-79.216917
4,M1H,43.773136,-79.239476


In [15]:
df_latlong.shape  # Just to make sure the entries in the csv file

(103, 3)

In [16]:
df_final = pd.merge(df_clean,df_latlong,on='Postcode')
df_final.head()

Unnamed: 0,Postcode,Borough,Neighbourhood,Latitude,Longitude
0,M1B,Scarborough,"Rouge,Malvern",43.806686,-79.194353
1,M1C,Scarborough,"Highland Creek,Rouge Hill,Port Union",43.784535,-79.160497
2,M1E,Scarborough,"Guildwood,Morningside,West Hill",43.763573,-79.188711
3,M1G,Scarborough,Woburn,43.770992,-79.216917
4,M1H,Scarborough,Cedarbrae,43.773136,-79.239476


## Lets see how many postcodes are there in each Borough

In [17]:
df_final.groupby(['Borough']).count().Postcode

Borough
Central Toronto      9
Downtown Toronto    18
East Toronto         5
East York            5
Etobicoke           12
Mississauga          1
North York          24
Queen's Park         1
Scarborough         17
West Toronto         6
York                 5
Name: Postcode, dtype: int64

### North York has the maximum Post codes, lets select North York as a central point and analyze its surroundings 

In [18]:
# Finding the coordinates of the North York

!conda install -c conda-forge geopy --yes # Installing geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

central_point = 'North York, Ontario, Canada'
geolocator = Nominatim(user_agent="ny_explorer")
location = geolocator.geocode(central_point)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of North York are {}, {}.'.format(latitude, longitude))

Collecting package metadata (current_repodata.json): done
Solving environment: done

# All requested packages already installed.



GeocoderServiceError: [Errno 65] No route to host

In [None]:
#Creating a data frame containing only North York data
df_NY = df_final[df_final['Borough'] == 'North York'].reset_index().drop('index',axis=1)
df_NY.head()

In [None]:
# Lets show the points on the map
!conda install -c conda-forge folium=0.5.0 --yes
import folium 

In [None]:
map_NY = folium.Map(location=[latitude,longitude],zoom_start=11)
for lat, lng, label in zip(df_NY['Latitude'], df_NY['Longitude'], df_NY['Postcode']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_NY)
    
map_NY

## Defining FourSquare Credentials

In [None]:
# @hidden_cell
CLIENT_ID = '3ZOLFC23B0ORXRPPPQ2LLXZFJFNRGD1VB2VSQR02VIAXBYIZ' # your Foursquare ID
CLIENT_SECRET = 'YXQQI3I3FLOCI3UZQ5ME0G1KDDENJUYHFTI5D5JISLWSV4YD' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

In [None]:
neighborhood_latitude = df_NY.loc[0, 'Latitude'] # neighborhood latitude value
neighborhood_longitude = df_NY.loc[0, 'Longitude'] # neighborhood longitude value

neighborhood_name = df_NY.loc[0, 'Postcode'] # neighborhood name

print('Latitude and longitude values of {} are {}, {}.'.format(neighborhood_name, 
                                                               neighborhood_latitude, 
                                                               neighborhood_longitude))

## Now lets find the top 100 venues within M2H postcode

In [None]:
LIMIT = 100
radius = 500

url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url

In [None]:
import requests # library to request the data
results = requests.get(url).json()
results

In [None]:
venues = results['response']['groups'][0]['items']
venues

In [None]:
import json
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

In [None]:
nearby_venues = json_normalize(venues) # flatten JSON
nearby_venues

In [None]:
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

In [None]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [None]:
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues.head()

In [None]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Postcode', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    return(nearby_venues)

In [None]:
NY_venues = getNearbyVenues(names=df_NY['Postcode'],
                                   latitudes=df_NY['Latitude'],
                                   longitudes=df_NY['Longitude']
                                  )

In [None]:
NY_venues.head()

In [None]:
NY_venues.groupby('Postcode').count()

In [None]:
print('There are {} uniques categories.'.format(len(NY_venues['Venue Category'].unique())))

In [None]:
# one hot encoding
NY_onehot = pd.get_dummies(NY_venues[['Venue Category']], prefix="", prefix_sep="")

# add neighborhood column back to dataframe
NY_onehot['Postcode'] = NY_venues['Postcode'] 

# move neighborhood column to the first column
fixed_columns = [NY_onehot.columns[-1]] + list(NY_onehot.columns[:-1])
NY_onehot = NY_onehot[fixed_columns]

NY_onehot.head()

In [None]:
NY_grouped = NY_onehot.groupby('Postcode').mean().reset_index()
NY_grouped

In [None]:
num_top_venues = 5

for hood in NY_grouped['Postcode']:
    print("----"+hood+"----")
    temp = NY_grouped[NY_grouped['Postcode'] == hood].T.reset_index()
    temp.columns = ['venue','freq']
    temp = temp.iloc[1:]
    temp['freq'] = temp['freq'].astype(float)
    temp = temp.round({'freq': 2})
    print(temp.sort_values('freq', ascending=False).reset_index(drop=True).head(num_top_venues))
    print('\n')