# A Recommender System for Organic Produce Distribution Warehouse

The purpose of this project is to find the best neighborhoods in the Scarborough borough of Toronto to open a distribution center. Restaraunts are the target customers for the organic produce so restaraunt density is the key metric.

In [1]:
# import libraries
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
import json
from pandas.io.json import json_normalize

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# !conda install -c conda-forge geopy --yes # uncomment this line if you haven't completed the Foursquare API lab
import geopy.geocoders # convert an address into latitude and longitude values

# !conda install -c conda-forge folium=0.5.0 --yes # uncomment this line if you haven't completed the Foursquare API lab
import folium # map rendering library

#### Postal Codes in Toronto

In [5]:
# Loading the dataset of postal codes in Toronto
# This dataset was created in week 3.
df_toronto = pd.read_csv('toronto_data.csv')
df_toronto.head()

Unnamed: 0.1,Unnamed: 0,PostalCode,Borough,Neighborhood,Latitude,Longitude
0,0,M5G,Downtown Toronto,Central Bay Street,43.657952,-79.387383
1,1,M2H,North York,Hillcrest Village,43.803762,-79.363452
2,2,M4B,East York,"Parkview Hill, Woodbine Gardens",43.706397,-79.309937
3,3,M1J,Scarborough,Scarborough Village,43.744734,-79.239476
4,4,M4G,East York,Leaside,43.70906,-79.363452


In [53]:
df_toronto = df_toronto.reset_index(drop=True).drop(columns = 'Unnamed: 0')

In [55]:
# Some rows had multiple neighborhoods. Split rows that have multiple neighborhoods so each row is a separate neighborhood
df_toronto['Neighborhood'] = df_toronto['Neighborhood'].str.split(',')
df_toronto = (df_toronto
 .set_index(['PostalCode','Borough','Latitude','Longitude'])['Neighborhood']
 .apply(pd.Series)
 .stack()
 .reset_index()
 .drop('level_4', axis=1)
 .rename(columns={0:'Neighborhood'}))
df_toronto

Unnamed: 0,PostalCode,Borough,Latitude,Longitude,Neighborhood
0,M5G,Downtown Toronto,43.657952,-79.387383,Central Bay Street
1,M2H,North York,43.803762,-79.363452,Hillcrest Village
2,M4B,East York,43.706397,-79.309937,Parkview Hill
3,M4B,East York,43.706397,-79.309937,Woodbine Gardens
4,M1J,Scarborough,43.744734,-79.239476,Scarborough Village
5,M4G,East York,43.70906,-79.363452,Leaside
6,M4M,East Toronto,43.659526,-79.340923,Studio District
7,M1R,Scarborough,43.750071,-79.295849,Wexford
8,M1R,Scarborough,43.750071,-79.295849,Maryvale
9,M9V,Etobicoke,43.739416,-79.588437,South Steeles


#### Create a Map of Toronto Using Postal Codes

In [56]:
# Latitude and longtitude are manually extracted via google search
toronto_latitude = 43.6932; toronto_longitude = -79.3832
map_toronto = folium.Map(location = [toronto_latitude, toronto_longitude], zoom_start = 10.7)

# add markers to map
for lat, lng, borough, neighborhood in zip(df_toronto['Latitude'], df_toronto['Longitude'], df_toronto['Borough'], df_toronto['Neighborhood']):
    label = '{}, {}'.format(neighborhood, borough)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7).add_to(map_toronto)  
    

map_toronto

#### Focus on the "Scarorough" Borough in Toronto

In [57]:
# select only neighborhoods in the "Scarborough" borough.
scarborough_data = df_toronto[df_toronto['Borough'] == 'Scarborough']
scarborough_data.head()

Unnamed: 0,PostalCode,Borough,Latitude,Longitude,Neighborhood
4,M1J,Scarborough,43.744734,-79.239476,Scarborough Village
7,M1R,Scarborough,43.750071,-79.295849,Wexford
8,M1R,Scarborough,43.750071,-79.295849,Maryvale
25,M1B,Scarborough,43.806686,-79.194353,Malvern
26,M1B,Scarborough,43.806686,-79.194353,Rouge


#### Create Map of Scarborough and Its Neighborhoods

In [58]:
address_scar = 'Scarborough, Toronto'
latitude_scar = 43.773077
longitude_scar = -79.257774
print('The geograpical coordinate of "Scarborough" are: {}, {}.'.format(latitude_scar, longitude_scar))

map_Scarborough = folium.Map(location=[latitude_scar, longitude_scar], zoom_start=11.5)

# add markers to map
for lat, lng, label in zip(scarborough_data['Latitude'], scarborough_data['Longitude'], scarborough_data['Neighborhood']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius = 10,
        popup = label,
        color ='blue',
        fill = True,
        fill_color = '#3186cc',
        fill_opacity = 0.7).add_to(map_Scarborough)  
    
map_Scarborough

The geograpical coordinate of "Scarborough" are: 43.773077, -79.257774.


#### Prepare function to pull needed data from Fourscare API

In [59]:
def foursquare_crawler (postal_code_list, neighborhood_list, lat_list, lng_list, LIMIT = 500, radius = 1000):
    result_ds = []
    counter = 0
    for postal_code, neighborhood, lat, lng in zip(postal_code_list, neighborhood_list, lat_list, lng_list):
         
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, CLIENT_SECRET, VERSION, 
            lat, lng, radius, LIMIT)
            
        # GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        tmp_dict = {}
        tmp_dict['Postal Code'] = postal_code; tmp_dict['Neighborhood(s)'] = neighborhood; 
        tmp_dict['Latitude'] = lat; tmp_dict['Longitude'] = lng;
        tmp_dict['Crawling_result'] = results;
        result_ds.append(tmp_dict)
        counter += 1
        print('{}.'.format(counter))
        print('Data is Obtained, for the Postal Code {} (and Neighborhoods {}) SUCCESSFULLY.'.format(postal_code, neighborhood))
    return result_ds;

In [60]:
CLIENT_ID = 'TNK1X5SMMHZAWJ4JIUD4VQBY0IQJF2BNSIABFFL2IA2BTSLS'
CLIENT_SECRET = 'OAUBZBKLYDTWGKO0ZV3XEKNB5L4NXAYGKVIDGOMMT1SG1Q3N'
VERSION = '20180605' # Foursquare API version

#### Venues by Neighborhood inside "Scarborough"

In [61]:
print('Crawling different neighborhoods inside "Scarborough"')
Scarborough_foursquare_dataset = foursquare_crawler(list(scarborough_data['PostalCode']),
                                                   list(scarborough_data['Neighborhood']),
                                                   list(scarborough_data['Latitude']),
                                                   list(scarborough_data['Longitude']),)

Crawling different neighborhoods inside "Scarborough"
1.
Data is Obtained, for the Postal Code M1J (and Neighborhoods Scarborough Village) SUCCESSFULLY.
2.
Data is Obtained, for the Postal Code M1R (and Neighborhoods Wexford) SUCCESSFULLY.
3.
Data is Obtained, for the Postal Code M1R (and Neighborhoods  Maryvale) SUCCESSFULLY.
4.
Data is Obtained, for the Postal Code M1B (and Neighborhoods Malvern) SUCCESSFULLY.
5.
Data is Obtained, for the Postal Code M1B (and Neighborhoods  Rouge) SUCCESSFULLY.


#### Clean the Data Received from Foursquare API

In [62]:
# This function will extract each venue for every neighborhood in the data

def get_venue_dataset(foursquare_dataset):
    result_df = pd.DataFrame(columns = ['Postal Code', 'Neighborhood', 
                                           'Neighborhood Latitude', 'Neighborhood Longitude',
                                          'Venue', 'Venue Summary', 'Venue Category', 'Distance'])
    # print(result_df)
    
    for neigh_dict in foursquare_dataset:
        postal_code = neigh_dict['Postal Code']; neigh = neigh_dict['Neighborhood(s)']
        lat = neigh_dict['Latitude']; lng = neigh_dict['Longitude']
        print('Number of Venuse in Coordination "{}" Posal Code and "{}" Negihborhood(s) is:'.format(postal_code, neigh))
        print(len(neigh_dict['Crawling_result']))
        
        for venue_dict in neigh_dict['Crawling_result']:
            summary = venue_dict['reasons']['items'][0]['summary']
            name = venue_dict['venue']['name']
            dist = venue_dict['venue']['location']['distance']
            cat =  venue_dict['venue']['categories'][0]['name']
            
            result_df = result_df.append({'Postal Code': postal_code, 'Neighborhood': neigh, 
                              'Neighborhood Latitude': lat, 'Neighborhood Longitude':lng,
                              'Venue': name, 'Venue Summary': summary, 
                              'Venue Category': cat, 'Distance': dist}, ignore_index = True)
            # print(result_df)
    
    return(result_df)

In [63]:
scarborough_venues = get_venue_dataset(Scarborough_foursquare_dataset)

Number of Venuse in Coordination "M1J" Posal Code and "Scarborough Village" Negihborhood(s) is:
12
Number of Venuse in Coordination "M1R" Posal Code and "Wexford" Negihborhood(s) is:
32
Number of Venuse in Coordination "M1R" Posal Code and " Maryvale" Negihborhood(s) is:
32
Number of Venuse in Coordination "M1B" Posal Code and "Malvern" Negihborhood(s) is:
17
Number of Venuse in Coordination "M1B" Posal Code and " Rouge" Negihborhood(s) is:
17


#### View Neighborhood Venues

In [64]:
scarborough_venues.head()

Unnamed: 0,Postal Code,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Summary,Venue Category,Distance
0,M1J,Scarborough Village,43.744734,-79.239476,Diamond Pizza,This spot is popular,Pizza Place,530
1,M1J,Scarborough Village,43.744734,-79.239476,Tim Hortons,This spot is popular,Coffee Shop,640
2,M1J,Scarborough Village,43.744734,-79.239476,Dairy Queen,This spot is popular,Ice Cream Shop,617
3,M1J,Scarborough Village,43.744734,-79.239476,Dairy Queen,This spot is popular,Ice Cream Shop,607
4,M1J,Scarborough Village,43.744734,-79.239476,Subway,This spot is popular,Sandwich Place,749


#### Save a Cleaned Version of the Data

In [65]:
# The purpose of this is that the Foursquare API will not have to be utilized again
scarborough_venues.to_csv('scarborough_venues.csv')

#### Load the Data

In [2]:
scarborough_venues = pd.read_csv('scarborough_venues.csv')

#### Summary Information Regarding Neighborhoods inside "Scarborough"

In [3]:
neigh_list = list(scarborough_venues['Neighborhood'].unique())
print('Number of Neighborhoods inside Scarborough:')
print(len(neigh_list))
print('List of Neighborhoods inside Scarborough:')
neigh_list

Number of Neighborhoods inside Scarborough:
5
List of Neighborhoods inside Scarborough:


['Scarborough Village', 'Wexford', ' Maryvale', 'Malvern', ' Rouge']

In [4]:
neigh_venue_summary = scarborough_venues.groupby('Neighborhood').count()
neigh_venue_summary.drop(columns = ['Unnamed: 0']).head()

Unnamed: 0_level_0,Postal Code,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Summary,Venue Category,Distance
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Maryvale,32,32,32,32,32,32,32
Rouge,17,17,17,17,17,17,17
Malvern,17,17,17,17,17,17,17
Scarborough Village,12,12,12,12,12,12,12
Wexford,32,32,32,32,32,32,32


In [5]:
print('There are {} unique categories.'.format(len(scarborough_venues['Venue Category'].unique())))

print('List of categories:')
list(scarborough_venues['Venue Category'].unique())

There are 39 unique categories.
List of categories:


['Pizza Place',
 'Coffee Shop',
 'Ice Cream Shop',
 'Sandwich Place',
 'Restaurant',
 'Grocery Store',
 'Convenience Store',
 'Japanese Restaurant',
 'Bowling Alley',
 'Train Station',
 'Fast Food Restaurant',
 'Vietnamese Restaurant',
 'Fish Market',
 'Bakery',
 'African Restaurant',
 'Korean Restaurant',
 'Supermarket',
 'Middle Eastern Restaurant',
 'Seafood Restaurant',
 'Indian Restaurant',
 'Breakfast Spot',
 'Burger Joint',
 'Asian Restaurant',
 'Badminton Court',
 'Gas Station',
 'Rental Car Location',
 'Intersection',
 'Furniture / Home Store',
 'Flea Market',
 'Soccer Field',
 'Indian Chinese Restaurant',
 'Spa',
 'Bank',
 'Paper / Office Supplies Store',
 'Caribbean Restaurant',
 'Martial Arts Dojo',
 'Trail',
 'Auto Workshop',
 'Chinese Restaurant']

#### One-Hot Encoding the Categroies

In [6]:
scarborough_onehot = pd.get_dummies(data = scarborough_venues, drop_first  = False, 
                              prefix = "", prefix_sep = "", columns = ['Venue Category'])
scarborough_onehot.head()

Unnamed: 0.1,Unnamed: 0,Postal Code,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Summary,Distance,African Restaurant,Asian Restaurant,Auto Workshop,Badminton Court,Bakery,Bank,Bowling Alley,Breakfast Spot,Burger Joint,Caribbean Restaurant,Chinese Restaurant,Coffee Shop,Convenience Store,Fast Food Restaurant,Fish Market,Flea Market,Furniture / Home Store,Gas Station,Grocery Store,Ice Cream Shop,Indian Chinese Restaurant,Indian Restaurant,Intersection,Japanese Restaurant,Korean Restaurant,Martial Arts Dojo,Middle Eastern Restaurant,Paper / Office Supplies Store,Pizza Place,Rental Car Location,Restaurant,Sandwich Place,Seafood Restaurant,Soccer Field,Spa,Supermarket,Trail,Train Station,Vietnamese Restaurant
0,0,M1J,Scarborough Village,43.744734,-79.239476,Diamond Pizza,This spot is popular,530,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,1,M1J,Scarborough Village,43.744734,-79.239476,Tim Hortons,This spot is popular,640,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,M1J,Scarborough Village,43.744734,-79.239476,Dairy Queen,This spot is popular,617,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,M1J,Scarborough Village,43.744734,-79.239476,Dairy Queen,This spot is popular,607,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4,M1J,Scarborough Village,43.744734,-79.239476,Subway,This spot is popular,749,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


#### Manually subsetting restaraunts that would be likely buyers of organic produce

In [7]:
# This list is created manually from all the restaraunt venues in the foursquare API categories
important_list_of_features = [
 
 'Neighborhood',
 'Neighborhood Latitude',
 'Neighborhood Longitude',
 'Pizza Place',
 'Sandwich Place',
 'Restaurant',
 'Japanese Restaurant',
 'Fast Food Restaurant',
 'Vietnamese Restaurant',
 'Bakery',
 'African Restaurant',
 'Korean Restaurant',
 'Middle Eastern Restaurant',
 'Seafood Restaurant',
 'Indian Restaurant',
 'Breakfast Spot',
 'Burger Joint',
 'Asian Restaurant',
 'Indian Chinese Restaurant',
 'Caribbean Restaurant',
 'Chinese Restaurant']

#### Update the One-Hot Encoded DataFrame and Group the Data by Neighborhoods

In [8]:
scarborough_onehot = scarborough_onehot[important_list_of_features].drop(
    columns = ['Neighborhood Latitude', 'Neighborhood Longitude']).groupby(
    'Neighborhood').sum()

scarborough_onehot.head()

Unnamed: 0_level_0,Pizza Place,Sandwich Place,Restaurant,Japanese Restaurant,Fast Food Restaurant,Vietnamese Restaurant,Bakery,African Restaurant,Korean Restaurant,Middle Eastern Restaurant,Seafood Restaurant,Indian Restaurant,Breakfast Spot,Burger Joint,Asian Restaurant,Indian Chinese Restaurant,Caribbean Restaurant,Chinese Restaurant
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
Maryvale,3,0,1,0,0,1,1,1,1,4,1,1,1,2,1,1,0,0
Rouge,0,1,2,0,2,0,0,0,0,0,0,0,0,0,0,0,1,1
Malvern,0,1,2,0,2,0,0,0,0,0,0,0,0,0,0,0,1,1
Scarborough Village,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
Wexford,3,0,1,0,0,1,1,1,1,4,1,1,1,2,1,1,0,0


#### Integrating Different Restaurants and Different Joints

This assumption was made due to not having a very large dataset of restaraunts

In [9]:
feat_name_list = list(scarborough_onehot.columns)
restaurant_list = []

for counter, value in enumerate(feat_name_list):
    if value.find('Restaurant') != (-1):
        restaurant_list.append(value)
        
scarborough_onehot['Total Restaurants'] = scarborough_onehot[restaurant_list].sum(axis = 1)
scarborough_onehot = scarborough_onehot.drop(columns = restaurant_list)

feat_name_list = list(scarborough_onehot.columns)
joint_list = []

for counter, value in enumerate(feat_name_list):
    if value.find('Joint') != (-1):
        joint_list.append(value)
        
scarborough_onehot['Total Joints'] = scarborough_onehot[joint_list].sum(axis = 1)
scarborough_onehot = scarborough_onehot.drop(columns = joint_list)

#### Vizualize DataFrame To Verify Readiness for ML Algorithm.

In [10]:
scarborough_onehot

Unnamed: 0_level_0,Pizza Place,Sandwich Place,Bakery,Breakfast Spot,Total Restaurants,Total Joints
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Maryvale,3,0,1,1,12,2
Rouge,0,1,0,0,6,0
Malvern,0,1,0,0,6,0
Scarborough Village,1,1,0,0,3,0
Wexford,3,0,1,1,12,2


#### Run k-means to Cluster Neighborhoods

In [20]:
# import k-means from clustering
from sklearn.cluster import KMeans

# train k-means
kmeans = KMeans(n_clusters = 3, random_state = 0).fit(scarborough_onehot)

In [21]:
# View centers of clusters
means_df = pd.DataFrame(kmeans.cluster_centers_)
means_df.columns = scarborough_onehot.columns
means_df.index = ['G1','G2','G3']
means_df['Total Sum'] = means_df.sum(axis = 1)
means_df.sort_values(axis = 0, by = ['Total Sum'], ascending=False)

Unnamed: 0,Pizza Place,Sandwich Place,Bakery,Breakfast Spot,Total Restaurants,Total Joints,Total Sum
G1,3.0,0.0,1.0,1.0,12.0,2.0,19.0
G3,0.0,1.0,0.0,0.0,6.0,0.0,7.0
G2,1.0,1.0,0.0,0.0,3.0,0.0,5.0


#### Display the Corresponding Group Label For Each Neighborhood.

In [22]:
neigh_summary = pd.DataFrame([scarborough_onehot.index, 1 + kmeans.labels_]).T
neigh_summary.columns = ['Neighborhood', 'Group']
neigh_summary

Unnamed: 0,Neighborhood,Group
0,Maryvale,1
1,Rouge,3
2,Malvern,3
3,Scarborough Village,2
4,Wexford,1


#### Analysis

In [23]:
# Groups from best to worst based on above DF
neigh_summary[neigh_summary['Group'] == 1] # best neighborhood

Unnamed: 0,Neighborhood,Group
0,Maryvale,1
4,Wexford,1


In [24]:
name_of_neigh = list(neigh_summary[neigh_summary['Group'] == 1]['Neighborhood'])[0]
scarborough_venues[scarborough_venues['Neighborhood'] == name_of_neigh].iloc[0,1:5].to_dict()

{'Postal Code': 'M1R',
 'Neighborhood': ' Maryvale',
 'Neighborhood Latitude': 43.750071500000004,
 'Neighborhood Longitude': -79.2958491}

In [25]:
neigh_summary[neigh_summary['Group'] == 3] # 2nd best

Unnamed: 0,Neighborhood,Group
1,Rouge,3
2,Malvern,3


In [26]:
name_of_neigh = list(neigh_summary[neigh_summary['Group'] == 3]['Neighborhood'])[0]
scarborough_venues[scarborough_venues['Neighborhood'] == name_of_neigh].iloc[0,1:5].to_dict()

{'Postal Code': 'M1B',
 'Neighborhood': ' Rouge',
 'Neighborhood Latitude': 43.806686299999996,
 'Neighborhood Longitude': -79.19435340000003}

In [27]:
neigh_summary[neigh_summary['Group'] == 2] # 3rd best

Unnamed: 0,Neighborhood,Group
3,Scarborough Village,2


In [28]:
name_of_neigh = list(neigh_summary[neigh_summary['Group'] == 2]['Neighborhood'])[0]
scarborough_venues[scarborough_venues['Neighborhood'] == name_of_neigh].iloc[0,1:5].to_dict()

{'Postal Code': 'M1J',
 'Neighborhood': 'Scarborough Village',
 'Neighborhood Latitude': 43.7447342,
 'Neighborhood Longitude': -79.23947609999998}

Group 1, consisting of the neighborhoods Maryvale and Wexford, has the highest density of restaraunts and would be the best location for a organic produce distribution center in Scarborough.