# Aditya Murthy Coursera Capstone Project Notebook - Battle of the Neighborhoods Part 2

### The data I will be using is a dataframe for all the neighborhoods in San Francisco with their latitudes and longitudes.
### Later, I will use foursqure to get venue data for each neighborhood.

First, I import the modules

In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd 

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from geopy.geocoders import Nominatim

import requests

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium

print('Libraries imported.')

Libraries imported.


Then I parse through the wikipedia page and get all the neighborhoods in San Francisco and remove any incorrect values

In [2]:
URL = "https://en.wikipedia.org/wiki/List_of_neighborhoods_in_San_Francisco"
page = requests.get(URL)

soup = BeautifulSoup(page.content, "html.parser")
data_list = soup.find_all('span', attrs = {'class':"toctext"})

    #removes the non-neighborhood sections of the table of contents to create a list of the neighborhoods
df=pd.DataFrame(data_list[0:-4])
df.columns = ["Neighborhood"]
df.drop(df.index[df["Neighborhood"] == 'Sunnyside'], inplace = True)


This gets the latitude and longitude for each neighborhod

In [3]:
def getLat(neighborhood):
    address = '{} San Francisco, CA'.format(neighborhood)
    geolocator = Nominatim(user_agent="ca_explorer")
    location = geolocator.geocode(address)
    if location != None:
        latitude = location.latitude
        longitude = location.longitude
        return(latitude)
    else:
        return(float('NaN'))
df['Latitude']= df['Neighborhood'].apply(lambda x: getLat(x))

def getLong(neighborhood):
    address = '{} San Francisco, CA'.format(neighborhood)
    geolocator = Nominatim(user_agent="ca_explorer")
    location = geolocator.geocode(address)
    if location != None:
        latitude = location.latitude
        longitude = location.longitude
        return(longitude)
    else:
        return(float('NaN'))
df['Longitude']= df['Neighborhood'].apply(lambda x: getLong(x))

I created a copy of the dataset down below, since the previous step took a long time. This way, if I make any changes to the dataset and I need to revert those changes, I can simply pass this line of code to create a fresh set quickly

In [4]:
df_SF = df.copy()

This ensures that all the latitudes and longitudes are correct, and drops any incorrect values. For example, if the code gave the coordinates outside the thresholds, like the coordinates for New York City, then it would be dropped.

In [5]:
df_SF.dropna(inplace = True)
def longcheck(longcoord):
    if ((longcoord>-122.57425) and (longcoord<-122.31676)):
        return(longcoord)
    else:
        return(float('NaN'))
def latcheck(latcoord):
    if (latcoord>37.7) and (latcoord<37.84171): 
        return(latcoord)
    else:
        return(float('NaN'))

df_SF['Longitude']= df_SF['Longitude'].apply(lambda x: longcheck(float(x)))
df_SF['Latitude']= df_SF['Latitude'].apply(lambda x: latcheck(float(x)))

df_SF.dropna(inplace = True)

I generated a map of all the data values to make sure that they were all in San Francisco.

In [6]:
map_SF = folium.Map(location=[37.7749, -122.4194], zoom_start=11)
    #coordinates for San Francsico

for lat, lng, neighborhood in zip(df_SF['Latitude'], df_SF['Longitude'], df_SF['Neighborhood']):
    label = neighborhood
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_SF)  
    
map_SF

## And we're finished with the base dataset. From here, I can use foursquare to get venue data.

In [7]:
CLIENT_ID = 'XFUA0JZMI5HA3EPAENLM1G3BDAWYZNTHFPYM4GSJR55HL53R'
CLIENT_SECRET = 'JOVLLF4AP4JMCZU2KHNSJAY14E0E30A04CQQNZQIXFRTQDOI'
VERSION = '20180605'
LIMIT = 100

def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):

        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

SF_venues = getNearbyVenues(df_SF['Neighborhood'], df_SF['Latitude'], df_SF['Longitude'], radius=500)

### Creating copies of the dataset, so that I can quickly undo any changes I make to the dataframes

In [8]:
SF_venuesC = SF_venues.copy()

In [9]:
SF_venues = SF_venuesC.copy()

### One Hot encoding the dataframe and normalizing the values

In [10]:
SF_onehot = pd.get_dummies(SF_venues[['Venue Category']], prefix="", prefix_sep="")

SF_onehot['Neighborhood'] = SF_venues['Neighborhood'] 

fixed_columns = [SF_onehot.columns[-1]] + list(SF_onehot.columns[:-1])
SF_onehot = SF_onehot[fixed_columns]


SF_grouped = (SF_onehot.groupby('Neighborhood').sum()*SF_onehot.groupby('Neighborhood').mean()).reset_index()

### Creating a user profile and weighting it

In [11]:
ratings = ['Coffee Shop',	'Café', 'Park',	'Italian Restaurant',	'Liquor Store',	'Gym',	'Bakery',	'BBQ Joint',	'Indian Restaurant']
score = [100, 75, 50, 35, 25, 20, 17, 15, 10]

weight_df = SF_grouped.copy()
weight_df.drop(weight_df.index[weight_df['Neighborhood'] != 'Alamo Square'], inplace = True)
for i in range(len(score)):
    weight_df[ratings[i]] = score[i]
weight_df.drop('Neighborhood', axis=1, inplace = True)
weight_df = weight_df.applymap(lambda x: int(x))

In [12]:
SF_groupedC = SF_grouped.copy()
SF_groupedC.drop('Neighborhood', axis=1, inplace = True)

In [13]:
weight_df.index = ['Number']
weight_transposed = weight_df.transpose()

In [14]:
weight_series = weight_transposed.squeeze()


### Multiplying the weighted user profile with the normalized venue data

In [15]:
recommendations_df = (weight_series*SF_groupedC).sum(axis=1)/(weight_series.sum())

In [16]:
neighborhoodValues = SF_grouped['Neighborhood']
neighborhoodValues.reset_index(drop = True, inplace = True)
recommendation_df = pd.DataFrame(recommendations_df, columns = ['Rating'])
recommendation_df['Neighborhood'] = neighborhoodValues

In [17]:
recommendation_df.sort_values(by=['Rating'],ascending = False, inplace = True)
recommendation_df.reset_index(drop = True, inplace = True)

### Getting the latitude and longitude coordinates for the top 3 reccomended neighborhoods

In [18]:
top3 = pd.DataFrame([[0, 0, 0], [0, 0, 0], [0, 0, 0]], columns = ['Neighborhood', 'Latitude', 'Longitude',])
for i in range(3):
    top3.iloc[i] = df_SF[df['Neighborhood'] == recommendation_df['Neighborhood'][i]].copy()
    top3['Neighborhood'][i] = recommendation_df['Neighborhood'][i]
top3['Rank'] = ['', '2nd', '3rd']

  top3.iloc[i] = df_SF[df['Neighborhood'] == recommendation_df['Neighborhood'][i]].copy()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top3['Neighborhood'][i] = recommendation_df['Neighborhood'][i]


### Creating a map on Folium as the final product with recommended neighborhoods the user would want to live in

In [19]:
map_SF_top = folium.Map(location=[37.7749, -122.4194], zoom_start=13)
    #coordinates for San Francsico

for lat, lng, neighborhood, rank in zip(top3['Latitude'], top3['Longitude'], top3['Neighborhood'], top3['Rank']):
    label = '{} Best Match is: {}'.format(rank, neighborhood)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [float(lat), float(lng)],
        radius=15,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_SF_top)
    
map_SF_top