# Data

### The data I will be using is a dataframe for all the neighborhoods in San Francisco with their latitudes and longitudes.
### Later, I will use foursqure to get venue data for each neighborhood.

First, I import the modules

In [58]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd 

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

from geopy.geocoders import Nominatim

import requests

import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium

print('Libraries imported.')

Libraries imported.


Then I parse through the wikipedia page and get all the neighborhoods in San Francisco and remove any incorrect values

In [77]:
URL = "https://en.wikipedia.org/wiki/List_of_neighborhoods_in_San_Francisco"
page = requests.get(URL)

soup = BeautifulSoup(page.content, "html.parser")
data_list = soup.find_all('span', attrs = {'class':"toctext"})

    #removes the non-neighborhood sections of the table of contents to create a list of the neighborhoods
df=pd.DataFrame(data_list[0:-4])
df.columns = ["Neighborhood"]
df.drop(df.index[df["Neighborhood"] == 'Sunnyside'], inplace = True)


This gets the latitude and longitude for each neighborhod

In [78]:
def getLat(neighborhood):
    address = '{} San Francisco, CA'.format(neighborhood)
    geolocator = Nominatim(user_agent="ca_explorer")
    location = geolocator.geocode(address)
    if location != None:
        latitude = location.latitude
        longitude = location.longitude
        return(latitude)
    else:
        return(float('NaN'))
df['Latitude']= df['Neighborhood'].apply(lambda x: getLat(x))

def getLong(neighborhood):
    address = '{} San Francisco, CA'.format(neighborhood)
    geolocator = Nominatim(user_agent="ca_explorer")
    location = geolocator.geocode(address)
    if location != None:
        latitude = location.latitude
        longitude = location.longitude
        return(longitude)
    else:
        return(float('NaN'))
df['Longitude']= df['Neighborhood'].apply(lambda x: getLong(x))

I created a copy of the dataset down below, since the previous step took a long time. This way, if I make any changes to the dataset and I need to revert those changes, I can simply pass this line of code to create a fresh set quickly

In [107]:
df_SF = df.copy()

This ensures that all the latitudes and longitudes are correct, and drops any incorrect values. For example, if the code gave the coordinates outside the thresholds, like the coordinates for New York City, then it would be dropped.

In [110]:
df_SF.dropna(inplace = True)
def longcheck(longcoord):
    if ((longcoord>-122.57425) and (longcoord<-122.31676)):
        return(longcoord)
    else:
        return(float('NaN'))
def latcheck(latcoord):
    if (latcoord>37.7) and (latcoord<37.84171): 
        return(latcoord)
    else:
        return(float('NaN'))

df_SF['Longitude']= df_SF['Longitude'].apply(lambda x: longcheck(float(x)))
df_SF['Latitude']= df_SF['Latitude'].apply(lambda x: latcheck(float(x)))

df_SF.dropna(inplace = True)

I generated a map of all the data values to make sure that they were all in San Francisco.

In [111]:
map_SF = folium.Map(location=[37.7749, -122.4194], zoom_start=11)
    #coordinates for San Francsico

for lat, lng, neighborhood in zip(df_SF['Latitude'], df_SF['Longitude'], df_SF['Neighborhood']):
    label = neighborhood
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_SF)  
    
map_SF

## And we're finished with the base dataset. From here, I can use foursquare to get venue data.

In [112]:
df_SF

Unnamed: 0,Neighborhood,Latitude,Longitude
0,Alamo Square,37.77636,-122.4347
1,Anza Vista,37.780836,-122.443149
3,Balboa Hollow,37.798794,-122.436098
5,The Bayview,37.728889,-122.3925
6,Belden Place,37.791744,-122.403886
7,Bernal Heights,37.742986,-122.415804
8,Buena Vista,37.806532,-122.420648
10,The Castro,37.760856,-122.434957
11,Cathedral Hill,37.791821,-122.413497
12,Cayuga Terrace,37.730297,-122.432929
