# Coursera - IBM Data Science Professional Certification - Week5


### Import Modules to run analysis

In [1]:
import requests
from bs4 import BeautifulSoup as bs

import json # library to handle JSON files

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

import geocoder

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors

from sklearn.cluster import KMeans

import folium # map rendering library

import warnings
warnings.filterwarnings("ignore")

## A description of the problem and a discussion of the background.

In San Diego, California, a client has hired your firm to help them identify in which neighbourhood they should open the 2nd location of their very successful French Restaurant.  

The client has indicated they are happy with the success they have found in the neighbourhood of Nork Park and that they would like to open their 2nd location in the neighbourhood that is the most similar to North Park.  There are over 100 neighbourhoods in San Diego, so choosing which neighbourhood is the most simliar won't be easy. 

## A description of the data

The following wikipage contains a list of all of over 100 neighbourhoods in San Diego, with links to wikipages for just about all of them.  

https://en.wikipedia.org/wiki/List_of_communities_and_neighborhoods_of_San_Diego

This list of neighborhoods will be used to create a table of latitude and longitude coordinates for each neighborhood either by scraping the coordinates from the respective wikipage about the neighborhood or by using the Nominatum package in python.  

These latitude and longidue coordinates will be used to obtain the Foursqaure API data about the venue types in each neighborhood.  This foursquare API data about the venues and venue categories awill be used to identify which neighborhoods are the most similar to North Park.

An example of how the data will be used is outlined below, by grabbing the relevant data for the neighborhood of North Park.

### Functions to complete the analysis

In [2]:
def getNearbyVenues(names, latitudes, longitudes, radius=500):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
        
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhoods', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category']
    
    return(nearby_venues)

In [3]:
# function to get the lat/long of each neighborhood
def get_lat_long(neighborhood):
    address = "{}, San Diego, CA".format(neighborhood.strip())
    geolocator = Nominatim()
    location = geolocator.geocode(address)
    try:
        latitude = location.latitude
        longitude = location.longitude
        return latitude, longitude
    except:
        return -1,-1



In [4]:
# function that extracts the category of the venue
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

### Foursquare credentials

In [5]:
CLIENT_ID = '42R2YOP4MTNEEGBLCFEBVLKE1S1EFC5AUWUIS45M3Q4Y00SK' # your Foursquare ID
CLIENT_SECRET = 'X4THSP32AF2EVUC4M4O2IP3KYLKSF2WHW5BNKSQM4GYBF2N1' # your Foursquare Secret
VERSION = '20180604'
#LIMIT = 30
print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: 42R2YOP4MTNEEGBLCFEBVLKE1S1EFC5AUWUIS45M3Q4Y00SK
CLIENT_SECRET:X4THSP32AF2EVUC4M4O2IP3KYLKSF2WHW5BNKSQM4GYBF2N1


### We are looking to find the neighborhood that is the most similar to North Park according to the venue information contained in the Foursquare API venue data.
1.  Start by getting the latitude and longitude of North Park to be used as input into the Foursquare API

In [6]:
neighborhood_latitude, neighborhood_longitude, = get_lat_long('North Park')

#### Gather the category information about each venue in North Park, within 500 meters of the neighborhoods lat/long

In [7]:
LIMIT = 100 # limit of number of venues returned by Foursquare API
radius = 1000 # define radius
# create URL
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    neighborhood_latitude, 
    neighborhood_longitude, 
    radius, 
    LIMIT)
url # display URL

'https://api.foursquare.com/v2/venues/explore?&client_id=42R2YOP4MTNEEGBLCFEBVLKE1S1EFC5AUWUIS45M3Q4Y00SK&client_secret=X4THSP32AF2EVUC4M4O2IP3KYLKSF2WHW5BNKSQM4GYBF2N1&v=20180604&ll=32.7408842,-117.1305877&radius=1000&limit=100'

### Convert the JSON results of the North Park venue category data to a DataFrame

In [8]:
results = requests.get(url).json()
#results.keys()
venues = results['response']['groups'][0]['items']
nearby_venues = json_normalize(venues) # flatten JSON
print(nearby_venues.shape)
nearby_venues.head()

(97, 28)


Unnamed: 0,reasons.count,reasons.items,referralId,venue.categories,venue.delivery.id,venue.delivery.provider.icon.name,venue.delivery.provider.icon.prefix,venue.delivery.provider.icon.sizes,venue.delivery.provider.name,venue.delivery.url,venue.id,venue.location.address,venue.location.cc,venue.location.city,venue.location.country,venue.location.crossStreet,venue.location.distance,venue.location.formattedAddress,venue.location.labeledLatLngs,venue.location.lat,venue.location.lng,venue.location.neighborhood,venue.location.postalCode,venue.location.state,venue.name,venue.photos.count,venue.photos.groups,venue.venuePage.id
0,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-56391bfacd105fac6c403c63-0,"[{'pluralName': 'Seafood Restaurants', 'id': '...",,,,,,,56391bfacd105fac6c403c63,3382 30th St,US,San Diego,United States,,117,"[3382 30th St, San Diego, CA 92104, United Sta...","[{'lat': 32.74087051063667, 'lng': -117.129334...",32.740871,-117.129334,,92104,CA,Pete's Seafood & Sandwich,0,[],
1,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-536add8e11d21c7853ef0883-1,"[{'pluralName': 'Breweries', 'id': '50327c8591...",,,,,,,536add8e11d21c7853ef0883,3000 Upas St,US,San Diego,United States,at 30th St,91,"[3000 Upas St (at 30th St), San Diego, CA 9210...","[{'lat': 32.741510883435105, 'lng': -117.12994...",32.741511,-117.129949,"North Park, San Diego, CA",92104,CA,Modern Times Flavordome,0,[],
2,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-577d9c47498e861ea1dbf937-2,"[{'pluralName': 'Burger Joints', 'id': '4bf58d...",,,,,,,577d9c47498e861ea1dbf937,3501 30th St,US,San Diego,United States,Myrtle Ave,165,"[3501 30th St (Myrtle Ave), San Diego, CA 9210...","[{'lat': 32.742278423256074, 'lng': -117.12995...",32.742278,-117.12996,,92104,CA,Dunedin North Park,0,[],
3,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-480262c5f964a520f14e1fe3-3,"[{'pluralName': 'Pizza Places', 'id': '4bf58dd...",,,,,,,480262c5f964a520f14e1fe3,3448 30th St,US,San Diego,United States,at Myrtle Ave,130,"[3448 30th St (at Myrtle Ave), San Diego, CA 9...","[{'lat': 32.742004958632215, 'lng': -117.13017...",32.742005,-117.130177,,92104,CA,Lefty's Chicago Pizzeria,0,[],
4,0,"[{'summary': 'This spot is popular', 'type': '...",e-0-53aee984498e7d85f525e67d-4,"[{'pluralName': 'Cafés', 'id': '4bf58dd8d48988...",,,,,,,53aee984498e7d85f525e67d,3000 Upas St Ste 103,US,San Diego,United States,30th St,83,"[3000 Upas St Ste 103 (30th St), San Diego, CA...","[{'lat': 32.74144910740029, 'lng': -117.129994...",32.741449,-117.129995,,92104,CA,Influx,0,[],


### Grab only the relevant data

In [9]:
# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]
nearby_venues.shape

(97, 4)

In [10]:
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

In [11]:
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

### Take a look at the first few rows of cleaned up North Park venue data 

In [12]:
nearby_venues.head()

Unnamed: 0,name,categories,lat,lng
0,Pete's Seafood & Sandwich,Seafood Restaurant,32.740871,-117.129334
1,Modern Times Flavordome,Brewery,32.741511,-117.129949
2,Dunedin North Park,Burger Joint,32.742278,-117.12996
3,Lefty's Chicago Pizzeria,Pizza Place,32.742005,-117.130177
4,Influx,Café,32.741449,-117.129995


### Take a quick look at the category breakdown for North Park

In [13]:
nearby_venues['categories'].value_counts()[:10]

Coffee Shop            6
Brewery                5
Pizza Place            5
Café                   5
Breakfast Spot         4
Park                   4
Ice Cream Shop         3
Mexican Restaurant     3
American Restaurant    3
Liquor Store           3
Name: categories, dtype: int64

### One hot encode the the Venue Categories for all of the Neighborhoods

In [14]:
# one hot encoding
northpark_onehot = pd.get_dummies(nearby_venues[['categories']], prefix="", prefix_sep="")

# add neighborhood column to dataframe
northpark_onehot['Neighborhood'] = 'North Park'

# move neighborhood column to the first column
fixed_columns = [northpark_onehot.columns[-1]] + list(northpark_onehot.columns[:-1])
northpark_onehot = northpark_onehot[fixed_columns]

print(northpark_onehot.shape)
northpark_onehot.head()

(97, 59)


Unnamed: 0,Neighborhood,ATM,Accessories Store,American Restaurant,Art Gallery,BBQ Joint,Bakery,Bar,Beer Bar,Beer Store,Breakfast Spot,Brewery,Burger Joint,Café,Camera Store,Chinese Restaurant,Coffee Shop,Convenience Store,Disc Golf,Donut Shop,Farmers Market,Fast Food Restaurant,French Restaurant,Gastropub,Grocery Store,Gym / Fitness Center,Health & Beauty Service,Hobby Shop,Ice Cream Shop,Italian Restaurant,Liquor Store,Lounge,Massage Studio,Mexican Restaurant,Mobile Phone Shop,Music Venue,New American Restaurant,Noodle House,Park,Performing Arts Venue,Pet Store,Pharmacy,Pizza Place,Poke Place,Pool,Salon / Barbershop,Seafood Restaurant,Spa,Sushi Restaurant,Taco Place,Tennis Court,Thai Restaurant,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Women's Store,Yoga Studio
0,North Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,North Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,North Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,North Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,North Park,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Get the mean score for each venue type

In [15]:
northpark_onehot.groupby('Neighborhood').mean()

Unnamed: 0_level_0,ATM,Accessories Store,American Restaurant,Art Gallery,BBQ Joint,Bakery,Bar,Beer Bar,Beer Store,Breakfast Spot,Brewery,Burger Joint,Café,Camera Store,Chinese Restaurant,Coffee Shop,Convenience Store,Disc Golf,Donut Shop,Farmers Market,Fast Food Restaurant,French Restaurant,Gastropub,Grocery Store,Gym / Fitness Center,Health & Beauty Service,Hobby Shop,Ice Cream Shop,Italian Restaurant,Liquor Store,Lounge,Massage Studio,Mexican Restaurant,Mobile Phone Shop,Music Venue,New American Restaurant,Noodle House,Park,Performing Arts Venue,Pet Store,Pharmacy,Pizza Place,Poke Place,Pool,Salon / Barbershop,Seafood Restaurant,Spa,Sushi Restaurant,Taco Place,Tennis Court,Thai Restaurant,Vegetarian / Vegan Restaurant,Video Store,Vietnamese Restaurant,Whisky Bar,Wine Bar,Women's Store,Yoga Studio
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1
North Park,0.010309,0.010309,0.030928,0.010309,0.010309,0.010309,0.020619,0.010309,0.010309,0.041237,0.051546,0.010309,0.051546,0.010309,0.010309,0.061856,0.010309,0.010309,0.010309,0.010309,0.010309,0.020619,0.010309,0.010309,0.020619,0.010309,0.010309,0.030928,0.010309,0.030928,0.010309,0.010309,0.030928,0.010309,0.010309,0.010309,0.010309,0.041237,0.010309,0.010309,0.010309,0.051546,0.010309,0.010309,0.010309,0.010309,0.010309,0.030928,0.020619,0.010309,0.020619,0.020619,0.010309,0.010309,0.010309,0.010309,0.010309,0.010309


### This process will be repeated  for all 100+ neighborhoods in San Diego
- the data will be aggregated into a single DataFrame and grouped by neighborhood using the mean value for each venue category in that neighborhood.
- The neighborhoods will then be clustered using Kmeans to locate those neighborhoods which are most similar to North Park.

### Scrape the list of Neighborhoods in San Diego from Wikipedia and store results in a list

In [16]:
url = 'https://en.wikipedia.org/wiki/List_of_communities_and_neighborhoods_of_San_Diego'
html = requests.get(url)
html.text[:100]

'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title'

In [17]:
soup = bs(html.content,'html.parser')
soup.text[:100]

'\n\n\n\nList of communities and neighborhoods of San Diego - Wikipedia\ndocument.documentElement.classNam'

In [18]:
table = soup.find('table')
table.text[:100]

'\n\nBalboa Park\nBankers Hill\nBarrio Logan\nBay Ho\nBay Park\nBirdland\nBlack Mountain Ranch\nBorder\nBurling'

In [19]:
san_diego_neighborhoods = [neighborhood for neighborhood in table.text.split('\n') if neighborhood != ""]
san_diego_neighborhoods[:10]
print("Total Neighborhoods for Analysis: {}.".format(len(san_diego_neighborhoods)))

Total Neighborhoods for Analysis: 127.


### Get the Latitude and Longitude for each of the neighborhoods

In [52]:
latitudes = []
longitudes = []

batch_size = [each for each in range(0,127,25)]
for i in range(len(batch_size)-1):
    start = batch_size[i]
    stop = batch_size[i+1]
    for neighborhood in san_diego_neighborhoods[start:stop]:
        print(neighborhood)
        coordinates = get_lat_long(neighborhood)
        print(coordinates)
        temp_lat = coordinates[0]
        latitudes.append(temp_lat)

        temp_long = coordinates[1]
        longitudes.append(temp_long)

Balboa Park
(32.73135675, -117.146526555704)
Bankers Hill
(32.7260727, -117.1612254)
Barrio Logan
(32.697552, -117.1419765)
Bay Ho
(-1, -1)
Bay Park
(32.781716, -117.2064242)
Birdland
(32.7882923, -117.1562231)
Black Mountain Ranch
(32.9841169, -117.1319318)
Border
(32.5563803, -117.096731755882)
Burlingame
(32.7337039, -117.12754)
Carmel Mountain Ranch
(32.9803929, -117.0783641)
Carmel Valley
(32.9457386, -117.2310168)
City Heights
(32.7497278, -117.101029)
Clairemont
(32.7972712, -117.1925348)
College Area
(40.3894387, -3.66774112631767)
Del Mar Heights
(32.9483783, -117.2608701)
Del Mar Mesa
(32.9414344, -117.1825348)
Downtown San Diego (Centre City)
(-1, -1)
Columbia
(10.42889065, -75.5463785239698)
Core
(32.7172767, -117.1596804)
Cortez Hill
(32.721465, -117.1598091)
East Village
(32.713731, -117.1536398)
Gaslamp Quarter
(32.7109444, -117.1597029)
Horton Plaza
(32.7139859, -117.1637894)
Little Italy
(32.7234579, -117.168272)
Marina
(32.7857377, -117.0416783)
East Elliott
(-1, -1)


In [53]:
for neighborhood in san_diego_neighborhoods[stop:]:
    print(neighborhood)
    coordinates = get_lat_long(neighborhood)
    print(coordinates)
    temp_lat = coordinates[0]
    latitudes.append(temp_lat)

    temp_long = coordinates[1]
    longitudes.append(temp_long)

Uptown
(32.7505901, -117.1691744)
Webster
(32.721534, -117.0937732)


### Create DataFrame containing Neighborhoods and their respective latitude and longitude

In [54]:
[len(each) for each in [latitudes,longitudes,san_diego_neighborhoods]]

[127, 127, 127]

In [55]:
san_diego = pd.DataFrame()
san_diego['Neighborhood'] = san_diego_neighborhoods
san_diego['Neighborhood_Latitude'] = latitudes
san_diego['Neighborhood_Longitude'] = longitudes

In [56]:
san_diego.shape

(127, 3)

In [57]:
san_diego.head()

Unnamed: 0,Neighborhood,Neighborhood_Latitude,Neighborhood_Longitude
0,Balboa Park,32.731357,-117.146527
1,Bankers Hill,32.726073,-117.161225
2,Barrio Logan,32.697552,-117.141976
3,Bay Ho,-1.0,-1.0
4,Bay Park,32.781716,-117.206424


### Save results to file for ease of access after restarting Kernel

In [114]:
san_diego.to_excel('san_diego_neighborhood_latitudes_and_longitudes.xlsx')

### Determine which neighborhoods have missing/incorrect Lat/Long

In [63]:
missing_lat_long = san_diego.Neighborhood.loc[san_diego.Neighborhood_Latitude == -1].values.tolist()
print("There are {} neighborhoods with missing Latitude and Longitude values".format(len(missing_lat_long)))
missing_lat_long

There are 14 neighborhoods with missing Latitude and Longitude values


['Bay Ho',
 'Downtown San Diego (Centre City)',
 'East Elliott',
 'Egger Highlands',
 'Village of La Jolla',
 'Lake Murray',
 'Marston Hills',
 'North City',
 'Ocean Crest',
 'Ocean View Hills',
 'Point Loma Heights',
 'Redwood Village',
 'Miramar Ranch North',
 ' Map of San Diego neighborhoods']

In [71]:
good_lats = san_diego[((san_diego.Neighborhood_Latitude > 31) & (san_diego.Neighborhood_Latitude < 34))].index.values.tolist()

In [87]:
bad_lats = [idx for idx in range(san_diego.shape[0]) if idx not in good_lats]
sd_bad_lats = san_diego.iloc[bad_lats]

In [88]:
good_longs = san_diego[((san_diego.Neighborhood_Longitude < -115) & (san_diego.Neighborhood_Longitude > -119))].index.values.tolist()
bad_longs = [idx for idx in range(san_diego.shape[0]) if idx not in good_longs]
sd_bad_longs = san_diego.iloc[bad_longs]

In [104]:
n_bad_lats = sd_bad_lats.Neighborhood.values.tolist()
n_bad_longs = sd_bad_longs.Neighborhood.values.tolist()
neighborhoods_missing_coordinates = list(set(n_bad_lats + n_bad_longs))
neighborhoods_missing_coordinates = sorted([each.strip() for each in neighborhoods_missing_coordintes])
print(neighborhoods_missing_coordinates)

['Bay Ho', 'College Area', 'Columbia', 'Downtown San Diego (Centre City)', 'East Elliott', 'Egger Highlands', 'El Cerrito', 'Lake Murray', 'Map of San Diego neighborhoods', 'Marston Hills', 'Memorial', 'Miramar Ranch North', 'North City', 'Ocean Crest', 'Ocean View Hills', 'Point Loma Heights', 'Redwood Village', 'Village of La Jolla']


### Remove entries from webscrape that are not actual neighborhoods
1. ie, Map of San Diego Neighborhoods: This is title of a map on the page that was scraped inadvertently.
2. Downtown San Diego (Centre City): This is a title representing a group of several actual neighborhoods which we will analyze individually.

In [168]:
neighborhoods_missing_coordinates = [each for each in neighborhoods_missing_coordinates if each not in ['Downtown San Diego (Centre City)','Map of San Diego neighborhoods']]
neighborhoods_missing_coordinates

['Bay Ho',
 'College Area',
 'Columbia',
 'East Elliott',
 'Egger Highlands',
 'El Cerrito',
 'Lake Murray',
 'Marston Hills',
 'Memorial',
 'Miramar Ranch North',
 'North City',
 'Ocean Crest',
 'Ocean View Hills',
 'Point Loma Heights',
 'Redwood Village',
 'Village of La Jolla']

### Take a quick look at how many neighborhoods are still missing coordinates

In [169]:
print("There are {} neighborhoods with missing Latitude and Longitude values.".format(len(neighborhoods_missing_coordinates)))

There are 16 neighborhoods with missing Latitude and Longitude values.


### Scrape the missing values from google search results
- use the requests and BeautifulSoup libraries to query and extract lat/long coordinates from google searches

In [190]:
temp_neighborhood = neighborhoods_missing_coordinates[-1]
temp_neighborhood

'Village of La Jolla'

In [191]:
query = "https://www.google.com/search?q=%22{}%22+latitude+and+longitude".format(temp_neighborhood)
query

'https://www.google.com/search?q=%22Village of La Jolla%22+latitude+and+longitude'

In [192]:
html = requests.get(query)
html

<Response [200]>

In [193]:
"Coordinates" in html.text

True

In [178]:
soup = bs(html.text, 'html.parser')

In [181]:
"Coordinates" in html.text

False

In [182]:
coordinates = soup.find_all("span", class_="mrH1y",text=True)
coordinates

[]

In [135]:
query = input("What would you like to search: ")
query = query.replace(" ","+")
query = "https://www.google.com/search?q=" + query
query

What would you like to search: bar


'https://www.google.com/search?q=bar'

In [142]:
test = "%22village+of+la+jolla%22+latitude+and+longitude"
test

'%22village+of+la+jolla%22+latitude+and+longitude'

In [175]:
query = "https://www.google.com/search?q=%22{}%22+latitude+and+longitude".format(temp_neighborhood)
query

'https://www.google.com/search?q=%22Bay Ho%22+latitude+and+longitude'

In [144]:
query = "https://www.google.com/search?q="+test
query

'https://www.google.com/search?q=%22village+of+la+jolla%22+latitude+and+longitude'

In [None]:
html = requests.get(query)

In [146]:
len(html.text)

46674

In [147]:
"Coordinates" in html.text

True

In [153]:
html.text.index("117.2706")

22694

In [159]:
start = 22500
end = start+250
html.text[start:end]

'="resultStats">About 1,940 results</div><div id="res"><div id="topstuff"></div><div id="search"><div id="ires"><ol><div class="g"><div class="g"><div id="Db7kif"><span class="mrH1y">32.8431° N, 117.2706° W</span><div class="PZ6wOb"> Village of La Jol'

In [150]:
soup = bs(html.text, 'html.parser')

In [None]:
<div class="Z0LcW">32.8431° N, 117.2706° W</div>

In [167]:
b = soup.find_all("span", class_="mrH1y",text=True)[0]
b.text

'32.8431° N, 117.2706° W'

In [None]:
r = requests.get(query)
html_doc = r.text

In [139]:
a = "https://www.google.com/search?client=firefox-b-1-ab&ei=kUn4W7HgNIji0gKEubTIBw&q=%22village+of+la+jolla%22+latitude+and+longitude&oq=%22village+of+la+jolla%22+latitude+and+longitude&gs_l=psy-ab.3..35i39.11603.13176..13400...0.0..0.125.337.3j1......0....1..gws-wiz.Ty07VvistWo"
a

'https://www.google.com/search?client=firefox-b-1-ab&ei=kUn4W7HgNIji0gKEubTIBw&q=%22village+of+la+jolla%22+latitude+and+longitude&oq=%22village+of+la+jolla%22+latitude+and+longitude&gs_l=psy-ab.3..35i39.11603.13176..13400...0.0..0.125.337.3j1......0....1..gws-wiz.Ty07VvistWo'

In [136]:
query = "https://www.google.com/search?q={}+latitude+and+longitude&ie=utf-8&oe=utf-8&client=firefox-b-1-ab".format(b)
query

'https://www.google.com/search?q=Village+of+La+Jolla+latitude+and+longitude&ie=utf-8&oe=utf-8&client=firefox-b-1-ab'

In [128]:
query = "https://www.google.com/search?q={}+latitude+and+longitude&ie=utf-8&oe=utf-8&client=firefox-b-1-ab".format(b)
html = requests.get(query)

In [129]:
soup = bs(html.content,'html.parser')

In [None]:
<div class="Z0LcW">32.8635° N, 117.2338° W</div>

In [133]:
"Z0LcW" in soup.text

False

In [125]:
soup.find_all("div", {"class": "Z0LcW"})

[]

In [None]:
coordinates = soup.find('table')
table.text[:100]

In [108]:
a = neighborhoods_missing_coordinates[-1]

In [111]:
b = "+".join(a.split())

In [None]:
neighborhood = "Village+of+La+Jolla

In [None]:
bad_longs = [idx for idx in range(san_diego.shape[0]) if idx not in good_longs]
san_diego.iloc[bad_logs]

In [None]:
incorr = san_diego.Neighborhood.loc[san_diego.Neighborhood_Latitude == -1].values.tolist()
print("There are {} neighborhoods with missing Latitude and Longitude values".format(len(missing_lat_long)))
missing_lat_long