# Capstone Project notebook

This notebook is used for the capstone project

In [2]:
import numpy as np
import pandas as pd

## Problem Description

The goal of the projective is to find a neighborhood in Raleigh, NC, which is similar to Murraywood neighborhood in Columbia, SC. Murraywood ct was the address of author's previous residence and he wants to find a similar neighborhood in North Carolina. 

# Tools
## Convert neighborhoods to coordinates

In [29]:
from geopy.geocoders import Nominatim

import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe


In [26]:
inputAddress = 'Anderson Heights, Raleigh, NC' 

geolocator = Nominatim()
location = geolocator.geocode(inputAddress)
latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)



35.8178634 -78.6377824196471


In [31]:
# Pull API keys from the file
with open('foursquare_keys.txt','r') as f:
    CLIENT_ID, CLIENT_SECRET = [l.strip() for l in f.readlines()]

VERSION = '20180605' # Foursquare API version

In [32]:
# Define an auxiliary function to pull the category
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

In [47]:
# Pull up to 100 venues within 500 meters of the borough
radius = 1*1600
LIMIT = 100
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT)

results = requests.get(url).json()
venues = results['response']['groups'][0]['items']

nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues

Unnamed: 0,name,categories,lat,lng
0,Jersey Mike's Subs,Sandwich Place,34.075935,-81.178343
1,Hungry Howie's Pizza,Pizza Place,34.071897,-81.175538
2,Chapala,Mexican Restaurant,34.07501,-81.17808
3,Loveland Coffee,Coffee Shop,34.075771,-81.177871
4,Subway,Sandwich Place,34.07115,-81.175505
5,Sun Ming,Chinese Restaurant,34.087824,-81.184906
6,Shear Indulgence,Cosmetics Shop,34.083716,-81.18272
7,Moe's Southwest Grill,Mexican Restaurant,34.089911,-81.172793
8,Jimmy John's,Sandwich Place,34.068211,-81.166314
9,Lizard's Thicket,Southern / Soul Food Restaurant,34.089574,-81.185936


## Murraywood neighborhood

In [36]:
inputAddress = 'Murraywood, Irmo, SC' 

geolocator = Nominatim()
location = geolocator.geocode(inputAddress)
latitude = location.latitude
longitude = location.longitude
print(latitude, longitude)

  app.launch_new_instance()


34.0767364 -81.1789118


In [45]:
# Pull up to 100 venues within 500 meters of the borough
radius = 1*1600
LIMIT = 100
url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
    CLIENT_ID, 
    CLIENT_SECRET, 
    VERSION, 
    latitude, 
    longitude, 
    radius, 
    LIMIT)

results = requests.get(url).json()
venues = results['response']['groups'][0]['items']

nearby_venues = json_normalize(venues) # flatten JSON

# filter columns
filtered_columns = ['venue.name', 'venue.categories', 'venue.location.lat', 'venue.location.lng']
nearby_venues =nearby_venues.loc[:, filtered_columns]

# filter the category for each row
nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)

# clean columns
nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

nearby_venues

Unnamed: 0,name,categories,lat,lng
0,Jersey Mike's Subs,Sandwich Place,34.075935,-81.178343
1,Hungry Howie's Pizza,Pizza Place,34.071897,-81.175538
2,Chapala,Mexican Restaurant,34.07501,-81.17808
3,Loveland Coffee,Coffee Shop,34.075771,-81.177871
4,Subway,Sandwich Place,34.07115,-81.175505
5,Sun Ming,Chinese Restaurant,34.087824,-81.184906
6,Shear Indulgence,Cosmetics Shop,34.083716,-81.18272
7,Moe's Southwest Grill,Mexican Restaurant,34.089911,-81.172793
8,Jimmy John's,Sandwich Place,34.068211,-81.166314
9,Lizard's Thicket,Southern / Soul Food Restaurant,34.089574,-81.185936


## Scrape neighborhoods data

In [49]:
import requests
from bs4 import BeautifulSoup
from pandas.io.json import json_normalize

In [68]:

url = "http://www.city-data.com/neighborhood/Murraywood-Columbia-SC.html"
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')


In [69]:
row_data = soup.find(class_='row')
row_data

<div class="row"><div class="col-xs-12">
<div class="row"><div class="col-md-7"><br/><a href="/nbmaps/neigh-Columbia-South-Carolina.html">Columbia, South Carolina Neighborhood Map</a><br/><br/></div><div class="col-md-5"><script type="text/javascript"><!--function bse(enable){document.getElementById("bssubmit").disabled=enable;}--></script><div class="search_box_bs"><b>Business Search</b> <span id="bsinfomsg">- <b>14 Million</b> verified businesses</span><br><form action="http://www.city-data.com/bs/" method="get" onchange="bse(false);" onsubmit="bse(true);">Search for: <input class="bs_input" name="q" onchange="bse(false);" value=""/>near: <input class="bs_input" name="w" onchange="bse(false);" value="Columbia, SC"/> <input class="li sp-finds" id="bssubmit" name="bssubmit" onclick="urchinTracker('/forum_searchbusiness.html');" src="http://pics3.city-data.com/trn.gif" type="image" value="Find It"/></form></br></div></div></div><div class="content-item"><br/><b>Area:</b> 7.865 <b>square

In [70]:
item = row_data.find(class_='content-item')
item

<div class="content-item"><br/><b>Area:</b> 7.865 <b>square miles</b><br/><br/>
<b>Population:</b> 6,110<br/><br/>
<b>Population density:</b><br/><div class="hgraph"><table><tr><td><b>Murraywood (Murray Woods):</b></td><td><p class="h" style="padding-left:108px;"></p>777 <b>people per square mile</b></td></tr><tr><td><b>Columbia:</b></td><td><p class="a" style="padding-left:150px;"></p>1,070 <b>people per square mile</b></td></tr></table></div>
</div>

In [218]:
d = {}
for i in item.find_all('b'):
    try:
        key = i.text.strip()
        value = i.next_sibling.strip()
        d[key] = value
    except:
        pass
d

{'Area:': '7.865', 'Population:': '6,110'}

In [219]:
def pull_value(hg, item_pos):
    return hg[item_pos].find('table').find_all('tr')[0].find_all('td')[1].text

pull_value(hg, 0)

'$81,120'

In [279]:
hg = row_data.find_all(class_ = 'hgraph')[1:]
pull_value(hg,2)
#hg

'2,822'

In [220]:
keys = ['med household income'
        ,'med rent'
        ,'males'
        ,'med age males'
        ,'med age females'
        ,'avg household size'
        ,'pct family household'
        ,'pct married couple'
        ,'pct families with children'
        ,'pct single mother'
        ,'pct never married males > 15'
        ,'pct never married females > 15'
        ,'pct not speak English well'
        ,'pct born in state'
        ,'pct born in another us state'
       ,'pct native residents born outside us'
       ,'pct foreign born residents'
       ,'avg number of cars houses'
       ,'avg number of cars apts'
       ,'pct units mortgage']
values = [pull_value(hg, pos) for pos in range(0,20)]
dict(zip(keys,values))

{'avg household size': '7.8 people',
 'avg number of cars apts': '1.8',
 'avg number of cars houses': '1.9',
 'males': '2,822',
 'med age females': '37.8 years',
 'med age males': '40.4 years',
 'med household income': '$81,120',
 'med rent': '$679',
 'pct born in another us state': '37.4%',
 'pct born in state': '56.6%',
 'pct families with children': '30.7%',
 'pct family household': '52.0%',
 'pct foreign born residents': '3.4%',
 'pct married couple': '46.5%',
 'pct native residents born outside us': '2.6%',
 'pct never married females > 15': '14.8%',
 'pct never married males > 15': '15.4%',
 'pct not speak English well': '2.0%',
 'pct single mother': '12.1%',
 'pct units mortgage': '74.3%'}

In [222]:
z = d.copy()
z.update(dict(zip(keys,values)))
z
# area in sq. miles

{'Area:': '7.865',
 'Population:': '6,110',
 'avg household size': '7.8 people',
 'avg number of cars apts': '1.8',
 'avg number of cars houses': '1.9',
 'males': '2,822',
 'med age females': '37.8 years',
 'med age males': '40.4 years',
 'med household income': '$81,120',
 'med rent': '$679',
 'pct born in another us state': '37.4%',
 'pct born in state': '56.6%',
 'pct families with children': '30.7%',
 'pct family household': '52.0%',
 'pct foreign born residents': '3.4%',
 'pct married couple': '46.5%',
 'pct native residents born outside us': '2.6%',
 'pct never married females > 15': '14.8%',
 'pct never married males > 15': '15.4%',
 'pct not speak English well': '2.0%',
 'pct single mother': '12.1%',
 'pct units mortgage': '74.3%'}

In [280]:
keys = ['med household income'
        ,'med rent'
        ,'males'
        ,'med age males'
        ,'med age females'
        ,'avg household size'
        ,'pct family household'
        ,'pct married couple'
        ,'pct families with children'
        ,'pct single mother'
        ,'pct never married males > 15'
        ,'pct never married females > 15'
        ,'pct not speak English well'
        ,'pct born in state'
        ,'pct born in another us state'
       ,'pct native residents born outside us'
       ,'pct foreign born residents'
       ,'avg number of cars houses'
       ,'avg number of cars apts'
       ,'pct units mortgage']

def pull_neigh_data(url):
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    row_data = soup.find(class_='row')
    item = row_data.find(class_='content-item')
    d = {}
    for i in item.find_all('b'):
        try:
            key = i.text.strip()
            value = i.next_sibling.strip()
            d[key] = value
        except:
            pass
    hg = row_data.find_all(class_ = 'hgraph')[1:]
    values = [pull_value(hg, pos) for pos in range(0,20)]
    z = d.copy()
    z.update(dict(zip(keys,values)))
    return z

pull_neigh_data("http://www.city-data.com/neighborhood/Murraywood-Columbia-SC.html")

{'Area:': '7.865',
 'Population:': '6,110',
 'avg household size': '7.8 people',
 'avg number of cars apts': '1.8',
 'avg number of cars houses': '1.9',
 'males': '2,822',
 'med age females': '37.8 years',
 'med age males': '40.4 years',
 'med household income': '$81,120',
 'med rent': '$679',
 'pct born in another us state': '37.4%',
 'pct born in state': '56.6%',
 'pct families with children': '30.7%',
 'pct family household': '52.0%',
 'pct foreign born residents': '3.4%',
 'pct married couple': '46.5%',
 'pct native residents born outside us': '2.6%',
 'pct never married females > 15': '14.8%',
 'pct never married males > 15': '15.4%',
 'pct not speak English well': '2.0%',
 'pct single mother': '12.1%',
 'pct units mortgage': '74.3%'}

In [289]:
pull_neigh_data('http://www.city-data.com/neighborhood/Anderson-Heights-Raleigh-NC.html')

{'Area:': '0.370',
 'Population:': '964',
 'avg household size': '2.8 people',
 'avg number of cars apts': '1.9',
 'avg number of cars houses': '2.3',
 'males': '485',
 'med age females': '39.2 years',
 'med age males': '45.1 years',
 'med household income': '$161,702',
 'med rent': '$1,125',
 'pct born in another us state': '32.7%',
 'pct born in state': '62.0%',
 'pct families with children': '32.5%',
 'pct family household': '66.2%',
 'pct foreign born residents': '3.2%',
 'pct married couple': '77.0%',
 'pct native residents born outside us': '2.2%',
 'pct never married females > 15': '8.5%',
 'pct never married males > 15': '6.8%',
 'pct not speak English well': '0.0%',
 'pct single mother': '1.8%',
 'pct units mortgage': '72.3%'}

## Scrape NC neighborhoods

In [228]:
page_no = 1
url = "http://www.city-data.com/indexes/neighborhoods/NC/%d/"%page_no

page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')

In [244]:
li = soup.find_all('li')
li[100]

<li><a href="/neighborhood/Anderson-Heights-Raleigh-NC.html">Anderson Heights neighborhood in Raleigh, NC</a></li>

In [251]:
url_prefix = 'http://www.city-data.com'

def get_neigh_url(li_item):
    value = url_prefix+li_item.find('a').get('href')
    key = li_item.text
    return (key,value)
    
print(get_neigh_url(li[100]))


('Anderson Heights neighborhood in Raleigh, NC', 'http://www.city-data.com/neighborhood/Anderson-Heights-Raleigh-NC.html')


In [None]:
def scrape_page(page_no):
    url = "http://www.city-data.com/indexes/neighborhoods/NC/%d/"%page_no
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser')
    urls_dict = []
    for li_item in soup.find_all('li'):
        if li_item.text.find('Raleigh, NC') >= 0:
            urls_dict.append(get_neigh_url(li_item))
    return urls_dict

scrape_page(2)

In [259]:
from random import randint
from time import sleep

urls_dict = []

for page_no in range(1,51):
    d = scrape_page(page_no)
    urls_dict = urls_dict + d
    print("Processed page %d of 51"%page_no)
    sleep(randint(3,20))
    print("waiting...")
    
print(urls_dict)

Processed page 1 of 51
waiting...
Processed page 2 of 51
waiting...
Processed page 3 of 51
waiting...
Processed page 4 of 51
waiting...
Processed page 5 of 51
waiting...
Processed page 6 of 51
waiting...
Processed page 7 of 51
waiting...
Processed page 8 of 51
waiting...
Processed page 9 of 51
waiting...
Processed page 10 of 51
waiting...
Processed page 11 of 51
waiting...
Processed page 12 of 51
waiting...
Processed page 13 of 51
waiting...
Processed page 14 of 51
waiting...
Processed page 15 of 51
waiting...
Processed page 16 of 51
waiting...
Processed page 17 of 51
waiting...
Processed page 18 of 51
waiting...
Processed page 19 of 51
waiting...
Processed page 20 of 51
waiting...
Processed page 21 of 51
waiting...
Processed page 22 of 51
waiting...
Processed page 23 of 51
waiting...
Processed page 24 of 51
waiting...
Processed page 25 of 51
waiting...
Processed page 26 of 51
waiting...
Processed page 27 of 51
waiting...
Processed page 28 of 51
waiting...
Processed page 29 of 51
waiti

In [261]:
len(urls_dict)

1499

In [264]:
import pickle
with open('urls_dict.pickle','wb') as f:
    pickle.dump(urls_dict, f)

## Pull Neighborhoods Data

In [292]:
urls_dict[100][1]
#pull_neigh_data(urls_dict[100][1])

'http://www.city-data.com/neighborhood/Bellewood-Farms-Raleigh-NC.html'

In [296]:
l = []
count = 0
for name, url in urls_dict:
    print(url)
    try:
        d = pull_neigh_data(url)
        d['neighborhood'] = name
        l.append(d)
        count += 1
        print("%s processed (%d), now waiting..."%(name,count))
    except:
        print("%s unable to pull data"%(name))
    sleep(randint(3,20))
    
l

http://www.city-data.com/neighborhood/Abbington-Ridge-Raleigh-NC.html
Abbington Ridge neighborhood in Raleigh, NC unable to pull data
http://www.city-data.com/neighborhood/Adams-Mountain-Raleigh-NC.html
Adams Mountain neighborhood in Raleigh, NC unable to pull data
http://www.city-data.com/neighborhood/Addison-Reserve-Raleigh-NC.html
Addison Reserve neighborhood in Raleigh, NC unable to pull data
http://www.city-data.com/neighborhood/Alanda-Raleigh-NC.html
Alanda neighborhood in Raleigh, NC processed (1), now waiting...
http://www.city-data.com/neighborhood/Alexander-Place-Townhomes-Raleigh-NC.html
Alexander Place Townhomes neighborhood in Raleigh, NC processed (2), now waiting...
http://www.city-data.com/neighborhood/Alexandria-Square-Raleigh-NC.html
Alexandria Square neighborhood in Raleigh, NC processed (3), now waiting...
http://www.city-data.com/neighborhood/Alpha-Raleigh-NC.html
Alpha neighborhood in Raleigh, NC processed (4), now waiting...
http://www.city-data.com/neighborhood/

[{'Area:': '0.019',
  'Population:': '4',
  'avg household size': '21.0 people',
  'avg number of cars apts': '1.7',
  'avg number of cars houses': '1.7',
  'males': '2',
  'med age females': '36.7 years',
  'med age males': '30.7 years',
  'med household income': '$65,332',
  'med rent': '$853',
  'neighborhood': 'Alanda neighborhood in Raleigh, NC',
  'pct born in another us state': '48.6%',
  'pct born in state': '38.8%',
  'pct families with children': '48.3%',
  'pct family household': '37.7%',
  'pct foreign born residents': '11.4%',
  'pct married couple': '30.3%',
  'pct native residents born outside us': '1.1%',
  'pct never married females > 15': '12.6%',
  'pct never married males > 15': '39.9%',
  'pct not speak English well': '0.0%',
  'pct single mother': '8.5%',
  'pct units mortgage': '96.9%'},
 {'Area:': '0.089',
  'Population:': '12',
  'avg household size': '2.2 people',
  'avg number of cars apts': '88.6%',
  'avg number of cars houses': '1.9',
  'males': '6',
  'me

In [297]:
import pandas as pd
df = pd.DataFrame(l)
df

Unnamed: 0,Area:,Population:,avg household size,avg number of cars apts,avg number of cars houses,males,med age females,med age males,med household income,med rent,...,pct families with children,pct family household,pct foreign born residents,pct married couple,pct native residents born outside us,pct never married females > 15,pct never married males > 15,pct not speak English well,pct single mother,pct units mortgage
0,0.019,4,21.0 people,1.7,1.7,2,36.7 years,30.7 years,"$65,332",$853,...,48.3%,37.7%,11.4%,30.3%,1.1%,12.6%,39.9%,0.0%,8.5%,96.9%
1,0.089,12,2.2 people,88.6%,1.9,6,37.9 years,38.0 years,"$77,721","$1,159",...,39.2%,50.2%,1.9,56.5%,7.4%,10.0%,12.6%,52.6%,7.9%,"$199,712"
2,0.025,55,2.1 people,1.2,2.2,28,59.9 years,57.1 years,"$99,318",$872,...,47.0%,34.8%,5.2%,72.5%,0.9%,3.2%,5.5%,0.0%,14.2%,73.4%
3,0.005,29,2.4 people,1.7,1.7,10,24.0 years,33.5 years,"$45,978",$677,...,7.9%,60.8%,7.7%,34.3%,0.3%,41.0%,17.5%,0.0%,33.8%,87.7%
4,0.091,55,2.3 people,1.7,1.8,26,40.8 years,36.0 years,"$53,929",$895,...,25.0%,37.7%,10.9%,34.1%,1.3%,24.9%,22.6%,0.0%,19.1%,86.6%
5,0.026,71,2.5 people,1.3,2.1,33,32.6 years,30.9 years,"$34,719",$693,...,23.9%,60.2%,25.0%,40.0%,1.3%,17.8%,17.6%,4.7%,11.2%,84.7%
6,0.027,33,2.3 people,1.6,2.2,15,41.3 years,42.4 years,"$76,395","$1,454",...,35.0%,41.9%,4.6%,63.0%,0.7%,12.8%,14.3%,0.0%,5.9%,57.3%
7,0.370,964,2.8 people,1.9,2.3,485,39.2 years,45.1 years,"$161,702","$1,125",...,32.5%,66.2%,3.2%,77.0%,2.2%,8.5%,6.8%,0.0%,1.8%,72.3%
8,0.116,209,3.1 people,1.0,1.6,112,39.2 years,27.4 years,"$28,530",$736,...,15.0%,32.1%,7.6%,19.6%,0.5%,24.4%,25.2%,6.6%,13.2%,79.2%
9,0.043,15,2.3 people,1.1,1.6,6,37.8 years,36.2 years,"$54,200",$795,...,26.0%,59.3%,21.5%,51.8%,2.0%,19.9%,9.2%,0.0%,19.1%,81.9%


In [299]:
df.to_csv('raleigh_neighs.csv')

## Data Preparation

In [301]:
df.describe()

Unnamed: 0,Area:,Population:,avg household size,avg number of cars apts,avg number of cars houses,males,med age females,med age males,med household income,med rent,...,pct families with children,pct family household,pct foreign born residents,pct married couple,pct native residents born outside us,pct never married females > 15,pct never married males > 15,pct not speak English well,pct single mother,pct units mortgage
count,661.0,664,668,668.0,668.0,668,668,668,668,668,...,668,668,668,668,668,668,668,668,668,668
unique,190.0,224,64,48.0,24.0,169,169,164,205,178,...,296,154,85,160,38,140,140,102,234,158
top,0.002,7,2.6 people,1.2,1.7,3,40.8 years,41.7 years,"$53,929",$853,...,15.0%,37.7%,5.5%,34.1%,0.5%,12.6%,9.2%,0.0%,0.8%,86.6%
freq,31.0,19,86,75.0,113.0,46,18,20,20,30,...,8,38,32,31,68,29,26,461,14,25


In [302]:
df.columns

Index(['Area:', 'Population:', 'avg household size', 'avg number of cars apts',
       'avg number of cars houses', 'males', 'med age females',
       'med age males', 'med household income', 'med rent', 'neighborhood',
       'pct born in another us state', 'pct born in state',
       'pct families with children', 'pct family household',
       'pct foreign born residents', 'pct married couple',
       'pct native residents born outside us',
       'pct never married females > 15', 'pct never married males > 15',
       'pct not speak English well', 'pct single mother',
       'pct units mortgage'],
      dtype='object')