# Housing Inventory Import

This notebook imports the SF Housing Inventory data from the SF open data portal. This is an alternative to development activity from the SF Development Pipeline, which I used for my CP 255 final project. 

In [37]:
#import pandas
import pandas as pd
import numpy as np
import re as re
import os
import requests  # library for accessing content from web URLs
import pprint  # library for making Python data structures readable
pp = pprint.PrettyPrinter()
import json    # library for working with JSON-formatted text strings
import time
from geopy.geocoders import GoogleV3 #google geocoding
pd.set_option('display.max_row', 1000) #sets max rows to be seen
pd.set_option('display.max_columns', 50) #Set max column width to 50

# Import Data

In [38]:
#set directory for one file not available through API
root = '/Users/briangoggin/Dropbox/CP 255/SF Development Project'
raw = root+'/Raw Data/'

d2014 = 'https://data.sfgov.org/resource/b8d6-zthg.json'
d2013 = 'https://data.sfgov.org/resource/sjse-8gyy.json'
d2012 = 'https://data.sfgov.org/resource/a64c-96a5.json'
d2011 = 'https://data.sfgov.org/resource/pwiv-ej3p.json'

In [39]:
#import via excel
df2015 = pd.read_excel(raw+'Housing Inventory/2015_datasf.xlsx', sheetname = '2015Completes')

In [40]:
df2015['address'] = df2015['Address']
df2015['address'] = df2015['address'] + ", San Francisco, CA" #add city and state to address field for consistency
df2015['block'] = df2015['BLOCK']
df2015['lot'] = df2015['LOT']
df2015['status'] = df2015['ACTION']
df2015['latest_date'] = df2015['ACTDATE']
df2015['units'] = df2015['UNITS']
df2015['net_units'] = df2015['NETUNITS']
df2015['affordable_units'] = df2015['AFFHSG']
df2015['zone'] = df2015['ZONING']
df2015['lat'] = df2015['Y']
df2015['lon'] = df2015['X']
df2015['year'] = '2015'

varlist = ['address', 'block', 'lot', 'status', 'latest_date', 'units', 
           'net_units', 'affordable_units', 'zone', 'year', 'lat' ,'lon']

df2015= df2015[varlist]
df2015.head()

Unnamed: 0,address,block,lot,status,latest_date,units,net_units,affordable_units,zone,year,lat,lon
0,"1400 MISSION ST, San Francisco, CA",3507,42,COMPLETE,2015-12-22,190.0,190.0,167.0,C-3-G,2015,37.775283,-122.41649
1,"255 BROADWAY *, San Francisco, CA",165,21,COMPLETE,2015-02-12,75.0,75.0,74.0,C-2,2015,37.798299,-122.401629
2,"1100 OCEAN AV, San Francisco, CA",3180,1,COMPLETE,2015-02-23,71.0,71.0,70.0,OCEAN AVE NCT,2015,37.725577,-122.454153
3,"280 BEALE ST, San Francisco, CA",3738,4,TCO ISSUED,2015-06-23,70.0,70.0,69.0,TB DTR,2015,37.788706,-122.393609
4,"100 VAN NESS AV, San Francisco, CA",814,20,PRE-FINAL,2015-05-05,399.0,399.0,48.0,C-3-G,2015,37.776721,-122.419177


In [41]:
#define function for importing other data, which can be called via SF open data portal API

def importdata(year, yearvalue, field1, field2, field3, field4, field5, field6, field7, field8, field9):
    '''
    This function calls the API endpoint year and returns a dataframe with the desired columns.
    field1=geogfield2 represent the desired fields as they are named at the API endpoint
    '''
    
    def includekey(field):
        '''
        This function takes a given field from the API endpoint and creates a list out of its entires.
        '''
        list = []
        for item in data: 
            if field in item.keys():
                list.append(item[field])
            else:
                list.append(np.nan)
        return list
    
    response = requests.get(year)
    results = response.text
    data = json.loads(results) #data is a list at this point
    
    #import fields
    d = {}
    d['block'] = includekey(field1)
    d['lot'] = includekey(field2)
    d['address'] = includekey(field3)
    d['status'] = includekey(field4)
    d['latest_date'] = includekey(field5)
    d['units'] = includekey(field6)
    d['net_units'] = includekey(field7)
    d['affordable_units'] = includekey(field8)
    d['zone'] = includekey(field9)

    df = pd.DataFrame.from_dict(d)
    df['year'] = yearvalue
    
    return df

In [42]:
#import data
start = time.time()
df2014 = importdata(d2014, '2014', 'block', 'lot', 'stdadd', 'action', 'actdate', 'units', 'netunits', 'affhsg', 'zoning')
df2013 = importdata(d2013, '2013', 'block', 'lot', 'standardaddress', 'change_type', 'actdate', 'units', 'netunits', 'aff_hsg', 'zoning')
df2012 = importdata(d2012, '2012', 'block', 'lot', 'standardaddress', 'change_type', 'actdate', 'units', 'netunits', 'aff_hsg', 'zoning')
df2011 = importdata(d2011, '2011', 'block', 'lot', 'standardad', 'change_type', 'actdate', 'units', 'netunits', 'affordable', 'zoning')
end = time.time()
print(end - start)

2.7346601486206055


In [43]:
#append data together
years = [df2013, df2012, df2011]
full_df = df2014.append(years)

In [44]:
full_df.head()

Unnamed: 0,address,affordable_units,block,latest_date,lot,net_units,status,units,zone,year
0,1000 MASON ST,0.0,223,27-Mar-14,8,3,COMPLETE,51,RM-4,2014
1,1000 POWELL ST,0.0,211,24-Jun-14,15,1,COMPLETE,48,RC-4,2014
2,101 DUBOCE AV,2.0,3533,07-Aug-14,1,2,COMPLETE,7,NCT-3,2014
3,1017 CAPITOL AV,0.0,6985,12-Sep-14,10,-1,CFC ISSUED,1,RH-1,2014
4,1028 WISCONSIN ST,0.0,4219,06-Nov-14,3,2,CFC ISSUED,2,RH-2,2014


In [45]:
# Fill out address field prior to geocoding
full_df['address'] = df2014['address'] + ", San Francisco, CA"

# Geocoding

In [46]:
# set the pause duration between api requests
pause = 0.1

In [47]:
# function that accepts an address string, sends it to the Google API, and returns the lat-long API result
def geocode(address):
    time.sleep(pause) #pause for some duration before each request, to not hammer their server
    url = 'http://maps.googleapis.com/maps/api/geocode/json?address={}&sensor=false' #api url with placeholders
    request = url.format(address) #fill in the placeholder with a variable
    response = requests.get(request) #send the request to the server and get the response
    data = response.json() #convert the response json string into a dict
    
    if len(data['results']) > 0: #if google was able to geolocate our address, extract lat-long from result
        latitude = data['results'][0]['geometry']['location']['lat']
        longitude = data['results'][0]['geometry']['location']['lng']
        status = data['status'] #return geocode status
        return '{},{},{}'.format(latitude, longitude, status) #return lat-long as a string in the format google likes

In [48]:
# for each value in the address column, geocode it, save results as new df column
start = time.time()
full_df['latlng'] = full_df['address'].map(geocode)
full_df['lat'] = full_df['latlng'].str.split(',').str[0]
full_df['lon'] = full_df['latlng'].str.split(',').str[1]
full_df['geocode_status'] = full_df['latlng'].str.split(',').str[2]
end = time.time()
print(end - start)

380.8613860607147


In [49]:
full_df['lon'] = full_df['lon'].astype(float)
full_df['lat'] = full_df['lat'].astype(float)
full_df['net_units'] = full_df['net_units'].astype(int)

# Merge in 2015

In [50]:
full_df = df2015.append(full_df)

In [51]:
full_df[full_df['address'].isnull()] #only a few empty rows in 2015 with no address. Get rid of these
full_df = full_df[full_df['address'].notnull()]

# Final Cleaning and Export

In [53]:
#export data
export_path = root+"/Code/Maps/Examples Under Construction/"
full_df.to_csv(export_path+"intermediate_HI.csv")

In [54]:
full_df[full_df['lat'].isnull()].count()

address             35
affordable_units    11
block               35
lat                  0
latest_date         25
latlng               0
lon                  0
lot                 31
net_units           35
status               0
units               35
year                35
zone                35
unitcat             35
dtype: int64