# Housing Inventory Import

This notebook imports the SF Housing Inventory data from the SF open data portal. This is an alternative to development activity from the SF Development Pipeline, which I used for my CP 255 final project. 

In [110]:
#import pandas
import pandas as pd
import numpy as np
import re as re
import os
import requests  # library for accessing content from web URLs
import pprint  # library for making Python data structures readable
pp = pprint.PrettyPrinter()
import json    # library for working with JSON-formatted text strings
import time
from geopy.geocoders import GoogleV3 #google geocoding
pd.set_option('display.max_row', 1000) #sets max rows to be seen
pd.set_option('display.max_columns', 50) #Set max column width to 50

# Import Data

In [111]:
#set directory for one file not available through API
root = '/Users/briangoggin/Dropbox/CP 255/SF Development Project'
raw = root+'/Raw Data/'

d2014 = 'https://data.sfgov.org/resource/b8d6-zthg.json'
d2013 = 'https://data.sfgov.org/resource/sjse-8gyy.json'
d2012 = 'https://data.sfgov.org/resource/a64c-96a5.json'
d2011 = 'https://data.sfgov.org/resource/pwiv-ej3p.json'

In [112]:
#import via excel
df2015 = pd.read_excel(raw+'Housing Inventory/2015_datasf.xlsx', sheetname = '2015Completes')

In [113]:
df2015['address'] = df2015['Address']
df2015['block'] = df2015['BLOCK']
df2015['lot'] = df2015['LOT']
df2015['status'] = df2015['ACTION']
df2015['latest_date'] = df2015['ACTDATE']
df2015['units'] = df2015['UNITS']
df2015['net_units'] = df2015['NETUNITS']
df2015['affordable_units'] = df2015['AFFHSG']
df2015['zone'] = df2015['ZONING']
df2015['year'] = '2015'

varlist = ['address', 'block', 'lot', 'status', 'latest_date', 'units', 
           'net_units', 'affordable_units', 'zone', 'year']

df2015= df2015[varlist]
df2015.head()

Unnamed: 0,address,block,lot,status,latest_date,units,net_units,affordable_units,zone,year
0,1400 MISSION ST,3507,42,COMPLETE,2015-12-22,190.0,190.0,167.0,C-3-G,2015
1,255 BROADWAY *,165,21,COMPLETE,2015-02-12,75.0,75.0,74.0,C-2,2015
2,1100 OCEAN AV,3180,1,COMPLETE,2015-02-23,71.0,71.0,70.0,OCEAN AVE NCT,2015
3,280 BEALE ST,3738,4,TCO ISSUED,2015-06-23,70.0,70.0,69.0,TB DTR,2015
4,100 VAN NESS AV,814,20,PRE-FINAL,2015-05-05,399.0,399.0,48.0,C-3-G,2015


In [114]:
#define function for importing other data

def importdata(year, yearvalue, field1, field2, field3, field4, field5, field6, field7, field8, field9):
    '''
    This function calls the API endpoint year and returns a dataframe with the desired columns.
    field1=geogfield2 represent the desired fields as they are named at the API endpoint
    '''
    
    def includekey(field):
        '''
        This function takes a given field from the API endpoint and creates a list out of its entires.
        '''
        list = []
        for item in data: 
            if field in item.keys():
                list.append(item[field])
            else:
                list.append(np.nan)
        return list
    
    response = requests.get(year)
    results = response.text
    data = json.loads(results) #data is a list at this point
    
    #import fields
    d = {}
    d['block'] = includekey(field1)
    d['lot'] = includekey(field2)
    d['address'] = includekey(field3)
    d['status'] = includekey(field4)
    d['latest_date'] = includekey(field5)
    d['units'] = includekey(field6)
    d['net_units'] = includekey(field7)
    d['affordable_units'] = includekey(field8)
    d['zone'] = includekey(field9)

    df = pd.DataFrame.from_dict(d)
    df['year'] = yearvalue
    
    return df

In [115]:
#import data
start = time.time()
df2014 = importdata(d2014, '2014', 'block', 'lot', 'stdadd', 'action', 'actdate', 'units', 'netunits', 'affhsg', 'zoning')
df2013 = importdata(d2013, '2013', 'block', 'lot', 'standardaddress', 'change_type', 'actdate', 'units', 'netunits', 'aff_hsg', 'zoning')
df2012 = importdata(d2012, '2012', 'block', 'lot', 'standardaddress', 'change_type', 'actdate', 'units', 'netunits', 'aff_hsg', 'zoning')
df2011 = importdata(d2011, '2011', 'block', 'lot', 'standardad', 'change_type', 'actdate', 'units', 'netunits', 'affordable', 'zoning')
end = time.time()
print(end - start)

1.6651020050048828


In [116]:
#append data together
years = [df2014, df2013, df2012, df2011]
full_df = df2015.append(years)

In [117]:
full_df.head()

Unnamed: 0,address,affordable_units,block,latest_date,lot,net_units,status,units,year,zone
0,1400 MISSION ST,167,3507,2015-12-22 00:00:00,42,190,COMPLETE,190,2015,C-3-G
1,255 BROADWAY *,74,165,2015-02-12 00:00:00,21,75,COMPLETE,75,2015,C-2
2,1100 OCEAN AV,70,3180,2015-02-23 00:00:00,1,71,COMPLETE,71,2015,OCEAN AVE NCT
3,280 BEALE ST,69,3738,2015-06-23 00:00:00,4,70,TCO ISSUED,70,2015,TB DTR
4,100 VAN NESS AV,48,814,2015-05-05 00:00:00,20,399,PRE-FINAL,399,2015,C-3-G


In [118]:
full_df[full_df['address'].isnull()] #only a few empty rows in 2015 with no address. Get rid of these
full_df = full_df[full_df['address'].notnull()]

In [119]:
# Fill out address field prior to geocoding
full_df['address'] = df2014['address'] + ", San Francisco, CA"

# Geocoding

In [120]:
# set the pause duration between api requests
pause = 0.1

In [121]:
# function that accepts an address string, sends it to the Google API, and returns the lat-long API result
def geocode(address):
    time.sleep(pause) #pause for some duration before each request, to not hammer their server
    url = 'http://maps.googleapis.com/maps/api/geocode/json?address={}&sensor=false' #api url with placeholders
    request = url.format(address) #fill in the placeholder with a variable
    response = requests.get(request) #send the request to the server and get the response
    data = response.json() #convert the response json string into a dict
    
    if len(data['results']) > 0: #if google was able to geolocate our address, extract lat-long from result
        latitude = data['results'][0]['geometry']['location']['lat']
        longitude = data['results'][0]['geometry']['location']['lng']
        return '{},{}'.format(latitude, longitude) #return lat-long as a string in the format google likes

In [122]:
# for each value in the address column, geocode it, save results as new df column
start = time.time()
full_df['latlng'] = full_df['address'].map(geocode)
full_df['lat'] = full_df['latlng'].str.split(',').str[0]
full_df['lon'] = full_df['latlng'].str.split(',').str[1]
end = time.time()
print(end - start)

503.7457609176636


In [None]:
# round values
def round(x):
    y = float(x)
    return y

full_df['lat'] = full_df['lat'].map(round)
full_df['lon'] = full_df['lon'].map(round)

# Export Data

In [151]:
#write function to export point data
def df_to_geojson(df, properties, lat='latitude', lon='longitude'):
    # create a new python dict to contain our geojson data, using geojson format
    geojson = {'type':'FeatureCollection', 'features':[]}

    # loop through each row in the dataframe and convert each row to geojson format
    for _, row in df.iterrows():
        # create a feature template to fill in
        feature = {'type':'Feature',
                   'properties':{},
                   'geometry':{'type':'Point',
                               'coordinates':[]}}

        # fill in the coordinates
        feature['geometry']['coordinates'] = [row[lon],row[lat]]

        # for each column, get the value and add it as a new feature property
        for prop in properties:
            feature['properties'][prop] = row[prop]
        
        # add this feature (aka, converted dataframe row) to the list of features inside our dict
        geojson['features'].append(feature)
    
    return geojson

In [152]:
# Omitting "Latest Date" variable here because it does not work with json output. Not needed for now anyway
varlist = ['address', 'block', 'lot', 'status', 'units', 
           'net_units', 'affordable_units', 'zone', 'year']

geojson = df_to_geojson(full_df, varlist, 'lat', 'lon')

In [153]:
# save the geojson result to a file
output_path = root+'/Code/Maps/Housing Inventory'
output_filename = output_path+'/data.js'
with open(output_filename, 'w') as output_file:
    output_file.write('var dataset = {};'.format(json.dumps(geojson, indent=4)))