# Analysis from Brian Goggin (uses newly-produced data as of September 2017)

In [None]:
import pandas as pd
import logging
import dateutil
from dateutil import parser
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
from geopandas import GeoDataFrame
from shapely.geometry import Point
%matplotlib inline
pd.set_option('display.max_columns', 500)

In [None]:
#import dataset of project times from "data" folder
file = "../../../data/cleaned/all_quarters__one_record_per_project.csv"
output="../../Output/"
df = pd.read_csv(file)

# Initial Data Cleaning

In [None]:
#First, filter out those projects that are exclusively non-residential (defined as those without units)
df = df[df['units'] > 0]

In [None]:
#Next, keep only those that reached completion at some point over the time period
df=df[pd.notnull(df['comp_date'])]

In [None]:
df.shape

In [None]:
df=df[pd.notnull(df['firstfiled'])]
df=df[df['firstfiled'] !='']

In [None]:
df.shape

In [None]:
#drop duplicates for now, until manual cleaning is done
df= df[~ df.duplicated('dbi_permit', keep=False)]

In [None]:
df.shape

In [None]:
df['project_time_years']=df['project_duration_days']/365

In [None]:
# Top 5 longest projects
df.sort_values('project_time_years', ascending = False)[0:5]

In [None]:
# Top 5 shortest projects
df.sort_values('project_time_years', ascending = True)[0:5]

In [None]:
#Manual Data Cleaning for some fishy values based on top 5 shortest list above
#df.loc[(df['address']=='55 05TH ST') & (new_df['apn']=='3705039'), 'units'] = 8 #looked up on PIM. Simple adding 8 units.
#df.loc[(df['address']=='545 POST ST') & (new_df['apn']=='0306022'), 'units'] = 4 #looked up on PIM. Simple adding 4 units
#df.loc[(df['address']=='555 POST ST') & (new_df['apn']=='0306020'), 'units'] = 17 #looked up on PIM. Simple adding 4 units
#df.loc[(df['address']=='515 JOHN MUIR DR') & (new_df['apn']=='7282005'), 'units'] = 2 #looked up on PIM. Change of use to residential. 2 additional units

#based on the above cases, unit count seems to be the unit count at the site at the end of 
#the project rather than the unit count of the project itself. Because of this, I will be adjust
#unit counts that are more than the net units added to the net units added amount. If units net
#is missing, just accept unit count as true for now.

def unit_change(value):
    return_val=value['units']
    if pd.notnull(value['unitsnet']):
        if value['unitsnet']<0:
            return_val = value['units']
        elif value['unitsnet']==0:
            return_val = value['units']
        elif value['unitsnet']<value['units']:
            return_val = value['unitsnet']
        else:
            return_val=value['units']
    return return_val

df['units']=df.apply(unit_change, axis=1)
 

In [None]:
#Create Big Projects Dataframe
df_big=df[df['units']>=10]
df_big['project_time_years'].describe()

In [None]:
df['project_time_years'].describe()

# Merge in Geographic Information and Make Big Dataframe

In [None]:
#first, convert points to geodataframe
crs = {'init' :'epsg:4326'}
geometry = [Point(xy) for xy in zip(df.x, df.y)]
devs = GeoDataFrame(df, crs=crs, geometry=geometry)
devs = devs.to_crs({'init': 'epsg:4326'}) 

In [None]:
#import neighborhoods
neighborhoods = gpd.read_file('../../../data/gis/41_neighborhoods/41_neighborhoods.shp')

In [None]:
#convert boundaries to geographic coordinate system to conform to points
neighborhoods = neighborhoods.to_crs({'init': 'epsg:4326'}) 

In [None]:
#First, spatial join between points and neighborhood boundaries. Set 'how' to 'left' to preserve all developments
df = gpd.sjoin(devs, neighborhoods, how = 'inner', op='within')
df.shape

In [None]:
#Create Big Projects Dataframe
df_big=df[df['units']>=10]
df_big['project_time_years'].describe()

In [None]:
# Top 5 shortest projects of big projects
df_big.sort_values('project_time_years', ascending = True)[0:5]

# Zoning Exploration

In [None]:
df['zoning_simplified'].value_counts()

conv_res = ['RH-1(D)', 'RH-1', 'RH-1(S)', 'RH-2', 'RH-3']
apt_res=['RM-1', 'RM-2', 'RM-3', 'RM-4']
mixed_res=['RC-3', 'RC-4', 'RED', 'RTO', 'RC-4']
nhb_comm=['NC-1', 'NC-2', 'NC-3', 'NC-S', 'NCD', 'NCT'] #NCD and NCT encompass all types of NCD, NCTs
comm=['C-2', 'C-3', 'C-M']
indus=['M-1', 'M-2']
other=[]

def zoning_simple(value):
    zone_class='Other'
    for item in conv_res:
        if value['zoning'].find(item)>=0:
            zone_class= 'Conventional Residential'
    for item in apt_res:
        if value['zoning'].find(item)>=0: 
            zone_class= 'Apartment Residential'
    for item in mixed_res:
        if value['zoning'].find(item)>=0: 
            zone_class= 'Mixed Residential'
    for item in nhb_comm:
        if value['zoning'].find(item)>=0: 
            zone_class= 'Nhb Commercial'
    for item in comm:
        if value['zoning'].find(item)>=0: 
            zone_class= 'Standard Commercial'
    for item in indus:
        if value['zoning'].find(item)>=0: 
            zone_class= 'Industrial'
    return zone_class

df['zone_class']=df.apply(zoning_simple, axis=1)


In [None]:
df['zone_class'].value_counts()

In [None]:
def bar_graph(var_x,var_y, title, x_label, y_label, df=df, sortedvar=False, graph_options=['red', (8, 6)]):
    """
    Function to make bar graphs by subgroup for median time to completion.
    """
    medians=[]
    value_names=df[var_x].value_counts(sort=False).index
    if sortedvar==True:
        value_names=sorted(value_names, key=lambda x: df[df[var_x]==x][var_y].median())
    for i in value_names:
        median = df[df[var_x]==i][var_y].median()
        medians.append(median)
        
    y_pos = np.arange(len(df[var_x].value_counts(sort=False).index))
    categories=[]
    for cat in value_names:
        categories.append(cat)
        
    fig, ax = plt.subplots(figsize=graph_options[1])
    plt.title(title, fontsize=20)
    ax.bar(y_pos, medians, align='center', alpha=0.5, color = graph_options[0])
    ax.set_xticks(y_pos)
    ax.set_xticklabels(categories, rotation=45)
    ax.set_xlabel(x_label, fontsize=20)
    ax.set_ylabel(y_label, fontsize=20)
    for tick in ax.xaxis.get_majorticklabels():
        tick.set_horizontalalignment("right")
    plt.tick_params(axis='both', which='major', labelsize=14)
    plt.show()


In [None]:
bar_graph('zone_class','project_time_years', 'Time to Completion by Zoning Category','Zoning Class', 'Median Development Time (Years)')

# Project Times by Years

In [None]:
df['year_start']=df['first_date'].str[0:4].astype(int)

In [None]:
year_medians=[]
for i in df['year_start'].value_counts(sort=False).index:
    median = df[df['year_start']==i]['project_time_years'].median()
    year_medians.append(median)

In [None]:
y_pos = np.arange(len(df['year_start'].value_counts(sort=False).index))

In [None]:
categories=[]
for cat in df['year_start'].value_counts(sort=False).index:
    categories.append(cat)

In [None]:
bar_graph('year_start','project_time_years', 'Time to Completion by Year Started','Year Started', 'Median Development Time (Years)')

# Figure 1. Histograms

In [None]:
plt.figure(1, figsize=(8,6), )
plt.suptitle('Figure 1. SF Housing Development Times', fontsize=20)
plt.xlabel('Development Time (Years)', fontsize = 20)
plt.ylabel('Number of Developments', fontsize = 20)
plt.xticks(fontsize = 14)
plt.yticks(fontsize = 14)
ax = plt.hist(df['project_time_years'], bins=25, alpha=.6, color='g')
plt.savefig(output+'hitogram.png')
plt.show()

In [None]:
plt.figure(1, figsize=(8,6), )
plt.suptitle('SF Housing Development Times: Big Projects', fontsize=20)
plt.xlabel('Development Time (Years)', fontsize = 20)
plt.ylabel('Number of Developments', fontsize = 20)
plt.xticks(fontsize = 14)
plt.yticks(fontsize = 14)
ax = plt.hist(df_big['project_time_years'], bins=25, alpha=.6, color='g')
plt.show()

# Figure 2. Years Per Unit Histogram

In [None]:
# Project review time, normalized by amount of units
#create unit-year category so that we normalize by units when comparing geographies
df['years_per_unit']=df['project_time_years']/df['units']

In [None]:
df['years_per_unit'].describe()

In [None]:
df.sort_values('years_per_unit', ascending = False)[0:5]

In [None]:
plt.figure(1, figsize=(8,6), )
plt.suptitle('Figure 2. SF Housing Development Times', fontsize=20)
plt.xlabel('Years per Unit Added', fontsize=20)
plt.ylabel('Number of Developments', fontsize=20)
plt.xticks(fontsize = 14)
plt.yticks(fontsize = 14)
ax = plt.hist(df['years_per_unit'], bins=25, alpha=.6, color='g')
plt.savefig(output+'hitogram_yearunits.png')
plt.show()

In [None]:
#Create dataset units added by years for each unit
list = []
uniqueid = 0
for index, row in df.iterrows():
    amount = int(row['units'])
    for units in range(0, amount):
        dictionary = {}
        dictionary['id']=uniqueid+1
        dictionary['time']=row['project_time_years']
        list.append(dictionary)
        uniqueid=uniqueid+1
df_units = pd.DataFrame(list)

plt.figure(1, figsize=(10,10), )
plt.suptitle('SF Housing Development Times', fontsize=20)
plt.xlabel('Review Time (Years)', fontsize = 20)
plt.ylabel('Number of Units Added', fontsize = 20)
ax = plt.hist(df_units['time'], bins=25, alpha=.6, color='g')
plt.savefig(output+'hitogram_units.png')
plt.show()

In [None]:
df_units['time'].describe()

# Figure 4. Scatter Plots

In [None]:
plt.figure(1, figsize=(10,10))
plt.scatter(df['units'], df['project_time_years'], s=100)
plt.suptitle('Figure 4. Development Time by Units Added', fontsize=20)
plt.ylabel('Development Time (Years)', fontsize = 18)
plt.xlabel('Number of Units Added', fontsize = 18)
plt.xticks(fontsize=14)
plt.yticks(fontsize = 14)
plt.xlim([0,800])
plt.ylim([0,25])
plt.savefig(output+'scatter.png')

In [None]:
plt.figure(1, figsize=(10,10))
plt.scatter(df_big['units'], df_big['project_time_years'], s=100)
plt.suptitle('Development Time by Units Added: Big Projects', fontsize=20)
plt.ylabel('Development Time (Years)', fontsize = 18)
plt.xlabel('Number of Units Added', fontsize = 18)
plt.xticks(fontsize=14)
plt.yticks(fontsize = 14)
plt.xlim([0,800])
plt.ylim([0,25])

# Figure 5. Plot by unit category

In [None]:
unit_cutoffs=[10, 50, 100, 200]
def unit_categories(value):
    if value['units'] <unit_cutoffs[0]:
        field = '0'+'-'+str(unit_cutoffs[0])
    elif (value['units'] >=unit_cutoffs[0]) & (value['units'] <unit_cutoffs[1]):
        field = str(unit_cutoffs[0])+'-'+str(unit_cutoffs[1])
    elif (value['units']>= unit_cutoffs[1]) & (value['units'] <unit_cutoffs[2]):
        field = str(unit_cutoffs[1])+'-'+str(unit_cutoffs[2])
    elif (value['units']>= unit_cutoffs[2]) & (value['units'] <200):
        field = str(unit_cutoffs[2])+'-'+str(unit_cutoffs[3])
    elif value['units']>=200:
        field = '>'+str(unit_cutoffs[3])
    return field
    
df['unitcat']=df.apply(unit_categories, axis=1)
df_big['unitcat']=df_big.apply(unit_categories, axis=1)

In [None]:
df['unitcat'].value_counts()

In [None]:
cat_medians=[]
categories=['0-10', '10-50', '50-100', '100-200', '>200']
for cat in categories:
    median = df[df['unitcat']==cat]['project_time_years'].median()
    cat_medians.append(median)
objects = df['unitcat'].unique()
y_pos = np.arange(len(objects))

plt.figure(1, figsize=(8,6), )
plt.suptitle('Figure 5. Development Time by Size Category', fontsize=20)
plt.ylabel('Median Development Time (Years)', fontsize=20)
plt.xlabel('Units Added', fontsize=20)
plt.bar(y_pos, cat_medians, align='center', alpha=0.5)
plt.xticks(y_pos, categories, fontsize=14)
plt.yticks(fontsize = 14)
plt.savefig(output+'bar_chart_times.png')
plt.show()

# Figures 3 and 6. Breakdown time by stage of process

In [None]:
df = df[pd.notnull(df['BP_date'])]

In [None]:
df.shape

In [None]:
df = df[pd.notnull(df['con_date'])]

In [None]:
df.shape

In [None]:
df['permit_time']=df.apply(lambda x: ((dateutil.parser.parse(x['BP_date']) - dateutil.parser.parse(x['first_date'])).days)/365, axis=1)
df['bp_time']=df.apply(lambda x: ((dateutil.parser.parse(x['con_date']) - dateutil.parser.parse(x['BP_date'])).days)/365, axis=1)
df['con_time']=df.apply(lambda x: ((dateutil.parser.parse(x['comp_date']) - dateutil.parser.parse(x['con_date'])).days)/365, axis=1)

In [None]:
df['bp_time'].describe()

In [None]:
df['con_time'].describe()

In [None]:
df['project_time_years'].describe()

In [None]:
plt.figure(1, figsize=(16,14), )
plt.suptitle("Figure 3. SF Development Times by Stage", fontsize=24)

ax = plt.subplot(221)
ax.set_title("Entitlement Time", fontsize=18)
ax.set_xlabel('Time in Years', fontsize=18)
ax.set_ylabel('Number of Developments', fontsize=18)
plt.xlim([0,14])
plt.ylim([0,80])
plt.xticks(fontsize = 16)
plt.yticks(fontsize = 16)
ax.hist(df['permit_time'], bins=50, color='b')

ax = plt.subplot(222)
ax.set_title("Construction Prep Time", fontsize=18)
ax.set_xlabel('Time in Years', fontsize=18)
ax.set_ylabel('Number of Developments', fontsize=18)
plt.xlim([0,14])
plt.ylim([0,80])
plt.xticks(fontsize = 16)
plt.yticks(fontsize = 16)
ax.hist(df['bp_time'], bins=50, color='r')

ax = plt.subplot(223)
ax.set_title("Construction Time", fontsize=18)
ax.set_xlabel('Time in Years', fontsize=18)
ax.set_ylabel('Number of Developments', fontsize=18)
plt.xlim([0,14])
plt.ylim([0,80])
plt.xticks(fontsize = 16)
plt.yticks(fontsize = 16)
ax.hist(df['con_time'], bins=50, color='g')

plt.savefig(output+'big_hist.png')

In [None]:
ent_medians=[]
bp_medians=[]
cons_medians=[]
categories=['0-10', '10-50', '50-100', '100-200', '>200']
for cat in categories:
    median1 = df[df['unitcat']==cat]['permit_time'].median()
    median2 = df[df['unitcat']==cat]['bp_time'].median()
    median3 = df[df['unitcat']==cat]['con_time'].median()
    ent_medians.append(median1)
    bp_medians.append(median2)
    cons_medians.append(median3)
objects = df['unitcat'].unique()
y_pos = np.arange(len(objects))

In [None]:
plt.figure(1, figsize=(16,14), )
plt.suptitle("Figure 6. Development Stages by Size Category", fontsize=24)

ax = plt.subplot(221)
ax.set_title('Median Entitlement Time', fontsize=18)
ax.set_xlabel('Number of Units', fontsize=18)
ax.set_ylabel('Years', fontsize=18)
ax.set_xticks(y_pos)
ax.set_xticklabels(categories)
ax.set_ylim([0,4])
ax.bar(y_pos, ent_medians, align='center', color='b')

ax = plt.subplot(222)
ax.set_title('Median Construction Prep Time', fontsize=18)
ax.set_xlabel('Number of Units', fontsize=18)
ax.set_ylabel('Years', fontsize=18)
ax.set_xticks(y_pos)
ax.set_xticklabels(categories)
ax.set_ylim([0,4])
ax.bar(y_pos, bp_medians, align='center', color='r')

ax = plt.subplot(223)
ax.set_title('Median Construction  Time', fontsize=18)
ax.set_xlabel('Number of Units', fontsize=18)
ax.set_ylabel('Years', fontsize=18)
ax.set_xticks(y_pos)
ax.set_xticklabels(categories)
ax.set_ylim([0,4])
ax.bar(y_pos, cons_medians, align='center', color='g')

plt.savefig(output+'big_bar.png')

# Figures 7 and 8. Create Neighborhood Graphs

In [None]:
#create separate geodataframe for just those with all dates
df_nb_full = df[pd.notnull(df['BP_date'])]

In [None]:
# for these graphs, drop if neighborhood has sample less than 10
for nhood in df_nb_full['nhood'].value_counts().index:
    if df_nb_full[df_nb_full['nhood']==nhood]['BP_date'].count() <10:
        df_nb_full=df_nb_full[df_nb_full['nhood']!=nhood]

In [None]:
df_nb_full['ptime_unityears']=df_nb_full['permit_time']/df_nb_full['units']

In [None]:
graph_options=['b', (12, 8)]
bar_graph('nhood','years_per_unit', 'Figure 7. SF Development Times by Neighborhood', 'Neighborhood', 'Median Development Time (Years per Unit)', df_nb_full, True, graph_options)

In [None]:
graph_options=['r', (12, 8)]
bar_graph('nhood','ptime_unityears', 'Figure 8. SF Entitlement Times by Neighborhood', 'Neighborhood', 'Median Entitlement Time (Years per Unit)', df_nb_full, True, graph_options)

In [None]:
df_nb_full.shape