In [107]:
import pandas as pd
import logging
import dateutil
from dateutil import parser
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
from geopandas import GeoDataFrame
from shapely.geometry import Point
%matplotlib inline
pd.set_option('display.max_columns', 500)

In [108]:
#import dataset of final cleaned projects from "data" folder
file = "../../../data/cleaned/final_cleaned_projects.csv"
output="../../Output/Entire Sample/"
df = pd.read_csv(file)

# Merge in Geographic Information and Make Big Dataframe

In [109]:
#first, convert points to geodataframe
crs = {'init' :'epsg:4326'}
geometry = [Point(xy) for xy in zip(df.x, df.y)]
devs = GeoDataFrame(df, crs=crs, geometry=geometry)
devs = devs.to_crs({'init': 'epsg:4326'}) 

In [110]:
#import neighborhoods
neighborhoods = gpd.read_file('../../../data/gis/area_plans/area_plans.shp')

In [111]:
#convert boundaries to geographic coordinate system to conform to points
neighborhoods = neighborhoods.to_crs({'init': 'epsg:4326'}) 

In [112]:
devs.shape

(2475, 42)

In [113]:
#First, spatial join between points and neighborhood boundaries. Set 'how' to 'left' to preserve all developments
df = gpd.sjoin(devs, neighborhoods, how = 'left', op='within')
df.shape

(2769, 49)

In [114]:
#in order to use as much data as possible, I am not keeping all data and just creating time vars where we can (i.e. where they are nonnull)
def permit_time(value):
    returnval=np.nan
    if pd.notnull(value['BP_date']) & pd.notnull(value['first_date']):
        returnval=((dateutil.parser.parse(value['BP_date']) - dateutil.parser.parse(value['first_date'])).days)/365
    return returnval
def bp_time(value):
    returnval=np.nan
    if pd.notnull(value['con_date']) & pd.notnull(value['BP_date']):
        returnval=((dateutil.parser.parse(value['con_date']) - dateutil.parser.parse(value['BP_date'])).days)/365
    return returnval
def con_time(value):
    returnval=np.nan
    if pd.notnull(value['comp_date']) & pd.notnull(value['con_date']):
        returnval=((dateutil.parser.parse(value['comp_date']) - dateutil.parser.parse(value['con_date'])).days)/365
    return returnval
    
df['permit_time']=df.apply(permit_time, axis=1)
df['bp_time']=df.apply(bp_time, axis=1)
df['con_time']=df.apply(con_time, axis=1)

In [115]:
#drop duplicates that arise because of overlapping planning areas
df['unique_key']=df['address_x']+df['apn']+df['first_date']
len(df['unique_key'].unique())

2475

In [116]:
df.drop_duplicates('unique_key', keep='first', inplace=True)

In [117]:
df.shape

(2475, 53)

In [118]:
df['plan_area']=pd.notnull(df['planarea'])

In [119]:
#per unit variables
df['ptime_per_unit']=df['permit_time']/df['units']
df['bptime_per_unit']=df['bp_time']/df['units']
df['contime_per_unit']=df['con_time']/df['units']

In [120]:
df_pa=df[df['plan_area']==True]
df_npa=df[df['plan_area']==False]

In [121]:
#average times by plan and not plan area
#average times per unit by plan and not plan area
#t-tests, averages, and sample sizes

In [122]:
#do some t-tests
from scipy.stats import ttest_ind

print ("Entitlement Time:")
print (df_pa['permit_time'].mean().astype(str), df_npa['permit_time'].mean().astype(str))
print (ttest_ind(df_pa['permit_time'], df_npa['permit_time'], nan_policy='omit')[1].astype(str))
print (str(df_pa[pd.notnull(df_pa['permit_time'])].count()['permit_time']))
print (str(df_npa[pd.notnull(df_npa['permit_time'])].count()['permit_time']))

print ("Construction Prep Time:")
print (df_pa['bp_time'].mean().astype(str), df_npa['bp_time'].mean().astype(str))
print (ttest_ind(df_pa['bp_time'], df_npa['bp_time'], nan_policy='omit')[1].astype(str))
print (str(df_pa[pd.notnull(df_pa['bp_time'])].count()['bp_time']))
print (str(df_npa[pd.notnull(df_npa['bp_time'])].count()['bp_time']))

print ("Construction Time:")
print (df_pa['con_time'].mean().astype(str), df_npa['con_time'].mean().astype(str))
print (ttest_ind(df_pa['con_time'], df_npa['con_time'], nan_policy='omit')[1].astype(str))
print (str(df_pa[pd.notnull(df_pa['con_time'])].count()['con_time']))
print (str(df_npa[pd.notnull(df_npa['con_time'])].count()['con_time']))

Entitlement Time:
2.7511284380700314 1.76803911676821
1.280751462704843e-14
383
706
Construction Prep Time:
1.0886971728359078 1.1328734114540353
0.6574059095099064
235
415
Construction Time:
1.3246261633378649 1.179532634971797
0.1754821607589882
262
459


In [123]:
#do some t-tests
from scipy.stats import ttest_ind

print ("Entitlement Time:")
print (df_pa['ptime_per_unit'].mean().astype(str), df_npa['ptime_per_unit'].mean().astype(str))
print (ttest_ind(df_pa['ptime_per_unit'], df_npa['ptime_per_unit'], nan_policy='omit')[1].astype(str))
print (str(df_pa[pd.notnull(df_pa['ptime_per_unit'])].count()['ptime_per_unit']))
print (str(df_npa[pd.notnull(df_npa['ptime_per_unit'])].count()['ptime_per_unit']))

print ("Construction Prep Time:")
print (df_pa['bptime_per_unit'].mean().astype(str), df_npa['bptime_per_unit'].mean().astype(str))
print (ttest_ind(df_pa['bptime_per_unit'], df_npa['bptime_per_unit'], nan_policy='omit')[1].astype(str))
print (str(df_pa[pd.notnull(df_pa['bptime_per_unit'])].count()['bptime_per_unit']))
print (str(df_npa[pd.notnull(df_npa['bptime_per_unit'])].count()['bptime_per_unit']))

print ("Construction Time:")
print (df_pa['contime_per_unit'].mean().astype(str), df_npa['contime_per_unit'].mean().astype(str))
print (ttest_ind(df_pa['contime_per_unit'], df_npa['contime_per_unit'], nan_policy='omit')[1].astype(str))
print (str(df_pa[pd.notnull(df_pa['contime_per_unit'])].count()['contime_per_unit']))
print (str(df_npa[pd.notnull(df_npa['contime_per_unit'])].count()['contime_per_unit']))

Entitlement Time:
0.8673579155859166 1.3518801165366217
7.215670841100008e-10
383
706
Construction Prep Time:
0.493203284832352 0.886130392683126
1.3010946114385201e-06
235
415
Construction Time:
0.6059701504300024 0.922799526491177
0.0010284462064382863
262
459
