In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import contextily as ctx

In [2]:
sf_sites = pd.read_csv('./data/clean_data/sf_sites.csv')
sf_new_construction_all = pd.read_csv('./data/clean_data/sf_all_construction.csv')
sf_new_construction_post_2015 = pd.read_csv('./data/clean_data/sf_construction_post_2015.csv')

In [3]:
date_cols = [c for c in sf_new_construction_all.columns if 'Date' in c]
sf_new_construction_all[date_cols] = sf_new_construction_all[date_cols].apply(pd.to_datetime)
sf_new_construction_post_2015[date_cols] = sf_new_construction_post_2015[date_cols].apply(pd.to_datetime)

## Make sure the `relcapcty` column is what we want:

In [4]:
test_df = pd.DataFrame({
    'maybe_density': sf_sites['allowden'] * sf_sites['locacres'],
    'capacity': sf_sites['relcapcty'].astype(float)
}).dropna()
test_df['maybe_density_int'] = test_df['maybe_density'].round(1)
test_df['capacity_int'] = test_df['capacity'].round(1)

In [5]:
(test_df['maybe_density_int'] == test_df['capacity_int']).mean()

1.0

## Now let's do some stats:

In [6]:
sf_sites['apn'].isin(sf_new_construction_all['apn']).mean()

0.08379343942838584

In [7]:
sf_sites['apn'].isin(sf_new_construction_post_2015['apn']).mean()

0.044982137057486195

In [8]:
sf_new_construction_all['apn'].isin(sf_sites['apn']).mean()

0.09545454545454546

In [9]:
sf_new_construction_post_2015['apn'].isin(sf_sites['apn']).mean()

0.10060734090308952

In [10]:
sf_new_construction_post_2015[
    sf_new_construction_post_2015['apn'].isin(sf_sites['apn'])
]['new_units'].sum()

15712.0

In [11]:
sf_new_construction_post_2015['new_units'].sum()

72562.0

In [12]:
15712.0 / 72562.0

0.2165320691270913

## Number of units in HE:

In [13]:
sf_sites['relcapcty'].sum()

47209.0

## Overall number of units completed:

In [14]:
n_filed_ever = sf_new_construction_all[sf_new_construction_all['Filed Date'].notnull()]['Proposed Units'].sum()
n_filed_ever

172275.0

In [15]:
n_issued_ever = sf_new_construction_all[sf_new_construction_all['Issued Date'].notnull()]['Proposed Units'].sum()
n_issued_ever

123073.0

In [16]:
n_started_ever = sf_new_construction_all[
    sf_new_construction_all['First Construction Document Date'].notnull()
]['Proposed Units'].sum()
n_started_ever

31116.0

In [17]:
n_completed_ever = sf_new_construction_all[
    sf_new_construction_all['Completed Date'].notnull()
]['Proposed Units'].sum()
n_completed_ever

66284.0

In [18]:
n_filed_post_15 = sf_new_construction_post_2015[
    sf_new_construction_post_2015['Filed Date'].notnull()
]['Proposed Units'].sum()
n_filed_post_15

95144.0

In [19]:
n_issued_post_15 = sf_new_construction_post_2015[
    sf_new_construction_post_2015['Issued Date'].notnull()
]['Proposed Units'].sum()
n_issued_post_15

95144.0

In [20]:
n_started_post_15 = sf_new_construction_post_2015[
    sf_new_construction_post_2015['First Construction Document Date'].notnull()
]['Proposed Units'].sum()
n_started_post_15

25660.0

In [21]:
n_completed_post_15 = sf_new_construction_post_2015[
    sf_new_construction_post_2015['Completed Date'].notnull()
]['Proposed Units'].sum()
n_completed_post_15

44172.0

In [22]:
# started before 2015, completed after 2015
sf_new_construction_all[
    sf_new_construction_all['Completed Date'] >= '2015-01-01'
]['Proposed Units'].sum()

57865.0

In [23]:
n_completed_ever / n_issued_ever

0.5385746670675128

In [24]:
n_completed_ever / 5

13256.8

In [25]:
sf_new_construction_all[
    (sf_new_construction_all['Completed Date'] >= '2015-01-01')
    & (sf_new_construction_all['Issued Date'] < '2015-01-01')
]

Unnamed: 0,Permit Number,Permit Type,Permit Type Definition,Permit Creation Date,Block,Lot,Street Number,Street Number Suffix,Street Name,Street Suffix,...,Current Police Districts,Current Supervisor Districts,Analysis Neighborhoods,DELETE - Zip Codes,DELETE - Fire Prevention Districts,DELETE - Supervisor Districts,DELETE - Current Police Districts,DELETE - Supervisorial_Districts_Waterline_data_from_7pkg_wer3,apn,new_units
38,201406138386,1,new construction,2014-06-13,0281,003,832,,Sutter,St,...,6.0,3.0,21.0,28858.0,5.0,10.0,1.0,2.0,0281/003,20.0
40,201308204717,1,new construction,2013-08-20,4624,031,142,,West Point,Rd,...,2.0,9.0,1.0,58.0,10.0,8.0,3.0,6.0,4624/031,50.0
71,201307051190,1,new construction,2013-07-05,0811,031,101,,Polk,St,...,4.0,10.0,36.0,28852.0,7.0,9.0,6.0,3.0,0811/031,162.0
72,201404042522,1,new construction,2014-04-04,8711,031,588,,Mission Bay Blvd North,Bl,...,1.0,10.0,4.0,310.0,14.0,9.0,3.0,3.0,8711/031,200.0
85,201301319232,1,new construction,2013-01-31,3509,043,104,,09th,St,...,1.0,10.0,34.0,28853.0,8.0,9.0,2.0,3.0,3509/043,160.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7148,201407080672,8,otc alterations permit,2014-07-08,4146,019,2830,,22nd,St,...,3.0,2.0,20.0,28859.0,2.0,7.0,7.0,7.0,4146/019,1.0
7150,201407070539,8,otc alterations permit,2014-07-07,6349,021,717,V,Paris,St,...,9.0,1.0,7.0,28861.0,9.0,6.0,4.0,8.0,6349/021,1.0
7151,201407030495,8,otc alterations permit,2014-07-03,4591C,045,451,,Hudson,Av,...,2.0,9.0,1.0,58.0,10.0,8.0,3.0,6.0,4591C/045,9.0
7152,201407030493,8,otc alterations permit,2014-07-03,4591C,068,421,,Hudson,Av,...,2.0,9.0,1.0,58.0,10.0,8.0,3.0,6.0,4591C/068,9.0


# By capacity

In [26]:
sf_sites.columns

Index(['objectid', 'apn', 'locapn', 'genplan', 'zoning', 'gacres', 'locacres',
       'allowden', 'allowhigh', 'relcapcty', 'sitetype', 'pdaparcel',
       'existuse', 'Shape__Are', 'Shape__Len', 'geometry'],
      dtype='object')

In [27]:
sf_sites.shape

(6158, 16)

In [28]:
sf_sites['relcapcty'].value_counts()[:10]

1.0     1103
2.0      959
0.0      904
3.0      687
4.0      463
5.0      291
6.0      227
7.0      194
8.0      139
9.0      117
10.0     102
Name: relcapcty, dtype: int64

In [29]:
(sf_sites['relcapcty'] > 10).sum()

972

In [31]:
rows = []
for n in range(1, 11):
    rows.append({
        'capacity_num_units': n,
        'fraction_of_projects_built': sf_sites[
            sf_sites['relcapcty'] == n
        ]['apn'].isin(sf_new_construction_all['apn']).mean()
    })
    
pd.DataFrame(rows)

Unnamed: 0,capacity_num_units,fraction_of_projects_built
0,1,0.113327
1,2,0.084463
2,3,0.056769
3,4,0.105832
4,5,0.092784
5,6,0.0837
6,7,0.108247
7,8,0.093525
8,9,0.102564
9,10,0.039216


In [32]:
sf_sites[
    sf_sites['relcapcty'] > 10
]['apn'].isin(sf_new_construction_all['apn']).mean()

0.11625514403292181

# Merging the datasets

In [38]:
sf_sites['apn'].value_counts().value_counts()

1    6128
2      13
4       1
Name: apn, dtype: int64

In [39]:
sf_sites_deduped = sf_sites.sort_values('relcapcty').drop_duplicates('apn', keep='last')

In [35]:
sf_new_construction_through_2019 = sf_new_construction_post_2015[
    sf_new_construction_post_2015['Issued Date'] < '2020-01-01'
]

In [42]:
sf_new_construction_through_2019['apn'].value_counts().value_counts()

1     2033
2      407
3       65
4       20
5       13
6        9
8        2
10       2
14       1
11       1
Name: apn, dtype: int64

In [44]:
sf_new_construction_through_2019_deduped = (
    sf_new_construction_through_2019.sort_values(['Issued Date', 'new_units'])
    .drop_duplicates('apn', keep='last')
)

In [45]:
merged_df = sf_sites_deduped.merge(
    sf_new_construction_through_2019_deduped,
    on='apn',
    how='left',
    indicator=True
)

In [46]:
merged_df['apn'].value_counts().value_counts()

1    6142
Name: apn, dtype: int64

In [48]:
zone_mappings = {
    'PDR': 'Industrial',
    'RH': 'Residential',
    'NC': 'Commercial',
    'RM': 'Residential',
    'RC': 'Commercial',
    'RTO': 'Residential',
    'M-1': 'Industrial',
    'UMU': 'Commercial',
    'C-2': 'Commercial',
    'C-3': 'Commercial',
    'C-M': 'Commercial',
}

In [49]:
current_use_mappings = {
    'Vacant': 'Vacant', 
    'Dwelling': 'Residential', 
    'Commercial': 'Commercial', 
    'Industrial': 'Industrial', 
    'Flat+store': 'Residential',
    'Misc/unknown': 'Other', 
    'Flats+duplex': 'Residential', 
    'Office Bld': 'Commercial', 
    'Gas Station': 'Commercial',
    'Indust Ware': 'Industrial', 
    'Church': 'Other', 
    'Comm Garage': 'Commercial', 
    'Public Prop': 'Other', 
    'Parking Lot': 'Commercial',
    'Bank': 'Commercial', 
    'Condominium': 'Residential', 
    'Motel': 'Commercial', 
    'Club/lodge': 'Commercial', 
    'Apartment': 'Residential', 
    'School': 'Other',
    'Dwel+apts': 'Residential', 
    'Theatre': 'Commercial', 
    'Store/condo': 'Residential', 
    'Apt/store': 'Residential', 
    'Other Hotel': 'Commercial',
    'Hospital': 'Other', 
    'Nursing Hm': 'Residential', 
    'Livework': 'Residential', 
    'Dwelling Bm': 'Residential', 
    '2 Dwel 1 Lot': 'Residential',
    'Office Condo': 'Commercial',
    'Flat+apt': 'Residential', 
    'Indust Data': 'Commercial', 
    'D/f >1 Lot': 'Other', 
    'Co-op Units': 'Residential',
    'Shopping Cen': 'Commercial',
}

In [50]:
def get_zone_mapping(zone):
    for zone_match_string, zone_type in zone_mappings.items():
        if zone_match_string in zone:
            return zone_type
    return None

merged_df_with_type = merged_df.copy()
merged_df_with_type['zoning_type'] = merged_df_with_type['zoning'].map(get_zone_mapping)
merged_df_with_type['current_use_type'] = merged_df_with_type['existuse'].map(current_use_mappings)

In [51]:
merged_df_with_type['zoning_type'].value_counts()

Commercial     2869
Residential    1928
Industrial     1038
Name: zoning_type, dtype: int64

In [52]:
merged_df_with_type['current_use_type'].value_counts()

Residential    1703
Commercial     1578
Vacant         1534
Industrial      890
Other           437
Name: current_use_type, dtype: int64

In [53]:
merged_df_with_type['_merge'].value_counts(normalize=True)

left_only     0.960111
both          0.039889
right_only    0.000000
Name: _merge, dtype: float64

In [59]:
merged_df_with_type.groupby('zoning_type').apply(lambda g: (g['_merge'] == 'both').mean())

zoning_type
Commercial     0.041478
Industrial     0.005780
Residential    0.054461
dtype: float64

In [61]:
merged_df_with_type.groupby('current_use_type').apply(lambda g: (g['_merge'] == 'both').mean())

current_use_type
Commercial     0.029785
Industrial     0.040449
Other          0.057208
Residential    0.034645
Vacant         0.050847
dtype: float64

# Extrapolate to 8 years

In [64]:
(merged_df['_merge'] == 'both').mean()

0.03988928687723869

In [65]:
8 / 5 * (merged_df['_merge'] == 'both').mean()

0.06382285900358191

In [62]:
8/5 * merged_df_with_type.groupby('zoning_type').apply(lambda g: (g['_merge'] == 'both').mean())

zoning_type
Commercial     0.066365
Industrial     0.009249
Residential    0.087137
dtype: float64

In [63]:
8/5 * merged_df_with_type.groupby('current_use_type').apply(lambda g: (g['_merge'] == 'both').mean())

current_use_type
Commercial     0.047655
Industrial     0.064719
Other          0.091533
Residential    0.055432
Vacant         0.081356
dtype: float64