In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd

## Load sites data, and clean data

In [2]:
sites_df = gpd.read_file(
    './data/raw_data/housing_sites/xn--Bay_Area_Housing_Opportunity_Sites_Inventory__20072023_-it38a.shp'
)

In [3]:
sj_sites = sites_df.query('jurisdict == "San Jose" and rhnacyc == "RHNA5"').copy()
sj_sites.fillna(value=np.nan, inplace=True)
sj_sites['allowden'] = sj_sites['allowden'].astype(float)
sj_sites['relcapcty'] = sj_sites['relcapcty'].astype(float)

In [4]:
is_constant = ((sj_sites == sj_sites.iloc[0]).all())
constant_cols = is_constant[is_constant].index.values
constant_cols

array(['rhnacyc', 'rhnayrs', 'county', 'jurisdict', 'currunits',
       'allowlow', 'num_vl', 'num_l', 'num_m', 'num_am'], dtype=object)

In [5]:
sj_sites.drop(constant_cols, axis=1, inplace=True)

In [6]:
sj_sites.dropna(how='all', axis=1, inplace=True)

In [8]:
sj_sites.shape

(599, 18)

## Load building permits data

In [10]:
# This data was pulled on 2021-03-08
expired_permits = gpd.read_file('data/raw_data/san_jose/sj_expired_building_permits.shp')
active_permits = gpd.read_file('data/raw_data/san_jose/sj_active_building_permits.shp')

In [16]:
expired_permits.dropna(how='all', axis='columns', inplace=True)
active_permits.dropna(how='all', axis='columns', inplace=True)

In [31]:
expired_permits['ISSUEDATE'] = pd.to_datetime(expired_permits['ISSUEDATE'])
expired_permits['ISSUEDATEU'] = pd.to_datetime(expired_permits['ISSUEDATEU'])
expired_permits['LASTUPDATE'] = pd.to_datetime(expired_permits['LASTUPDATE'])

In [35]:
expired_permits.groupby(pd.Grouper(key='ISSUEDATE', freq='10Y')).size()

ISSUEDATE
1938-12-31        1
1948-12-31        1
1958-12-31        0
1968-12-31        1
1978-12-31       11
1988-12-31       28
1998-12-31       47
2008-12-31    12541
2018-12-31    33210
2028-12-31       27
Freq: 10A-DEC, dtype: int64

In [40]:
active_permits['ISSUEDATE'] = pd.to_datetime(active_permits['ISSUEDATE'])
active_permits['ISSUEDATEU'] = pd.to_datetime(active_permits['ISSUEDATEU'])
active_permits['LASTUPDATE'] = pd.to_datetime(active_permits['LASTUPDATE'])

In [42]:
active_permits.groupby(pd.Grouper(key='ISSUEDATE', freq='10Y')).size()

ISSUEDATE
1997-12-31        2
2007-12-31     1421
2017-12-31     2901
2027-12-31    17625
Freq: 10A-DEC, dtype: int64

In [44]:
expired_permits.iloc[1]

OBJECTID                                              2.0
APPLICANT                                            NONE
CONTRACTOR                                           None
FACILITYID                                              2
INTID                                                 2.0
ADDRESS        1490  REDMOND AV  , SAN JOSE CA 95120-4858
APN                                              57528005
WORKDESC                            Additions/Alterations
SUBDESC                                     Single-Family
PERMITAPPR                                           None
ISSUEDATE                             1994-03-16 16:00:00
ISSUEDATEU                            1994-03-17 00:00:00
FINALDATE                                            None
FINALDATEU                                           None
DWELLINGUN                                            0.0
PERMITVALU                                            0.0
SQUAREFOOT                                            0.0
FOLDERNUM     

In [47]:
expired_new_construction = expired_permits[
    expired_permits['WORKDESC'] == 'New Construction'
]

In [49]:
expired_new_construction['SUBDESC'].value_counts()

Single-Family                   207
Apt/Condo/Townhouse             184
2nd Unit Added                   75
Retail                           42
Office                           19
Townhouse                        14
Condo                            14
Mixed Use                        14
Undefined                        12
Hotel/Motel                      10
Duplex                           10
School/Daycare                   10
Warehouse/Storage                 7
Open Public Parking Garage        6
Restaurant                        6
Industrial Plant                  5
Manufactured Home                 5
Tilt Up Warehouse                 3
Service Station                   2
Assembly                          2
Church                            2
Recreation Building               2
Manufacturing                     2
Medical/Dental Clinic             2
Multiple Use                      2
Closed Public Parking Garage      1
Bank                              1
High Rise                   

In [56]:
expired_new_construction['DWELLINGUN'].value_counts().sort_index()

0.0      246
1.0      261
2.0        8
3.0        5
4.0       14
5.0       14
6.0       22
7.0       30
8.0       18
9.0       11
10.0       4
11.0       1
12.0       5
13.0       1
14.0       1
17.0       1
22.0       5
40.0       1
45.0       1
48.0       1
80.0       1
86.0       1
109.0      1
113.0      1
117.0      1
148.0      1
165.0      1
205.0      1
243.0      1
271.0      1
330.0      1
357.0      1
403.0      1
Name: DWELLINGUN, dtype: int64

In [57]:
expired_new_construction[
    expired_new_construction['DWELLINGUN'] == 403
]

Unnamed: 0,OBJECTID,APPLICANT,CONTRACTOR,FACILITYID,INTID,ADDRESS,APN,WORKDESC,SUBDESC,PERMITAPPR,...,FINALDATE,FINALDATEU,DWELLINGUN,PERMITVALU,SQUAREFOOT,FOLDERNUM,LASTUPDATE,LASTEDITOR,ENTERPRISE,geometry
34475,34507.0,HANOVER RS CONSTRUCTION LLC,HANOVER RS CONSTRUCTION LLC,34503,34503.0,"415 E TAYLOR ST , SAN JOSE CA 95112-3136",24909001,New Construction,Apt/Condo/Townhouse,"B-4. Complete, E-4. Complete, P-4. Complete, M...",...,,,403.0,0.0,390695.0,2016-132495-MF,2021-01-09 02:28:09,FME,PLN-PBEX-0000034503,POINT (6157176.189 1954347.642)


In [50]:
active_new_construction = active_permits[
    active_permits['WORKDESC'] == 'New Construction'
]

In [51]:
active_new_construction['SUBDESC'].value_counts()

2nd Unit Added                   541
Single-Family                    457
Apt/Condo/Townhouse              225
Retail                            43
Office                            38
Mixed Use                         30
Hotel/Motel                       19
Warehouse/Storage                 19
Condo                             12
Closed Public Parking Garage       9
Recreation Building                8
Apartment                          7
Restaurant                         5
Duplex                             5
Townhouse                          3
Service Station                    3
Medical/Dental Clinic              3
Open Public Parking Garage         3
School/Daycare                     2
Assembly                           2
Undefined                          2
Industrial Plant                   2
High Rise                          2
Data Center                        2
Health Club                        1
Address Assignment                 1
Church                             1
C

In [59]:
pd.set_option('max_rows', 70)

In [60]:
active_new_construction['DWELLINGUN'].value_counts().sort_index()

0.0      222
1.0      996
2.0        6
3.0        6
4.0        4
5.0       29
6.0       30
7.0       44
8.0       28
9.0        2
10.0      20
11.0       1
12.0       1
13.0       2
16.0       1
24.0       2
25.0       1
31.0       1
48.0       1
52.0       2
55.0       3
64.0       1
71.0       1
77.0       1
82.0       1
83.0       1
84.0       1
86.0       1
87.0       1
92.0       1
96.0       1
97.0       1
102.0      1
105.0      1
106.0      1
112.0      1
119.0      1
120.0      1
130.0      1
135.0      2
136.0      1
162.0      2
165.0      1
166.0      1
176.0      1
190.0      2
200.0      1
219.0      1
230.0      1
233.0      1
249.0      1
260.0      1
267.0      1
268.0      1
269.0      1
299.0      1
301.0      2
304.0      1
315.0      1
318.0      1
321.0      1
326.0      1
357.0      1
369.0      1
640.0      1
Name: DWELLINGUN, dtype: int64

In [63]:
expired_new_construction['APN'].isin(active_new_construction['APN']).mean()

0.1975867269984917

Okay, so there is some duplication. We can choose the active one over the expired one.

In [64]:
combined_df = pd.concat(
    [
        expired_new_construction.assign(state='expired'),
        active_new_construction.assign(state='active'),
    ], 
    ignore_index=True
).drop_duplicates('APN', keep='last')

In [65]:
combined_df.groupby('APN').size().value_counts()

1    1050
dtype: int64

In [68]:
recent_combined_df = combined_df[
    combined_df['ISSUEDATE'] >= '2015-01-01'
]

In [70]:
recent_combined_df.shape

(819, 23)

# Merge DataFrames

In [74]:
merged_df = sj_sites.merge(
    recent_combined_df,
    left_on='locapn',
    right_on='APN',
    how='left'
)

In [75]:
merged_df

Unnamed: 0,objectid,apn,locapn,genplan,zoning,gacres,locacres,allowden,allowhigh,relcapcty,...,FINALDATEU,DWELLINGUN,PERMITVALU,SQUAREFOOT,FOLDERNUM,LASTUPDATE,LASTEDITOR,ENTERPRISE,geometry_y,state
0,66203,249-41-024,24941024,RN,R-M,0.150000,0.14,8.0,8,1.0,...,,,,,,NaT,,,,
1,66555,249-41-022,24941022,RN,R-M,0.168000,0.14,8.0,8,1.0,...,,,,,,NaT,,,,
2,76446,274-16-029,27416029,UV,CP,0.118000,0.86,12.4,12,11.0,...,,,,,,NaT,,,,
3,77834,274-59-001,27459001,UV,CO,0.614000,1.98,12.4,12,25.0,...,,,,,,NaT,,,,
4,77839,277-18-023,27718023,UV,CN,0.221000,0.74,12.4,12,9.0,...,,,,,,NaT,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
594,84228,48109049,48109049,UV,MS-G,1.036080,1.78,12.1,12,22.0,...,,,,,,NaT,,,,
595,84240,27441115,27441115,UV,CP,0.165030,0.16,12.4,12,2.0,...,,,,,,NaT,,,,
596,84250,61270016,61270016,LH,A,7.122983,11.71,0.2,0,2.0,...,,,,,,NaT,,,,
597,84251,66003013,66003013,RR,A(PD),2.075477,2.35,1.9,2,3.0,...,,,,,,NaT,,,,


In [76]:
merged_df['state'].notnull().mean()

0.05175292153589316

In [79]:
merged_df.columns

Index(['objectid', 'apn', 'locapn', 'genplan', 'zoning', 'gacres', 'locacres',
       'allowden', 'allowhigh', 'relcapcty', 'desafford', 'sitetype',
       'pdaparcel', 'existuse', 'localnote', 'Shape__Are', 'Shape__Len',
       'geometry_x', 'OBJECTID', 'APPLICANT', 'CONTRACTOR', 'FACILITYID',
       'INTID', 'ADDRESS', 'APN', 'WORKDESC', 'SUBDESC', 'PERMITAPPR',
       'ISSUEDATE', 'ISSUEDATEU', 'FINALDATE', 'FINALDATEU', 'DWELLINGUN',
       'PERMITVALU', 'SQUAREFOOT', 'FOLDERNUM', 'LASTUPDATE', 'LASTEDITOR',
       'ENTERPRISE', 'geometry_y', 'state'],
      dtype='object')

In [81]:
merged_df[
    merged_df['state'].notnull()
][['ADDRESS', 'allowden', 'allowhigh', 'relcapcty', 'DWELLINGUN', 'zoning']]

Unnamed: 0,ADDRESS,allowden,allowhigh,relcapcty,DWELLINGUN,zoning
20,"1690 SOUTHWEST EX , SAN JOSE CA 95126-0000",66.0,66,64.0,0.0,A(PD)
21,"7020 LIVERY LN , SAN JOSE CA",1.2,1,7.0,1.0,A(PD)
23,"1787 LUCRETIA AV , SAN JOSE CA 95122-3814",14.4,14,20.0,1.0,A(PD)
192,"1364 TIFFANY CANYON CT , SAN JOSE CA 95120-0000",0.2,0,2.0,1.0,A(PD)
222,"734 N 12TH ST , SAN JOSE CA 95112",8.0,8,1.0,1.0,R-2
267,"201 BASSETT ST , SAN JOSE CA 95128-0000",184.9,185,135.0,0.0,DC
268,"252 N 1ST ST , SAN JOSE CA 95113-1002",140.3,140,216.0,0.0,DC
280,"6462 ALMADEN RD , SAN JOSE CA 95120-1901",8.0,8,7.0,1.0,R-1-8
306,"5048 BROOK VALLEY LP , SAN JOSE CA 95123-0000",8.0,8,20.0,1.0,LI
346,"16607 ALMADEN EX , SAN JOSE CA 95120-0000",50.0,50,40.0,0.0,CP


Okay, looks like the data isn't super complete, so we can't really determine how the planned densities compare to the actually built densities.
We can still look at development probabilities though.

In [86]:
merged_df.dropna(subset=['state'])['zoning'].value_counts()

A(PD)    20
DC        4
CP        2
LI        2
R-1-8     1
R-2       1
HI        1
Name: zoning, dtype: int64

* A(PD) means Agricultural, but with a Planned Development overlay
* DC means Downtown Primary Commercial District
* CP means Commercial Pedestrian
* LI means Light Industrial
* R-1-8 means residential, 8 dwelling units/acre
* R-2 is residential, higher density than R-1
* HI means Heavy Industrial

In [82]:
merged_df['zoning'].value_counts()

A(PD)        160
CP            61
R-1-8         60
County        43
DC            36
LI            34
R-2           26
CG            26
R-M           25
R-1-5         24
A             22
MS-G          16
CN            14
HI            14
CO            10
R-1-2          6
IP             3
R-1-1          3
R-1-5(PD)      3
R-1-2(PD)      2
CP(PD)         1
CG/R-M         1
R-1-8(PD)      1
R-M(PD)        1
R-1-RR         1
DC-NT1         1
CN(PD)         1
CG(PD)         1
R-2/LI         1
OS             1
MS-C           1
Name: zoning, dtype: int64

In [83]:
merged_df.groupby('zoning').apply(lambda g: g['state'].notnull().mean())

zoning
A            0.000000
A(PD)        0.125000
CG           0.000000
CG(PD)       0.000000
CG/R-M       0.000000
CN           0.000000
CN(PD)       0.000000
CO           0.000000
CP           0.032787
CP(PD)       0.000000
County       0.000000
DC           0.111111
DC-NT1       0.000000
HI           0.071429
IP           0.000000
LI           0.058824
MS-C         0.000000
MS-G         0.000000
OS           0.000000
R-1-1        0.000000
R-1-2        0.000000
R-1-2(PD)    0.000000
R-1-5        0.000000
R-1-5(PD)    0.000000
R-1-8        0.016667
R-1-8(PD)    0.000000
R-1-RR       0.000000
R-2          0.038462
R-2/LI       0.000000
R-M          0.000000
R-M(PD)      0.000000
dtype: float64