This notebook tests out the helper functions (in `utils.py`) that (1) parse the APR spreadsheets for 2018-2019 data, and (2) combines the ABAG permits dataset from 2013-2017 with the APR spreadsheets from 2018 to 2019 to create a dataset of all permits over the entire time period.

In [1]:
import geopandas as gpd
import pandas as pd
from IPython.display import Markdown
from housing_elements import utils

Set up logging to print to the screen:

In [2]:
import logging
import sys
logging.basicConfig(stream=sys.stdout, level=logging.INFO)

In [3]:
from importlib import reload
reload(utils)

<module 'housing_elements.utils' from '/Users/Salim/Desktop/housing-elements/housing_elements/utils.py'>

First, let's test out the APR spreadsheet helper function (`utils.load_apr_permits`) on the cities we have to ensure they look reasonable.

In [None]:
datasets = {}
for city in ['Berkeley', 'Mountain View', 'Oakland',  'Palo Alto', 'SanJose']:
    for year in ['2018', '2019']:
        filtered_df = utils.load_apr_permits(city, year)

        display(Markdown('# ' + city + ' ' + year))
        display(filtered_df[['Current APN', 'Street Address', '# of Units Issued Building Permits', 
                             'Unit Category (SFA,SFD,2 to 4,5+,ADU,MH)']])

These look reasonable.

Now let's test out the functions that load the ABAG permits:

In [None]:
sj_permits_df = utils.load_all_new_building_permits('San Jose')

In [None]:
sj_permits_df.groupby('permyear')['totalunit'].sum()

Looks good!

In [None]:
sj_permits_df.columns

In [None]:
sj_sites = utils.load_site_inventory('San Jose')

In [None]:
sj_permits_df.columns

In [None]:
utils.calculate_inventory_housing_over_all_housing(sj_sites, sj_permits_df)

In [None]:
sj_sites[sj_sites.apn == '25417084']

In [None]:
# TODO: In the course of testing with the san jose dataset, i saw one APN listed three times.
# apn == 25417084. we should find some fix during cleaning

In [None]:
sj_sites.apn.isin(sj_permits_df.apn).sum()

In [None]:
sj_permits_df.apn.isin(sj_sites.apn).sum()

In [None]:
# TODO: why are there multiple permits per site? are some of these not used? are there duplicates?

In [None]:
utils.calculate_mean_overproduction_on_sites(sj_sites, sj_permits_df)

In [None]:
utils.calculate_inventory_housing_over_all_housing(sj_sites, sj_permits_df)

In [None]:
utils.calculate_total_units_permitted_over_he_capacity(sj_sites, sj_permits_df)

In [None]:
sj_sites.sitetype.value_counts()

In [None]:
utils.calculate_pdev_for_nonvacant_sites(sj_sites, sj_permits_df)

In [None]:
utils.calculate_pdev_for_vacant_sites(sj_sites, sj_permits_df)

In [None]:
utils.calculate_pdev_for_inventory(sj_sites, sj_permits_df)

In [None]:
sj_sites

In [38]:
import numpy as np
df = gpd.read_file(
        "./data/raw_data/housing_sites/xn--Bay_Area_Housing_Opportunity_Sites_Inventory__20072023_-it38a.shp"
    )
sites = df.query(f'jurisdict == "Windsor" and rhnacyc == "RHNA5"').copy()


In [39]:
sites.allowden.value_counts()

8-12        2
12-32       2
3-6         2
12-24       1
FAR 2.5     1
12-16       1
3-6, 5-8    1
5-8         1
Name: allowden, dtype: int64

In [40]:
sites = utils.remove_units_in_allowden(sites)
sites = utils.remove_range_in_allowden(sites)

In [41]:
sites.allowden.value_counts()

8          2
6          2
32         2
12         2
FAR 2.5    1
16         1
24         1
Name: allowden, dtype: int64

In [37]:
sites

Unnamed: 0,objectid,rhnacyc,rhnayrs,county,jurisdict,apn,locapn,genplan,zoning,gacres,...,infcapcty,siteconst,sitetype,pdaparcel,bundled,existuse,localnote,Shape__Are,Shape__Len,geometry
33252,75386,RHNA5,2015-2023,6081,Portola Valley,077-070-110,077-070-110,Conservation Residential,R-E,4.021,...,N,No sewer,Vacant,,,,Density District: 2.5A,1.657016e-06,0.006725,"POLYGON ((-122.21251 37.39960, -122.21253 37.3..."
33253,75387,RHNA5,2015-2023,6081,Portola Valley,077-011-050,077-011-050,Conservation Residential,R-E,2.554,...,N,No sewer,Vacant,,,,Density District: 2.5A,1.052263e-06,0.004483,"POLYGON ((-122.22364 37.39558, -122.22477 37.3..."
33254,75388,RHNA5,2015-2023,6081,Portola Valley,077-060-290,077-060-290,Conservation Residential,R-E,7.400,...,N,No sewer,Vacant,,,,Density District: 2.5A,3.049226e-06,0.011562,"POLYGON ((-122.21962 37.39518, -122.21953 37.3..."
33255,75389,RHNA5,2015-2023,6081,Portola Valley,077-090-140,077-090-140,Conservation Residential,R-E,2.501,...,N,No sewer,Vacant,,,,Density District: 2.5A,1.030435e-06,0.006758,"POLYGON ((-122.21062 37.39295, -122.21062 37.3..."
33256,75390,RHNA5,2015-2023,6081,Portola Valley,077-101-170,077-101-170,Low Intensity Residential,R-E,2.702,...,N,No sewer,Vacant,,,,Density District: 1A,1.113151e-06,0.004949,"POLYGON ((-122.20902 37.39137, -122.20968 37.3..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35974,78108,RHNA5,2015-2023,6081,Portola Valley,080-241-230,080-241-230,Conservation Residential,R-E,0.611,...,N,Sewer only infrastrucutre,Vacant,,Y - 080-241-230,,Density District: 3.5A,2.516003e-07,0.002100,"POLYGON ((-122.20309 37.35538, -122.20333 37.3..."
35975,78109,RHNA5,2015-2023,6081,Portola Valley,080-241-240,080-241-240,,,0.726,...,,,,,Y - 080-241-230,,Density District: 3.5A,2.989332e-07,0.002263,"POLYGON ((-122.20333 37.35508, -122.20340 37.3..."
35976,78110,RHNA5,2015-2023,6081,Portola Valley,080-241-280,080-241-280,Conservation Residential,R-E,1.674,...,N,Sewer only infrastrucutre,Vacant,,,,Density District: 3.5A,6.893856e-07,0.003698,"POLYGON ((-122.20541 37.35551, -122.20545 37.3..."
35977,78111,RHNA5,2015-2023,6081,Portola Valley,080-471-030,080-471-030,Conservation Residential,R-E,0.480,...,N,Sewer only infrastrucutre,Vacant,,,,Density District: 2A,1.979165e-07,0.001813,"POLYGON ((-122.21210 37.36455, -122.21230 37.3..."


In [5]:
cities = df.jurisdict.unique()

In [53]:
from importlib import reload
reload(utils)

<module 'housing_elements.utils' from '/Users/Salim/Desktop/housing-elements/housing_elements/utils.py'>

In [44]:
import os, sys

class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

Unfortunate cases of casting values to nan:
 - San Ramon has "up to 1.35 FAR" as allowden for one parcel. I will cast to nan.
 - Newark has "2500 sf/ac" as allowden for one parcel. It will be turned to 'ac' and cast to nan.
 - Danville uses '1 du/20 ac' as allowden. I cannot do the math bc it's a waste of time. These are cast to nan.
 - El Cerrito uses nonstandardized plain english in allowden.
 - Walnut Creek mostly uses FAR instead of du/ac
 - Pittsburg has one allowden value of 'Max 96 units'. Im not going to support this one parcel.
 - Sausality does everything in terms of p units per k square feet, where p and k are variable. Im not bothering with this.
 - Fairfax has a lot of values of "project specific - no maximum"
 - Novato has some FAR values.
 - Portola valley has a few values of 'PD' for allowden that I cannot understand the meaning of.
 
Other data info:
- Orinda has no sites in the cycle in ABAG dataset.
- Piedmont's allowden is just nonsensical
- Woodside's allowden is also nonsensical

For any quantity of interest with over 50% of a input variable as nan, we should just mark it as ignored in the results table

In [26]:
for i, city in enumerate(cities):
    try:
        print(i)
        #print(city.upper())
        with HiddenPrints():
            utils.load_site_inventory(city)
        print('SUCCESS')
    except Exception as exc:
        print('ERROR FOUND')
        print(city)
        print(str(exc))
        print('-------')


0
SUCCESS
1
SUCCESS
2
SUCCESS
3
SUCCESS
4
SUCCESS
5
ERROR FOUND
San Ramon
could not convert string to float: '1.35 FAR'
-------
6
ERROR FOUND
Newark
could not convert string to float: 'ac'
-------
7
SUCCESS
8
SUCCESS
9
SUCCESS
10
SUCCESS
11
SUCCESS
12
SUCCESS
13
SUCCESS
14
SUCCESS
15
SUCCESS
16
SUCCESS
17
SUCCESS
18
SUCCESS
19
SUCCESS
20
SUCCESS
21
SUCCESS
22
SUCCESS
23
SUCCESS
24
ERROR FOUND
Danville
could not convert string to float: 'GOS'
-------
25
SUCCESS
26
SUCCESS
27
ERROR FOUND
El Cerrito
could not convert string to float: '45 with  incentives'
-------
28
SUCCESS
29
ERROR FOUND
Walnut Creek
could not convert string to float: 'no max; FAR 1.15'
-------
30
ERROR FOUND
Corte Madera
could not convert string to float: '5 ac'
-------
31
SUCCESS
32
SUCCESS
33
SUCCESS
34
ERROR FOUND
Orinda
'GeoDataFrame' object has no attribute 'apn'
-------
35
SUCCESS
36
ERROR FOUND
Pittsburg
could not convert string to float: 'Max 96 units'
-------
37
SUCCESS
38
SUCCESS
39
SUCCESS
40
ERROR FOUND
Pied

In [None]:
la_sites.apn = la_sites.apn.str.replace('-','').astype('float')

In [None]:
permits.sort_values('totalunit', ascending=False).drop_duplicates('apn').shape

permits

In [None]:
permits.shape

In [45]:
cities

array(['Berkeley', 'Albany', 'Alameda', 'Livermore', 'Fremont',
       'San Ramon', 'Newark', 'Brentwood', 'Hayward',
       'Contra Costa County', 'Emeryville', 'Alameda County',
       'Pleasanton', 'San Leandro', 'Concord', 'Richmond', 'Martinez',
       'Clayton', 'Pinole', 'Oakland', 'San Francisco', 'Dublin',
       'Antioch', 'Lafayette', 'Danville', 'San Pablo', 'Napa',
       'El Cerrito', 'Union City', 'Walnut Creek', 'Corte Madera',
       'Moraga', 'Hercules', 'Oakley', 'Orinda', 'Marin County',
       'Pittsburg', 'Pleasant Hill', 'American Canyon', 'Larkspur',
       'Piedmont', 'San Rafael', 'Calistoga', 'Tiburon', 'Sausalito',
       'Saint Helena', 'Yountville', 'Napa County', 'San Anselmo',
       'Belvedere', 'Fairfax', 'Ross', 'Novato', 'Half Moon Bay',
       'Millbrae', 'San Bruno', 'Mill Valley', 'Brisbane', 'Atherton',
       'Menlo Park', 'Pacifica', 'Redwood City', 'Belmont', 'San Mateo',
       'Colma', 'Daly City', 'San Carlos', 'Hillsborough', 'Woodside',
 

In [55]:
for city in cities:
    print(city.upper())
    try:
        with HiddenPrints():
            utils.load_all_new_building_permits(city)
    except Exception as exc:
        print('ERROR FOUND')
        print(city)
        print(str(exc))
        print('-------')

BERKELEY
ALBANY
ALAMEDA
LIVERMORE
FREMONT
SAN RAMON
ERROR FOUND
San Ramon
[Errno 2] No such file or directory: 'data/raw_data/APRs/SanRamon2018.xlsm'
-------
NEWARK
BRENTWOOD
HAYWARD
CONTRA COSTA COUNTY
ERROR FOUND
Contra Costa County

-------
EMERYVILLE
ALAMEDA COUNTY
ERROR FOUND
Alameda County

-------
PLEASANTON
SAN LEANDRO
CONCORD
RICHMOND
MARTINEZ
ERROR FOUND
Martinez
[Errno 2] No such file or directory: 'data/raw_data/APRs/Martinez2018.xlsm'
-------
CLAYTON
ERROR FOUND
Clayton
[Errno 2] No such file or directory: 'data/raw_data/APRs/Clayton2018.xlsm'
-------
PINOLE
OAKLAND
SAN FRANCISCO
DUBLIN
ANTIOCH
LAFAYETTE
DANVILLE
SAN PABLO
ERROR FOUND
San Pablo
[Errno 2] No such file or directory: 'data/raw_data/APRs/SanPablo2018.xlsm'
-------
NAPA
EL CERRITO
UNION CITY
WALNUT CREEK
CORTE MADERA
ERROR FOUND
Corte Madera
[Errno 2] No such file or directory: 'data/raw_data/APRs/CorteMadera2018.xlsm'
-------
MORAGA
ERROR FOUND
Moraga
[Errno 2] No such file or directory: 'data/raw_data/APRs/Mo