In [1]:
import numpy as np
import pandas as pd

## Investigating the Berkeley Dataset

**What do the data contain?**

Subset of the Alameda Dataset, specifically where SA_SITE_CITY is Berkeley.

2 Subsets: Quantifications for Berkeley's JCE and Berkeley's RC

In [3]:
# import alameda
import pickle
with open("ALAMEDA_V2_SUB.csv", "rb") as f:
    alameda = pickle.load(f)

In [17]:
alameda[]

Unnamed: 0,SA_PROPERTY_ID,SA_SCM_ID,MM_STATE_CODE,MM_MUNI_NAME,MM_FIPS_STATE_CODE,MM_FIPS_MUNI_CODE,MM_FIPS_COUNTY_NAME,SA_PARCEL_NBR_PRIMARY,SA_PARCEL_NBR_REFERENCE,SA_PARCEL_ACCOUNT_NBR,...,SA_GEO_QLTY_CODE,SA_CENSUS_TRACT,SA_CENSUS_BLOCK_GROUP,CORE_BASED_STATISTICAL_AREA_CODE,MINOR_CIVIL_DIVISION_CODE,FIPS_PLACE_CODE,SA_INACTIVE_PARCEL_FLAG,SA_SHELL_PARCEL_FLAG,FILLER,Subsidized
0,38146934,33,CA,ALAMEDA,6,1,ALAMEDA,018 035500900,,,...,0,982000.0,1.0,41860.0,92230.0,53000.0,,,,N
1,38146935,33,CA,ALAMEDA,6,1,ALAMEDA,018 035501000,,,...,0,981900.0,1.0,41860.0,92230.0,53000.0,,,,N
2,38146936,33,CA,ALAMEDA,6,1,ALAMEDA,018 037500100,,,...,0,981900.0,1.0,41860.0,92230.0,53000.0,,,,N
3,38146937,33,CA,ALAMEDA,6,1,ALAMEDA,018 037500302,,,...,0,981900.0,1.0,41860.0,92230.0,53000.0,,,,N
4,38146949,33,CA,ALAMEDA,6,1,ALAMEDA,018 038500100,,,...,0,982000.0,1.0,41860.0,92230.0,53000.0,,,,N


In [9]:
# Filter for Berkeley JCE only
alameda = alameda[alameda['Subsidized'] == 'N']
berkeley = alameda[alameda['SA_SITE_CITY'] == 'BERKELEY']
# Save Berkeley Dataset and save again later
#import pickle
#with open("data/BERKELEY.csv", "wb") as f:
#    pickle.dump(berkeley, f)

In [10]:
# Check structure 
# (number of records, number of columns)
berkeley.shape


(29957, 191)

In [11]:
# Quoted comments refer to the lines in the JC and RC inventory. Specifically each coverage/exemption.

# "Units owned by a government agency"
# "Nursing or hospital units or home for the aged and the like"
# Filter rows to only include residential buildings
berkeley = berkeley[(berkeley['USE_CODE_STD'] == 'RAPT') | (berkeley['USE_CODE_STD'] == 'RCON') | (berkeley['USE_CODE_STD'] == 'RCOO') |
                 (berkeley['USE_CODE_STD'] == 'RDUP') | (berkeley['USE_CODE_STD'] == 'RMFD') | (berkeley['USE_CODE_STD'] == 'RMOB') |
                 (berkeley['USE_CODE_STD'] == 'RQUA') | (berkeley['USE_CODE_STD'] == 'RSFR') | (berkeley['USE_CODE_STD'] == 'RTIM') |
                 (berkeley['USE_CODE_STD'] == 'RTRI')]

# "Two unit properties where one unit is owner-occupied"
# Filter out properties with 2 or more units and owner occupied
berkeley = berkeley[~((berkeley['SA_SITE_MAIL_SAME'] == 'Y') & (berkeley['SA_NBR_UNITS'] >= 2))]

# Could not figure out some things:
# 1. "Unit where tenant shares a bathroom or kitchen with the owner who maintains his principle residence there"
# 2. "Units rented to transcient guests"
# 3. the "for the aged and the like" part in "Nursing or hospital units or home for the aged and the like"
# 4. "Units rented by certain instituations of higher learning to staff, students, or faculty"
# 5. "Nonprofit housing owned and controlled by residents"

# I believe these 5 things might not be possible to find with our assessor data?

In [12]:
# Berkeley's JCE number of records after filtering
len(berkeley)

23088

In [13]:
# Filter for Berkeley RC only
berk_rc = alameda[alameda['SA_SITE_CITY'] == 'BERKELEY']

In [14]:
# Quoted comments refer to the lines in the JC and RC inventory. Specifically each coverage/exemption.

# Filter rows to only include residential buildings
berk_rc = berk_rc[(berk_rc['USE_CODE_STD'] == 'RAPT') | (berk_rc['USE_CODE_STD'] == 'RCON') | (berk_rc['USE_CODE_STD'] == 'RCOO') |
                 (berk_rc['USE_CODE_STD'] == 'RDUP') | (berk_rc['USE_CODE_STD'] == 'RMFD') | (berk_rc['USE_CODE_STD'] == 'RMOB') |
                 (berk_rc['USE_CODE_STD'] == 'RQUA') | (berk_rc['USE_CODE_STD'] == 'RSFR') | (berk_rc['USE_CODE_STD'] == 'RTIM') |
                 (berk_rc['USE_CODE_STD'] == 'RTRI')]


In [15]:
# "Units eligible for RC:"
# "Most multi-unit properties that were built before June 1980"
# "Units eligible for RC: Single family homes with tenants who moved in prior to 1996"

# "Units NOT eligible for RC: "
# "Single-family homes first re-rented on or after 1/1/1996"
# APPROXIMATING THOSE ABOVE^^^^^^^^
# Select buildings before 1996, this automatically include all buildings built before 1980. 
# It will also approximate buildings where people rented or moved in before 1996
berk_rc = berk_rc[berk_rc['SA_YR_BLT'] < 1996]

# "Units eligible for RC:"
# "Single family homes with 5 or more rooms rented out individually with seperate leases"
# Approximating above ^^^
# Approximating by automatically removing single family with less than 5 rooms
berk_rc = berk_rc[~((berk_rc['SA_NBR_BEDRMS'] < 5) & (berk_rc['USE_CODE_STD'] == 'RSFR'))]

# "Units NOT eligible for RC:"
# "Any Duplex that was owner occupied on December 31, 1979 and currently has a owner living in one of the units"
# Approximates above ^^^
berk_rc = berk_rc[~((berkeley['SA_SITE_MAIL_SAME'] == 'Y') & (berk_rc['USE_CODE_STD'] == 'RDUP'))]

# Could not figure out some things:
# 1. ""New Construction": Units that were built and received a Certificate of Occupancy after June of 1970"
# 2. "Section 8 Tenancies"
# 3. "Units where tentants shares kitchen or bath with landlord"
# 4. How to figure out if tenant rented or moved in prior to 1996

# I believe these 4 things might not be possible to find with our assessor data?




In [16]:
# Berkeley's RC number of records after filtering
len(berk_rc)

8522