# Breaks down "Data Model" Code

In [152]:
import pandas as pd
import logging
import dateutil
from dateutil import parser
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_columns', 500)
pd.set_option('display.float_format', lambda x: '%.2f' % x) 

In [153]:
logging.basicConfig(level=logging.INFO)

In [154]:
df = pd.read_csv("cleaned/all_quarters_merged.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [155]:
# keep only variables of interest
list = ['address', 'apn', 'best_date', 'best_stat','firstfiled', 'report_quarter', 'report_year', 'units', 'unitsnet', 'dbi_permit', 'zoning_simplified', 'x', 'y']
df = df[list]

In [156]:
#consolidate status categories. Start with 3 for now
def status_function(value):
    if value['best_stat']=="CONSTRUCTION":
        field = 'Under Construction'
    elif (value['best_stat']=='BP APPROVED') | (value['best_stat']=='BP ISSUED') | (value['best_stat']=='BP REINSTATED'):
        field = 'Building Permit Approved'
    else:
        field = 'Proposed'
    return field
        
df['status']=df.apply(status_function, axis=1)

In [157]:
gb = df.groupby(['apn', 'address'])

In [158]:
gb.groups

{('2819036', '11 CASTENADA AV'): Int64Index([510], dtype='int64'),
 ('3736084',
  '48 TEHAMA ST'): Int64Index([24779, 26012, 27471, 28886, 30780], dtype='int64'),
 ('0867033',
  '445 WALLER ST'): Int64Index([ 1659,  2182,  3199,  3419,  4634,  5322,  6316,  7060,  7636,
              8312,  9005, 10211, 10681, 11420, 12030, 12904, 13810, 14668,
             15526, 16366, 17514, 18151, 19512, 20699, 21939, 23417],
            dtype='int64'),
 ('3507042', '1420 MISSION ST'): Int64Index([19104], dtype='int64'),
 ('7104024',
  '454 CAPITOL AV'): Int64Index([25217, 27430, 28843, 30475], dtype='int64'),
 ('1797006',
  '1333 42ND AV'): Int64Index([19761, 20936, 22115, 22706, 24012, 25271], dtype='int64'),
 ('4591C068',
  '421 HUDSON AV'): Int64Index([13282, 14243, 14944, 15896, 16569, 17241, 18544, 19163, 20328], dtype='int64'),
 ('3579022',
  '505 CHURCH ST'): Int64Index([23477, 24812, 26072, 27496, 28915, 30509], dtype='int64'),
 ('0024011C', '2718 HYDE ST'): Int64Index([28594, 30768], dtyp

In [165]:
group_df = gb.get_group(('4646020','900 INNES AVE'))

In [166]:
group_df=group_df.sort_values(['best_date', 'report_year', 'report_quarter'], ascending=[True, True, True])
group_df.reset_index(drop=True, inplace=True)

In [167]:
group_df

Unnamed: 0,address,apn,best_date,best_stat,firstfiled,report_quarter,report_year,units,unitsnet,dbi_permit,zoning_simplified,x,y,status
0,900 INNES AVE,4646020,2004-09-17,PL FILED,2004-09-17,3,2009,128.0,128.0,,NC-2,-122.38,37.73,Proposed


In [150]:
last_row = group_df.tail(1).copy()
last_row=last_row.reset_index(drop=True)
        
# identify building permit ID. Then fill in rest of quarters with this permit ID.
building_permit=np.nan
for index, row in group_df.iterrows():
    if pd.isnull(row['dbi_permit']) and not pd.isnull(building_permit):
        building_permit=building_permit
    else:
        building_permit = row['dbi_permit']
        
#standardize building permit formats to strings
if isinstance(building_permit, float) and not pd.isnull(building_permit):
    building_permit=str(int(building_permit))
elif pd.isnull(building_permit)==True:
    building_permit=''
else:
    building_permit=str(building_permit)
    
#standardize building permit formats to strings
if isinstance(building_permit, float) and not pd.isnull(building_permit):
    building_permit=str(int(building_permit))

#Identify completion quarter for those projects that have reached completion
for index, row in group_df.iterrows():
    if row['best_stat'] !='CONSTRUCTION':
        comp_quarter= np.nan
        comp_year=np.nan
    elif row['best_stat'] == 'CONSTRUCTION':
        comp_quarter = row['report_quarter']
        comp_year = row['report_year']

if pd.isnull(comp_quarter):
    pass
if comp_quarter == 1 and comp_year == 2017:
    comp_quarter = np.nan
elif comp_quarter == 4:
    comp_quarter = 1
    comp_year = 1+comp_year
else:
    comp_quarter = 1+comp_quarter

if comp_quarter ==1:
    comp_daymth= '01/01'
elif comp_quarter ==2:
    comp_daymth= '04/01'
elif comp_quarter==3:
    comp_daymth= '07/01'
elif comp_quarter == 4:
    comp_daymth= '10/01'
elif pd.isnull(comp_quarter):
    comp_daymth=np.nan

if pd.isnull(comp_daymth):
    comp_date = np.nan
else:
    comp_date = comp_daymth + "/" + str(comp_year)

#if not completed, find when it dropped out
final_status=last_row['best_stat'][0]

final_quarter=last_row['report_quarter'][0]
final_year=last_row['report_year'][0]

dropped_out=False
if final_status != 'CONSTRUCTION' and final_quarter !=2 and final_year !=2017:
    dropped_out=True
dropped_out

#Identify earliest "firstfiled" date
firstfiled=''
for index, row in group_df.iterrows():
    if pd.isnull(row['firstfiled']):
        continue
    else:
        if len(firstfiled) ==0:
            firstfiled=row['firstfiled']
        else:
            if dateutil.parser.parse(row['firstfiled']) < dateutil.parser.parse(firstfiled):
                firstfiled=row['firstfiled']
            else: 
                firstfiled=firstfiled
        
#Identify earliest best date
earliest_BD = ''
for index, row in group_df.iterrows():
    if pd.isnull(row['best_date']):
        continue
    else:
        if len(earliest_BD) == 0:
            earliest_BD = row['best_date']
        else:
            continue

# Finalize first date variable (minimum of earliest best_date and firstfiled)
if firstfiled =='' and earliest_BD !='':
    first_date = earliest_BD
elif earliest_BD=='' and firstfiled !='':
    first_date = firstfiled
elif firstfiled !='' and earliest_BD !='':
    first_date = min(firstfiled, earliest_BD)

# initiate variables. Groups without these dates are blank for these variables.
BP_date = ''
con_date = ''

#Identify first date for all status categories
m=0
for index, row in group_df.iterrows():
    if m == 0:
        status_previous = 'blah'
    if row['status']=='Building Permit Approved':
        if index == 0:
            BP_date = row['best_date']
        elif index !=0:
            if status_previous =='Building Permit Approved':
                BP_date=BP_date
            else:
                BP_date = row['best_date']
    elif row['status']=='Under Construction':
        if index == 0:
            con_date = row['best_date']
        elif index !=0:
            if status_previous =='Under Construction':
                con_date=con_date
            else:
                con_date = row['best_date']
    status_previous = row['status']
    m=m+1

#Identify latest unit counts
units = np.nan
for index, row in group_df.iterrows():
    if pd.isnull(row['units']):
        continue
    else:
        units = row['units']
                
#Identify latest net unit counts
unitsnet= np.nan
for index, row in group_df.iterrows():
    if pd.isnull(row['unitsnet']):
        continue
    else:
        unitsnet = row['unitsnet']

#a few projects have construction best date after the completion date. In these cases, match the two.
if pd.notnull(comp_date) & pd.notnull(con_date):
    if dateutil.parser.parse(comp_date) < dateutil.parser.parse(con_date):
        comp_date = con_date
        
#Identify latest zoning designation
for index, row in group_df.iterrows():
    zoning = row['zoning_simplified']
        
last_row['firstfiled']=firstfiled
last_row['dbi_permit']= building_permit
last_row['comp_date']=comp_date
last_row['BP_date'] = BP_date
last_row['con_date'] = con_date
last_row['first_date']=first_date
last_row['latest_project_record_date'] = last_row.best_date
last_row['first_project_record_date'] = group_df.iloc[0].best_date
last_row['latest_project_status'] = last_row.best_stat
last_row['units']=units
last_row['unitsnet']=unitsnet
last_row['zoning']=zoning
last_row['dropped_out']=dropped_out

## Store a parseable list of all the project states and the dates those states were reported
last_row['project_dates'] = str(tuple(group_df.best_date))
last_row['project_statuses'] = str(tuple(group_df.best_stat))

## Store the project duration in days
if not (pd.isnull(comp_date) or pd.isnull(first_date)):
    last_row['project_duration_days'] = (dateutil.parser.parse(comp_date) - dateutil.parser.parse(first_date)).days

In [151]:
last_row

Unnamed: 0,address,apn,best_date,best_stat,firstfiled,report_quarter,report_year,units,unitsnet,dbi_permit,zoning_simplified,x,y,status,comp_date,BP_date,con_date,first_date,latest_project_record_date,first_project_record_date,latest_project_status,zoning,dropped_out,project_dates,project_statuses,project_duration_days
0,270 OCEAN AV,3211010A,2015-06-26,CONSTRUCTION,2006-07-18,4,2015,6.0,6.0,200607186852,NC-1,-122.44,37.72,Under Construction,01/01/2016,2008-01-25,2015-01-30,2006-07-18,2015-06-26,2008-01-25,CONSTRUCTION,NC-1,False,"('2008-01-25', '2008-01-25', '2008-01-25', '20...","('BP REINSTATED', 'BP REINSTATED', 'BP REINSTA...",3454
