In [70]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import statsmodels.api as sm
from statsmodels.base.model import GenericLikelihoodModel
from statsmodels.genmod.generalized_estimating_equations import GEE
from statsmodels.genmod.cov_struct import (Exchangeable,
    Independence,Autoregressive)
from statsmodels.genmod.families import Poisson


#### Demographic Data Processing


In [71]:
def remove_minus(x):
    if (x == "-"):
        return "NaN"
    else:
        return x.replace("-", "").replace("+", "").replace(",", "")

In [72]:
#this data is for 11 variables, all census tracts in California, using ACS 2009 5YR estimates 

demog_data_path = '/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/demographic_data/'
#get percentages of each race

race = pd.read_csv(demog_data_path+ 'ACS_09_5YR_B03002_race_clean.csv')
race["Percent NH White alone"] = race["Estimate; Not Hispanic or Latino: - White alone"]/race["Estimate; Total:"]
race["Percent NH Black or African African alone"] = race["Estimate; Not Hispanic or Latino: - Black or African American alone"]/race["Estimate; Total:"]
race["Percent NH Asian alone"] = race["Estimate; Not Hispanic or Latino: - Asian alone"]/race["Estimate; Total:"]
race["Percent Hispanic"] = race["Estimate; Hispanic or Latino:"]/race["Estimate; Total:"]


edu = pd.read_csv(demog_data_path+ 'ACS_09_5YR_S1501_bach_clean.csv')
edu = edu.rename({"Percent of population 25 years and over  with Bachelor\'s degree": "Percent of population 25 years and over with Bachelor\'s degree"}, axis=1)
edu["Percent of population 25 years and over with Bachelor\'s degree"] = edu["Percent of population 25 years and over with Bachelor\'s degree"].apply(lambda x: x.replace("+", "").replace("-", "NaN"))
edu["Percent of population 25 years and over with Bachelor\'s degree"] = edu["Percent of population 25 years and over with Bachelor\'s degree"].astype(float).dropna()


#get rid of + and ***
hvalue = pd.read_csv(demog_data_path+ 'ACS_09_5YR_B25077_hvalue_clean.csv')
hvalue["Estimate; Median value (dollars)"] = hvalue["Estimate; Median value (dollars)"].apply(lambda x: x.replace("+", "").replace("-", "NaN").replace(",", ""))
hvalue["Estimate; Median value (dollars)"] = hvalue["Estimate; Median value (dollars)"].astype(float)

#hvalue["Estimate; Median value (dollars)"] = hvalue["Estimate; Median value (dollars)"].astype(float).dropna()

#add attached and detached and find the %
single = pd.read_csv(demog_data_path+ 'ACS_09_5YR_B25024_singlefam_clean.csv')
single["Total Single Family"] = single["Estimate; 1, attached"]+ single["Estimate; 1, detached"]
single["% Single Family"] = single["Total Single Family"] / single["Estimate; Total:"]
#single["Tract"] = (single["Id2"].astype(str)).apply(lambda x: int(x[4:8]))

poverty = pd.read_csv(demog_data_path+"ACS_09_5YR_S1702_povertyfam_clean.csv")

# poverty = pd.read_csv(demog_data_path+ "ACS_17_5YR_S1701_poverty_clean.csv")
poverty['All families - Percent  below poverty level; Estimate; Families'] = poverty['All families - Percent  below poverty level; Estimate; Families'].apply(lambda x: x.replace("+", "").replace("-", "NaN"))
poverty['All families - Percent  below poverty level; Estimate; Families'] = poverty['All families - Percent  below poverty level; Estimate; Families'].astype(float).dropna()



units = pd.read_csv(demog_data_path+'ACS_09_5YR_B25001_units_clean.csv')
#units["Tract"] = (units["Id2"].astype(str)).apply(lambda x: int(x[4:8]))


#find % renter occupied
tenure = pd.read_csv(demog_data_path+ 'ACS_09_5YR_B25003_tenure_clean.csv')
tenure["% Renter Occupied"] = tenure["Estimate; Renter occupied"]/tenure["Estimate; Total:"]
tenure["% Owner Occupied"] = tenure["Estimate; Owner occupied"]/tenure["Estimate; Total:"]


income = pd.read_csv(demog_data_path + 'ACS_09_5YR_S1903_income.csv')
income["Median income (dollars); All households"] = income["Median income (dollars); All households"].apply(remove_minus)

income["Median income (dollars); All households"] = income["Median income (dollars); All households"].astype(float)





#### Create Controlled features dataframe

In [73]:
race_units_merged = race.merge(units, how='left', left_on="Id2", right_on="Id2")
print("race and units merged", "length", len(race_units_merged))
#print("race and units merged", "dtypes", race_units_merged.dtypes)

edu_merged = race_units_merged.merge(edu, how='left', left_on="Id2", right_on="Id2")
print("edu merged", "length", len(edu_merged))
#print("edu merged", "dtypes", edu_merged.dtypes)

income_merged = edu_merged.merge(income, how='left', left_on="Id2", right_on="Id2")
print("income merged", "length", len(income_merged))
#print("income", "dtypes", income_merged.dtypes)

poverty_merged = income_merged.merge(poverty, how='left', left_on="Id2", right_on="Id2")
print("poverty merged", "length", len(poverty_merged))
#print("poverty", "dtypes", poverty_merged.dtypes)

hvalue_merged = poverty_merged.merge(hvalue, how='left', left_on="Id2", right_on="Id2")
print("hvalue merged", "length", len(hvalue_merged))
#print("hvalue", "dtypes", hvalue_merged.dtypes)

single_merged = hvalue_merged.merge(single, how='left', left_on="Id2", right_on="Id2")
print("single merged", "length", len(single_merged))
#print("singe merged", "dtypes", single_merged.dtypes)


all_demog_vars = single_merged.merge(tenure, how='left', left_on="Id2", right_on="Id2")
print("tenure merged", "length", len(all_demog_vars))
all_demog_vars.head()


race and units merged length 7049
edu merged length 7049
income merged length 7049
poverty merged length 7049
hvalue merged length 7049
single merged length 7049
tenure merged length 7049


Unnamed: 0,Id2,Geography_x,Estimate; Total:_x,Margin of Error; Total:_x,Estimate; Not Hispanic or Latino:,Estimate; Not Hispanic or Latino: - White alone,Estimate; Not Hispanic or Latino: - Black or African American alone,Estimate; Not Hispanic or Latino: - Asian alone,Estimate; Hispanic or Latino:,Estimate; Hispanic or Latino: - White alone,...,% Single Family,Geography_y,Estimate; Total:,Margin of Error; Total:,Estimate; Owner occupied,Margin of Error; Owner occupied,Estimate; Renter occupied,Margin of Error; Renter occupied,% Renter Occupied,% Owner Occupied
0,6001400100,"Census Tract 4001, Alameda County, California",2872,233,2851,2209,119,318,21,15,...,0.912439,"Census Tract 4001, Alameda County, California",1308,108,1171,107,137,74,0.10474,0.89526
1,6001400200,"Census Tract 4002, Alameda County, California",2076,165,1854,1594,47,144,222,56,...,0.665236,"Census Tract 4002, Alameda County, California",902,42,593,71,309,73,0.342572,0.657428
2,6001400300,"Census Tract 4003, Alameda County, California",4964,369,4541,3545,492,375,423,247,...,0.478758,"Census Tract 4003, Alameda County, California",2626,132,1064,150,1562,182,0.594821,0.405179
3,6001400400,"Census Tract 4004, Alameda County, California",4223,553,3957,2948,391,330,266,149,...,0.555446,"Census Tract 4004, Alameda County, California",1964,87,852,133,1112,152,0.566191,0.433809
4,6001400500,"Census Tract 4005, Alameda County, California",3759,308,3323,1577,1303,249,436,179,...,0.44438,"Census Tract 4005, Alameda County, California",1686,82,738,102,948,119,0.562278,0.437722


In [74]:
all_demog_vars.columns
all_demog_vars = all_demog_vars[['Id2','Percent of population 25 years and over with Bachelor\'s degree', 
      'All families - Percent  below poverty level; Estimate; Families', 
      '% Single Family', '% Owner Occupied', 'Percent NH White alone',
       'Percent NH Black or African African alone', 'Percent NH Asian alone',
       'Percent Hispanic', 'Estimate; Total Number of Housing Units', 'Estimate; Median value (dollars)', 
        'Median income (dollars); All households']]
all_demog_vars_rename = all_demog_vars.rename({'All families - Percent  below poverty level; Estimate; Families': "% below poverty level",
              'Estimate; Total Number of Housing Units': 'Total number of housing units',
              'Estimate; Median value (dollars)': "Median home value",
              'Median income (dollars); All households': "Median income", 'Id2': "Geoid"}, axis=1)
#print("num tracts", len(merged))
      
all_demog_vars_rename.head()

#all_demog_vars_rename.to_csv("alldemog.csv")



Unnamed: 0,Geoid,Percent of population 25 years and over with Bachelor's degree,% below poverty level,% Single Family,% Owner Occupied,Percent NH White alone,Percent NH Black or African African alone,Percent NH Asian alone,Percent Hispanic,Total number of housing units,Median home value,Median income
0,6001400100,34.5,3.1,0.912439,0.89526,0.76915,0.041435,0.110724,0.007312,1439,1000000.0,186439.0
1,6001400200,37.6,0.0,0.665236,0.657428,0.767823,0.02264,0.069364,0.106936,932,909500.0,122647.0
2,6001400300,32.1,6.9,0.478758,0.405179,0.714142,0.099114,0.075544,0.085214,2801,718100.0,66638.0
3,6001400400,44.0,4.0,0.555446,0.433809,0.698082,0.092588,0.078143,0.062988,2020,790500.0,80391.0
4,6001400500,28.1,6.0,0.44438,0.437722,0.419526,0.346635,0.066241,0.115988,1735,572000.0,50658.0


#### Get only Bay Area counties

In [75]:
county_codes = ['6001', '6013', '6041', '6055', '6075', '6081', '6085', '6097', '6095']


In [95]:
all_demog_vars_bay_rename = all_demog_vars_rename[all_demog_vars_rename["Geoid"].apply(lambda x: str(x)[:4]).isin(county_codes)]
print("num tracts", len(all_demog_vars_bay_rename))
all_demog_vars_bay_rename.head()


num tracts 1405


Unnamed: 0,Geoid,Percent of population 25 years and over with Bachelor's degree,% below poverty level,% Single Family,% Owner Occupied,Percent NH White alone,Percent NH Black or African African alone,Percent NH Asian alone,Percent Hispanic,Total number of housing units,Median home value,Median income
0,6001400100,34.5,3.1,0.912439,0.89526,0.76915,0.041435,0.110724,0.007312,1439,1000000.0,186439.0
1,6001400200,37.6,0.0,0.665236,0.657428,0.767823,0.02264,0.069364,0.106936,932,909500.0,122647.0
2,6001400300,32.1,6.9,0.478758,0.405179,0.714142,0.099114,0.075544,0.085214,2801,718100.0,66638.0
3,6001400400,44.0,4.0,0.555446,0.433809,0.698082,0.092588,0.078143,0.062988,2020,790500.0,80391.0
4,6001400500,28.1,6.0,0.44438,0.437722,0.419526,0.346635,0.066241,0.115988,1735,572000.0,50658.0


#### Missing demog data exploration

In [143]:
# print("number of tracts in the bay area:", len(merged_bay))
# print("number of bay tracts with complete demographic info:", len(merged_bay.dropna())) 
# print("percent of tracts that have at least one nan entry: ", (len(merged_bay)-len(merged_bay.dropna()))/len(merged_bay)) 

# #by column
# for col in merged_bay.columns[1:]:
#     column = merged_bay[col]
#     num_missing = len(column) - len(column.dropna())
#     print(col, ", num missing:", num_missing)
    



In [68]:
# na_free = merged_bay.dropna()
# only_na = merged_bay[~merged_bay.index.isin(na_free.index)]

# #this is the dataframe of the census tracts that have at least one null value in their demographic variables, 
# #merged with the ACS 2009 population estimates. There doesn't seem to be a connection between CTs with very 
# #small populations and those with missing values, though the largest problem seems to be coming from the % poverty level
# #column
# pop_df = pd.read_csv(demog_data_path+"ACS_09_5YR_B01003_totalpop.csv")

# na_ct_pop = pd.merge(only_na, pop_df, how='inner', right_on='Id2', left_on='Id2')
# na_ct_pop

#### Merge Demographic features with neighbors dataframe to contribute CRA eligibility information by tract and exclude CTs without oppositely coded neighbors

Universe: all of the CTs in the 9 county bay area that had loan activity have oppositely coded neighbors. The boolean_tracts_all_2009.csv file includes all the tracts that have oppositely coded neighbors and their CRA eligibility.



In [97]:
data_path = '/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/neighbors/'
cra_elig_neighbors = pd.read_csv(data_path+ 'boolean_tracts_all_2009.csv')
#cra_elig_neighbors.to_csv('neighbors.csv')
len(cra_elig_neighbors) #801 CTs in the bay with loan activity and oppositely coded neighbors


801

In [102]:
len(all_demog_vars_bay_rename)

1405

In [101]:
len(all_demog_vars_bay_rename[~(all_demog_vars_bay_rename['Geoid'].isin(cra_elig_neighbors["Geoid"]))]) #616 tracts in merged_bay that are not in neighbors, which makes sense
#len(cra_elig_neighbors[~(cra_elig_neighbors['Geoid'].isin(all_demog_vars_bay_rename["Geoid"]))]) #0 tracts that are in neighbors but not in merged_bay, which makes sense




616

In [105]:
demog_with_cra = pd.merge(merged_bay, cra_elig_neighbors, how='right', left_on='Geoid', right_on='Geoid')
demog_with_cra.head()
#i think that this is supposed to be a right merge because the demographics dataframe has all the census tracts, 
#while the boolean neighbors one only has the census tracts in the bay area that have neighbors
# demog_with_cra = demog_with_cra.drop(["Tract", "Year"], axis=1)
print(len(demog_with_cra)) #used to be 1228, now 801
demog_with_cra.head()



801


Unnamed: 0,Geoid,Percent of population 25 years and over with Bachelor's degree,% below poverty level,% Single Family,% Owner Occupied,Percent NH White alone,Percent NH Black or African African alone,Percent NH Asian alone,Percent Hispanic,Total number of housing units,Median home value,Median income,Year_x,Tract_y,type,County,CRA Eligible
0,6001400100,34.5,3.1,0.912439,0.89526,0.76915,0.041435,0.110724,0.007312,1439,1000000.0,186439.0,2009,4001.0,high,Alameda County,not eligible
1,6001400300,32.1,6.9,0.478758,0.405179,0.714142,0.099114,0.075544,0.085214,2801,718100.0,66638.0,2009,4003.0,mid,Alameda County,not eligible
2,6001400400,44.0,4.0,0.555446,0.433809,0.698082,0.092588,0.078143,0.062988,2020,790500.0,80391.0,2009,4004.0,mid,Alameda County,not eligible
3,6001400500,28.1,6.0,0.44438,0.437722,0.419526,0.346635,0.066241,0.115988,1735,572000.0,50658.0,2009,4005.0,mod,Alameda County,eligible
4,6001400600,51.9,26.4,0.632653,0.470756,0.338305,0.47315,0.03222,0.059666,784,586700.0,39802.0,2009,4006.0,mod,Alameda County,eligible


#### Get total number and percentages of loans data from HMDA files
This is currently for 2009, but next step is to do it for the average of the recovery period 2013-2017


Universe: all of the CTs in the 9 county bay area that had loan activity during 2009

In [103]:
filepath = '/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/parsed_data/'
Alameda = pd.read_csv(filepath+'Alameda_2009_parsed.csv')
Alameda["Geoid"] = Alameda["Tract"].apply(lambda x: int(x*100 + 6001000000))

ContraCosta = pd.read_csv(filepath+'ContraCosta_2009_parsed.csv')
ContraCosta["Geoid"] = ContraCosta["Tract"].apply(lambda x: int(x*100 + 6013000000))

Napa = pd.read_csv(filepath+'Napa_2009_parsed.csv')
Napa["Geoid"] = Napa["Tract"].apply(lambda x: int(x*100 + 6055000000))

Marin = pd.read_csv(filepath+'Marin_2009_parsed.csv')
Marin["Geoid"] = Marin["Tract"].apply(lambda x: int(x*100 + 6041000000))

SanMateo = pd.read_csv(filepath+'SanMateo_2009_parsed.csv')
SanMateo["Geoid"] = SanMateo["Tract"].apply(lambda x: int(x*100 + 6081000000))

SanFrancisco = pd.read_csv(filepath+'SanFrancisco_2009_parsed.csv')
SanFrancisco["Geoid"] = SanFrancisco["Tract"].apply(lambda x: int(x*100 + 6075000000))

Solano = pd.read_csv(filepath+'Solano_2009_parsed.csv')
Solano["Geoid"] = Solano["Tract"].apply(lambda x: int(x*100 + 6095000000))

SantaClara = pd.read_csv(filepath+'SantaClara_2009_parsed.csv')
SantaClara["Geoid"] = SantaClara["Tract"].apply(lambda x: int(x*100 + 6085000000))

Sonoma = pd.read_csv(filepath+'Sonoma_2009_parsed.csv')
Sonoma["Geoid"] = Sonoma["Tract"].apply(lambda x: int(x*100 + 6097000000))

counties = [Alameda, ContraCosta, Napa, Marin, SanMateo, SanFrancisco,Solano,SantaClara, Sonoma]

bay_counties_loans = pd.concat(counties).reset_index(drop=True)
bay_counties_loans["Total Loans"] = bay_counties_loans['# HI borrower, LI tract'] + bay_counties_loans['# HI borrower, HI tract']+ bay_counties_loans['# LI borrower, LI tract']
+ bay_counties_loans['# LI borrower, HI tract']
bay_counties_loans = bay_counties_loans.drop(["Tract", "Year"], axis=1)


#### Merge Loans Data with cra_demog features

In [106]:
allvars_2009 = pd.merge(demog_with_cra, bay_counties_loans, how="left", left_on='Geoid', right_on='Geoid').drop(['CRA Eligible_y', "Geoid"], axis=1)
allvars_2009['CRA'] = allvars_2009['CRA Eligible_x'].apply(lambda x: 1 if x == 'eligible' else 0)
allvars_2009.to_csv("/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/regression/all_vars2009.csv")
print(allvars_2009.columns)
allvars_2009.head()

Index(['Percent of population 25 years and over with Bachelor's degree',
       '% below poverty level', '% Single Family', '% Owner Occupied',
       'Percent NH White alone', 'Percent NH Black or African African alone',
       'Percent NH Asian alone', 'Percent Hispanic',
       'Total number of housing units', 'Median home value', 'Median income',
       'Year_x', 'Tract_y', 'type_x', 'County_x', 'CRA Eligible_x', 'County_y',
       'type_y', '% HI borrower, LI tract', '# HI borrower, LI tract',
       '% HI borrower, HI tract', '# HI borrower, HI tract',
       '% LI borrower, LI tract', '# LI borrower, LI tract',
       '%LI borrower, HI tract', '# LI borrower, HI tract', 'Total Loans',
       'CRA'],
      dtype='object')


Unnamed: 0,Percent of population 25 years and over with Bachelor's degree,% below poverty level,% Single Family,% Owner Occupied,Percent NH White alone,Percent NH Black or African African alone,Percent NH Asian alone,Percent Hispanic,Total number of housing units,Median home value,...,"% HI borrower, LI tract","# HI borrower, LI tract","% HI borrower, HI tract","# HI borrower, HI tract","% LI borrower, LI tract","# LI borrower, LI tract","%LI borrower, HI tract","# LI borrower, HI tract",Total Loans,CRA
0,34.5,3.1,0.912439,0.89526,0.76915,0.041435,0.110724,0.007312,1439,1000000.0,...,0.0,0,0.981818,54,0.0,0,0.018182,1,54,0
1,32.1,6.9,0.478758,0.405179,0.714142,0.099114,0.075544,0.085214,2801,718100.0,...,0.0,0,0.90625,58,0.0,0,0.09375,6,58,0
2,44.0,4.0,0.555446,0.433809,0.698082,0.092588,0.078143,0.062988,2020,790500.0,...,0.0,0,0.979167,47,0.0,0,0.020833,1,47,0
3,28.1,6.0,0.44438,0.437722,0.419526,0.346635,0.066241,0.115988,1735,572000.0,...,0.877193,50,0.0,0,0.122807,7,0.0,0,57,1
4,51.9,26.4,0.632653,0.470756,0.338305,0.47315,0.03222,0.059666,784,586700.0,...,0.870968,27,0.0,0,0.129032,4,0.0,0,31,1


#### Regression Data Prep with  CRA variable

In [151]:
allvars_2009.columns
noinc = ['% HI borrower, LI tract',
       '# HI borrower, LI tract', '% HI borrower, HI tract',
       '# HI borrower, HI tract', '% LI borrower, LI tract',
       '# LI borrower, LI tract', '%LI borrower, HI tract',
       '# LI borrower, HI tract', 'Geoid', 'Total Loans', "CRA Eligible_x", "type", "County_x", "County_y", 
         "type_x", "type_y", "Id2", "Year_x", "Tract_y"]

X_vars = allvars_2009.loc[:, ~allvars_2009.columns.isin(noinc)]
y = allvars_2009["Total Loans"]
print(len(y))
print(len(X_vars))
X_vars.head()


801
801


Unnamed: 0,Percent of population 25 years and over with Bachelor's degree,% below poverty level,% Single Family,% Owner Occupied,Percent NH White alone,Percent NH Black or African African alone,Percent NH Asian alone,Percent Hispanic,Total number of housing units,Median home value,Median income,CRA
0,34.5,3.1,0.912439,0.89526,0.76915,0.041435,0.110724,0.007312,1439,1000000.0,186439.0,0
1,32.1,6.9,0.478758,0.405179,0.714142,0.099114,0.075544,0.085214,2801,718100.0,66638.0,0
2,44.0,4.0,0.555446,0.433809,0.698082,0.092588,0.078143,0.062988,2020,790500.0,80391.0,0
3,28.1,6.0,0.44438,0.437722,0.419526,0.346635,0.066241,0.115988,1735,572000.0,50658.0,1
4,51.9,26.4,0.632653,0.470756,0.338305,0.47315,0.03222,0.059666,784,586700.0,39802.0,1


#### Regression Data Prep- not including cra variable this time

In [128]:
allvars_2009.columns
noinc = ['% HI borrower, LI tract',
       '# HI borrower, LI tract', '% HI borrower, HI tract',
       '# HI borrower, HI tract', '% LI borrower, LI tract',
       '# LI borrower, LI tract', '%LI borrower, HI tract',
       '# LI borrower, HI tract', 'Geoid', 'Total Loans', "CRA Eligible_x", "type", "County_x", "County_y", 
         "type_x", "type_y", "Id2", "Year_x", "Tract_y", "CRA"]

X_vars = allvars_2009.loc[:, ~allvars_2009.columns.isin(noinc)]
y = allvars_2009["Total Loans"]
print(len(y))
print(len(X_vars))
X_vars.head()

801
801


Unnamed: 0,Percent of population 25 years and over with Bachelor's degree,% below poverty level,% Single Family,% Owner Occupied,Percent NH White alone,Percent NH Black or African African alone,Percent NH Asian alone,Percent Hispanic,Total number of housing units,Median home value,Median income
0,34.5,3.1,0.912439,0.89526,0.76915,0.041435,0.110724,0.007312,1439,1000000.0,186439.0
1,32.1,6.9,0.478758,0.405179,0.714142,0.099114,0.075544,0.085214,2801,718100.0,66638.0
2,44.0,4.0,0.555446,0.433809,0.698082,0.092588,0.078143,0.062988,2020,790500.0,80391.0
3,28.1,6.0,0.44438,0.437722,0.419526,0.346635,0.066241,0.115988,1735,572000.0,50658.0
4,51.9,26.4,0.632653,0.470756,0.338305,0.47315,0.03222,0.059666,784,586700.0,39802.0


#### Some of the data is Nan, I dealt with this is 3 ways, discuss the best strategy for this with Carolina

Strategy 1: drop all the Nans, lose ~1.5% of the data

In [152]:
#need to concatenate x with y, dropna and then unconcatenate in order to do this correctly
xydf = X_vars.copy()
xydf["y"] = y
dropped = xydf.dropna()

X_drop = dropped.loc[:, dropped.columns != 'y']
y_drop = dropped['y']

print("num tracts", len(X_drop))
print("y", len(y_drop))

print("before drop", len(xydf))
print("pct lost", (len(xydf) - len(X_drop))/len(xydf))
X_drop.head()

num tracts 790
y 790
before drop 801
pct lost 0.01373283395755306


Unnamed: 0,Percent of population 25 years and over with Bachelor's degree,% below poverty level,% Single Family,% Owner Occupied,Percent NH White alone,Percent NH Black or African African alone,Percent NH Asian alone,Percent Hispanic,Total number of housing units,Median home value,Median income,CRA
0,34.5,3.1,0.912439,0.89526,0.76915,0.041435,0.110724,0.007312,1439,1000000.0,186439.0,0
1,32.1,6.9,0.478758,0.405179,0.714142,0.099114,0.075544,0.085214,2801,718100.0,66638.0,0
2,44.0,4.0,0.555446,0.433809,0.698082,0.092588,0.078143,0.062988,2020,790500.0,80391.0,0
3,28.1,6.0,0.44438,0.437722,0.419526,0.346635,0.066241,0.115988,1735,572000.0,50658.0,1
4,51.9,26.4,0.632653,0.470756,0.338305,0.47315,0.03222,0.059666,784,586700.0,39802.0,1


#### Data Normalization

In [31]:
# def normalize_columns(data, mean_df, std_df):
#     '''
#     Input:
#       data (data frame): contains only numeric columns
#     Output:
#       data frame, the same data, except each column is standardized 
#       to have 0-mean and unit variance
#     '''
#     normalized_data=(data-mean_df.mean())/std_df.std()

#     return normalized_data

In [32]:
# def unnormalize_columns(data, mean_df, std_df):
#     unnormalized_data=(data*std_df.std())+mean_df.mean()

#     return unnormalized_data

In [34]:
#normalize the data before perfoming the regression
# normal_x_drop = normalize_columns(X_drop, X_drop, X_drop)
# normal_y_drop = normalize_columns(y_drop, y_drop, y_drop)


#### Poisson Model(s)

WithCRA

In [153]:
import statsmodels.formula.api as smf

data = X_drop.copy()
data = sm.add_constant(data, prepend=False)
data.rename({'Percent NH Black or African African alone': "NHBLK", 'Percent NH White alone': 'NHWHITE', 
             'Percent NH Asian alone': 'NHASIAN', 'Percent Hispanic': 'HISPANIC', 
             'Percent of population 25 years and over with Bachelor\'s degree': 'BACHELORS', 
             '% below poverty level': 'POVERTYRT',
             '% Single Family': 'PCTSINGLE', '% Owner Occupied': 'PCTOWNER',
             'Total number of housing units': 'UNITS',
             'Median home value': 'MEDVAL', 'Median income': 'MEDINC'}, axis=1, inplace=True)

data['y'] = y_drop.copy()
data.head()

#formula = 'y ~ ' + ' + '.join(features)
formula = 'y ~ NHBLK + BACHELORS + PCTSINGLE + PCTOWNER + POVERTYRT + NHWHITE + NHBLK + NHASIAN + UNITS + HISPANIC + MEDVAL + MEDINC + const' 


poisson_model = smf.glm(formula=formula, data=data, family=sm.families.Poisson()).fit()
poisson_model.summary()



0,1,2,3
Dep. Variable:,y,No. Observations:,790
Model:,GLM,Df Residuals:,778
Model Family:,Poisson,Df Model:,11
Link Function:,log,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-12434.
Date:,"Wed, 23 Jan 2019",Deviance:,20134.
Time:,13:53:37,Pearson chi2:,2.53e+04
No. Iterations:,6,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.7259,0.072,23.946,0.000,1.585,1.867
NHBLK,0.0947,0.157,0.604,0.546,-0.213,0.402
BACHELORS,0.0034,0.001,4.363,0.000,0.002,0.005
PCTSINGLE,0.1025,0.032,3.250,0.001,0.041,0.164
PCTOWNER,0.4824,0.044,10.972,0.000,0.396,0.569
POVERTYRT,0.0008,0.001,1.017,0.309,-0.001,0.002
NHWHITE,-0.6833,0.151,-4.520,0.000,-0.980,-0.387
NHASIAN,-0.5710,0.151,-3.792,0.000,-0.866,-0.276
UNITS,0.0003,1.77e-06,155.718,0.000,0.000,0.000


In [155]:
with open('/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/results/poisson_regression_results_origmodel_NEW.html', 'w') as outfile:
    outfile.write(poisson_model.summary().as_html())
      

Without CRA

In [135]:
import statsmodels.formula.api as smf

data_noCRA = X_drop.copy()
data_noCRA = sm.add_constant(data_noCRA, prepend=False)
data_noCRA.rename({'Percent NH Black or African African alone': "NHBLK", 'Percent NH White alone': 'NHWHITE', 
             'Percent NH Asian alone': 'NHASIAN', 'Percent Hispanic': 'HISPANIC', 
             'Percent of population 25 years and over with Bachelor\'s degree': 'BACHELORS', 
             '% below poverty level': 'POVERTYRT',
             '% Single Family': 'PCTSINGLE', '% Owner Occupied': 'PCTOWNER',
             'Total number of housing units': 'UNITS',
             'Median home value': 'MEDVAL', 'Median income': 'MEDINC'}, axis=1, inplace=True)
data_noCRA.head()

data_noCRA['y'] = y_drop.copy()


formula = 'y ~ NHBLK + BACHELORS + PCTSINGLE + PCTOWNER + POVERTYRT + NHWHITE + NHBLK + NHASIAN + UNITS + NHASIAN + HISPANIC + MEDVAL + MEDINC + const' 


poisson_model_noCRA = smf.glm(formula=formula, data=data_noCRA, family=sm.families.Poisson()).fit()
poisson_model_noCRA.summary()

0,1,2,3
Dep. Variable:,y,No. Observations:,790
Model:,GLM,Df Residuals:,778
Model Family:,Poisson,Df Model:,11
Link Function:,log,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-12434.
Date:,"Wed, 23 Jan 2019",Deviance:,20134.
Time:,13:48:35,Pearson chi2:,2.53e+04
No. Iterations:,6,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.7259,0.072,23.946,0.000,1.585,1.867
NHBLK,0.0947,0.157,0.604,0.546,-0.213,0.402
BACHELORS,0.0034,0.001,4.363,0.000,0.002,0.005
PCTSINGLE,0.1025,0.032,3.250,0.001,0.041,0.164
PCTOWNER,0.4824,0.044,10.972,0.000,0.396,0.569
POVERTYRT,0.0008,0.001,1.017,0.309,-0.001,0.002
NHWHITE,-0.6833,0.151,-4.520,0.000,-0.980,-0.387
NHASIAN,-0.5710,0.151,-3.792,0.000,-0.866,-0.276
UNITS,0.0003,1.77e-06,155.718,0.000,0.000,0.000


In [154]:
with open('/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/results/poisson_regression_results_nocra_NEW.html', 'w') as outfile:
    outfile.write(poisson_model_noCRA.summary().as_html())
      
  

2009 Poisson Model with only %NH white for race vars

In [62]:
import statsmodels.formula.api as smf

data_nhw = X_drop.copy()
data_nhw = sm.add_constant(data_nhw, prepend=False)
data_nhw.drop(['Percent NH Black or African African alone', 'Percent NH Asian alone', 'Percent Hispanic'], axis=1, inplace=True)
data_nhw.rename({'Percent NH White alone': 'NHWHITE', 
             'Percent of population 25 years and over with Bachelor\'s degree': 'BACHELORS', 
             'All families - Percent  below poverty level; Estimate; Families': 'POVERTYRT',
             '% Single Family': 'PCTSINGLE', '% Owner Occupied': 'PCTOWNER',
             'Estimate; Total Number of Housing Units': 'UNITS',
             'Estimate; Median value (dollars)': 'MEDVAL', 'Median income (dollars); All households': 'MEDINC',            
            }, axis=1, inplace=True)


data_nhw['y'] = y_drop.copy()

#formula = 'y ~ ' + ' + '.join(features)
formula = 'y ~ BACHELORS + PCTSINGLE + PCTOWNER + POVERTYRT + NHWHITE + UNITS + MEDVAL + MEDINC + const' 


poisson_model_nhw = smf.glm(formula=formula, data=data_nhw, family=sm.families.Poisson()).fit()
poisson_model_nhw.summary()



0,1,2,3
Dep. Variable:,y,No. Observations:,1210
Model:,GLM,Df Residuals:,1201
Model Family:,Poisson,Df Model:,8
Link Function:,log,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-24180.
Date:,"Mon, 21 Jan 2019",Deviance:,40977.
Time:,17:30:46,Pearson chi2:,5.33e+04
No. Iterations:,6,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.8756,0.010,193.502,0.000,1.857,1.895
BACHELORS,-0.0058,0.001,-11.406,0.000,-0.007,-0.005
PCTSINGLE,0.3903,0.024,16.147,0.000,0.343,0.438
PCTOWNER,0.3654,0.033,11.140,0.000,0.301,0.430
POVERTYRT,0.0042,0.001,6.885,0.000,0.003,0.005
NHWHITE,-0.5120,0.015,-33.497,0.000,-0.542,-0.482
UNITS,0.0003,1.15e-06,224.369,0.000,0.000,0.000
MEDVAL,-8.75e-07,2.53e-08,-34.651,0.000,-9.25e-07,-8.26e-07
MEDINC,7.924e-06,2.11e-07,37.559,0.000,7.51e-06,8.34e-06


In [63]:
with open('/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/results/poisson_regression_results_nhw_only.html', 'w') as outfile:
    outfile.write(poisson_model_nhw.summary().as_html())
    
    

2009 Poisson Model with "majority minority" dummy variable

In [64]:
def dummy_var(row):
    dummy = 0
    if (row['Percent NH White alone'] >= 0.50):
        dummy = 1
    else:
        dummy = 0
    return dummy

In [68]:
import statsmodels.formula.api as smf
data_minority = X_drop.copy()
data_minority['MINORITY'] = data_minority.apply(lambda x: dummy_var(x), axis=1)

data_minority = sm.add_constant(data_minority, prepend=False)
data_minority.rename({'Percent NH Black or African African alone': "NHBLK", 'Percent NH White alone': 'NHWHITE', 
             'Percent NH Asian alone': 'NHASIAN', 'Percent Hispanic': 'HISPANIC', 
             'Percent of population 25 years and over with Bachelor\'s degree': 'BACHELORS', 
             'All families - Percent  below poverty level; Estimate; Families': 'POVERTYRT',
             '% Single Family': 'PCTSINGLE', '% Owner Occupied': 'PCTOWNER',
             'Estimate; Total Number of Housing Units': 'UNITS',
             'Estimate; Median value (dollars)': 'MEDVAL', 'Median income (dollars); All households': 'MEDINC',            
            }, axis=1, inplace=True)
data_minority

data_minority['y'] = y_drop.copy()

#formula = 'y ~ ' + ' + '.join(features)
formula = 'y ~ NHBLK + BACHELORS + PCTSINGLE + PCTOWNER + POVERTYRT + NHWHITE + NHBLK + NHASIAN + UNITS + NHASIAN + HISPANIC + MEDVAL + MEDINC + MINORITY + CRA + const' 


poisson_model_minority = smf.glm(formula=formula, data=data_minority, family=sm.families.Poisson()).fit()
poisson_model_minority.summary()



0,1,2,3
Dep. Variable:,y,No. Observations:,1210
Model:,GLM,Df Residuals:,1196
Model Family:,Poisson,Df Model:,13
Link Function:,log,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-21097.
Date:,"Mon, 21 Jan 2019",Deviance:,34812.
Time:,17:32:21,Pearson chi2:,4.22e+04
No. Iterations:,6,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.4981,0.056,26.895,0.000,1.389,1.607
NHBLK,-0.1111,0.120,-0.925,0.355,-0.346,0.124
BACHELORS,0.0076,0.001,12.550,0.000,0.006,0.009
PCTSINGLE,0.4590,0.025,18.679,0.000,0.411,0.507
PCTOWNER,0.7009,0.033,21.247,0.000,0.636,0.766
POVERTYRT,-0.0047,0.001,-7.237,0.000,-0.006,-0.003
NHWHITE,-0.1298,0.118,-1.104,0.269,-0.360,0.101
NHASIAN,-0.5357,0.114,-4.693,0.000,-0.759,-0.312
UNITS,0.0003,1.19e-06,223.554,0.000,0.000,0.000


In [69]:
with open('/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/results/poisson_regression_results_minority.html', 'w') as outfile:
    outfile.write(poisson_model_minority.summary().as_html())
    
    
    