In [1]:
import pandas as pd
import numpy as np
from openpyxl import load_workbook
import os
import matplotlib.pyplot as plt
import urllib.request
import os

In [2]:
#categorized census tracts into 4 categories based on income level
def tract_type(row):
    if (row["tract_to_msamd_income"] < 50):
        return "low"
    if (row["tract_to_msamd_income"] < 80):
        return "mod"
    if (row["tract_to_msamd_income"] <= 120):
        return "mid"
    if (row["tract_to_msamd_income"] > 120):
        return "high"
    else:
        return "not categorized"
    


In [3]:
#codes a tract as CRA eligible or ineligible based on its median income to AMI ratio
def tract_eligibility(row):
    if (row["tract_to_msamd_income"] < 80):
        return "eligible"
    else:
        return "not eligible"

In [4]:
#codes a loan borrower as CRA eligible or ineligible based on their median income to AMI ratio
def borrower_eligibility(row):
    if (row["applicant_income_000s"]*1000 < .8*row["hud_median_family_income"]):
        return "eligible"
    else:
        return "not eligible"


In [5]:
def add_year(df, year):
    df["Year"] = year
    

In [6]:
#calculates the proportion of loans in the dataset that go to each of the 4 person-place categories, returns an array of 
#4 proportions that correspond to each of the 4 categories
def people_vs_place(df):
    hb_lt = 0 
    hb_ht = 0
    lb_lt = 0
    lb_ht = 0
    for row in df.itertuples():
        cra_eligible_tract = row._48 
        cra_eligible_borrower = row._49

        if (cra_eligible_tract == "eligible" and cra_eligible_borrower == "eligible"):
            lb_lt +=1
            
        elif (cra_eligible_tract == "not eligible" and cra_eligible_borrower == "eligible"):
            lb_ht +=1
        elif (cra_eligible_tract == "eligible" and cra_eligible_borrower == "not eligible"):
            hb_lt +=1
        elif (cra_eligible_tract == "not eligible" and cra_eligible_borrower == "not eligible"):
            hb_ht +=1
        
    return [hb_lt/len(df), hb_lt, hb_ht/len(df), hb_ht, lb_lt/len(df), lb_lt, lb_ht/len(df), lb_ht]
    
        

In [7]:
bay_county_names = ["Alameda", "ContraCosta", "Sonoma", "Solano", "SanMateo", "SantaClara", "SanFrancisco", "Marin","Napa"] 
bay_county_codes = ['001', '013', '041', '055', '075', '081', '085', '097', '095']

analysis_years = ["2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017"]



In [8]:
def parse_data(df, year, county):
#this uses the functions above combined to "parse" raw HMDA data, including to generate proportions of 
#loans to combinations of HI/LI borrowers and tracts and append other columns described above. It also adds a column 
#called geoid which is the full 10 digit FIPS for the census tract, which is useful for later processing.
    df["CRA eligible tract"] = df.apply(lambda item: tract_eligibility(item), axis=1)
    df["CRA eligible borrower"] = df.apply(lambda item: borrower_eligibility(item), axis=1)
    df["type"] = df.apply(lambda item: tract_type(item), axis=1)
    df["Year"] = year
    
    df_lol = []
    for tract, tract_df in df.groupby("census_tract_number"):
        row_list = [tract, 
                    tract_df["Year"].iloc[0], tract_df["county_name"].iloc[0],
                    tract_df["type"].iloc[0], tract_df["CRA eligible tract"].iloc[0]]
        row_list.extend(people_vs_place(tract_df))
        df_lol.append(row_list)
    data = pd.DataFrame(df_lol, columns=["Tract","Year", "County", "type", "CRA Eligible",'% HI borrower, LI tract', '# HI borrower, LI tract', 
                                         '% HI borrower, HI tract','# HI borrower, HI tract', "% LI borrower, LI tract", "# LI borrower, LI tract",
                                         "%LI borrower, HI tract", "# LI borrower, HI tract"])
    return data
        
        

In [9]:


#It takes several minutes to run since it goes through all of the years and counties of data. The end result is a
#csv file for each year and county with a line for each tract and information about the type of tract, the CRA eligibiliy
#and the number and percentage of each borrower-tract combination
filenames = []
tables_dict = {}
for county, code in zip(bay_county_names, bay_county_codes): 
    for year in analysis_years:
        filename = "/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/raw_hmda/"+ county + "_"+ str(year)+ ".csv"
        filenames.append(filename)
        df = pd.read_csv(filename)
        new_frame = parse_data(df, year, county)
        new_frame["Geoid"] = '6'+code+((new_frame["Tract"]*100).astype(int)).astype(str)
        

        tables_dict[county+ "_"+ str(year)] = new_frame
        print("writing..", county, year, " to csv")
        new_frame.to_csv("/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/parsed_data/"+county+"_"+str(year)+"_parsed.csv", index=False)
              

  interactivity=interactivity, compiler=compiler, result=result)


writing.. Alameda 2008  to csv


  interactivity=interactivity, compiler=compiler, result=result)


writing.. Alameda 2009  to csv


  interactivity=interactivity, compiler=compiler, result=result)


writing.. Alameda 2010  to csv


  interactivity=interactivity, compiler=compiler, result=result)


writing.. Alameda 2011  to csv
writing.. Alameda 2012  to csv
writing.. Alameda 2013  to csv
writing.. Alameda 2014  to csv
writing.. Alameda 2015  to csv
writing.. Alameda 2016  to csv
writing.. Alameda 2017  to csv
writing.. ContraCosta 2008  to csv


  interactivity=interactivity, compiler=compiler, result=result)


writing.. ContraCosta 2009  to csv
writing.. ContraCosta 2010  to csv


  interactivity=interactivity, compiler=compiler, result=result)


writing.. ContraCosta 2011  to csv
writing.. ContraCosta 2012  to csv


  interactivity=interactivity, compiler=compiler, result=result)


writing.. ContraCosta 2013  to csv


  interactivity=interactivity, compiler=compiler, result=result)


writing.. ContraCosta 2014  to csv


  interactivity=interactivity, compiler=compiler, result=result)


writing.. ContraCosta 2015  to csv
writing.. ContraCosta 2016  to csv
writing.. ContraCosta 2017  to csv
writing.. Sonoma 2008  to csv
writing.. Sonoma 2009  to csv
writing.. Sonoma 2010  to csv
writing.. Sonoma 2011  to csv
writing.. Sonoma 2012  to csv
writing.. Sonoma 2013  to csv
writing.. Sonoma 2014  to csv
writing.. Sonoma 2015  to csv
writing.. Sonoma 2016  to csv
writing.. Sonoma 2017  to csv
writing.. Solano 2008  to csv
writing.. Solano 2009  to csv
writing.. Solano 2010  to csv
writing.. Solano 2011  to csv
writing.. Solano 2012  to csv
writing.. Solano 2013  to csv
writing.. Solano 2014  to csv
writing.. Solano 2015  to csv
writing.. Solano 2016  to csv
writing.. Solano 2017  to csv
writing.. SanMateo 2008  to csv
writing.. SanMateo 2009  to csv
writing.. SanMateo 2010  to csv
writing.. SanMateo 2011  to csv
writing.. SanMateo 2012  to csv
writing.. SanMateo 2013  to csv
writing.. SanMateo 2014  to csv
writing.. SanMateo 2015  to csv
writing.. SanMateo 2016  to csv
writing

  interactivity=interactivity, compiler=compiler, result=result)


writing.. SantaClara 2009  to csv


  interactivity=interactivity, compiler=compiler, result=result)


writing.. SantaClara 2010  to csv


  interactivity=interactivity, compiler=compiler, result=result)


writing.. SantaClara 2011  to csv
writing.. SantaClara 2012  to csv


  interactivity=interactivity, compiler=compiler, result=result)


writing.. SantaClara 2013  to csv
writing.. SantaClara 2014  to csv


  interactivity=interactivity, compiler=compiler, result=result)


writing.. SantaClara 2015  to csv


  interactivity=interactivity, compiler=compiler, result=result)


writing.. SantaClara 2016  to csv


  interactivity=interactivity, compiler=compiler, result=result)


writing.. SantaClara 2017  to csv
writing.. SanFrancisco 2008  to csv
writing.. SanFrancisco 2009  to csv
writing.. SanFrancisco 2010  to csv
writing.. SanFrancisco 2011  to csv
writing.. SanFrancisco 2012  to csv
writing.. SanFrancisco 2013  to csv
writing.. SanFrancisco 2014  to csv
writing.. SanFrancisco 2015  to csv
writing.. SanFrancisco 2016  to csv
writing.. SanFrancisco 2017  to csv
writing.. Marin 2008  to csv
writing.. Marin 2009  to csv
writing.. Marin 2010  to csv
writing.. Marin 2011  to csv
writing.. Marin 2012  to csv
writing.. Marin 2013  to csv
writing.. Marin 2014  to csv
writing.. Marin 2015  to csv
writing.. Marin 2016  to csv
writing.. Marin 2017  to csv
writing.. Napa 2008  to csv
writing.. Napa 2009  to csv
writing.. Napa 2010  to csv
writing.. Napa 2011  to csv
writing.. Napa 2012  to csv
writing.. Napa 2013  to csv
writing.. Napa 2014  to csv
writing.. Napa 2015  to csv
writing.. Napa 2016  to csv
writing.. Napa 2017  to csv


In [10]:
new_frame.dtypes

Tract                      float64
Year                        object
County                      object
type                        object
CRA Eligible                object
% HI borrower, LI tract    float64
# HI borrower, LI tract      int64
% HI borrower, HI tract    float64
# HI borrower, HI tract      int64
% LI borrower, LI tract    float64
# LI borrower, LI tract      int64
%LI borrower, HI tract     float64
# LI borrower, HI tract      int64
Geoid                       object
dtype: object

### Create master CT reference by Type

In [12]:
#create a master reference dataframe that maps all census tracts in the bay to their type
path = "/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/parsed_data/"
files = [f for f in os.listdir(path) if f[-3:] == "csv"]
dfs = []
for file in files:
    #df = pd.read_csv("/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/data/parsed_data/"+file) 
    #dflite = df[["Tract", "Geoid", "Tract type", "Year", "County", "CRA Eligible"]]
    df = pd.read_csv(path+file)
    lite = df[["Geoid", "Tract", "Year", "type", "County", "CRA Eligible"]]
    
    dfs.append(lite)
master = pd.concat(dfs, axis=0)
master

    



Unnamed: 0,Geoid,Tract,Year,type,County,CRA Eligible
0,6013301000,3010.00,2015,mid,Contra Costa County,not eligible
1,6013302005,3020.05,2015,mid,Contra Costa County,not eligible
2,6013302006,3020.06,2015,mid,Contra Costa County,not eligible
3,6013302007,3020.07,2015,mid,Contra Costa County,not eligible
4,6013302008,3020.08,2015,mid,Contra Costa County,not eligible
5,6013302009,3020.09,2015,mid,Contra Costa County,not eligible
6,6013302010,3020.10,2015,mid,Contra Costa County,not eligible
7,6013303102,3031.02,2015,mid,Contra Costa County,not eligible
8,6013303103,3031.03,2015,mid,Contra Costa County,not eligible
9,6013303201,3032.01,2015,mid,Contra Costa County,not eligible


In [13]:
master.to_csv("/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/reference/tracts_type_master.csv", index=0) #exports it to csv