In [1]:
import pandas as pd
import geopandas as gp
import matplotlib.pyplot as plt
import os
import json
import time
import numpy as np


In [2]:
bay_county_names = ["Alameda", "ContraCosta", "Sonoma", "Solano", "SanMateo", "SantaClara", "SanFrancisco", "Marin","Napa"] 
analysis_years = ["2008", "2009","2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017"]

#### Concatenate all CTs in 9 counties for each year and writes the resulting data to csv

I am trying to locate pairs of census tracts that are geographically proximate and demographically similar except for their CRA eligibility threshold as the first step towards establishing a regression discontinuity design study. 

In [3]:
#concatenates all the census tracts in a given year from all the counties in the Bay Area. This is because when selectiing
#neighbors I do not want to eliminate census tracts that border each other but are separated by administrative 
#boundaries (ie: cities, counties), so I consider the congruous set of tracts for this analysis


for year in analysis_years:
    county_files = [f for f in os.listdir("/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/CTs_geo_data_1/") 
                    if f[-3:] == "csv"
                    and f.split('_')[2] == (str(year) +".csv")]
    countydfs = []
    for file in county_files:
        df = pd.read_csv("/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/CTs_geo_data_1/"+file)
        countydfs.append(df)
        
    to_write = pd.concat(countydfs, axis=0) #concatenates all the dataframes from that year together
    #to_write.to_csv("/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/CTs_geo_data_1/all_bay_concat_"+year+".csv", index=False)


#### Building Geographic Neighbors Dictionary

Takes in a geodataframe which has the HMDA information geolocated to polygons, concatenated for all of the 9 counties. Builds a dictionary where the keys are each unique tract number and the values are an array of geographic neighbors 
of that tract, and the eligibility threshold does not match the tract in question. The first function "find_neighbors_all" finds all of the CTs that are geographic neighbors and are oppositely CRA coded, the second function "find_neighbors_midmod" finds neighbors only for those tracts that are middle or moderate income, by eliminating high (>120% AMI) and low (<50% AMI) tracts.

In [4]:
def find_neighbors_all(geodataframe):
    neighbors_dict = {}
    for df_row in geodataframe.iterrows(): #index, then row
        polygon = df_row[1]["geometry"]
        tract = (df_row[1]["ctidfp00"])

        eligibility = df_row[1]["cra_eligib"]
        neighbors_dict[tract] = list(geodataframe[geodataframe.apply(lambda row: row['geometry'].touches(polygon) and 
                row['cra_eligib'] != eligibility, axis=1)]["ctidfp00"].values)#list so that it is easier to write to json later
         
    return neighbors_dict

    
    


In [27]:
def find_neighbors_midmod(geodataframe):
    neighbors_dict = {}
    geodf_mid_mod = geodataframe[(geodataframe['type'] == "mod") | (geodataframe["type"] == "mid")]
    for df_row in geodf_mid_mod.iterrows(): #index, then row
        polygon = df_row[1]["geometry"]
        tract = (df_row[1]["ctidfp00"])

        eligibility = df_row[1]["cra_eligib"]
        neighbors_dict[tract] = list(geodf_mid_mod[geodf_mid_mod.apply(lambda row: row['geometry'].touches(polygon) and 
                row['cra_eligib'] != eligibility, axis=1)]["ctidfp00"].values)#list so that it is easier to write to json later
         
    return neighbors_dict

#### This takes a very long time to run, but since both the all tracts and mid mod dictionaries are saves as npy files it shouldn't have to be run repeatedly

In [29]:
start = time.time()
neighbors_years = {}
neighbors_years_midmod = {}


for year in analysis_years:
    #reads in a shapefile of all the CTs in the 9 counties of a given year
    geodf = gp.read_file('/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/combined_shapefiles/all_bay_concat_'+year+'/all_bay_concat_'+year+'.shp')
    #print("finding all neighbors for "+ str(year))
    #applies the find neighbors function from above
    #neighbors_years[year] = find_neighbors_midmod(geodf) #for only mid mod tracts
    #neighbors_years[year] = find_neighbors_all(geodf) #adds the dictionary of neighbors for a given year to the master
    print("finding mid mod neighbors for "+ str(year))
    neighbors_years_midmod[year] = find_neighbors_midmod(geodf)
    #dictionary of neighbors for all of the years
    

#writes the dictionary to json, if needed
#     with open('/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/neighbors_all_'+str(year)+'.json', 'w') as fp:
#         json.dump(neighbors, fp)
end =time.time()
elapsed = end-start

elapsed

finding mid mod neighbors for 2008
finding mid mod neighbors for 2009
finding mid mod neighbors for 2010
finding mid mod neighbors for 2011
finding mid mod neighbors for 2012
finding mid mod neighbors for 2013
finding mid mod neighbors for 2014
finding mid mod neighbors for 2015
finding mid mod neighbors for 2016
finding mid mod neighbors for 2017


328.1445622444153

#### Saves the neighbors dictionary as a numpy object, to be read in later as a dictionary


In [40]:
np.save('/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/neighbors/neighbors_all_allyears.npy', neighbors_years)

In [30]:
np.save('/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/neighbors/neighbors_midmod_allyears.npy', neighbors_years_midmod)

In [37]:
neighbors_years = np.load('/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/neighbors/neighbors_all_allyears.npy').item()
neighbors_years_midmod = np.load('/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/neighbors/neighbors_midmod_allyears.npy').item()

print("num mid mod tracts", len(neighbors_years_midmod['2009'].values())) #910 tracts in the midmod dict
print("num all tracts", len(neighbors_years['2009'].values())) # 1397 tracts in the all dict



num mid mod tracts 910
num all tracts 1397


#### Creates a dataframe of all the tracts with neighbors by year (2009 to start), indicating which are CRA eligible. Exports to csv called "boolean_tracts_year"


There are some issues with this code that I am still working on, but for now the regression is using the file "boolean_tracts_2009". I think this is right now? It looks like it's ok for the all but the midmod below is the same which doesnt make sense

In [39]:
ntracts = []
years = []
master = pd.read_csv('/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/reference/tracts_type_master_1.csv')
boolean_tracts = pd.DataFrame()
for year in ['2009']:
#for year in analysis_years:
    for tract in neighbors_years[year]:
        if (len(neighbors_years[year][tract]) > 0): #if the tract has neighbors
            ntracts.append(tract) #add it to the list "ntracts"
            years.append(year) #and add the year to the years list
    cra_ref = master[master["Year"] == int(year)] #reference dataframe for just the year in question
            

boolean_tracts["Tract"] = ntracts #add the column 'Tract' to boolean_tracts of all of th tracts with neighbors
boolean_tracts["Year"] = years #add the column 'Year' to boolean_tracts of all of the years


#merge the boolean_tracts_all dataframe (all the tracts with neighbors), with their respective CRA coding
#based on the reference dataframe
boolean_tracts_withcra = pd.merge(boolean_tracts, cra_ref,  how='inner', left_on="Tract", right_on ='Geoid')
boolean_tracts_withcra = boolean_tracts_withcra[["Geoid", "Year_x", "Tract_y", "type", "County", "CRA Eligible"]]

boolean_tracts_withcra.to_csv("/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/neighbors/boolean_tracts_all_2009.csv", index=False)

print("num all tracts that have at least one oppositely coded neighbor", len(boolean_tracts_withcra))


num all tracts that have at least one oppositely coded neighbor 801


In [42]:
ntracts = []
years = []
master = pd.read_csv('/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/reference/tracts_type_master_1.csv')
boolean_tracts_midmod = pd.DataFrame()
for year in ['2009']:
#for year in analysis_years:
    for tract in neighbors_years_midmod[year]:
        if (len(neighbors_years_midmod[year][tract]) > 0): #if the tract has neighbors
            ntracts.append(tract) #add it to the list "ntracts"
            years.append(year) #and add the year to the years list
    cra_ref_midmod = master[master["Year"] == int(year)] #reference dataframe for just the year in question
            

boolean_tracts_midmod["Tract"] = ntracts #add the column 'Tract' to boolean_tracts of all of th tracts with neighbors
boolean_tracts_midmod["Year"] = years #add the column 'Year' to boolean_tracts of all of the years


#merge the boolean_tracts_all dataframe (all the tracts with neighbors), with their respective CRA coding
#based on the reference dataframe
boolean_tracts_midmod_withcra = pd.merge(boolean_tracts_midmod, cra_ref_midmod,  how='inner', left_on="Tract", right_on ='Geoid')
boolean_tracts_midmod_withcra.to_csv("/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/neighbors/boolean_tracts_midmod_2009.csv", index=False)


print("num mid or mod tracts that have at least one oppositely coded neighbor", len(boolean_tracts_midmod))


num mid or mod tracts that have at least one oppositely coded neighbor 626
