In [62]:
import pandas as pd
import geopandas as gp
import matplotlib.pyplot as plt
import os
import json
import time
import numpy as np


In [37]:
bay_county_names = ["Alameda", "ContraCosta", "Sonoma", "Solano", "SanMateo", "SantaClara", "SanFrancisco", "Marin","Napa"] 
analysis_years = ["2008", "2009","2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017"]

In [7]:
#concatenates all the census tracts in a given year from all the counties in the Bay Area. This is because when selectiing
#neighbors I do not want to eliminate census tracts that border each other but are separated by administrative 
#boundaries (ie: cities, counties), so I consider the congruous set of tracts for this analysis

# I am trying to locate pairs of census tracts that are geographically proximate and demographically similar except
# for their CRA eligibility threshold as the first step towards establishing a regression discontinuity design study.
# As a preliminary measure, I eliminate census tracts categorized as "high" and "low" since I want to look for neighboring
#tracts that are as close to the CRA threshold as possible in order to compare otherwise similar geographies.
for year in analysis_years:
    county_files = [f for f in os.listdir("/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/county_geo_data/") 
                    if f[-3:] == "csv"
                    and f.split('_')[2] == (str(year) +".csv")]
    countydfs = []
    for file in county_files:
        df = pd.read_csv("/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/county_geo_data/"+file)
        countydfs.append(df)
    to_write = pd.concat(countydfs, axis=0) #concatenates all the dataframes from that year together
    #to_write.to_csv("/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/county_geo_data/" + "all_bay_concat_"+year+".csv")


In [1]:
path = '/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/data/county_geo_data/'
dfs = []
for year in analysis_years:
    dfs.append(pd.read_csv(path+'all_bay_concat_'+ year+'.csv'))

print(dfs)




In [52]:
#Takes in a geodataframe which has the HMDA information geolocated to polygons, concatenated for all of the 
#9 counties. Builds a dictionary where the keys are each unique tract number and the values are an array of geographic neighbors 
#of that tract, and the eligibility threshold does not match the tract in question. Currently filters out high and low income
#tracts to look at only the center of the distribution, though this might change.



def find_neighbors(geodataframe):
    neighbors_dict = {}
    #geodf_mid_mod = geodataframe[(geodataframe['type'] == "mod") | (geodataframe["type"] == "mid")]
    for df_row in geodataframe.iterrows(): #index, then row
        polygon = df_row[1]["geometry"]
        tract = (df_row[1]["ctidfp00"])

        eligibility = df_row[1]["cra_eligib"]
        neighbors_dict[tract] = list(geodataframe[geodataframe.apply(lambda row: row['geometry'].touches(polygon) and 
                                        row['cra_eligib'] != eligibility, axis=1)]["ctidfp00"].values)
        #list so that it is easier to write to json later
         
    return neighbors_dict

    
    


In [57]:
start = time.time()
neighbors_years = {}

#for year in analysis_years:
for year in ["2009"]:

    geodf = gp.read_file('/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/combined_shapefiles/all_bay_concat_'+year+'/all_bay_concat_'+year+'.shp')
    print("finding neighbors for "+ str(year))
    neighbors = find_neighbors(geodf) #save this in json
    neighbors_years[year] = find_neighbors(geodf)
    #print([type(x) for x in neighbors.keys()])
    #print([type(x[0]) for x in neighbors.values()])
    
#     print("writing", str(year), "to json")
#     with open('/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/neighbors_all_'+str(year)+'.json', 'w') as fp:
#         json.dump(neighbors, fp)
end =time.time()
elapsed = end-start

elapsed

finding neighbors for 2009


157.83533120155334

In [65]:
#type(neighbors_years['2009'][6001433700][0])
np.save('/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/neighbors/neighbors_2009.npy', neighbors_years)

In [83]:
ntracts = []
years = []
master = pd.read_csv('/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/tracts_type_master.csv')
boolean_tracts = pd.DataFrame()
#for year in neighbors_years:
for year in ['2009']:
    for tract in neighbors_years[year]:
        #print(tract)
        if (len(neighbors_years[year][tract]) > 0):
            ntracts.append(tract)
            years.append(year)

            

boolean_tracts["Tract"] = ntracts #tracts with neighbors, seems to be 805 which makes sense

#print(len(ntracts.unique()))
boolean_tracts["Year"] = years
cra_ref = pd.read_csv('/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/place_parsed/allbay_place_geolocated_2009_.csv')[['CTIDFP00', 'CRA Eligible']]
boolean_tracts_all = pd.merge(boolean_tracts, cra_ref,  how='left', left_on="Tract", right_on ='CTIDFP00').drop("CTIDFP00", axis=1)
#
#boolean_tracts_all = pd.merge(boolean_tracts, master,  how='left', left_on=['Tract','Year'], right_on = ['census_tract_number','as_of_year'])
# boolean_tracts_all[["Tract", "Year", "Tract Type"]]
# boolean_tracts_all["CRA"] = boolean_tracts_all["Tract Type"].isin(['low', 'mod']) # == 'low' | boolean_tracts_all["Tract Type"] == 'mod'
# # boolean_tracts_all["Tract Type"] == 'low' or boolean_tracts_all["Tract Type"] == 'mod'
# print(len(boolean_tracts_all))
boolean_tracts_all.to_csv("/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/neighbors/boolean_tracts_all_2009.csv", index=False)






In [27]:
# ntracts = []
# years = []
# master = pd.read_csv('/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/tracts_type_master.csv')
# boolean_tracts = pd.DataFrame()
# #for year in neighbors_years:
# for year in ['2009']:
#     for tract in neighbors_years[year]:
#         if (len(neighbors_years[year][tract]) > 0):
#             ntracts.append(tract)
#             #years.append(year)
# boolean_tracts["Tract"] = ntracts
# boolean_tracts["Year"] = years
# boolean_tracts["Year"] = boolean_tracts["Year"].astype(int)
# boolean_tracts
# boolean_tracts_all = pd.merge(boolean_tracts, master,  how='left', left_on=['Tract','Year'], right_on = ['census_tract_number','as_of_year'])
# boolean_tracts_all[["Tract", "Year", "Tract Type"]]
# boolean_tracts_all["CRA"] = boolean_tracts_all["Tract Type"].isin(['low', 'mod']) # == 'low' | boolean_tracts_all["Tract Type"] == 'mod'
# # boolean_tracts_all["Tract Type"] == 'low' or boolean_tracts_all["Tract Type"] == 'mod'
# print(len(boolean_tracts_all))
# boolean_tracts_all.to_csv("/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/neighbors/boolean_tracts_all_2009.csv")


ValueError: Length of values does not match length of index