In [14]:
import pandas as pd
import geopandas as gp
import matplotlib.pyplot as plt
import os
import json
import time



In [23]:
bay_county_names = ["Alameda", "ContraCosta", "Sonoma", "Solano", "SanMateo", "SantaClara", "SanFrancisco", "Marin","Napa"] 
analysis_years = ["2008", "2009","2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017"]

In [25]:
#concatenates all the census tracts in a given year from all the counties in the Bay Area. This is because when selectiing
#neighbors I do not want to eliminate census tracts that border each other but are separated by administrative 
#boundaries (ie: cities, counties), so I consider the congruous set of tracts for this analysis

# I am trying to locate pairs of census tracts that are geographically proximate and demographically similar except
# for their CRA eligibility threshold as the first step towards establishing a regression discontinuity design study.
# As a preliminary measure, I eliminate census tracts categorized as "high" and "low" since I want to look for neighboring
#tracts that are as close to the CRA threshold as possible in order to compare otherwise similar geographies.
for year in analysis_years:
    county_files = [f for f in os.listdir("/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/data/county_geo_data/") 
                    if f[-3:] == "csv"
                    and f.split('_')[2] == (str(year) +".csv")]
    countydfs = []
    for file in county_files:
        df = pd.read_csv("/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/data/county_geo_data/"+file)
        countydfs.append(df)
    to_write = pd.concat(countydfs, axis=0) #concatenates all the dataframes from that year together
    to_write.to_csv("/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/data/county_geo_data/" + "all_bay_concat_"+year+".csv")


In [1]:
path = '/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/data/county_geo_data/'
dfs = []
for year in analysis_years:
    dfs.append(pd.read_csv(path+'all_bay_concat_'+ year+'.csv'))

print(dfs)




In [17]:
#Takes in a geodataframe which has the HMDA information geolocated to polygons, concatenated for all of the 
#9 counties. Builds a dictionary where the keys are each unique tract number and the values are an array of geographic neighbors 
#of that tract, and the eligibility threshold does not match the tract in question. Currently filters out high and low income
#tracts to look at only the center of the distribution, though this might change.



def find_neighbors(geodataframe):
    neighbors_dict = {}
    #geodf_mid_mod = geodataframe[(geodataframe['type'] == "mod") | (geodataframe["type"] == "mid")]
    for df_row in geodataframe.iterrows(): #index, then row
        polygon = df_row[1]["geometry"]
        tract = df_row[1]["tract"]
        eligibility = df_row[1]["cra_eligib"]
        neighbors_dict[tract] = list(geodataframe[geodataframe.apply(lambda row: row['geometry'].touches(polygon) and 
                                        row['cra_eligib'] != eligibility, axis=1)]["tract"].values)
        #list so that it is easier to write to json later
         
    return neighbors_dict

    
    


In [24]:
start = time.time()
neighbors_years = {}

for year in analysis_years:
    geodf = gp.read_file('/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/combined_shapefiles/all_bay_concat_'+year+'/all_bay_concat_'+year+'.shp')
    print("finding neighbors for "+ str(year))
    neighbors = find_neighbors(geodf) #save this in json
    neighbors_years[year] = find_neighbors(geodf)

    print("writing", str(year), "to json")
    with open('/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/neighbors_all_'+str(year)+'.json', 'w') as fp:
        json.dump(neighbors, fp)
end =time.time()
elapsed = end-start

elapsed

finding neighbors for 2008
writing 2008 to json
finding neighbors for 2009
writing 2009 to json
finding neighbors for 2010
writing 2010 to json
finding neighbors for 2011
writing 2011 to json
finding neighbors for 2012
writing 2012 to json
finding neighbors for 2013
writing 2013 to json
finding neighbors for 2014
writing 2014 to json
finding neighbors for 2015
writing 2015 to json
finding neighbors for 2016
writing 2016 to json
finding neighbors for 2017
writing 2017 to json


1420.753396987915

In [30]:
neighbors_years

{'2008': {4001.0: [4227.0],
  4364.02: [],
  4401.0: [4402.0],
  2015.0: [],
  6056.0: [],
  6138.0: [],
  5076.0: [],
  1515.02: [],
  1515.03: [],
  1516.0: [],
  4100.0: [4101.0],
  4415.03: [5046.02, 5046.01, 6103.02, 6117.0, 6118.0],
  4507.01: [],
  4507.21: [],
  4511.01: [],
  4512.02: [],
  3032.0: [3031.0],
  3040.0: [3010.0, 3031.0],
  3072.05: [3050.0, 3071.02, 3072.01, 3072.02],
  3131.03: [3072.01, 3131.01],
  3132.02: [3131.01, 3141.04],
  3521.02: [],
  3551.04: [],
  3551.06: [],
  3553.03: [],
  1011.0: [],
  118.0: [],
  451.0: [452.0, 401.0, 154.0, 156.0, 402.0],
  6050.0: [],
  6077.02: [],
  6096.03: [],
  6097.0: [],
  6133.0: [],
  6134.0: [],
  6135.01: [],
  5009.02: [5013.0],
  5033.12: [],
  5119.09: [],
  5120.01: [5120.18],
  5124.01: [5126.02, 5125.05, 5125.06],
  5124.02: [5126.02],
  5125.07: [5126.02, 5125.08, 5126.01, 5125.05, 5125.06],
  2501.02: [2010.02],
  2521.02: [3160.0],
  1502.02: [1503.02],
  1503.02: [1502.02, 1503.03, 1505.0, 2011.0, 1503.

In [63]:
ntracts = []
years = []
master = pd.read_csv('/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/tracts_type_master.csv')
boolean_tracts = pd.DataFrame()
for year in neighbors_years:
    for tract in neighbors_years[year]:
        if (len(neighbors_years[year][tract]) > 0):
            ntracts.append(tract)
            years.append(year)
boolean_tracts["Tract"] = ntracts
boolean_tracts["Year"] = years
boolean_tracts["Year"] = boolean_tracts["Year"].astype(int)
#boolean_tracts.dtypes

boolean_tracts_all = pd.merge(boolean_tracts, master,  how='left', left_on=['Tract','Year'], right_on = ['census_tract_number','as_of_year'])
boolean_tracts_all[["Tract", "Year", "Tract Type"]]
boolean_tracts_all["CRA"] = boolean_tracts_all["Tract Type"].isin(['low', 'mod']) # == 'low' | boolean_tracts_all["Tract Type"] == 'mod'
# boolean_tracts_all["Tract Type"] == 'low' or boolean_tracts_all["Tract Type"] == 'mod'
boolean_tracts_all.to_csv("/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/neighbors/boolean_tracts_all.csv")


In [13]:
neighborsfilepath = "/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/"
files = [f for f in os.listdir(neighborsfilepath) if f[:13] == "neighbors_all"]
# for file in files:
#     foo = pd.read_json(neighborsfilepath+file)
    
foo = pd.read_json(neighborsfilepath+'neighbors_all_2013.json')
foo

ValueError: arrays must all be same length