In [1]:
import pandas as pd
import geopandas as gp
import matplotlib.pyplot as plt
import os
import json




In [2]:
bay_county_names = ["Alameda", "ContraCosta", "Sonoma", "Solano", "SanMateo", "SantaClara", "SanFrancisco", "Marin","Napa"] 
analysis_years = ["2008", "2009","2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017"]

In [25]:
#concatenates all the census tracts in a given year from all the counties in the Bay Area. This is because when selectiing
#neighbors I do not want to eliminate census tracts that border each other but are separated by administrative 
#boundaries (ie: cities, counties), so I consider the congruous set of tracts for this analysis

# I am trying to locate pairs of census tracts that are geographically proximate and demographically similar except
# for their CRA eligibility threshold as the first step towards establishing a regression discontinuity design study.
# As a preliminary measure, I eliminate census tracts categorized as "high" and "low" since I want to look for neighboring
#tracts that are as close to the CRA threshold as possible in order to compare otherwise similar geographies.
for year in analysis_years:
    county_files = [f for f in os.listdir("/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/data/county_geo_data/") 
                    if f[-3:] == "csv"
                    and f.split('_')[2] == (str(year) +".csv")]
    countydfs = []
    for file in county_files:
        df = pd.read_csv("/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/data/county_geo_data/"+file)
        countydfs.append(df)
    to_write = pd.concat(countydfs, axis=0) #concatenates all the dataframes from that year together
    to_write.to_csv("/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/data/county_geo_data/" + "all_bay_concat_"+year+".csv")


In [8]:
path = '/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/data/county_geo_data/'
dfs = []
for year in analysis_years:
    dfs.append(pd.read_csv(path+'all_bay_concat_'+ year+'.csv'))

print(dfs)
# all_years_all_counties = pd.concat(dfs, axis=0)
# all_years_all_counties.to_csv(path+'all_years_all_counties.csv')



[      Unnamed: 0  Unnamed: 0.1  Unnamed: 0.1.1    Tract  Year  \
0              0             0               0  5001.00  2008   
1              1             1               1  5002.00  2008   
2              2             2               2  5003.00  2008   
3              3             3               3  5004.00  2008   
4              4             4               4  5005.00  2008   
5              5             5               5  5006.00  2008   
6              6             6               6  5008.00  2008   
7              7             7               7  5009.01  2008   
8              8             8               8  5009.02  2008   
9              9             9               9  5010.00  2008   
10            10            10              10  5011.00  2008   
11            11            11              11  5012.00  2008   
12            12            12              12  5013.00  2008   
13            13            13              13  5014.00  2008   
14            14        

In [35]:
#Takes in a geodataframe which has the HMDA information geolocated to polygons, concatenated for all of the 
#9 counties. Builds a dictionary where the keys are each unique tract number and the values are an array of geographic neighbors 
#of that tract, and the eligibility threshold does not match the tract in question. Currently filters out high and low income
#tracts to look at only the center of the distribution, though this might change.



def find_neighbors(geodataframe):
    neighbors_dict = {}
    #geodf_mid_mod = geodataframe[(geodataframe['type'] == "mod") | (geodataframe["type"] == "mid")]
    for df_row in geodataframe.iterrows(): #index, then row
        polygon = df_row[1]["geometry"]
        tract = df_row[1]["tract"]
        eligibility = df_row[1]["cra_eligib"]
        neighbors_dict[tract] = list(geodataframe[geodataframe.apply(lambda row: row['geometry'].touches(polygon) and 
                                        row['cra_eligib'] != eligibility, axis=1)]["tract"].values)
        #list so that it is easier to write to json later
         
    return neighbors_dict

    
    


In [36]:
#neighbors_years = {}
for year in analysis_years:
    geodf = gp.read_file('/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/data/combined_shapefiles/all_bay_concat_'+year+'/all_bay_concat_'+year+'.shp')
    print("finding neighbors for "+ str(year))
    neighbors = find_neighbors(geodf) #save this in json
    #neighbors_years[year] = find_neighbors(geodf)

    print("writing", str(year), "to json")
    with open('data/neighbors_'+str(year)+'.json', 'w') as fp:
        json.dump(neighbors, fp)
    


finding neighbors for 2008
writing 2008 to json
finding neighbors for 2009
writing 2009 to json
finding neighbors for 2010
writing 2010 to json
finding neighbors for 2011
writing 2011 to json
finding neighbors for 2012
writing 2012 to json
finding neighbors for 2013
writing 2013 to json
finding neighbors for 2014
writing 2014 to json
finding neighbors for 2015
writing 2015 to json
finding neighbors for 2016
writing 2016 to json
finding neighbors for 2017
writing 2017 to json
