In [1]:
import numpy as np
import pandas as pd
import geopandas
import shapely
import os

In [2]:
bay_county_names = ["Alameda", "ContraCosta", "Sonoma", "Solano", "SanMateo", "SantaClara", "SanFrancisco", "Marin","Napa"] 
analysis_years = ["2008", "2009", "2010", "2011", "2012", "2013", "2014", "2015", "2016", "2017"] #try to use 2010 census for this


slump_years = ["2008", "2009", "2010", "2011", "2012"]
recovery_years = ["2013", "2014", "2015", "2016", "2017"]
geo_data_path = "/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/CTs_geo_data_1/"
parsed_data_path = "/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/parsed_data_1/"
shapefiles_data_path = "/Users/ameliabaum/Desktop/Amelia/CRA_Thesis/communityreinvestmentact/data/raw_shapefiles/"



## Shapefiles

In [3]:
#2000 census tract shapefiles (from 2008 file, but identical to 2000)
shapefiles_08 = geopandas.read_file(shapefiles_data_path+"alltracts_shapefiles_2008/tl_2008_06_tract00.shp")
bay_shapefiles_08 = shapefiles_08[shapefiles_08["COUNTYFP00"].isin(['001', '013', '041', '055', '075', '081', '085', '097', '095'])]
bay_shapefiles_08["NAME00"] = pd.to_numeric(bay_shapefiles_08["NAME00"])


# Carolina says that 2010 and should be used for all HMDA data 2012-2017, but for right now I'm still using 
#2000 for these because it seems like the only one that conserves all the data in the merge

# shapefiles_10 = geopandas.read_file(shapefiles_data_path+"gz_2010_06_140_00_500k/gz_2010_06_140_00_500k.shp")
# bay_shapefiles_10 = shapefiles_10[shapefiles_10["COUNTY"].isin(['001', '013', '041', '055', '075', '081', '085', '097', '095'])]
# bay_shapefiles_10["NAME"] = pd.to_numeric(bay_shapefiles_10["NAME"])
# bay_shapefiles_10.head()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


## Merging HMDA data with shapefiles for mapping

In [5]:
for county in bay_county_names:
    for year in analysis_years:
        parsed_df = pd.read_csv(parsed_data_path+county+'_'+year+'_parsed.csv')
        with_geo = parsed_df.merge(bay_shapefiles_08, how="left", right_on="NAME00", left_on="Tract")  
        print("writing...", county, year)
        with_geo.to_csv(geo_data_path+county+"_geoparsed_"+year+".csv", index=False)
        
        


writing... Alameda 2008
writing... Alameda 2009
writing... Alameda 2010
writing... Alameda 2011
writing... Alameda 2012
writing... Alameda 2013
writing... Alameda 2014
writing... Alameda 2015
writing... Alameda 2016
writing... Alameda 2017
writing... ContraCosta 2008
writing... ContraCosta 2009
writing... ContraCosta 2010
writing... ContraCosta 2011
writing... ContraCosta 2012
writing... ContraCosta 2013
writing... ContraCosta 2014
writing... ContraCosta 2015
writing... ContraCosta 2016
writing... ContraCosta 2017
writing... Sonoma 2008
writing... Sonoma 2009
writing... Sonoma 2010
writing... Sonoma 2011
writing... Sonoma 2012
writing... Sonoma 2013
writing... Sonoma 2014
writing... Sonoma 2015
writing... Sonoma 2016
writing... Sonoma 2017
writing... Solano 2008
writing... Solano 2009
writing... Solano 2010
writing... Solano 2011
writing... Solano 2012
writing... Solano 2013
writing... Solano 2014
writing... Solano 2015
writing... Solano 2016
writing... Solano 2017
writing... SanMateo 

## Creating the Recovery and Slump dataframes

In [6]:
#creates a combined dataframe of all the years in 2 groups, and averages the 4 proportions in this interval.
to_concat_recovery = []
to_concat_slump = []
all_dfs = []
files = [f for f in os.listdir(geo_data_path) if f[-3:] == "csv"]
for name in files:
    df_year = name.split('_')[2][:4]
    if df_year in slump_years:
        df1 = pd.read_csv(geo_data_path+name)
        all_dfs.append(df1)
        to_concat_slump.append(df1)
        
    if df_year in recovery_years:
        df2 = pd.read_csv(geo_data_path+name)
        all_dfs.append(df2)
        to_concat_recovery.append(df2)

slump = pd.concat(to_concat_slump, axis=0)
#.groupby(by=["Tract", "tract num", "STATEFP", "COUNTYFP", "TRACTCE", "AFFGEOID", "GEOID", 
#                                    "NAME", "LSAD", "ALAND", "AWATER", "geometry"]).mean()

recovery = pd.concat(to_concat_recovery, axis=0)
#.groupby(by=["Tract", "tract num", "STATEFP", "COUNTYFP", "TRACTCE", "AFFGEOID", "GEOID", 
#                                    "NAME", "LSAD", "ALAND", "AWATER", "geometry"]).mean()




            
      
    

In [7]:
print(len(slump))
print(len(slump.dropna()))

7143
6779


In [8]:
slump.to_csv(geo_data_path+"all_tracts_geoparsed_slump.csv", index=False)
recovery.to_csv(geo_data_path+"all_tracts_geoparsed_recovery.csv", index=False)