In [1]:
import sys, os, time

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import rasterio
import rasterio.mask
import rasterio.io
import rasterio.errors
import fiona
import fiona.transform
import shapely
import shapely.geometry

import collections

In [2]:
NLCD_CLASS_VALS = [11, 12, 21, 22, 23, 24, 31, 41, 42, 43, 52, 71, 81, 82, 90, 95]
NLCD_CLASS_NAMES = [
    "Open Water",
    "Perennial Ice/Snow",
    "Developed, Open Space",
    "Developed, Low Intensity",
    "Developed, Medium Intensity",
    "Developed High Intensity",
    "Barren Land (Rock/Sand/Clay)",
    "Deciduous Forest",
    "Evergreen Forest",
    "Mixed Forest",
    "Shrub/Scrub",
    "Grassland/Herbaceous",
    "Pasture/Hay",
    "Cultivated Crops",
    "Woody Wetlands",
    "Emergent Herbaceous Wetlands"
]

In [18]:
acs_fns = [
    "data/raw/acs5yr/acs_5yr_2011_data.csv",
    "data/raw/acs5yr/acs_5yr_2013_data.csv",
    "data/raw/acs5yr/acs_5yr_2016_data.csv"
]

# These are created by `01 - NLCD + Census Tract intersection.ipynb`
processed_nlcd_fns = [
    "data/processed/nlcd_2011_counts_per_output_tracts.npy",
    "data/processed/nlcd_2013_counts_per_output_tracts.npy",
    "data/processed/nlcd_2016_counts_per_output_tracts.npy"
]

output_fns = [
    "data/processed/acs5yr/acs_5yr_2011_nlcd_joined_data.csv",
    "data/processed/acs5yr/acs_5yr_2013_nlcd_joined_data.csv",
    "data/processed/acs5yr/acs_5yr_2016_nlcd_joined_data.csv"
]

In [20]:
with open("data/processed/output_tracts_geoids.txt", "r") as f:
    output_tracts_geoids = f.read().strip().split("\n")

with open("data/processed/output_tracts_bad_geoids.txt", "r") as f:
    output_tracts_bad_geoids = set(f.read().strip().split("\n"))

print(len(output_tracts_geoids))
output_tracts_geoids_mask = [
    geoid not in output_tracts_bad_geoids
    for geoid in output_tracts_geoids
]
print(sum(output_tracts_geoids_mask))

for i in range(len(acs_fns)):
    print(acs_fns[i])
    df = pd.read_csv(acs_fns[i], sep="\t", encoding="ISO-8859-1", dtype={
        "Geo_FIPS":str,
        "Geo_STATE":str,
        "Geo_COUNTY":str
    })
    df["Geo_StateCounty"] = df["Geo_STATE"]+df["Geo_COUNTY"]
    df.set_index("Geo_FIPS", inplace=True)
    
    for geoid in output_tracts_geoids: # sanity checks
        assert geoid in df.index
        
    df = df.loc[output_tracts_geoids] # only keep the geoids that are in our intersected census tract set
        
    counts = np.load(processed_nlcd_fns[i])
    assert counts.shape[1] == len(NLCD_CLASS_NAMES) # sanity check
    
    #df = df.iloc[output_tracts_geoids_mask] # only keep the rows that have an intersection with NLCD
    #counts = counts[output_tracts_geoids_mask] # same as previous line
    
    df = df.dropna(axis=1, how='all') # get rid of columns that are all n/a
    
    for j, class_name in enumerate(NLCD_CLASS_NAMES):
        class_name = class_name.replace(",", "").replace(" ", "_")
        class_name = class_name.lower()
        df["nlcd-"+class_name] = counts[:,j]
        
    os.makedirs(os.path.dirname(output_fns[i]), exist_ok=True)
    df.to_csv(output_fns[i], sep="\t", index_label="Geo_FIPS")

72749
72257
data/raw/acs5yr/acs_5yr_2011_data.csv
data/raw/acs5yr/acs_5yr_2013_data.csv
data/raw/acs5yr/acs_5yr_2016_data.csv
