# Clean CalEnviroScreen / LEHD datasets

### CalEnviroScreen 4.0
* [4.0 shapefile](https://oehha.ca.gov/calenviroscreen/report/calenviroscreen-40)

### LEHD
* [Urban Institute assembled datasets for all states](https://datacatalog.urban.org/dataset/longitudinal-employer-household-dynamics-origin-destination-employment-statistics-lodes)
* [Data dictionary](https://datacatalog.urban.org/sites/default/files/data-dictionary-files/LODESTechDoc7.5.pdf)

In [1]:
import geopandas as gpd
import pandas as pd
import os

import utils

pd.set_option("display.max_rows", 20)
os.environ["CALITP_BQ_MAX_BYTES"] = str(20_000_000_000)

In [None]:
'''
import gcsfs
gs = gcsfs.GCSFileSystem()

!pip install google-cloud-storage
from google.cloud import storage
storage_client = storage.Client(project="cal-itp-data-infra")
CALENVIROSCREEN_FILE = 'calenviroscreen40shp_F_2021/'

gs.get(CALENVIROSCREEN_FILE, f"./test")
gs.ls(f'{BUCKET_NAME}')
gs.download(f'{BUCKET_NAME}{CALENVIROSCREEN_FILE}/CES4_final.shp', './test/test.shp')
'''

## Prep CalEnviroScreen data

In [4]:
# CalEnviroScreen
CALENVIROSCREEN_FILE = 'calenviroscreen40shp_F_2021/CES4_final.shp'

gdf = gpd.read_file(f"./{CALENVIROSCREEN_FILE}")
gdf.head(2)

Unnamed: 0,Tract,ZIP,Population,CIscore,CIscoreP,Ozone,Ozone_Pctl,PM2_5,PM2_5_Pctl,Diesel_PM,...,African_Am,Native_Ame,Asian_Amer,Pacific_Is,Other_Mult,Shape_Leng,Shape_Area,County,City_1,geometry
0,6083002000.0,93454,4495,36.019653,69.162885,0.03419,10.566273,7.567724,10.031114,0.154573,...,0.4004,0.267,8.2091,0.0,1.3126,6999.357622,2847611.0,Santa Barbara,Santa Maria,"POLYGON ((-39795.070 -341919.191, -38126.384 -..."
1,6083002000.0,93455,13173,37.030667,70.637922,0.035217,11.561917,7.624775,10.454263,0.106088,...,2.5051,0.0,4.699,0.0,0.9489,19100.578003,16352920.0,Santa Barbara,Santa Maria,"POLYGON ((-39795.070 -341919.191, -39803.632 -..."


## Prep LEHD data

In [2]:
URBAN_URL = "https://urban-data-catalog.s3.amazonaws.com/drupal-root-live/"
DATE_DOWNLOAD = "2021/04/19/"
datasets = ["wac_pri_tract_minus_fed", "wac_fed_tract"]

for dataset in datasets:
    utils.import_export(DATASET_NAME = f"{URBAN_URL}{DATE_DOWNLOAD}{dataset}", 
                        OUTPUT_FILE_NAME = dataset, GCS=True)


In [5]:
def process_lehd(df):
    # Subset to CA, keep maxiumum year, and only keep total jobs
    keep_cols = ["trct", "c000"]
    
    df = (df[(df.stname == "California") & 
            (df.year == df.year.max())]
          [keep_cols]
          .assign(
              trct = df.apply(lambda x: '0' + str(x.trct), axis = 1).astype(str),
          )
          .rename(columns = {"trct": "Tract", 
                            "c000": "num_jobs"})
          .reset_index(drop=True)          
    )
    
    return df


# Merge and clean up 
def merge_and_process(df1, df2):
    df = pd.merge(df1, 
              df2.rename(columns = {"num_jobs": "fed_jobs"}), 
              on = "Tract", how = "left", validate = "1:1")
    
    df = df.assign(
        wac_num_jobs = df[["num_jobs", "fed_jobs"]].sum(axis=1).astype(int)
    )[["Tract", "wac_num_jobs"]]
    
    return df

In [8]:
primary_nofed = pd.read_parquet((f"gs://{utils.BUCKET_NAME}/"
                                 "wac_pri_tract_minus_fed.parquet")
                               )
fed = pd.read_parquet(f"gs://{utils.BUCKET_NAME}/"
                      "wac_fed_tract.parquet")

primary_nofed = process_lehd(primary_nofed)
fed = process_lehd(fed)

lehd = merge_and_process(primary_nofed, fed)

In [9]:
def merge_calenviroscreen_lehd(calenviroscreen, lehd):
    gdf = utils.prep_calenviroscreen(calenviroscreen)
    
    # Merge LEHD with CalEnviroScreen
    df = pd.merge(gdf, lehd, 
                  on = "Tract", how = "left", validate = "1:1")
    
    df = df.assign(
        wac_num_jobs = df.wac_num_jobs.fillna(0).astype(int)
    )
    
    return df

In [10]:
df = merge_calenviroscreen_lehd(gdf, lehd)
df.head()

Unnamed: 0,Tract,ZIP,Population,sq_mi,pop_sq_mi,CIscoreP,Pollution_,PopCharP,equity_group,pollution_group,popchar_group,County,City_1,geometry,wac_num_jobs
0,6001400100,94704,3120,2.655917,1174.735658,2.79879,26.621033,1.525466,1,2,1,Alameda,Oakland,"POLYGON ((-197090.096 -12468.283, -196909.112 ...",822
1,6001400200,94618,2007,0.229901,8729.842746,2.874433,24.181705,1.651538,1,2,1,Alameda,Oakland,"POLYGON ((-196982.196 -15963.566, -196992.931 ...",1240
2,6001400300,94618,5051,0.427356,11819.185813,15.935451,33.366521,12.266768,1,2,1,Alameda,Oakland,"POLYGON ((-197350.929 -16712.642, -197950.200 ...",1582
3,6001400400,94609,4007,0.271558,14755.587549,18.973777,26.235221,18.431669,1,2,1,Alameda,Oakland,"POLYGON ((-197953.290 -16012.154, -197963.187 ...",889
4,6001400500,94609,4124,0.227012,18166.435207,29.740292,31.400124,30.156329,2,2,2,Alameda,Oakland,"POLYGON ((-198589.270 -15822.210, -198703.191 ...",291
