## Imports

In [1]:
import json
from ast import literal_eval
import numpy as np
import pandas as pd

## 1. Read NIH data

In [2]:
df = pd.read_csv('data/raw/incd.csv')

In [3]:
with open('data/raw/state_abbrs.txt') as f:
    state_abbr = literal_eval(''.join(f.readlines()[2:]))

In [4]:
#cleaning NIH data
df[['County', 'State2']] = df['State'].str.split(',', n=1, expand=True)
df.dropna(inplace=True)
df['State2'] = [state_abbr[e.lstrip()[:-3]].lower() for e in df['State2'].tolist()]

# only keep county names in lower case
df['County'] = [e.lower().replace(' county', '').replace(' parish', '') for e in df['County'].tolist()]

# create a county-state column to be used for merge later
df['County-State'] = [f"{c}-{s}" for c,s in df[['County', 'State2']].values.tolist()]

In [None]:
# drop unnecessary columns
df.drop(columns=['County', 'State2', 'State'], inplace=True)

# save file
df.to_csv('data/cleaned/nih_cancer_counties_20221111.csv', index=False)

## 2. Read CDR county data 

In [5]:
with open('data/raw/cdr-sites.json') as f:
    sites = json.load(f)
    counties = pd.DataFrame([e['properties'] for e in sites['features']])

In [6]:
# county-state column to merge on later
counties['County-State']  = counties['SITE_COUNTY'] + '-' + counties['SITE_STATE']

In [None]:
# drop unnecessary columns
counties.drop(columns=['SITE_COUNTY', 'SITE_STATE'], inplace=True)

# save file
counties.to_csv('data/cleaned/epa_cdr_sites_20221111.csv', index=False)

## 3. Read CDR chemicals data

In [7]:
with open('data/raw/cdr_chemicals.json') as f:
    chems_j = json.load(f)
    chems = pd.DataFrame([e['properties'] for e in chems_j['features']])

# clean up county column
chems['SITE_COUNTY'] = chems['SITE_COUNTY'].str.lower().str.replace(" county", '').str.replace(' parish', '')

# create a county-state column to be used for merge later
chems['County-State'] = (chems['SITE_COUNTY'] + '-' + chems['SITE_STATE']).str.lower()

## 4. Read CDR 2020 data

In [8]:
chems_2020 = pd.read_csv('data/raw/cdr_2020_small_cut_v2.csv')

# clean up county column
chems_2020['SITE COUNTY / PARISH'] = chems_2020['SITE COUNTY / PARISH'].str.lower()
chems_2020['SITE COUNTY / PARISH'] = chems_2020['SITE COUNTY / PARISH'].str.replace(" county", '').str.replace(' parish', '')

# create a county-state column to be used for merge later
chems_2020['County-State'] = chems_2020['SITE COUNTY / PARISH'] + '-' + chems_2020['SITE STATE']
chems_2020['County-State'] = (chems_2020['County-State']).str.lower()

In [9]:
chems_2020.to_csv('data/cleaned/cdr_2020_chemicals_20221117.csv', index=False)

## 4. Merge NIH data to CDR data on County-State

In [10]:
# merge NIH to CDR chemicals
everything = chems.merge(df).dropna(axis=1)

In [11]:
everything['Recent Trend'].value_counts()

stable                18225
rising                 6182
falling                2703
*                       743
data not available      503
Name: Recent Trend, dtype: int64

In [12]:
everything.to_csv('data/cleaned/epa_nih_county_join_20221111.csv', index=False)