# DNN on EPA CDR Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

# Tensorflow imports below...
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, GRU
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping

## Read in Data

In [2]:
df = pd.read_csv('/content/drive/MyDrive/ga_data/full_county_dataset.csv')

## Columns to drop to focus on CDR data

In [9]:
non_CDR_columns = ['profile_STATECTY',
 'profile_CNTYNAME',
 'profile_ELEVATION',
 'profile_UID',
 'profile_LAT_DD83',
 'profile_LON_DD83',
 'hab_XBKA',
 'hab_MEDBK_A',
 'hab_XUN',
 'hab_MEDBKUN',
 'hab_XCDENMID',
 'hab_XCDENBK',
 'hab_CONPERCENT',
 'hab_PCT_FA',
 'hab_PCT_DR',
 'hab_PCT_FAST',
 'hab_PCT_SLOW',
 'hab_PCT_POOL',
 'hab_XWIDTH',
 'hab_SDWIDTH',
 'hab_XBKF_W',
 'hab_XBKF_H',
 'hab_XINC_H',
 'hab_SDINC_H',
 'hab_BFWD_RAT',
 'hab_XWXD',
 'hab_XWD_RAT',
 'hab_SDWXD',
 'hab_SDWD_RAT',
 'hab_XDEPTH_CM',
 'hab_SDDEPTH_CM',
 'hab_XFC_ALG',
 'hab_XFC_RCK',
 'hab_XFC_BRS',
 'hab_XFC_LVT',
 'hab_XFC_AQM',
 'hab_XFC_OHV',
 'hab_XFC_HUM',
 'hab_XFC_UCB',
 'hab_XFC_LWD',
 'hab_XFC_NAT',
 'hab_XFC_BIG',
 'hab_XFC_ALL',
 'hab_PCT_SIDE',
 'hab_REACHLEN',
 'hab_W1_HAG',
 'hab_W1_HNOAG',
 'hab_W1_HALL',
 'hab_W1H_BLDG',
 'hab_W1H_LDFL',
 'hab_W1H_LOG',
 'hab_W1H_MINE',
 'hab_W1H_PARK',
 'hab_W1H_PSTR',
 'hab_W1H_PVMT',
 'hab_W1H_PIPE',
 'hab_W1H_ROAD',
 'hab_W1H_CROP',
 'hab_W1H_WALL',
 'hab_C1WM100',
 'hab_C2WM100',
 'hab_C4WM100',
 'hab_V1WM100',
 'hab_V1W_MSQ',
 'hab_V2WM100',
 'hab_V2W_MSQ',
 'hab_V4WM100',
 'hab_V4W_MSQ',
 'hab_PCAN_C',
 'hab_PCAN_D',
 'hab_PCAN_E',
 'hab_PCAN_M',
 'hab_PCAN_N',
 'hab_XCL',
 'hab_XCS',
 'hab_XMW',
 'hab_XMH',
 'hab_XGW',
 'hab_XGH',
 'hab_XGB',
 'hab_XC',
 'hab_XM',
 'hab_XCMW',
 'hab_XCM',
 'hab_XG',
 'hab_XCMGW',
 'hab_XCMG',
 'hab_XPCAN',
 'hab_XPMID',
 'hab_XPMGW',
 'hab_XPCM',
 'hab_XPCMG',
 'hab_XSLOPE',
 'hab_XSLOPE_MAP',
 'hab_XSLOPE_FIELD',
 'hab_PCTCLINOMETER',
 'hab_XBEARING',
 'hab_SINU',
 'hab_LSUB_DMM',
 'hab_LSUBD_SD',
 'hab_LSUB_DMM_NOR',
 'hab_PCT_FN',
 'hab_PCT_GC',
 'hab_PCT_GF',
 'hab_PCT_HP',
 'hab_PCT_OM',
 'hab_PCT_OT',
 'hab_PCT_RC',
 'hab_PCT_SA',
 'hab_PCT_WD',
 'hab_PCT_BIGR',
 'hab_PCT_BDRK',
 'hab_PCT_SAFN',
 'hab_PCT_SFGF',
 'hab_PCT_ORG',
 'hab_XEMBED',
 'hab_XCEMBED',
 'hab_RPXDEP_CM',
 'hab_RPMXDEP_CM',
 'hab_RPGT50',
 'hab_RPGT75',
 'hab_RP100',
 'hab_LTEST',
 'hab_LRBS_TST',
 'hab_LDMB_BW5',
 'hab_LRBS_BW5',
 'hab_LDCBF_G08',
 'hab_LRBS_G08',
 'hab_PCT_SFG',
 'hab_PCT_BH',
 'hab_XSHOR2VG',
 'hab_PCT_OVRB',
 'hab_PCT_GL',
 'hab_C1TM100',
 'hab_C2TM100',
 'hab_C4TM100',
 'hab_PCT_GR',
 'hab_RDIST1',
 'hab_QR1',
 'hab_CVWIDTH',
 'hab_CVWXD',
 'bminv_AMPHNTAX',
 'bminv_AMPHPIND',
 'bminv_AMPHPTAX',
 'bminv_BURRNTAX',
 'bminv_BURRPIND',
 'bminv_BURRPTAX',
 'bminv_CHIRDOM1PIND',
 'bminv_CHIRDOM3PIND',
 'bminv_CHIRDOM5PIND',
 'bminv_CHIRNTAX',
 'bminv_CHIRPIND',
 'bminv_CHIRPTAX',
 'bminv_CLMBNTAX',
 'bminv_CLMBPIND',
 'bminv_CLMBPTAX',
 'bminv_CLNGNTAX',
 'bminv_CLNGPIND',
 'bminv_CLNGPTAX',
 'bminv_COFINTAX',
 'bminv_COFIPIND',
 'bminv_COFIPTAX',
 'bminv_COFITRICNTAX',
 'bminv_COFITRICPIND',
 'bminv_COFITRICPTAX',
 'bminv_COGANTAX',
 'bminv_COGAPIND',
 'bminv_COGAPTAX',
 'bminv_CRUSNTAX',
 'bminv_CRUSPIND',
 'bminv_CRUSPTAX',
 'bminv_DIPTNTAX',
 'bminv_DIPTPIND',
 'bminv_DIPTPTAX',
 'bminv_DOM1PIND',
 'bminv_DOM3PIND',
 'bminv_DOM5PIND',
 'bminv_EPHENTAX',
 'bminv_EPHEPIND',
 'bminv_EPHEPTAX',
 'bminv_EPOTNTAX',
 'bminv_EPOTPIND',
 'bminv_EPOTPTAX',
 'bminv_EPT_NTAX',
 'bminv_EPT_PIND',
 'bminv_EPT_PTAX',
 'bminv_FACLNTAX',
 'bminv_FACLPIND',
 'bminv_FACLPTAX',
 'bminv_HEMINTAX',
 'bminv_HEMIPIND',
 'bminv_HEMIPTAX',
 'bminv_HPRIME',
 'bminv_INTLNTAX',
 'bminv_INTLPIND',
 'bminv_INTLPTAX',
 'bminv_MITENTAX',
 'bminv_MITEPIND',
 'bminv_MITEPTAX',
 'bminv_MOLLNTAX',
 'bminv_MOLLPIND',
 'bminv_MOLLPTAX',
 'bminv_NOINNTAX',
 'bminv_NOINPIND',
 'bminv_NOINPTAX',
 'bminv_NTOLNTAX',
 'bminv_NTOLPIND',
 'bminv_NTOLPTAX',
 'bminv_ODONNTAX',
 'bminv_ODONPIND',
 'bminv_ODONPTAX',
 'bminv_OLLENTAX',
 'bminv_OLLEPIND',
 'bminv_OLLEPTAX',
 'bminv_ORTHCHIRPIND',
 'bminv_ORTHNTAX',
 'bminv_ORTHPIND',
 'bminv_ORTHPTAX',
 'bminv_PLECNTAX',
 'bminv_PLECPIND',
 'bminv_PLECPTAX',
 'bminv_PREDNTAX',
 'bminv_PREDPIND',
 'bminv_PREDPTAX',
 'bminv_SCRPNTAX',
 'bminv_SCRPPIND',
 'bminv_SCRPPTAX',
 'bminv_SHRDNTAX',
 'bminv_SHRDPIND',
 'bminv_SHRDPTAX',
 'bminv_SPWLNTAX',
 'bminv_SPWLPIND',
 'bminv_SPWLPTAX',
 'bminv_STOLNTAX',
 'bminv_STOLPIND',
 'bminv_STOLPTAX',
 'bminv_SWIMNTAX',
 'bminv_SWIMPIND',
 'bminv_SWIMPTAX',
 'bminv_TANYNTAX',
 'bminv_TANYPIND',
 'bminv_TANYPTAX',
 'bminv_TL01NTAX',
 'bminv_TL01PIND',
 'bminv_TL01PTAX',
 'bminv_TL23NTAX',
 'bminv_TL23PIND',
 'bminv_TL23PTAX',
 'bminv_TL45NTAX',
 'bminv_TL45PIND',
 'bminv_TL45PTAX',
 'bminv_TL67NTAX',
 'bminv_TL67PIND',
 'bminv_TL67PTAX',
 'bminv_TOLRNTAX',
 'bminv_TOLRPIND',
 'bminv_TOLRPTAX',
 'bminv_TOTLNIND',
 'bminv_TOTLNTAX',
 'bminv_TRICNTAX',
 'bminv_TRICPIND',
 'bminv_TRICPTAX',
 'bminv_TUBINAIDNTAX',
 'bminv_TUBINAIDPIND',
 'bminv_TUBINAIDPTAX',
 'bminv_WTD_TV', 
 'incidence_rate_per_100k', 
 '5yr_trend'
 ]

In [10]:
df_CDR = df.drop(columns=non_CDR_columns).copy()

In [12]:
df_CDR.dropna(inplace=True)

In [33]:
df_CDR['recent_trend_cat'].value_counts()

stable     1145
rising      506
falling      38
Name: recent_trend_cat, dtype: int64

In [30]:
df_duplicates = df_CDR[df_CDR['recent_trend_cat'] == 'rising'].copy()
df_CDR = pd.concat([df_CDR, df_duplicates])
y = df_CDR['recent_trend_cat'].copy()

## Baseline to beat

In [35]:
y.value_counts(normalize=True).max()

0.6779159265837774

In [36]:
X = df_CDR.select_dtypes(exclude=['object'])


In [37]:
oh = OneHotEncoder(
    sparse=False,
    dtype=int,
    categories="auto"
)

y = oh.fit_transform(y.values.reshape(-1, 1))

In [42]:
sc = StandardScaler()
X_sc = sc.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_sc, y, stratify= y, test_size=0.2, random_state=4)

model = Sequential()

model.add(Dense(1000, activation='relu'))
model.add(Dense(500, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['mae', 'acc'])
early_stop = EarlyStopping(monitor='val_loss', min_delta=0.6, patience=5, verbose=1, mode='auto')

history = model.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    epochs=40,
    batch_size=64,

)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [43]:
preds = model.predict(X_sc)



In [44]:
preds_classes =np.argmax(preds,axis=1)

In [46]:
df_CDR['predictions'] = preds_classes

In [52]:
df_map_matrix = df_CDR[['fips', 'recent_trend_cat', 'predictions']]

In [56]:
df_map_matrix.to_csv('/content/drive/MyDrive/ga_data/map_matrix.csv')

In [47]:
np.unique(preds_classes, return_counts=True)

(array([0, 1, 2]), array([  33,  550, 1106]))

In [31]:
with open('/content/drive/MyDrive/ga_data/cdr_nih_20221121.pkl', 'wb') as pickle_out:
    pickle_out = pickle.dump(model, pickle_out)

In [None]:
with open('/content/drive/MyDrive/ga_data/cdr_nih_20221121.pkl', 'rb') as pickle_out:
    model = pickle.load(pickle_out)