In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

# Tensorflow imports below...
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, GRU
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping


## Import Data and filter to just the Water data (not CDR)
Given the number of columns in this set, we'll focus on the numeric ones that are actually present. 

In [3]:
df = pd.read_csv('/content/drive/MyDrive/ga_data/full_county_dataset.csv')

In [5]:
water_string = " hab_XBKA	hab_MEDBK_A	hab_XUN	hab_MEDBKUN	hab_XCDENMID	hab_XCDENBK	hab_CONPERCENT	hab_PCT_FA	hab_PCT_DR	hab_PCT_FAST	hab_PCT_SLOW	hab_PCT_POOL	hab_XWIDTH	hab_SDWIDTH	hab_XBKF_W	hab_XBKF_H	hab_XINC_H	hab_SDINC_H	hab_BFWD_RAT	hab_XWXD	hab_XWD_RAT	hab_SDWXD	hab_SDWD_RAT	hab_XDEPTH_CM	hab_SDDEPTH_CM	hab_XFC_ALG	hab_XFC_RCK	hab_XFC_BRS	hab_XFC_LVT	hab_XFC_AQM	hab_XFC_OHV	hab_XFC_HUM	hab_XFC_UCB	hab_XFC_LWD	hab_XFC_NAT	hab_XFC_BIG	hab_XFC_ALL	hab_PCT_SIDE	hab_REACHLEN	hab_W1_HAG	hab_W1_HNOAG	hab_W1_HALL	hab_W1H_BLDG	hab_W1H_LDFL	hab_W1H_LOG	hab_W1H_MINE	hab_W1H_PARK	hab_W1H_PSTR	hab_W1H_PVMT	hab_W1H_PIPE	hab_W1H_ROAD	hab_W1H_CROP	hab_W1H_WALL	hab_C1WM100	hab_C2WM100	hab_C4WM100	hab_V1WM100	hab_V1W_MSQ	hab_V2WM100	hab_V2W_MSQ	hab_V4WM100	hab_V4W_MSQ	hab_PCAN_C	hab_PCAN_D	hab_PCAN_E	hab_PCAN_M	hab_PCAN_N	hab_XCL	hab_XCS	hab_XMW	hab_XMH	hab_XGW	hab_XGH	hab_XGB	hab_XC	hab_XM	hab_XCMW	hab_XCM	hab_XG	hab_XCMGW	hab_XCMG	hab_XPCAN	hab_XPMID	hab_XPMGW	hab_XPCM	hab_XPCMG	hab_XSLOPE	hab_XSLOPE_MAP	hab_XSLOPE_FIELD	hab_PCTCLINOMETER	hab_XBEARING	hab_SINU	hab_LSUB_DMM	hab_LSUBD_SD	hab_LSUB_DMM_NOR	hab_PCT_FN	hab_PCT_GC	hab_PCT_GF	hab_PCT_HP	hab_PCT_OM	hab_PCT_OT	hab_PCT_RC	hab_PCT_SA	hab_PCT_WD	hab_PCT_BIGR	hab_PCT_BDRK	hab_PCT_SAFN	hab_PCT_SFGF	hab_PCT_ORG	hab_XEMBED	hab_XCEMBED	hab_RPXDEP_CM	hab_RPMXDEP_CM	hab_RPGT50	hab_RPGT75	hab_RP100	hab_LTEST	hab_LRBS_TST	hab_LDMB_BW5	hab_LRBS_BW5	hab_LDCBF_G08	hab_LRBS_G08	hab_PCT_SFG	hab_PCT_BH	hab_XSHOR2VG	hab_PCT_OVRB	hab_PCT_GL	hab_C1TM100	hab_C2TM100	hab_C4TM100	hab_PCT_GR	hab_RDIST1	hab_QR1	hab_CVWIDTH	hab_CVWXD	bminv_AMPHNTAX	bminv_AMPHPIND	bminv_AMPHPTAX	bminv_BURRNTAX	bminv_BURRPIND	bminv_BURRPTAX	bminv_CHIRDOM1PIND	bminv_CHIRDOM3PIND	bminv_CHIRDOM5PIND	bminv_CHIRNTAX	bminv_CHIRPIND	bminv_CHIRPTAX	bminv_CLMBNTAX	bminv_CLMBPIND	bminv_CLMBPTAX	bminv_CLNGNTAX	bminv_CLNGPIND	bminv_CLNGPTAX	bminv_COFINTAX	bminv_COFIPIND	bminv_COFIPTAX	bminv_COFITRICNTAX	bminv_COFITRICPIND	bminv_COFITRICPTAX	bminv_COGANTAX	bminv_COGAPIND	bminv_COGAPTAX	bminv_CRUSNTAX	bminv_CRUSPIND	bminv_CRUSPTAX	bminv_DIPTNTAX	bminv_DIPTPIND	bminv_DIPTPTAX	bminv_DOM1PIND	bminv_DOM3PIND	bminv_DOM5PIND	bminv_EPHENTAX	bminv_EPHEPIND	bminv_EPHEPTAX	bminv_EPOTNTAX	bminv_EPOTPIND	bminv_EPOTPTAX	bminv_EPT_NTAX	bminv_EPT_PIND	bminv_EPT_PTAX	bminv_FACLNTAX	bminv_FACLPIND	bminv_FACLPTAX	bminv_HEMINTAX	bminv_HEMIPIND	bminv_HEMIPTAX	bminv_HPRIME	bminv_INTLNTAX	bminv_INTLPIND	bminv_INTLPTAX	bminv_MITENTAX	bminv_MITEPIND	bminv_MITEPTAX	bminv_MOLLNTAX	bminv_MOLLPIND	bminv_MOLLPTAX	bminv_NOINNTAX	bminv_NOINPIND	bminv_NOINPTAX	bminv_NTOLNTAX	bminv_NTOLPIND	bminv_NTOLPTAX	bminv_ODONNTAX	bminv_ODONPIND	bminv_ODONPTAX	bminv_OLLENTAX	bminv_OLLEPIND	bminv_OLLEPTAX	bminv_ORTHCHIRPIND	bminv_ORTHNTAX	bminv_ORTHPIND	bminv_ORTHPTAX	bminv_PLECNTAX	bminv_PLECPIND	bminv_PLECPTAX	bminv_PREDNTAX	bminv_PREDPIND	bminv_PREDPTAX	bminv_SCRPNTAX	bminv_SCRPPIND	bminv_SCRPPTAX	bminv_SHRDNTAX	bminv_SHRDPIND	bminv_SHRDPTAX	bminv_SPWLNTAX	bminv_SPWLPIND	bminv_SPWLPTAX	bminv_STOLNTAX	bminv_STOLPIND	bminv_STOLPTAX	bminv_SWIMNTAX	bminv_SWIMPIND	bminv_SWIMPTAX	bminv_TANYNTAX	bminv_TANYPIND	bminv_TANYPTAX	bminv_TL01NTAX	bminv_TL01PIND	bminv_TL01PTAX	bminv_TL23NTAX	bminv_TL23PIND	bminv_TL23PTAX	bminv_TL45NTAX	bminv_TL45PIND	bminv_TL45PTAX	bminv_TL67NTAX	bminv_TL67PIND	bminv_TL67PTAX	bminv_TOLRNTAX	bminv_TOLRPIND	bminv_TOLRPTAX	bminv_TOTLNIND	bminv_TOTLNTAX	bminv_TRICNTAX	bminv_TRICPIND	bminv_TRICPTAX	bminv_TUBINAIDNTAX	bminv_TUBINAIDPIND	bminv_TUBINAIDPTAX	bminv_WTD_TV"
water_columns = water_string.split()
demo_string = "name	fips	age_over_65	state	pop2017	poverty	homeownership	multi_unit	unemployment_rate	metro	median_edu	median_hh_income	smoking_ban	incidence_rate_per_100k	recent_trend_cat	5yr_trend"
demo_columns = demo_string.split()

In [6]:
features = water_columns + demo_columns
df_water = df[features].copy()
df_water.dropna(inplace=True)

In [26]:
y = df_water['recent_trend_cat'].copy()

In [27]:
y.value_counts(normalize=True).max()

0.8159246575342466

In [10]:
oh = OneHotEncoder(
    sparse=False,
    dtype=int,
    categories="auto"
)
y = oh.fit_transform(y.values.reshape(-1, 1))

In [14]:
#identifying vexxing columns with unexplained infinite values
df_water.describe().max().sort_values(ascending=False)

pop2017             4652980.00
median_hh_income     129588.00
fips                  56045.00
hab_XWXD              38050.27
hab_SDWXD             10672.34
                       ...    
hab_PCT_OT             1168.00
hab_PCT_RC             1168.00
hab_PCT_SA             1168.00
hab_PCT_WD             1168.00
5yr_trend              1168.00
Length: 271, dtype: float64

In [12]:
df_water.drop(columns=['hab_XWD_RAT', 'hab_BFWD_RAT'], inplace=True)

In [17]:
X = df_water.select_dtypes(exclude=['object'])

AttributeError: ignored

In [22]:
#Dense Neural Network on Water Data

sc = StandardScaler()
X_sc = sc.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_sc, y, stratify= y, test_size=0.2, random_state=4)

model = Sequential()

model.add(Dense(1000, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(500, activation='relu', kernel_regularizer=l2(0.002)))
model.add(Dropout(.05))
model.add(Dense(256, activation='relu', kernel_regularizer=l2(0.002)))
model.add(Dropout(.1))
model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.002)))
model.add(Dropout(.3))
model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.002)))
model.add(Dropout(.5))
model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.002)))
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.002)))
model.add(Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['mae', 'acc'])
early_stop = EarlyStopping(monitor='val_loss', min_delta=0.1, patience=5, verbose=1, mode='auto')

history = model.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    epochs=10,
    batch_size=32,
    callbacks=[early_stop]
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [155]:
preds = model.predict(X_sc)



In [157]:
preds_classes =np.argmax(preds,axis=1)

In [28]:
with open('/content/drive/MyDrive/ga_data/cdr_nih_20221121.pkl', 'wb') as pickle_out:
    pickle_out = pickle.dump(model, pickle_out)