# Import modules
CDC data set description link [here](https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Questionnaire&CycleBeginYear=2015)

In [1]:
import pdb
import glob

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import VarianceThreshold, SelectFromModel, SelectKBest, mutual_info_classif, mutual_info_regression

import nhanes as nhanes

%matplotlib notebook

## Settings

In [2]:
DATA_PATH = 'C:/Users/allen/Documents/Git-Repos/Opportunistic/CDC/NHANES/'
DATASET = 'cancer'

### Note: 
The code below loads each dataset: dataset_features, dataset_targets

Here, all datasets are defined explicitly (see nhanes.py).

In [3]:
ds = nhanes.Dataset(DATA_PATH)
ds.load_cancer()
n_fe = ds.features.shape[1]
n_classes = 2

dataset_features = ds.features
dataset_targets = ds.targets

Processing: Questionnaire\RXQ_RX.XPTT                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

Processing: Examination\AUXTYM_C.XPT                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

Processing: Laboratory\L28PBE_C.XPT                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     

Processing: Questionnaire\KIQ_P_C.XPT                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   

                                                                                Processing: Questionnaire\KIQ_P_D.XPT                                                                                Processing: Questionnaire\KIQ_P_E.XPT                                                                                Processing: Questionnaire\KIQ_U_B.XPT                                                                                Processing: Questionnaire\KIQ_U_C.XPT                                                                                Processing: Questionnaire\KIQ_U_D.XPT                                                                                Processing: Questionnaire\KIQ_U_E.XPT                                                                                Processing: Questionnaire\KIQ_U_F.XPT                                                                                Processing: Questionnaire\KIQ_U_G.XPT                                                

                                                                                Processing: Questionnaire\BAQ.XPT                                                                                Processing: Questionnaire\BAQ_B.XPT                                                                                Processing: Questionnaire\BAQ_C.XPT                                                                                Processing: Questionnaire\BHQ_D.XPT                                                                                Processing: Questionnaire\BHQ_E.XPT                                                                                Processing: Questionnaire\BHQ_F.XPT                                                                                Processing: Questionnaire\BPQ.XPT                                                                                Processing: Questionnaire\BPQ_B.XPT                                                                    

                                                                                Processing: Questionnaire\RXQ_RX.XPT                                                                                Processing: Questionnaire\RXQ_RX_B.XPT                                                                                Processing: Questionnaire\RXQ_RX_C.XPT                                                                                Processing: Questionnaire\RXQ_RX_D.XPT                                                                                Processing: Questionnaire\RXQ_RX_E.XPT                                                                                Processing: Questionnaire\RXQ_RX_F.XPT                                                                                Processing: Questionnaire\RXQ_RX_G.XPT                                                                                Processing: Questionnaire\RXQ_RX_H.XPT                                          

Processing: Questionnaire\CBQ_F.XPT                                                                                Processing: Questionnaire\CBQ_G.XPT                                                                                Processing: Questionnaire\CBQ_H.XPT                                                                                Processing: Questionnaire\CDQ.XPT                                                                                Processing: Questionnaire\CDQ_B.XPT                                                                                Processing: Questionnaire\CDQ_C.XPT                                                                                Processing: Questionnaire\CDQ_D.XPT                                                                                Processing: Questionnaire\CDQ_E.XPT                                                                                Processing: Questionnaire\CDQ_F.XPT                              

                                                                                Processing: Questionnaire\RXQ_RX_D.XPT                                                                                Processing: Questionnaire\RXQ_RX_E.XPT                                                                                Processing: Questionnaire\RXQ_RX_F.XPT                                                                                Processing: Questionnaire\RXQ_RX_G.XPT                                                                                Processing: Questionnaire\RXQ_RX_H.XPT                                                                                Processing: Questionnaire\SLQ_D.XPT                                                                                Processing: Questionnaire\SLQ_E.XPT                                                                                Processing: Questionnaire\SLQ_F.XPT                                                 

Processing: Questionnaire\FSQ_C.XPT                                                                                Processing: Questionnaire\FSQ_D.XPT                                                                                Processing: Questionnaire\FSQ_E.XPT                                                                                Processing: Questionnaire\FSQ_F.XPT                                                                                Processing: Questionnaire\FSQ_G.XPT                                                                                Processing: Questionnaire\FSQ_H.XPT                                                                                Processing: Questionnaire\HCQ_B.XPT                                                                                Processing: Questionnaire\HCQ_C.XPT                                                                                Processing: Questionnaire\HCQ_D.XPT                            

Processing: Questionnaire\WHQ_H.XPT                                                                                                                                                                                                                                                 

## Preprocessing of Data

In [None]:
dataset_features_sel = dataset_features.copy()
print(ds.features.shape)
# print(ds.feature_labels)
# Trim low variance features, left with 87
sel = VarianceThreshold(.05)
dataset_features_sel = sel.fit_transform(dataset_features_sel)
print("Original Shape: %s, New Shape: %s" % (ds.features.shape, dataset_features_sel.shape))

mutual_info = mutual_info_regression(dataset_features_sel, dataset_targets)
# print(mutual_info)

(49509, 119)
Original Shape: (49509, 119), New Shape: (49509, 87)


### Dropping features with low mutual information

In [None]:
mi_df = pd.DataFrame(mutual_info)
mi_df = mi_df.transpose()
selector = mi_df > .003

to_drop = []
for row in selector.itertuples():
    for i in range(len(row)):
        if((row[i])):
            to_drop.append(i)

mu_features_sel = pd.DataFrame(dataset_features_sel.copy())
mu_features_sel = mu_features_sel.drop(columns=to_drop)
print(mu_features_sel.shape)

## Train/Test Separation

In [None]:
perm = np.random.permutation(dataset_targets.shape[0])
dataset_features = dataset_features_sel[perm]
dataset_targets = dataset_targets[perm]

print("dataset_features Shape: %s, dataset_targets Shape: %s" % (dataset_features.shape, dataset_targets.shape))

def get_batch(n_size, phase):
    # select indices
    n_samples = dataset_features.shape[0]
    n_classes = int(dataset_targets.max() + 1)
    if phase == 'test':
        inds_sel = np.arange(0, int(n_samples*0.15), 1)
    elif phase == 'validation':
        n_samples = dataset_features.shape[0]
        inds_sel = np.arange(int(n_samples*0.15), int(n_samples*0.30), 1)
    elif phase == 'train':
        n_samples = dataset_features.shape[0]
        inds_sel = np.arange(int(n_samples*0.30), n_samples, 1)
    else:
        raise NotImplementedError
    inds_sel = np.random.permutation(inds_sel)
    batch_inds = []
    for cl in range(n_classes):
        inds_cl = inds_sel[dataset_targets[inds_sel] == cl]
        batch_inds.extend(inds_cl[:n_size//n_classes])
    batch_inds = np.random.permutation(batch_inds)
    
    return dataset_features[batch_inds], dataset_targets[batch_inds]
    
features_trn, targets_trn = get_batch(n_size=5000, phase='train')
features_tst, targets_tst = get_batch(n_size=1000, phase='test')

## Classification

In [None]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(features_trn, targets_trn)
preds_tst = clf.predict(features_tst)
accu = np.mean(preds_tst==targets_tst)
print('accu_tst_RFC', accu)

clf = SVC(gamma='auto')
clf.fit(features_trn, targets_trn)
preds_tst = clf.predict(features_tst)
accu = np.mean(preds_tst==targets_tst)
print('accu_tst_SVC', accu)

clf = LogisticRegression(solver='lbfgs', max_iter=200)
clf.fit(features_trn, targets_trn)
preds_tst = clf.predict(features_tst)
accu = np.mean(preds_tst==targets_tst)
print('accu_tst_LR', accu)


### Accuracies from baseline: 
#### Cancer (ds.load_cancer()):
* accu_tst_RFC 0.758
* accu_tst_SVC 0.759
* accu_tst_LR 0.768

#### Arthiritis (ds.load_arthiritis()):
* accu_tst_RFC 0.753
* accu_tst_SVC 0.754
* accu_tst_LR 0.773