# Import modules
CDC data set description link [here](https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Questionnaire&CycleBeginYear=2015)

In [1]:
import pdb
import glob

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

import nhanes as nhanes

%matplotlib notebook

## Settings

In [2]:
DATA_PATH = 'C:/Users/allen/Documents/Git-Repos/Opportunistic/CDC/NHANES/'
DATASET = 'arthritis'

### Note: 
The code below loads each dataset: dataset_features, dataset_targets

Here, all datasets are defined explicitly (see nhanes.py).

In [3]:
ds = nhanes.Dataset(DATA_PATH)
ds.load_cancer()
n_fe = ds.features.shape[1]
n_classes = 2

dataset_features = ds.features
dataset_targets = ds.targets

Processing: Dietary\FFQRAW_C.XPT                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

Processing: Laboratory\WPIN_D.XPT                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

                                                                                Processing: Laboratory\ALB_CR_D.XPT                                                                                Processing: Laboratory\ALB_CR_E.XPT                                                                                Processing: Laboratory\ALB_CR_F.XPT                                                                                Processing: Laboratory\ALB_CR_G.XPT                                                                                Processing: Laboratory\ALB_CR_H.XPT                                                                                Processing: Laboratory\ALB_CR_I.XPT                                                                                Processing: Laboratory\ALDUST_D.XPT                                                                                Processing: Laboratory\AL_IGE_D.XPT                                                                

                                                                                Processing: Laboratory\GLU_D.XPT                                                                                Processing: Laboratory\GLU_E.XPT                                                                                Processing: Laboratory\GLU_F.XPT                                                                                Processing: Laboratory\GLU_G.XPT                                                                                Processing: Laboratory\GLU_H.XPT                                                                                Processing: Laboratory\HCAAS_H.XPT                                                                                Processing: Laboratory\HCAA_H.XPT                                                                                Processing: Laboratory\HCY_D.XPT                                                                                Pro

                                                                                Processing: Laboratory\HDL_F.XPT                                                                                Processing: Laboratory\HDL_G.XPT                                                                                Processing: Laboratory\HDL_H.XPT                                                                                Processing: Laboratory\HDL_I.XPT                                                                                Processing: Laboratory\HEPA_D.XPT                                                                                Processing: Laboratory\HEPA_E.XPT                                                                                Processing: Laboratory\HEPA_F.XPT                                                                                Processing: Laboratory\HEPA_G.XPT                                                                                Pr

                                                                                Processing: Laboratory\UM_H.XPT                                                                                Processing: Laboratory\UPHOPM_E.XPT                                                                                Processing: Laboratory\UPHOPM_F.XPT                                                                                Processing: Laboratory\UPP_D.XPT                                                                                Processing: Laboratory\UPP_E.XPT                                                                                Processing: Laboratory\UTASS_H.XPT                                                                                Processing: Laboratory\UTAS_H.XPT                                                                                Processing: Laboratory\UVOCS_G.XPT                                                                              

                                                                                Processing: Laboratory\L39_B.XPT                                                                                Processing: Laboratory\L40FE_B.XPT                                                                                Processing: Laboratory\L40FE_C.XPT                                                                                Processing: Laboratory\L40T4_B.XPT                                                                                Processing: Laboratory\L40_2_B.XPT                                                                                Processing: Laboratory\L40_B.XPT                                                                                Processing: Laboratory\L40_C.XPT                                                                                Processing: Laboratory\L43_C.XPT                                                                                

                                                                                Processing: Laboratory\UIO_H.XPT                                                                                Processing: Laboratory\UMS_H.XPT                                                                                Processing: Laboratory\UM_H.XPT                                                                                Processing: Laboratory\UPHOPM_E.XPT                                                                                Processing: Laboratory\UPHOPM_F.XPT                                                                                Processing: Laboratory\UPP_D.XPT                                                                                Processing: Laboratory\UPP_E.XPT                                                                                Processing: Laboratory\UTASS_H.XPT                                                                                

                                                                                Processing: Laboratory\VID_G.XPT                                                                                Processing: Laboratory\VITAEC_D.XPT                                                                                Processing: Laboratory\VITB12_G.XPT                                                                                Processing: Laboratory\VITB12_H.XPT                                                                                Processing: Laboratory\VIT_2_B.XPT                                                                                Processing: Laboratory\VIT_B6_D.XPT                                                                                Processing: Laboratory\VIT_B6_E.XPT                                                                                Processing: Laboratory\VIT_B6_F.XPT                                                                    

Processing: Questionnaire\WHQ_H.XPT                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                

Exception: Failed to process MCQ365a

## Train/Test Separation

In [None]:
perm = np.random.permutation(dataset_targets.shape[0])
dataset_features = dataset_features[perm]
dataset_targets = dataset_targets[perm]

def get_batch(n_size, phase):
    # select indices
    n_samples = dataset_features.shape[0]
    n_classes = int(dataset_targets.max() + 1)
    if phase == 'test':
        inds_sel = np.arange(0, int(n_samples*0.15), 1)
    elif phase == 'validation':
        n_samples = dataset_features.shape[0]
        inds_sel = np.arange(int(n_samples*0.15), int(n_samples*0.30), 1)
    elif phase == 'train':
        n_samples = dataset_features.shape[0]
        inds_sel = np.arange(int(n_samples*0.30), n_samples, 1)
    else:
        raise NotImplementedError
    inds_sel = np.random.permutation(inds_sel)
    batch_inds = []
    for cl in range(n_classes):
        inds_cl = inds_sel[dataset_targets[inds_sel] == cl]
        batch_inds.extend(inds_cl[:n_size//n_classes])
    batch_inds = np.random.permutation(batch_inds)
    
    return dataset_features[batch_inds], dataset_targets[batch_inds]
    
features_trn, targets_trn = get_batch(n_size=5000, phase='train')
features_tst, targets_tst = get_batch(n_size=1000, phase='test')

## Classification

In [None]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(features_trn, targets_trn)
preds_tst = clf.predict(features_tst)
accu = np.mean(preds_tst==targets_tst)
print('accu_tst_RFC', accu)

clf = SVC(gamma='auto')
clf.fit(features_trn, targets_trn)
preds_tst = clf.predict(features_tst)
accu = np.mean(preds_tst==targets_tst)
print('accu_tst_SVC', accu)

clf = LogisticRegression(solver='lbfgs', max_iter=200)
clf.fit(features_trn, targets_trn)
preds_tst = clf.predict(features_tst)
accu = np.mean(preds_tst==targets_tst)
print('accu_tst_LR', accu)


### Accuracies from baseline: 
#### Cancer (ds.load_cancer()):
* accu_tst_RFC 0.758
* accu_tst_SVC 0.759
* accu_tst_LR 0.768

#### Arthiritis (ds.load_arthiritis()):
* accu_tst_RFC 0.753
* accu_tst_SVC 0.754
* accu_tst_LR 0.773