# Import modules
CDC data set description link [here](https://wwwn.cdc.gov/nchs/nhanes/search/datapage.aspx?Component=Questionnaire&CycleBeginYear=2015)

In [10]:
import pdb
import glob

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import VarianceThreshold, SelectFromModel, SelectKBest, mutual_info_classif, mutual_info_regression
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA

import nhanes as nhanes

%matplotlib notebook

## Settings

In [2]:
DATA_PATH = 'C:/Users/allen/Documents/Git-Repos/Opportunistic/CDC/NHANES/'
DATASET = 'cancer'

### Note: 
The code below loads each dataset: dataset_features, dataset_targets

Here, all datasets are defined explicitly (see nhanes.py).
*Caution* Very long computation, proceed with care

In [3]:
ds = nhanes.Dataset(DATA_PATH)
ds.load_cancer()
n_fe = ds.features.shape[1]
n_classes = 2

Processing: Questionnaire\RHQ_B.XPT                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     

Processing: Examination\AUXTYM_F.XPT                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

Processing: Laboratory\L13_2_B.XPT                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

Processing: Questionnaire\ECQ_I.XPT                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     

                                                                                Processing: Questionnaire\FSQ.XPT                                                                                Processing: Questionnaire\FSQ_B.XPT                                                                                Processing: Questionnaire\FSQ_C.XPT                                                                                Processing: Questionnaire\FSQ_D.XPT                                                                                Processing: Questionnaire\FSQ_E.XPT                                                                                Processing: Questionnaire\FSQ_F.XPT                                                                                Processing: Questionnaire\FSQ_G.XPT                                                                                Processing: Questionnaire\FSQ_H.XPT                                                                  

                                                                                Processing: Questionnaire\ALQ_H.XPT                                                                                Processing: Questionnaire\AQQ_E.XPT                                                                                Processing: Questionnaire\AQQ_F.XPT                                                                                Processing: Questionnaire\ARQ_F.XPT                                                                                Processing: Questionnaire\AUQ.XPT                                                                                Processing: Questionnaire\AUQ_B.XPT                                                                                Processing: Questionnaire\AUQ_C.XPT                                                                                Processing: Questionnaire\AUQ_D.XPT                                                                  

                                                                                Processing: Questionnaire\OCQ_E.XPT                                                                                Processing: Questionnaire\OCQ_F.XPT                                                                                Processing: Questionnaire\OCQ_G.XPT                                                                                Processing: Questionnaire\OCQ_H.XPT                                                                                Processing: Questionnaire\OHQ.XPT                                                                                Processing: Questionnaire\OHQ_B.XPT                                                                                Processing: Questionnaire\OHQ_C.XPT                                                                                Processing: Questionnaire\OHQ_D.XPT                                                                  

                                                                                Processing: Questionnaire\DUQ_B.XPT                                                                                Processing: Questionnaire\DUQ_C.XPT                                                                                Processing: Questionnaire\DUQ_D.XPT                                                                                Processing: Questionnaire\DUQ_E.XPT                                                                                Processing: Questionnaire\DUQ_F.XPT                                                                                Processing: Questionnaire\DUQ_G.XPT                                                                                Processing: Questionnaire\DUQ_H.XPT                                                                                Processing: Questionnaire\ECQ.XPT                                                                  

                                                                                Processing: Questionnaire\PFQ_B.XPT                                                                                Processing: Questionnaire\PFQ_C.XPT                                                                                Processing: Questionnaire\PFQ_D.XPT                                                                                Processing: Questionnaire\PFQ_E.XPT                                                                                Processing: Questionnaire\PFQ_F.XPT                                                                                Processing: Questionnaire\PFQ_G.XPT                                                                                Processing: Questionnaire\PFQ_H.XPT                                                                                Processing: Questionnaire\PFQ_I.XPT                                                                

Processing: Questionnaire\DUQ_H.XPT                                                                                Processing: Questionnaire\ECQ.XPT                                                                                Processing: Questionnaire\ECQ_B.XPT                                                                                Processing: Questionnaire\ECQ_C.XPT                                                                                Processing: Questionnaire\ECQ_D.XPT                                                                                Processing: Questionnaire\ECQ_E.XPT                                                                                Processing: Questionnaire\ECQ_F.XPT                                                                                Processing: Questionnaire\ECQ_G.XPT                                                                                Processing: Questionnaire\ECQ_H.XPT                             

Processing: Questionnaire\SXQ_F.XPT                                                                                Processing: Questionnaire\SXQ_G.XPT                                                                                Processing: Questionnaire\SXQ_H.XPT                                                                                Processing: Questionnaire\SXQ_I.XPT                                                                                Processing: Questionnaire\TBQ.XPT                                                                                Processing: Questionnaire\TBQ_G.XPT                                                                                Processing: Questionnaire\VIQ.XPT                                                                                Processing: Questionnaire\VIQ_B.XPT                                                                                Processing: Questionnaire\VIQ_C.XPT                               

                                                                                Processing: Questionnaire\PAQ_D.XPT                                                                                Processing: Questionnaire\PAQ_E.XPT                                                                                Processing: Questionnaire\PAQ_F.XPT                                                                                Processing: Questionnaire\PAQ_G.XPT                                                                                Processing: Questionnaire\PAQ_H.XPT                                                                                Processing: Questionnaire\PAQ_I.XPT                                                                                Processing: Questionnaire\PFQ.XPT                                                                                Processing: Questionnaire\PFQ_B.XPT                                                                  

Processing: Questionnaire\CBQPFC_E.XPT                                                                                Processing: Questionnaire\CBQPFC_F.XPT                                                                                Processing: Questionnaire\CBQ_E.XPT                                                                                Processing: Questionnaire\CBQ_F.XPT                                                                                Processing: Questionnaire\CBQ_G.XPT                                                                                Processing: Questionnaire\CBQ_H.XPT                                                                                Processing: Questionnaire\CDQ.XPT                                                                                Processing: Questionnaire\CDQ_B.XPT                                                                                Processing: Questionnaire\CDQ_C.XPT                       

                                                                                Processing: Questionnaire\SMQFAM_B.XPT                                                                                Processing: Questionnaire\SMQFAM_C.XPT                                                                                Processing: Questionnaire\SMQFAM_D.XPT                                                                                Processing: Questionnaire\SMQFAM_E.XPT                                                                                Processing: Questionnaire\SMQFAM_F.XPT                                                                                Processing: Questionnaire\SMQFAM_G.XPT                                                                                Processing: Questionnaire\SMQFAM_H.XPT                                                                                Processing: Questionnaire\SMQFAM_I.XPT                                        

Processing: Questionnaire\CSQ_G.XPT                                                                                Processing: Questionnaire\CSQ_H.XPT                                                                                Processing: Questionnaire\DBQ.XPT                                                                                Processing: Questionnaire\DBQ_B.XPT                                                                                Processing: Questionnaire\DBQ_C.XPT                                                                                Processing: Questionnaire\DBQ_D.XPT                                                                                Processing: Questionnaire\DBQ_E.XPT                                                                                Processing: Questionnaire\DBQ_F.XPT                                                                                Processing: Questionnaire\DBQ_G.XPT                             

Processing: Questionnaire\SMQ_G.XPT                                                                                Processing: Questionnaire\SMQ_H.XPT                                                                                Processing: Questionnaire\SMQ_I.XPT                                                                                Processing: Questionnaire\SSQ.XPT                                                                                Processing: Questionnaire\SSQ_B.XPT                                                                                Processing: Questionnaire\SSQ_C.XPT                                                                                Processing: Questionnaire\SSQ_D.XPT                                                                                Processing: Questionnaire\SSQ_E.XPT                                                                                Processing: Questionnaire\SXQ.XPT                                

Processing: Questionnaire\INQ_E.XPT                                                                                Processing: Questionnaire\INQ_F.XPT                                                                                Processing: Questionnaire\INQ_G.XPT                                                                                Processing: Questionnaire\INQ_H.XPT                                                                                Processing: Questionnaire\INQ_I.XPT                                                                                Processing: Questionnaire\KIQ.XPT                                                                                Processing: Questionnaire\KIQ_P_B.XPT                                                                                Processing: Questionnaire\KIQ_P_C.XPT                                                                                Processing: Questionnaire\KIQ_P_D.XPT                       

Processing: Examination\BMX_C.XPT                                                                                Processing: Examination\BMX_D.XPT                                                                                Processing: Examination\BMX_E.XPT                                                                                Processing: Examination\BMX_F.XPT                                                                                Processing: Examination\BMX_G.XPT                                                                                Processing: Examination\BMX_H.XPT                                                                                Processing: Examination\BMX_I.XPT                                                                                Processing: Examination\BPX.XPT                                                                                Processing: Examination\BPX_B.XPT                                               

Processing: Examination\BMX_B.XPT                                                                                Processing: Examination\BMX_C.XPT                                                                                Processing: Examination\BMX_D.XPT                                                                                Processing: Examination\BMX_E.XPT                                                                                Processing: Examination\BMX_F.XPT                                                                                Processing: Examination\BMX_G.XPT                                                                                Processing: Examination\BMX_H.XPT                                                                                Processing: Examination\BMX_I.XPT                                                                                Processing: Examination\BPX.XPT                                               

Processing: Examination\BPX.XPT                                                                                Processing: Examination\BPX_B.XPT                                                                                Processing: Examination\BPX_C.XPT                                                                                Processing: Examination\BPX_D.XPT                                                                                Processing: Examination\BPX_E.XPT                                                                                Processing: Examination\BPX_F.XPT                                                                                Processing: Examination\BPX_G.XPT                                                                                Processing: Examination\BPX_H.XPT                                                                                Processing: Examination\BPX_I.XPT                                               

Processing: Examination\ARX_F.XPT                                                                                Processing: Examination\AUX1.XPT                                                                                Processing: Examination\AUXAR.XPT                                                                                Processing: Examination\AUXAR_B.XPT                                                                                Processing: Examination\AUXAR_C.XPT                                                                                Processing: Examination\AUXAR_D.XPT                                                                                Processing: Examination\AUXAR_E.XPT                                                                                Processing: Examination\AUXAR_F.XPT                                                                                Processing: Examination\AUXAR_G.XPT                                  

                                                                                Processing: Laboratory\DOXPOL_F.XPT                                                                                Processing: Laboratory\EPHPP_H.XPT                                                                                Processing: Laboratory\EPH_D.XPT                                                                                Processing: Laboratory\EPH_E.XPT                                                                                Processing: Laboratory\EPH_F.XPT                                                                                Processing: Laboratory\EPH_G.XPT                                                                                Processing: Laboratory\EPP_D.XPT                                                                                Processing: Laboratory\FASTQX_D.XPT                                                                                

                                                                                Processing: Laboratory\SSFLRT_H.XPT                                                                                Processing: Laboratory\SSFOL_A.XPT                                                                                Processing: Laboratory\SSHCVR_C.XPT                                                                                Processing: Laboratory\SSHCV_E.XPT                                                                                Processing: Laboratory\SSHEPC_H.XPT                                                                                Processing: Laboratory\SSHPV_F.XPT                                                                                Processing: Laboratory\SSHSV1_A.XPT                                                                                Processing: Laboratory\SSHSV1_B.XPT                                                                   

                                                                                Processing: Questionnaire\BHQ_F.XPT                                                                                Processing: Questionnaire\BPQ.XPT                                                                                Processing: Questionnaire\BPQ_B.XPT                                                                                Processing: Questionnaire\BPQ_C.XPT                                                                                Processing: Questionnaire\BPQ_D.XPT                                                                                Processing: Questionnaire\BPQ_E.XPT                                                                                Processing: Questionnaire\BPQ_F.XPT                                                                                Processing: Questionnaire\BPQ_G.XPT                                                                  

Processing: Questionnaire\SMQ_C.XPT                                                                                Processing: Questionnaire\SMQ_D.XPT                                                                                Processing: Questionnaire\SMQ_E.XPT                                                                                Processing: Questionnaire\SMQ_F.XPT                                                                                Processing: Questionnaire\SMQ_G.XPT                                                                                Processing: Questionnaire\SMQ_H.XPT                                                                                Processing: Questionnaire\SMQ_I.XPT                                                                                Processing: Questionnaire\SSQ.XPT                                                                                Processing: Questionnaire\SSQ_B.XPT                             

                                                                                Processing: Questionnaire\KIQ_P_B.XPT                                                                                Processing: Questionnaire\KIQ_P_C.XPT                                                                                Processing: Questionnaire\KIQ_P_D.XPT                                                                                Processing: Questionnaire\KIQ_P_E.XPT                                                                                Processing: Questionnaire\KIQ_U_B.XPT                                                                                Processing: Questionnaire\KIQ_U_C.XPT                                                                                Processing: Questionnaire\KIQ_U_D.XPT                                                                                Processing: Questionnaire\KIQ_U_E.XPT                                                

Processing: Questionnaire\DIQ_C.XPT                                                                                Processing: Questionnaire\DIQ_D.XPT                                                                                Processing: Questionnaire\DIQ_E.XPT                                                                                Processing: Questionnaire\DIQ_F.XPT                                                                                Processing: Questionnaire\DIQ_G.XPT                                                                                Processing: Questionnaire\DIQ_H.XPT                                                                                Processing: Questionnaire\DIQ_I.XPT                                                                                Processing: Questionnaire\DLQ_H.XPT                                                                                Processing: Questionnaire\DLQ_I.XPT                            

                                                                                Processing: Questionnaire\ACQ_F.XPT                                                                                Processing: Questionnaire\ACQ_G.XPT                                                                                Processing: Questionnaire\ACQ_H.XPT                                                                                Processing: Questionnaire\ACQ_I.XPT                                                                                Processing: Questionnaire\AGQ_D.XPT                                                                                Processing: Questionnaire\ALQ.XPT                                                                                Processing: Questionnaire\ALQY_F.XPT                                                                                Processing: Questionnaire\ALQ_B.XPT                                                                 

                                                                                Processing: Questionnaire\SLQ_H.XPT                                                                                Processing: Questionnaire\SMQ.XPT                                                                                Processing: Questionnaire\SMQFAM.XPT                                                                                Processing: Questionnaire\SMQFAM_B.XPT                                                                                Processing: Questionnaire\SMQFAM_C.XPT                                                                                Processing: Questionnaire\SMQFAM_D.XPT                                                                                Processing: Questionnaire\SMQFAM_E.XPT                                                                                Processing: Questionnaire\SMQFAM_F.XPT                                                  

Processing: Questionnaire\HSQ.XPT                                                                                Processing: Questionnaire\HSQ_B.XPT                                                                                Processing: Questionnaire\HSQ_C.XPT                                                                                Processing: Questionnaire\HSQ_D.XPT                                                                                Processing: Questionnaire\HSQ_E.XPT                                                                                Processing: Questionnaire\HSQ_F.XPT                                                                                Processing: Questionnaire\HSQ_G.XPT                                                                                Processing: Questionnaire\HSQ_H.XPT                                                                                Processing: Questionnaire\HSQ_I.XPT                             

                                                                                Processing: Questionnaire\SMQ_I.XPT                                                                                Processing: Questionnaire\SSQ.XPT                                                                                Processing: Questionnaire\SSQ_B.XPT                                                                                Processing: Questionnaire\SSQ_C.XPT                                                                                Processing: Questionnaire\SSQ_D.XPT                                                                                Processing: Questionnaire\SSQ_E.XPT                                                                                Processing: Questionnaire\SXQ.XPT                                                                                Processing: Questionnaire\SXQ_B.XPT                                                                    

                                                                                Processing: Questionnaire\HUQ_E.XPT                                                                                Processing: Questionnaire\HUQ_F.XPT                                                                                Processing: Questionnaire\HUQ_G.XPT                                                                                Processing: Questionnaire\HUQ_H.XPT                                                                                Processing: Questionnaire\HUQ_I.XPT                                                                                Processing: Questionnaire\IMQ.XPT                                                                                Processing: Questionnaire\IMQ_B.XPT                                                                                Processing: Questionnaire\IMQ_C.XPT                                                                  

Processing: Questionnaire\SMQ_G.XPT                                                                                Processing: Questionnaire\SMQ_H.XPT                                                                                Processing: Questionnaire\SMQ_I.XPT                                                                                Processing: Questionnaire\SSQ.XPT                                                                                Processing: Questionnaire\SSQ_B.XPT                                                                                Processing: Questionnaire\SSQ_C.XPT                                                                                Processing: Questionnaire\SSQ_D.XPT                                                                                Processing: Questionnaire\SSQ_E.XPT                                                                                Processing: Questionnaire\SXQ.XPT                               

Extract the features and targets.

In [24]:
features = ds.features.values
targets = ds.targets.values

indx = np.argwhere(targets != 3)
dataset_features = features[indx.flatten()]
dataset_targets = targets[indx.flatten()]

dataset_features = pd.DataFrame(dataset_features, columns=ds.features.columns)
dataset_targets = pd.DataFrame(dataset_targets, columns=ds.targets.columns, dtype='float64')
print(dataset_targets.shape)
# Pre-fix
# dataset_features = ds.features
# dataset_targets = ds.targets

(98908, 1)


## Preprocessing of Data
### Drop features with too-low variance

In [25]:
dataset_features_sel = dataset_features.copy()

# var_thresh = .001
# dropped_keys = []
# for key, value in dataset_features_sel.iteritems():
#     if(value.var() < var_thresh):
#         dataset_features_sel.drop(key, axis=1, inplace=True)
#         dropped_keys.append(key)

# print("Dropped %i keys, they were:\n %s" %(len(dropped_keys), dropped_keys))

### Calculate feature importance

In [33]:
importances = []
num_trees = 2
for i in range(num_trees):
    clf = ExtraTreesClassifier(n_estimators=50)
    clf = clf.fit(dataset_features_sel.values, dataset_targets.values.ravel())
    importances.append(clf.feature_importances_)

# print(importances)

#### Drop Threshhold helper function

In [47]:
def drop_threshold(threshold, df_data, df_threshold):
    mi = df_threshold # mutual information dataframe
    df = df_data # data containing dataframe
    to_keep = []
    # mi = mi.sort_values(by =0, axis=1) #sorts the values ascending
    # print(mi)
    var_counter = 0
    for col in mi:
        var_counter += mi[col].iloc[0]
        to_keep.append(col)
        if(var_counter > threshold):
            break
    if(len(to_keep) > 0):
        return pd.DataFrame(df[to_keep].copy(), dtype='float64')
    else:
        raise NotImplementedError


### Drop low importance features

In [50]:
importances_mean = np.array(importances)
importances_mean = np.mean(importances_mean, axis=0)
importances_mean = importances_mean.reshape((1, len(importances_mean)))
importances_mean = pd.DataFrame(importances_mean, columns=ds.features.columns, dtype='float64')
importances_mean = importances_mean.sort_values(by=0, axis=1, ascending=False) #sorts the values ascending

df_importances = drop_threshold(0.90, dataset_features_sel ,importances_mean)
print(df_importances.shape)

(98908, 39)


### Seperate one-hot encoded, discrete, and continuous features

In [6]:
# onehot encoded
dataset_features_onehot = dataset_features_sel.filter(regex=(".*#.*"))
# Continuous 
dataset_features_cont = dataset_features_sel.drop(columns=dataset_features_sel.filter(regex=(".*#.*")))

dataset_features_disc = []
for key, value in dataset_features_cont.iteritems():
    # discrete were normalized so they should have a mean of .5, and we expect 3 of them
    if(value.mean() >= .5):
        dataset_features_disc.append(dataset_features_cont[key])
        dataset_features_cont.drop(key, axis=1, inplace=True)
# discrete
dataset_features_disc = pd.concat(dataset_features_disc, axis=1)
# print(dataset_features_sel.shape)
# print(dataset_features_cont.shape)
# print(dataset_features_onehot.shape)
# print(dataset_features_disc.shape)

### Calculate mutual information
*Caution* Very long computation, proceed with care

In [7]:
dataset_features_sel_vals = dataset_features_sel.values
dataset_targets_vals = np.ravel(dataset_targets.values) # unroll the column to the correct vector shape
# dfs = [dataset_features_cont.copy(), dataset_features_disc.copy(), dataset_features_onehot.copy()]
# retrieve values from dfs_out for subsequent steps if this comes after PCA
# dataset_features_cont = dfs_out[0]
# dataset_features_disc = dfs_out[1]
# dataset_features_onehot = dfs_out[2]

print('0% [---] 100%', end='\r')
mutual_info_cont = mutual_info_regression(dataset_features_cont.values, dataset_targets_vals)
print('0% [+--] 100%', end='\r')
mutual_info_disc = mutual_info_regression(dataset_features_disc.values, dataset_targets_vals)
print('0% [++-] 100%', end='\r')
mutual_info_onehot = mutual_info_classif(dataset_features_onehot.values, dataset_targets_vals)
print('0% [+++] 100%', end='\r')

0% [---] 100%

MemoryError: 

### Convert mutual info into dataframes and inspect

In [None]:
if(mutual_info_cont.shape[0] != 1): # we only need to run this portion of the block once
    mutual_info_cont = mutual_info_cont.reshape((1, len(mutual_info_cont)))
    mutual_info_disc = mutual_info_disc.reshape((1, len(mutual_info_disc))) 
    mutual_info_onehot = mutual_info_onehot.reshape((1, len(mutual_info_onehot))) 


mutual_info_cont_df = pd.DataFrame(data=mutual_info_cont, columns=dataset_features_cont.columns)
mutual_info_disc_df = pd.DataFrame(data=mutual_info_disc, columns=dataset_features_disc.columns)
mutual_info_onehot_df = pd.DataFrame(data=mutual_info_onehot, columns=dataset_features_onehot.columns)

### Dropping features with low mutual information
    __Conditioned on target variable__

In [None]:
mi_dfs = [mutual_info_cont_df, mutual_info_disc_df, mutual_info_onehot_df]
dfs = [dataset_features_cont, dataset_features_disc, dataset_features_onehot]
dfs_out = []

for i in range(len(dfs)):
    mi = mi_dfs[i] # mutual information dataframe
    df = dfs[i] # data containing dataframe
    to_drop = []
    mean = 0.25*mi.mean(axis=1)
    
    # mi = mi.sort_values(by =0, axis=1) #sorts the values ascending
    # print(mi)
    for col in mi:
        if(mi[col].iloc[0] < mean).all():
            to_drop.append(col)
    if(len(to_drop) > 0):
        dfs_out.append(pd.DataFrame(df.drop(columns=to_drop)))
    
dfs_out = pd.concat(dfs_out, axis=1)
# Retained columns
print(dfs_out.columns)
# output shape
print(dfs_out.shape)

### Calculate PCA to drop features

In [None]:
# dfs = [dataset_features_cont.copy(), dataset_features_disc.copy(), dataset_features_onehot.copy()]
pca_dfs = []
gfs = 15 # Number of features to graph
gfs -= 1
for df in dfs_out:    
    pca = PCA() # no args, so keep all columns, n_components=2
    pca.fit(df)
    var_ratio = pca.explained_variance_ratio_
    var_ratio = var_ratio.reshape((1, len(var_ratio)))
    pca_dfs.append(pd.DataFrame(data=var_ratio, columns=df.columns))
    # Visualize PCA:
    idx = [i for i in range(len(df.columns))]
    cols = [i for i in df.columns]
    plt.bar(idx[0:gfs], pca.explained_variance_ratio_[0:gfs])
    plt.xticks(idx[0:gfs], df.columns[0:gfs], rotation='vertical')
    plt.xlabel('Features')
    plt.ylabel('Variance')
    plt.show()


### Remove columns that account for too little of the variance

In [None]:
dfs_out = []

for i in range(len(dfs)):
    mi = pca_dfs[i] # mutual information dataframe
    df = dfs[i] # data containing dataframe
    to_keep = []
    var_threshold = 0.80
    # mi = mi.sort_values(by =0, axis=1) #sorts the values ascending
    # print(mi)
    var_counter = 0
    for col in mi:
        var_counter += mi[col].iloc[0]
        to_keep.append(col)
        if(var_counter > var_threshold):
            break
    if(len(to_keep) > 0):
        dfs_out.append(pd.DataFrame(df[to_keep].copy(), dtype='float64'))

for df in dfs_out:
    print(df.shape)

## Train/Test Separation

In [None]:
perm = np.random.permutation(dataset_targets.shape[0])
dataset_features = dfs_out.values[perm]
dataset_targets = dataset_targets_vals[perm]

print("dataset_features Shape: %s, dataset_targets Shape: %s" % (dataset_features.shape, dataset_targets.shape))

def get_batch(n_size, phase):
    # select indices
    n_samples = dataset_features.shape[0]
    n_classes = int(dataset_targets.max() + 1)
    if phase == 'test':
        inds_sel = np.arange(0, int(n_samples*0.15), 1)
    elif phase == 'validation':
        n_samples = dataset_features.shape[0]
        inds_sel = np.arange(int(n_samples*0.15), int(n_samples*0.30), 1)
    elif phase == 'train':
        n_samples = dataset_features.shape[0]
        inds_sel = np.arange(int(n_samples*0.30), n_samples, 1)
    else:
        raise NotImplementedError
    inds_sel = np.random.permutation(inds_sel)
    batch_inds = []
    for cl in range(n_classes):
        inds_cl = inds_sel[dataset_targets[inds_sel] == cl]
        batch_inds.extend(inds_cl[:n_size//n_classes])
    batch_inds = np.random.permutation(batch_inds)
    
    return dataset_features[batch_inds], dataset_targets[batch_inds]

## Classification

In [None]:
trials = 30
accu_tst_RFC = []
accu_tst_SVC = []
accu_tst_LR = []

for i in range(trials):
    print('0% [',i*'.',(trials-i)*'-','] 100%', end='\r')
    features_trn, targets_trn = get_batch(n_size=5000, phase='train')
    features_tst, targets_tst = get_batch(n_size=1000, phase='test')

    clf = RandomForestClassifier(n_estimators=100)
    clf.fit(features_trn, targets_trn)
    preds_tst = clf.predict(features_tst)
    accu = np.mean(preds_tst==targets_tst)
    # print('accu_tst_RFC', accu)
    accu_tst_RFC.append(accu)

    clf = SVC(gamma='auto')
    clf.fit(features_trn, targets_trn)
    preds_tst = clf.predict(features_tst)
    accu = np.mean(preds_tst==targets_tst)
    # print('accu_tst_SVC', accu)
    accu_tst_SVC.append(accu)

    clf = LogisticRegression(solver='lbfgs', max_iter=200, multi_class='auto')
    clf.fit(features_trn, targets_trn)
    preds_tst = clf.predict(features_tst)
    accu = np.mean(preds_tst==targets_tst)
    # print('accu_tst_LR', accu)
    accu_tst_LR.append(accu)
    
accu_tst_RFC = np.array(accu_tst_RFC)
accu_tst_SVC = np.array(accu_tst_SVC)
accu_tst_LR = np.array(accu_tst_LR)

print('accu_tst_RFC %s, accu_tst_SVC %s, accu_tst_LR %s'
      % (np.mean(accu_tst_RFC), np.mean(accu_tst_SVC), np.mean(accu_tst_LR)))

In [None]:
print(classification_report(targets_tst, preds_tst))

### Accuracies from baseline: 
#### Cancer (ds.load_cancer()):
* accu_tst_RFC 0.758
* accu_tst_SVC 0.759
* accu_tst_LR 0.768