In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Dec 27 10:52:36 2017

@author: jh2475
"""

import os
import numpy as np
import pandas as pd
from sklearn.neural_network import MLPClassifier
import sklearn.grid_search
from sklearn.metrics import roc_auc_score, f1_score, confusion_matrix
import datetime

In [2]:
#os.chdir('/data/Projects/ACC_NCDR/NCDR/BJMDATA/ACTION/NCHData')
folds = [f'/data/Projects/ACC_NCDR/NCDR/BJMDATA/ACTION/NCHData/multiple_imputed/fold_{fold}' for fold in range(1,6)]
modes = ['mcnamara', 'expanded']

calc_best_thres = False

In [2]:
for fold in folds:
    print(f'Starting fold {fold} at {datetime.datetime.now()}')
    for mode in modes:
        print(f'Loading data for {mode} dataset fold {fold}.')
        trainA = pd.read_csv(f'{fold}/trainA_{mode}.csv')
        trainB = pd.read_csv(f'{fold}/trainB_{mode}.csv')
        trainC = pd.read_csv(f'{fold}/trainC_{mode}.csv')
        
        #used for sampling data
        np.random.seed((int(fold[-1])+2)*int(fold[-1]))
        #indicesA = np.random.rand(len(trainA)) <= 0.33
        #indicesB = np.random.rand(len(trainB)) <= 0.33
        #indicesC = np.random.rand(len(trainC)) <= 0.33
        #trainA = trainA[indicesA]
        #trainB = trainB[indicesB]
        #trainC = trainC[indicesC]
        
        labelsA = trainA['DCStatus']
        IdA = trainA['IDX']
        trainA = trainA.drop(['DCStatus', 'IDX'], axis = 1)
        
        labelsB = trainB['DCStatus']
        IdB = trainB['IDX']
        trainB = trainB.drop(['DCStatus', 'IDX'], axis = 1)
        
        labelsC = trainC['DCStatus']
        IdC = trainC['IDX']
        trainC = trainC.drop(['DCStatus', 'IDX'], axis = 1)
        
        clf = MLPClassifier(activation = 'relu',
                            solver='adam',
                            early_stopping = True,
                            alpha = 0.1,
                            hidden_layer_sizes=(100,100,100,100,100)
                            )
        
        clf.fit(trainA, labelsA)
        print(f'Fit for {mode} dataset fold {fold} complete.')
        
        print("\tPredicting test set B")
        probasB = clf.predict_proba(trainB)[:,1]
        clf_auc = roc_auc_score(labelsB, probasB)
        
        print(f"\tPrediction complete with AUC {clf_auc}")
        
        print("\tPredicting test set C")
        probasC = clf.predict_proba(trainC)[:,1]
        clf_auc = roc_auc_score(labelsC, probasC)
        
        print(f"\tPrediction complete with AUC {clf_auc}")
        
        #Note- not yet built for Mult. Impute folds
        if calc_best_thres:
            print("\tCalculating f-score for best K")
            f_scores = []
            Candidates = np.arange(0.01,0.3,0.01)
            for thresh in Candidates:
                temp = [1 if x >= thresh else 0 for x in probasC]
                f_scores.append(f1_score(labelsC, temp))
            
            BestThresh = Candidates[np.argmax(f_scores)]
            finalClass = [1 if x >= BestThresh else 0 for x in probasC]
            
            conf = confusion_matrix(labelsC, finalClass)
            print("Optimal decision threshold and corresponding confusion matrix")
            print(BestThresh)
            print(conf)
            
            print("______________________________")
        
        trainB['DCStatus'] = labelsB
        trainB['IDX'] = IdB
        trainB['NN'] = probasB
        
        trainB.to_csv(f'{fold}/nn_preds_trainB_{mode}.csv', index = False)
        
        probasC = clf.predict_proba(trainC)[:,1]
        clf_auc = roc_auc_score(labelsC, probasC)
        
        trainC['DCStatus'] = labelsC
        trainC['IDX'] = IdC
        trainC['NN'] = probasC
        
        trainC.to_csv(f'{fold}/nn_preds_trainC_{mode}.csv', index = False)
    print(f'Fold {fold} completed at {datetime.datetime.now()}\n\n')