In [23]:
import pandas as pd
import numpy as np
import os
import random
import progressbar
import pickle
import matplotlib.pyplot as plt

from sklearn.naive_bayes import (ComplementNB, GaussianNB, MultinomialNB)
from sklearn.metrics import (plot_confusion_matrix, plot_precision_recall_curve,
                             plot_roc_curve, auc)
from sklearn.model_selection import cross_validate, StratifiedKFold
!pip install delayed
!pip install scikit-learn
!pip install -U imbalanced-learn
from imblearn.over_sampling import (ADASYN, BorderlineSMOTE, KMeansSMOTE,
                                    RandomOverSampler, SMOTE, SMOTEN, SMOTENC,
                                    SVMSMOTE)
from imblearn.under_sampling import (AllKNN, ClusterCentroids,
                                     CondensedNearestNeighbour,
                                     EditedNearestNeighbours,
                                     InstanceHardnessThreshold,
                                     NearMiss, NeighbourhoodCleaningRule,
                                     OneSidedSelection, RandomUnderSampler,
                                     RepeatedEditedNearestNeighbours,
                                     TomekLinks)
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.pipeline import Pipeline

!pip install proc

import seaborn as sb
from statistics import mean, stdev

from google.colab import drive
drive.mount('/content/gdrive')

over_samplers = {
    1: ADASYN,
    2: BorderlineSMOTE,
    3: KMeansSMOTE,
    4: RandomOverSampler,
    5: SMOTE,
    6: SMOTEN,
    7: SMOTENC,
    8: SVMSMOTE
}

under_samplers = {
    1: AllKNN,
    2: ClusterCentroids,
    3: CondensedNearestNeighbour,
    4: EditedNearestNeighbours,
    5: InstanceHardnessThreshold,
    6: NearMiss,
    7: NeighbourhoodCleaningRule,
    8: OneSidedSelection,
    9: RandomUnderSampler,
    10: RepeatedEditedNearestNeighbours,
    11: TomekLinks
}

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Total number of data

In [24]:
data_per_csv = 512
def total():
  total_files = 0
  for files in os.listdir('gdrive/My Drive/Summer Research/HRV/Outlier Free/All/'):
    total_files += 1
  return total_files

Load data

In [25]:
def loadHRVData(c):
  hrv_and_labels = list()
  if c == 'wt':
    with open('gdrive/My Drive/Summer Research/Variables/wt_pseudoimage_hrv_and_labels.pkl', 'rb') as file:
      #load data from file
      hrv_and_labels = pickle.load(file)
  elif c == 'wt denoised':
    with open('gdrive/My Drive/Summer Research/Variables/wt_denoised_pseudoimage_hrv_and_labels.pkl', 'rb') as file:
      #load data from file
      hrv_and_labels = pickle.load(file)
  elif c == 'denoised':
    with open('gdrive/My Drive/Summer Research/Variables/wt_denoised_hrv_and_labels.pkl', 'rb') as file:
      #load data from file
      hrv_and_labels = pickle.load(file)
  elif c == 'normal':
    size = (163, 223, 4)
    with open('gdrive/My Drive/Summer Research/Variables/normal_hrv_and_labels.pkl', 'rb') as file:
      #load data from file
      hrv_and_labels = pickle.load(file)
  elif c == 'array':
    with open('gdrive/My Drive/Summer Research/Variables/array_hrv_and_labels.pkl', 'rb') as file:
      #load data from file
      hrv_and_labels = pickle.load(file)
  elif c == 'wt a1d1d2d3 coords':
    with open('gdrive/My Drive/Summer Research/Variables/wt_a1d1d2d3_coords_hrv_and_labels.pkl', 'rb') as file:
      #load data from file
      hrv_and_labels = pickle.load(file)
  elif c == 'wt a1d1d2d3 denoised coords':
    with open('gdrive/My Drive/Summer Research/Variables/wt_a1d1d2d3_denoised_coords_hrv_and_labels.pkl', 'rb') as file:
      #load data from file
      hrv_and_labels = pickle.load(file)
  elif c == 'a1':
    with open('gdrive/My Drive/Summer Research/Variables/wt_a1_coord_hrv_and_labels.pkl', 'rb') as file:
      #load data from file
      hrv_and_labels = pickle.load(file)
  elif c == 'd1':
    with open('gdrive/My Drive/Summer Research/Variables/wt_d1_coord_hrv_and_labels.pkl', 'rb') as file:
      #load data from file
      hrv_and_labels = pickle.load(file)
  elif c == 'd2':
    with open('gdrive/My Drive/Summer Research/Variables/wt_d2_coord_hrv_and_labels.pkl', 'rb') as file:
      #load data from file
      hrv_and_labels = pickle.load(file)
  elif c == 'd3':
    with open('gdrive/My Drive/Summer Research/Variables/wt_d3_coord_hrv_and_labels.pkl', 'rb') as file:
      #load data from file
      hrv_and_labels = pickle.load(file)
  elif c == 'd1 denoised':
    with open('gdrive/My Drive/Summer Research/Variables/wt_d1_denoised_coord_hrv_and_labels.pkl', 'rb') as file:
      #load data from file
      hrv_and_labels = pickle.load(file)
  elif c == 'd2 denoised':
    with open('gdrive/My Drive/Summer Research/Variables/wt_d2_denoised_coord_hrv_and_labels.pkl', 'rb') as file:
      #load data from file
      hrv_and_labels = pickle.load(file)
  elif c == 'd3 denoised':
    with open('gdrive/My Drive/Summer Research/Variables/wt_d3_denoised_coord_hrv_and_labels.pkl', 'rb') as file:
      #load data from file
      hrv_and_labels = pickle.load(file)
  return hrv_and_labels

Oversampling and undersampling

In [26]:
def resampling(o_s, u_s, kn, args):
  if args == 'SMOTEENN' or args == 'SMOTETomek':
    if args == 'SMOTEENN':
      resampler = SMOTEENN(enn=EditedNearestNeighbours(sampling_strategy='majority'),
                          n_jobs=-1)
    elif args == 'SMOTETomek':
      resampler = SMOTETomek(n_jobs=-1)
    return resampler
  o = over_samplers[o_s]
  u = under_samplers[u_s]
  if o_s in [1]:
    over = o(sampling_strategy='auto', n_neighbors=kn, n_jobs=-1)
  elif o_s in [4]:
    over = o(sampling_strategy='auto', n_jobs=-1)
  else:
    over = o(sampling_strategy='auto', k_neighbors=kn, n_jobs=-1)
  
  if u_s in [2, 5, 11]:
    under = u(sampling_strategy='auto', n_jobs=-1)
  elif u_s in [11]:
    under = u(sampling_strategy='auto')
  else:
    under = u(sampling_strategy='auto', n_neighbors=kn, n_jobs=-1)
  steps = [('over', over), ('under', under)]
  resampler = Pipeline(steps=steps)
  return resampler

Naive Bayes model

In [35]:
def NBModel(X, y, cv):
  model = GaussianNB()
  #K-fold Cross Validation
  scores = cross_validate(model, X, y, cv=cv, scoring=('accuracy', 'balanced_accuracy', 'precision', 'recall', 'roc_auc', 'f1'), n_jobs=-1, verbose=0, return_estimator=True)
  return scores

In [31]:
def metrics(scores, X, y, cv, resampling_method, data_choice):
  dir = 'gdrive/My Drive/Summer Research/Figures/Naive Bayes/'
  file_name = resampling_method+'-resampled '+data_choice
  rem_list = ['estimator', 'fit_time', 'score_time']
  csv_scores = dict([(key, val) for key, val in 
           scores.items() if key not in rem_list])
  df = pd.DataFrame.from_dict(csv_scores)
  df.to_csv(dir+file_name+'.csv', index=False)

  #TODO: generate PR, ROC, Confusion matrix graphs
  tprs = []
  aucs = []
  mean_fpr = np.linspace(0, 1, 100)

  cm = np.zeros((4,10))

  fig, ax = plt.subplots(figsize=(10,10))
  fig2, ax2 = plt.subplots(figsize=(10,10))
  fig3, ax3 = plt.subplots(figsize=(10,10))
  fig4, ax4 = plt.subplots(figsize=(10,10))
  for i, (train, test) in enumerate(cv.split(X, y)):
    viz = plot_roc_curve(scores['estimator'][i], X[test], y[test],
                         name='ROC fold {}'.format(i),
                         alpha=0.3, lw=1, ax=ax)
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)

    p = plot_precision_recall_curve(scores['estimator'][i], X[test],
                                     y[test], name='P v. R fold {}'.format(i),
                                     alpha=0.5, lw=1.5, ax=ax2)
    
    c = plot_confusion_matrix(scores['estimator'][i], X[test], y[test],
                              normalize='all', ax=ax4)
    cm[:,i] = np.array(c.confusion_matrix).reshape(4,)
  plt.close(fig=fig4)
  #ROC Curve
  ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
        label='Chance', alpha=.8)
  mean_tpr = np.mean(tprs, axis=0)
  mean_tpr[-1] = 1.0
  mean_auc = auc(mean_fpr, mean_tpr)
  std_auc = np.std(aucs)
  ax.plot(mean_fpr, mean_tpr, color='b',
          label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
          lw=2, alpha=.8)

  std_tpr = np.std(tprs, axis=0)
  tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
  tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
  ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                  label=r'$\pm$ 1 std. dev.')

  ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
        title="ROC Curve")
  ax.legend(loc="lower right")
  fig.savefig(dir+file_name+' ROC.png', bbox_inches='tight')
  plt.close(fig=fig)
  #PR Curve
  ax2.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
        title="Precision v. Recall Curve")
  ax2.legend(loc="lower left")
  fig2.savefig(dir+file_name+' PR.png', bbox_inches='tight')
  plt.close(fig=fig2)
  #Confusion Matrix
  c1, c2, c3, c4 = cm[0,:], cm[1,:], cm[2,:], cm[3,:]
  means = np.array([[mean(c1), mean(c2)],[mean(c3), mean(c4)]])
  stds = np.array([[stdev(c1), stdev(c2)],[stdev(c3), stdev(c4)]])
  labels = np.array([["{:.2%} $\pm$ {:.2%}".format(mean(c1), stdev(c1)),
                      "{:.2%} $\pm$ {:.2%}".format(mean(c2), stdev(c2))],
                     ["{:.2%} $\pm$ {:.2%}".format(mean(c3), stdev(c3)),
                      "{:.2%} $\pm$ {:.2%}".format(mean(c4), stdev(c4))]])
  plt.figure(figsize=(12,8))
  g = sb.heatmap(100*means, fmt='', annot=labels, cmap='Greens',
                 xticklabels=['Predicted Healthy', 'Predicted Diabetes'],
                 yticklabels=['Healthy', 'Diabetes'], ax=ax3, cbar_kws={'format': '%.0f%%'})
  g.set_yticklabels(labels=g.get_yticklabels(), va='center')
  g.set_title('Confusion Matrix')
  fig3.savefig(dir+file_name+' Confusion Matrix.png', bbox_inches='tight')
  plt.close(fig=fig3)

In [36]:
data_choices = {
    1:'a1',
    2:'d1',
    3:'d2',
    4:'d3',
    5:'d1 denoised',
    6:'d2 denoised',
    7:'d3 denoised',
    8:'denoised',
    9:'array',
    10:'wt a1d1d2d3 coords',
    11:'wt a1d1d2d3 denoised coords'
}

all_total = total()
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)

widgets = [' [',
      progressbar.Timer(format= 'elapsed time: %(elapsed)s'),
      '] ',
        progressbar.Bar('#'),' (',
        progressbar.ETA(), ') ',
        ]
bar = progressbar.ProgressBar(max_value=len(data_choices)+1, widgets=widgets).start()
count = 0

resampling_method = 'SMOTETomek'
for i in range(len(data_choices)):
  count += 1
  bar.update(count)
  data_choice = data_choices[i+1]
  hrv_and_labels = loadHRVData(data_choice)
  random.shuffle(hrv_and_labels)
  X = np.array([item[0] for item in hrv_and_labels]).reshape(total(),-1)
  y = np.array([item[1] for item in hrv_and_labels])
  X_resampled, y_resampled = resampling(0,0,0,resampling_method).fit_resample(X, y)
  scores = NBModel(X_resampled, y_resampled, cv)
  metrics(scores, X_resampled, y_resampled, cv, resampling_method, data_choice)

 [elapsed time: 0:00:30] |###############################   | (ETA:   0:00:02) 

<Figure size 864x576 with 0 Axes>

<Figure size 864x576 with 0 Axes>

<Figure size 864x576 with 0 Axes>

<Figure size 864x576 with 0 Axes>

<Figure size 864x576 with 0 Axes>

<Figure size 864x576 with 0 Axes>

<Figure size 864x576 with 0 Axes>

<Figure size 864x576 with 0 Axes>

<Figure size 864x576 with 0 Axes>

<Figure size 864x576 with 0 Axes>

<Figure size 864x576 with 0 Axes>