<a href="https://colab.research.google.com/github/byekelchik/ml2_project_2/blob/main/cse_447_final_model_byekel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
# from sklearn.metrics
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split,KFold
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, f1_score, accuracy_score

import matplotlib.pyplot as plt

from keras.datasets import mnist
from keras.utils import np_utils

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Helper Functions

In [3]:
def get_mnist():
  """
  Uses the paths from Keras to load MNIST data, creates train and test split of 90% and 10% of the training data

  ##Returns:
  - train_x: 90% of training data features
  - test_x: 10% of training data features
  - train_y: respective 90% of labels
  - test_y respective 10% of labels
  """
  
  (train_data_complete, train_target_complete), (test_data_complete, test_target_complete) = mnist.load_data()
  train_x, test_x, train_y, test_y = train_test_split(train_data_complete, train_target_complete, test_size = 0.1)
  return train_x, test_x, train_y, test_y

In [4]:
def get_pie():
  """
  Uses the paths on drive to get pie train and test

  ##Returns:
  - train_x: 90% of training data features
  - test_x: 10% of training data features
  - train_y: respective 90% of labels
  - test_y respective 10% of labels
  """
  path_pie_train = "/content/drive/MyDrive/PIE_32x32/StTrainFile.txt"
  pie_train = np.loadtxt(path_pie_train, delimiter= " ", dtype= np.float32)
  pie_train_features = pie_train[:, 0:1024]
  pie_train_features = np.reshape(pie_train_features, (pie_train_features.shape[0], 32, 32))
  pie_train_labels = pie_train[:, 1024]

  train_x, test_x, train_y, test_y = train_test_split(pie_train_features, pie_train_labels, test_size = 0.1)
  return train_x, test_x, train_y, test_y

In [5]:
def get_yale():
  """
  Uses the paths on drive to get YaleB train and test

  ##Returns:
  - train1_x: first set training data features
  - train1_y: first set training data labels
  - test1_x: first set testing data features
  - test1_y: first set testing data labels

  - train2_x: second set training data features
  - train2_y: second set training data labels
  - test2_x: second set testing data features
  - test2_y: second set testing data labels

  - train3_x: third set training data features
  - train3_y: third set training data labels
  - test3_x: third set testing data features
  - test3_y: third set testing data labels
  """
  path_yale_train_1 = "/content/drive/MyDrive/YaleB_32x32/StTrainFile1.txt"
  path_yale_train_2 = "/content/drive/MyDrive/YaleB_32x32/StTrainFile2.txt"
  path_yale_train_3 = "/content/drive/MyDrive/YaleB_32x32/StTrainFile3.txt"

  path_yale_test_1 = "/content/drive/MyDrive/YaleB_32x32/StTestFile1.txt"
  path_yale_test_2 = "/content/drive/MyDrive/YaleB_32x32/StTestFile2.txt"
  path_yale_test_3 = "/content/drive/MyDrive/YaleB_32x32/StTestFile3.txt" 


  yale_train_1 = np.loadtxt(path_yale_train_1, delimiter= " ", dtype= np.float32)
  yale_train_2 = np.loadtxt(path_yale_train_2, delimiter= " ", dtype= np.float32)
  yale_train_3 = np.loadtxt(path_yale_train_3, delimiter= " ", dtype= np.float32)

  yale_test_1 = np.loadtxt(path_yale_test_1, delimiter= " ", dtype= np.float32)
  yale_test_2 = np.loadtxt(path_yale_test_2, delimiter= " ", dtype= np.float32)
  yale_test_3 = np.loadtxt(path_yale_test_3, delimiter= " ", dtype= np.float32)

  train1_x = yale_train_1[:, 0:1024]
  train1_y = yale_train_1[:, 1024]

  train2_x = yale_train_2[:, 0:1024]
  train2_y = yale_train_2[:, 1024]

  train3_x = yale_train_3[:, 0:1024]
  train3_y = yale_train_3[:, 1024]

  test1_x = yale_test_1[:, 0:1024]
  test1_y = yale_test_1[:, 1024]

  test2_x = yale_test_2[:, 0:1024]
  test2_y = yale_test_2[:, 1024]

  test3_x = yale_test_3[:, 0:1024]
  test3_y = yale_test_3[:, 1024]

  return train1_x, train1_y, train2_x, train2_y, train3_x, train3_y, test1_x, test1_y, test2_x, test2_y, test3_x, test3_y

In [6]:
def model_eval(test, pred,d_set:str):
  """
  Take test and pred array and returns model evaluation metrics
  ### Args:
  - test: test array
  - pred: pred array 
  -------------------------
  ### Return:
  - f1: F1 score
  - roc: Multi-classification weighted ROC
  - acc: Accuracy
  """
  if d_set.lower() == 'pie' or 'yale':
    pred_cat = np_utils.to_categorical(pred-1)
  elif d_set.lower() == 'mnst':
    print('mnst here')
    pred_cat = np_utils.to_categorical(pred)
  f1 = f1_score(test, pred, average = 'weighted')
  roc = roc_auc_score(test, pred_cat, multi_class = 'ovr', average = 'weighted')
  acc = accuracy_score(test,pred)
  return round(f1,4), round(acc,4), round(roc,4)


### Data import and preprocessing

In [7]:
yale_train1_x, yale_train1_y, yale_train2_x, yale_train2_y, yale_train3_x, yale_train3_y, yale_test1_x, yale_test1_y, yale_test2_x, yale_test2_y, yale_test3_x, yale_test3_y = get_yale()
pie_train_x, pie_test_x, pie_train_y, pie_test_y = get_pie()
mnist_train_x, mnist_test_x, mnist_train_y, mnist_test_y = get_mnist()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


#### Make Valadation Set

In [8]:
from sklearn import model_selection
val_size = .1
pie_train_x, pie_val_x, pie_train_y, pie_val_y = train_test_split(pie_train_x,pie_train_y, test_size = val_size)
mnist_train_x, mnist_val_x, mnist_train_y, mnist_val_y = train_test_split(mnist_train_x,mnist_train_y,test_size= val_size)

#### Reshape

In [9]:
"""Reshape train x data"""
mnist_train_x_reshape = mnist_train_x.reshape(mnist_train_x.shape[0],mnist_train_x.shape[1]**2)
pie_train_x_reshape = pie_train_x.reshape(pie_train_x.shape[0],pie_train_x.shape[1]**2)

In [10]:
"""Reshape testing x data"""
mnist_test_x_reshape = mnist_test_x.reshape(mnist_test_x.shape[0],mnist_test_x.shape[1]**2)
pie_test_x_reshape = pie_test_x.reshape(pie_test_x.shape[0],pie_test_x.shape[1]**2)

### Model Prototyping - Baseline

#### PIE

In [58]:
pipe_pie = Pipeline(
    [('scaled',StandardScaler()),
     ('svc',SVC())
     ])
pipe_pie.fit(pie_train_x_reshape,pie_train_y)
pie_pred = pipe_pie.predict(pie_test_x_reshape)

#### Yale

In [59]:
yale_data_call = np.array([yale_train1_x, yale_test1_x, yale_train1_y, yale_test1_y, yale_train2_x, yale_test2_x, yale_train2_y, yale_test2_y, yale_train3_x, yale_test3_x, yale_train3_y, yale_test3_y],dtype='object')
acc_yale = np.zeros(3)
f1_yale = np.empty_like(acc_yale)
roc_yale = np.empty_like(acc_yale)
model_yale_dstructre = {}
pred_yale_dstructre = {}
for i in range(0,3):
  train_data, test_data, train_target, test_target = yale_data_call[0 + i*4], yale_data_call[1 + i*4], yale_data_call[2 + i*4], yale_data_call[3 + i*4]

  ### Model Training ###
  pipe_yale = Pipeline(
    [('scaled',StandardScaler()),
     ('svc',SVC())
     ])
  pipe_yale.fit(train_data,train_target) 

  ###Model Pred####
  yale_pred = pipe_yale.predict(test_data)
  
  ###Model Eval Metrics###
  f1_yale[i], acc_yale[i],roc_yale[i] = model_eval(test_target,yale_pred,'yale')
  ###Saving model, pred, eval metrics###
  pred_yale_dstructre[f'pred_{i}'] = yale_pred
  model_yale_dstructre[f'model_{i}'] =pipe_yale 

#### MNST

In [None]:
mnst_pie = Pipeline(
    [('scaled',StandardScaler()),
     ('svc',SVC())
     ])

mnst_pie.fit(mnist_train_x_reshape, mnist_train_y)
mnst_pred = mnst_pie.predict(mnist_test_x_reshape)

### Model Testing - Baseline

#### PIE

In [68]:
pie_f1,pie_acc,pie_roc = model_eval(pie_test_y, pie_pred,'pie')
print(f'Pie accuracy: {pie_acc}')
print(f'Pie F1: {pie_f1}')
print(f'Pie ROC: {pie_roc}')

Pie accuracy: 0.9615
Pie F1: 0.9643
Pie ROC: 0.9805


#### MNIST


In [91]:
mnst_f1, mnst_acc, mnst_roc = model_eval(mnist_test_y, mnst_pred,'mnst')
print(f'MNST accuracy: {mnst_acc}')
print(f'MNST F1: {mnst_f1}')
print(f'MNST ROC:  {mnst_roc}')

ValueError: ignored

#### Yale

In [92]:
print(f'Yale accuracy: {round(acc_yale.mean(),4)}')
print(f'Yale F1: {round(f1_yale.mean(),4)}')
print(f'Yale ROC: {round(roc_yale.mean(),4)}')

Yale accuracy: 0.9284
Yale F1: 0.9413
Yale ROC: 0.9632


### HP Tuning

In [93]:
kernel_tosearch = ["poly","rbf","sigmoid"]

#### KFold for YaleB

In [94]:
n_folds = 3

#### Val Set for Pie and MNIST

##### Pie

In [95]:
pie_val_f1 = np.zeros(len(kernel_tosearch)) #init eval metric arrays len of possible kernals
pie_val_acc = np.empty_like(pie_val_f1)
pie_val_roc = np.empty_like(pie_val_f1)

In [97]:
for i in enumerate(kernel_tosearch): #for each possible kernel
  print({"kernel":i[1]})
  pipe_pie[1].set_params(**{"kernel":i[1]}) #set the kernel to new option
  pipe_pie.fit(pie_train_x_reshape,pie_train_y) #fit on test
  pie_val_pred = pipe_pie.predict(pie_val_x.reshape(pie_val_x.shape[0],pie_val_x.shape[1]**2)) #pred on val
  pie_val_f1[i[0]], pie_val_acc[i[0]], pie_val_roc[i[0]] = model_eval(pie_val_y, pie_val_pred,'pie') #model eval pred val vs val_y

{'kernel': 'poly'}
{'kernel': 'rbf'}
{'kernel': 'sigmoid'}


In [122]:
pie_val_data = np.vstack((pie_val_f1, pie_val_acc, pie_val_roc))
pie_val_data_pd = pd.DataFrame(pie_val_data,columns = kernel_tosearch)
pie_val_data_pd['eval_params']=['f1','acc','roc']
pie_val_data_pd.set_index("eval_params",inplace=True)

In [124]:
print('Validation Performance')
pie_val_data_pd

Validation Performance


Unnamed: 0_level_0,poly,rbf,sigmoid
eval_params,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
f1,0.8851,0.9602,0.3899
acc,0.8686,0.9562,0.3323
roc,0.9335,0.9778,0.6625


##### MNIST

In [127]:
mnist_val_f1 = np.zeros(len(kernel_tosearch)) #init eval metric arrays len of possible kernals
mnist_val_acc = np.empty_like(mnist_val_f1)
mnist_val_roc = np.empty_like(mnist_val_f1)
for i in enumerate(kernel_tosearch): #for each possible kernel
  print({"kernel":i[1]})
  mnst_pie[1].set_params(**{"kernel":i[1]}) #set the kernel to new option
  mnst_pie.fit(mnist_train_x_reshape,mnist_train_y) #fit on test
  mnist_val_pred = mnst_pie.predict(mnist_val_x.reshape(mnist_val_x.shape[0],mnist_val_x.shape[1]**2)) #pred on val
  mnist_val_f1[i[0]], mnist_val_acc[i[0]], mnist_val_roc[i[0]] = model_eval(mnist_val_y, mnist_val_pred,'pie') #model eval pred val vs val_y

{'kernel': 'poly'}


ValueError: ignored

In [126]:
mnist_val_data = np.vstack((mnist_val_f1, mnist_val_acc, mnist_val_roc))
mnst_val_data_pd = pd.DataFrame(mnist_val_data,columns = kernel_tosearch)
mnst_val_data_pd['eval_params']=['f1','acc','roc']
mnst_val_data_pd.set_index("eval_params",inplace=True)
print('Validation Performance')
mnst_val_data_pd

Validation Performance


Unnamed: 0_level_0,poly,rbf,sigmoid
eval_params,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
f1,0.8851,0.9602,0.3899
acc,0.8686,0.9562,0.3323
roc,0.9335,0.9778,0.6625
