In [9]:
import numpy as np
import pandas as pd
import random
from pathlib import Path
import tensorflow as tf
from tensorflow import keras
from matplotlib import pyplot as plt
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, accuracy_score
import seaborn as sns

In [10]:
datapath = Path(r"C:\Users\aryav\OneDrive\codeshenanigans\ICJ 2021\ICJ code\data")
modelpath = Path(r'C:\Users\aryav\OneDrive\codeshenanigans\ICJ 2021\ICJ code\models\colortesting')

M1 = keras.models.load_model(modelpath/'M1.h5')
M2 = keras.models.load_model(modelpath/'M2.h5')
M3 = keras.models.load_model(modelpath/'M3.h5')
M4 = keras.models.load_model(modelpath/'M4.h5')
M5 = keras.models.load_model(modelpath/'M5.h5')

dataset = pd.read_csv(datapath/'RW1.csv')

In [None]:
dataset_small = dataset.sample(frac=0.1, random_state=42)

del dataset

In [11]:
del dataset['Unnamed: 14701']

num_letters = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F',
               6: 'G', 7: 'H', 8: 'I', 9: 'K',
               10: 'L', 11: 'M', 12: 'N', 13: 'O', 14: 'P',
               15: 'Q', 16: 'R', 17: 'S', 18: 'T', 19: 'U',
               20: 'V', 21: 'W', 22: 'X', 23: 'Y'}

y_test = dataset['label']
del dataset['label']
x_test = dataset.values
x_test = x_test / 255
x_test = x_test.reshape(-1, 70, 70, 3)
lb = LabelBinarizer()
y_test = lb.fit_transform(y_test)

In [19]:
def test_accuracy(model):
	y_pred = model.predict(x_test)
	print('Accuracy: ' + str(round(accuracy_score(y_test.argmax(axis=1), y_pred.argmax(axis=1)) * 100, 2)) + '%')
	del y_pred

def test_precision(model):
	y_pred = model.predict(x_test)
	print('Precision: ' + str(round(precision_score(y_test.argmax(axis=1), y_pred.argmax(axis=1), average='weighted'), 2)))
	del y_pred

def test_recall(model):
	y_pred = model.predict(x_test)
	print('Recall: ' + str(round(recall_score(y_test.argmax(axis=1), y_pred.argmax(axis=1), average='weighted'), 2)))
	del y_pred

def test_f1(model):	
	y_pred = model.predict(x_test)
	print('F1: ' + str(round(f1_score(y_test.argmax(axis=1), y_pred.argmax(axis=1), average='weighted'), 2)))
	del y_pred

def test_roc_auc(model):
	y_pred = model.predict(x_test)
	print('ROC AUC: ' + str(round(roc_auc_score(y_test, y_pred, average='weighted', multi_class='ovo'), 2)))
	del y_pred

def test_mean_and_std(model):
	#create multiple random samples of the test set
	#then compute the mean and standard deviation of all of the predictions' accuracies

	accuracies = []

	for i in range(200):
		#sample 50 images from the test set	
		sample = random.sample(range(0, len(x_test)), 50)

		#predict the labels of the sampled images
		y_pred = model.predict(x_test[sample])

		#compute the accuracy of the predictions
		accuracy = accuracy_score(y_test[sample].argmax(axis=1), y_pred.argmax(axis=1))
		accuracies.append(accuracy)

		del y_pred
		del sample

	#compute and print the mean and standard deviation of the accuracies
	print('Mean: ' + str(round(np.mean(accuracies) * 100, 2)) + '%')
	print('Standard Deviation: ' + str(round(np.std(accuracies) * 100, 2)) + '%')


def make_cm(model):
	y_pred = model.predict(x_test)
	cm = confusion_matrix(y_test.argmax(axis=1), y_pred.argmax(axis=1))
	cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
	cm = pd.DataFrame(cm, index=num_letters.values(), columns=num_letters.values())
	plt.figure(figsize=(10, 10))
	sns.heatmap(cm, annot=True, fmt='.2f', cmap='Blues')
	plt.ylabel('Actual')
	plt.xlabel('Predicted')
	plt.show()	

In [13]:
print("Running accuracy tests...")

print("M1:")
test_accuracy(M1)
print("M2:")
test_accuracy(M2)
print("M3:")
test_accuracy(M3)
print("M4:")
test_accuracy(M4)
print("M5:")
test_accuracy(M5)

Running accuracy tests...
M1:
Accuracy: 10.4%
M2:
Accuracy: 16.09%
M3:
Accuracy: 11.47%
M4:
Accuracy: 20.71%
M5:
Accuracy: 14.75%


In [14]:
print("Running precision tests...")

print("M1:")
test_precision(M1)
print("M2:")
test_precision(M2)
print("M3:")
test_precision(M3)
print("M4:")
test_precision(M4)
print("M5:")
test_precision(M5)

Running precision tests...
M1:


  _warn_prf(average, modifier, msg_start, len(result))


Precision: 0.21
M2:
Precision: 0.33
M3:
Precision: 0.28
M4:
Precision: 0.34
M5:
Precision: 0.22


In [20]:
print("Running recall tests...")

print("M1:")
test_recall(M1)
print("M2:")
test_recall(M2)
print("M3:")
test_recall(M3)
print("M4:")
test_recall(M4)
print("M5:")
test_recall(M5)

Running recall tests...
M1:
Recall: 0.1
M2:
Recall: 0.16
M3:
Recall: 0.11
M4:
Recall: 0.21
M5:
Recall: 0.15


In [16]:
print("Running F1 score tests...")

print("M1:")
test_f1(M1)
print("M2:")
test_f1(M2)
print("M3:")
test_f1(M3)
print("M4:")
test_f1(M4)
print("M5:")
test_f1(M5)

Running F1 score tests...
M1:
F1: 0.11
M2:
F1: 0.16
M3:
F1: 0.12
M4:
F1: 0.19
M5:
F1: 0.14


In [21]:
print("Running AUC tests...")

print("M1:")
test_roc_auc(M1)
print("M2:")
test_roc_auc(M2)
print("M3:")
test_roc_auc(M3)
print("M4:")
test_roc_auc(M4)
print("M5:")
test_roc_auc(M5)

Running AUC tests...
M1:
ROC AUC: 0.65
M2:
ROC AUC: 0.73
M3:
ROC AUC: 0.64
M4:
ROC AUC: 0.75
M5:
ROC AUC: 0.72


In [22]:
print ("Running mean and standard deviation tests...")

print("M1:")
test_mean_and_std(M1)
print("M2:")
test_mean_and_std(M2)
print("M3:")
test_mean_and_std(M3)
print("M4:")
test_mean_and_std(M4)
print("M5:")
test_mean_and_std(M5)

Running mean and standard deviation tests...
M1:
Mean: 10.24%
Standard Deviation: 4.36%
M2:
Mean: 15.63%
Standard Deviation: 4.94%
M3:
Mean: 11.71%
Standard Deviation: 4.17%
M4:
Mean: 20.08%
Standard Deviation: 6.0%
M5:
Mean: 14.55%
Standard Deviation: 4.49%
