## Import statements

In [1]:
import os
import cv2
import copy
import random
import numpy as np
import pandas as pd
import itertools
from itertools import cycle
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from sklearn.naive_bayes import GaussianNB
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
from sklearn import metrics
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.decomposition import PCA
from my_ml_lib import DataManipulationTools, MetricTools, PlotTools

## Function definitions

### Read dataset

In [2]:
def read_images(path):
    images = []

    num1 = 32
    num2 = 32
    for file_name in os.listdir(path):
        file_path = path + '/' + file_name
        for img_name in os.listdir(file_path):
            if not img_name.startswith('.'):
                if img_name.endswith('.png'):
                    img = cv2.imread(file_path + '/' + img_name)
                    new_img = cv2.resize(img, (num2, num1))
                    flat_img = new_img.ravel()
                    if file_name == 'Parasitized':
                        label = 0
                    else:
                        label = 1
                    label_img = np.append(flat_img,label)
                    images.append(label_img)
    
    img_arr = np.array(images)
    return img_arr

### Naive bayes

In [3]:
def model(data, label, test_data):
    m = GaussianNB()
    m.fit(data, label)
    p_label = m.predict(test_data)
    prob = m.predict_proba(test_data)
    return p_label, prob

### PCA

In [4]:
def pca(train_data, test_data):
    clf = PCA(n_components=39, random_state=1)
    new_train_data = clf.fit_transform(train_data)
    new_test_data = clf.transform(test_data)
    return new_train_data, new_test_data

### LDA

In [5]:
def lda(train_data, train_label, test_data):
    clf = LDA(solver='eigen')  
    new_train_data = clf.fit_transform(train_data, train_label)
    new_test_data = clf.transform(test_data)
    return new_train_data, new_test_data

### ROC

In [6]:
def plot_roc(fpr, tpr, class_name, area):
    figure(num=None, figsize=(12, 6), dpi=80, facecolor='w', edgecolor='k')
    
    plt.plot(fpr,tpr)

    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')

    plt.show()

In [7]:
def plot_combine_roc(test_img, prob):
    n_classes = 2
    name_arr = ['Parasitized', 'Uninfected']
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = metrics.roc_curve(test_img[:, -1], prob[:, i])
        roc_auc[i] = metrics.roc_auc_score(test_img[:, -1],  prob[:, i])

    figure(num=None, figsize=(12, 6), dpi=80, facecolor='w', edgecolor='k')
    colors = cycle(['darkorange', 'cornflowerblue'])
    for i, color in zip(range(n_classes), colors):
        plt.plot(fpr[i], tpr[i], color=color,
                 label='ROC curve for ' + name_arr[i] + ' class (area = {1:0.2f})'
                 ''.format(i, roc_auc[i]))

    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend(loc="lower right")
    plt.show()

## Main calls

### Read and split data

In [None]:
images = read_images('cell_images')

In [None]:
images.shape

In [None]:
train_img, test_img = DataManipulationTools.split_data(images, 0.9, random_state=10)

In [None]:
train_img.shape

In [None]:
test_img.shape

### Normal naive bayes

In [None]:
labels_predicted, prob = model(train_img[:, :-1], train_img[:, -1], test_img[:, :-1])

In [None]:
acc = metrics.accuracy_score(test_img[:, -1], labels_predicted)
print('Accuracy: ', acc)

cm = metrics.confusion_matrix(test_img[:, -1], labels_predicted, labels=[0, 1])
print('Confusion Matrix:')
print(cm)

PlotTools.confusion_matrix(cm, ['Parasitized', 'Uninfected'], title='', filename='Confusion Matrix NB', 
                           path='output/', figsize=(6,6))

In [None]:
recall_p = cm[0][0] / (cm[0][0] + cm[0][1])
precision_p = cm[0][0] / (cm[0][0] + cm[1][0])
print('precision', precision_p, 'recall', recall_p)

In [None]:
f1_p = 1/(0.5*((1/precision_p)+(1/recall_p)))
print('f1', f1_p)

In [None]:
fpr0, tpr0, thresholds0 = metrics.roc_curve(test_img[:, -1], prob[:, 0])
area0 = metrics.roc_auc_score(test_img[:, -1], prob[:, 0])
plot_roc(fpr0, tpr0, 'Parasitized', area0)

In [None]:
fpr1, tpr1, thresholds1 = metrics.roc_curve(test_img[:, -1], prob[:, 1])
area1 = metrics.roc_auc_score(test_img[:, -1], prob[:, 1])
plot_roc(fpr1, tpr1, 'Uninfected', area1)

In [None]:
plot_combine_roc(test_img, prob)

### Normal naive bayes K- fold

In [None]:
val_accuracies = []
best_model = None
max_acc = -np.inf

for i, (train_fold, val_fold) in enumerate(DataManipulationTools.k_folds(train_img, k=5), start=1):
    clf = GaussianNB()
    clf.fit(train_fold[:, :-1], train_fold[:,-1])

    val_y_hat = clf.predict(val_fold[:, :-1])
    val_accuracies.append(MetricTools.accuracy(val_fold[:, -1], val_y_hat))
    
    print(f'{i}-Fold Accuracy: {np.around(val_accuracies[-1] * 100, 2)}%')
    
    if val_accuracies[-1] > max_acc:
        print('Validation accuracy increased, Updating best model...')
        best_model = clf
        max_acc = val_accuracies[-1]

In [None]:
PlotTools.table(np.around(np.array(val_accuracies) * 100, 2), 
                     [f'Fold {i}' for i in range(1,6)], ['Accuracy %'], 'Complete: 5-Fold Cross Validation Accuracies',
                     path='output', figsize=(5,2), filename='1 - Complete: 5-Fold Cross Validation Accuracies')

In [None]:
PlotTools.table([np.around(np.mean(val_accuracies), 2), np.around(np.std(val_accuracies), 4)],
                     ['Mean Accuracy', 'Standard Deviation'], ['Statistics'], 'PCA: 5-Fold Mean & Std Dev',
                     path='output', figsize=(5,2), filename='2 - PCA: 5-Fold Mean & Std Dev')

### PCA

In [None]:
pca_train_images, pca_test_images = pca(train_img[:, :-1], test_img[:, :-1])

In [None]:
pca_train_img = np.concatenate((pca_train_images, train_img[:, -1].reshape(train_img[:, -1].shape[0], 1)), axis=1)
pca_test_img = np.concatenate((pca_test_images, test_img[:, -1].reshape(test_img[:, -1].shape[0], 1)), axis=1)

In [None]:
pca_train_img.shape

In [None]:
pca_labels_predicted, pca_prob = model(pca_train_img[:, :-1], pca_train_img[:, -1], pca_test_img[:, :-1])

In [None]:
pca_acc = metrics.accuracy_score(pca_test_img[:, -1], pca_labels_predicted)
print('Accuracy: ', pca_acc)

pca_cm = metrics.confusion_matrix(pca_test_img[:, -1], pca_labels_predicted, labels=[0, 1])
print('Confusion Matrix:')
print(pca_cm)

PlotTools.confusion_matrix(pca_cm, ['Parasitized', 'Uninfected'], title='', filename='Confusion Matrix PCA NB', 
                           path='output/', figsize=(6,6))

In [None]:
recall_pp = pca_cm[0][0] / (pca_cm[0][0] + pca_cm[0][1])
precision_pp = pca_cm[0][0] / (pca_cm[0][0] + pca_cm[1][0])
print('precision', precision_pp, 'recall', recall_pp)

In [None]:
f1_pp = 1/(0.5*((1/precision_pp)+(1/recall_pp)))
print('f1', f1_pp)

In [None]:
pca_fpr0, pca_tpr0, pca_thresholds0 = metrics.roc_curve(pca_test_img[:, -1], pca_prob[:, 0])
pca_area0 = metrics.roc_auc_score(pca_test_img[:, -1], pca_prob[:, 0])
plot_roc(pca_fpr0, pca_tpr0, 'Parasitized', pca_area0)

In [None]:
pca_fpr1, pca_tpr1, pca_thresholds1 = metrics.roc_curve(pca_test_img[:, -1], pca_prob[:, 1])
pca_area1 = metrics.roc_auc_score(pca_test_img[:, -1], pca_prob[:, 1])
plot_roc(pca_fpr1, pca_tpr1, 'Uninfected', pca_area1)

In [None]:
plot_combine_roc(pca_test_img, pca_prob)

### LDA

In [None]:
lda_train_images, lda_test_images = lda(train_img[:, :-1], train_img[:, -1], test_img[:, :-1])

In [None]:
lda_train_img = np.concatenate((lda_train_images, train_img[:, -1].reshape(train_img[:, -1].shape[0], 1)), axis=1)
lda_test_img = np.concatenate((lda_test_images, test_img[:, -1].reshape(test_img[:, -1].shape[0], 1)), axis=1)

In [None]:
lda_train_img.shape

In [None]:
val = 0
l1 = []
l2 = []
arr = ['Uninfected', 'Parasitized']
for i in range(lda_train_img.shape[0]//10):
    if int(lda_train_img[i][1]) == 0:
        l1.append(lda_train_img[i][0])
    else:
        l2.append(lda_train_img[i][0])

for i in range(lda_train_img.shape[0]-1,lda_train_img.shape[0]-lda_train_img.shape[0]//10,-1):
    if int(lda_train_img[i][1]) == 0:
        l1.append(lda_train_img[i][0])
    else:
        l2.append(lda_train_img[i][0])

l11 = np.ones((len(l1)))
l22 = np.ones((len(l2)))

plt.plot(l1, l11, c='g')
plt.plot(l2, l22, c='r')
plt.legend(arr)
plt.show()

In [None]:
max_p = np.max(l2)
max_u = np.max(l1)
min_p = np.min(l2)
min_u = np.min(l1)
print('p max', max_p)
print('p min', min_p)
print('u max', max_u)
print('u min', min_u)

In [None]:
mean_p = np.mean(l2)
mean_u = np.mean(l1)
sd_p = np.std(l2)
sd_u = np.std(l1)
print('p mean', mean_p)
print('p sd', sd_p)
print('u mean', mean_u)
print('u sd', sd_u)

In [None]:
lda_labels_predicted, lda_prob = model(lda_train_img[:, :-1], lda_train_img[:, -1], lda_test_img[:, :-1])

In [None]:
lda_acc = metrics.accuracy_score(lda_test_img[:, -1], lda_labels_predicted)
print('Accuracy: ', lda_acc)

lda_cm = metrics.confusion_matrix(lda_test_img[:, -1], lda_labels_predicted, labels=[0, 1])
print('Confusion Matrix:')
print(lda_cm)

PlotTools.confusion_matrix(lda_cm, ['Parasitized', 'Uninfected'], title='', filename='Confusion Matrix LDA NB', 
                           path='output/', figsize=(6,6))

In [None]:
recall_pl = lda_cm[0][0] / (lda_cm[0][0] + lda_cm[0][1])
precision_pl = lda_cm[0][0] / (lda_cm[0][0] + lda_cm[1][0])
print('precision', precision_pl, 'recall', recall_pl)

In [None]:
f1_pl = 1/(0.5*((1/precision_pl)+(1/recall_pl)))
print('f1', f1_pl)

In [None]:
lda_fpr0, lda_tpr0, lda_thresholds0 = metrics.roc_curve(lda_test_img[:, -1], lda_prob[:, 0])
lda_area0 = metrics.roc_auc_score(lda_test_img[:, -1], lda_prob[:, 0])
plot_roc(lda_fpr0, lda_tpr0, 'Parasitized', lda_area0)

In [None]:
lda_fpr1, lda_tpr1, lda_thresholds1 = metrics.roc_curve(lda_test_img[:, -1], lda_prob[:, 1])
lda_area1 = metrics.roc_auc_score(lda_test_img[:, -1], lda_prob[:, 1])
plot_roc(lda_fpr1, lda_tpr1, 'Uninfected', lda_area1)

In [None]:
plot_combine_roc(lda_test_img, lda_prob)