In [None]:
%matplotlib inline
import matplotlib.image as mpimg
from statsmodels.api import Logit
import glob
import numpy as np
from numpy.random import default_rng
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.metrics import roc_curve, precision_recall_curve, auc
import matplotlib.pyplot as plt
import cv2
import joblib

In [None]:
paraTrain = glob.glob("Data/training_set/Parasitized/*")
noParaTrain = glob.glob("Data/training_set/Uninfected/*")
paraTest = glob.glob("Data/testing_set/Parasitized/*")
noParaTest = glob.glob("Data/testing_set/Uninfected/*")
paraValid = glob.glob("Data/validation_set/Parasitized/*")
noParaValid = glob.glob("Data/validation_set/Uninfected/*")

In [None]:
trainSize = 400
rng = default_rng()
paraTrainIndexes = rng.choice(len(paraTrain), trainSize,replace=False)
paraTrain = [mpimg.imread(paraTrain[i]) for i in paraTrainIndexes]
noParaTrainIndexes = rng.choice(len(noParaTrain), trainSize, replace=False)
noParaTrain = [mpimg.imread(noParaTrain[i]) for i in noParaTrainIndexes]
trainData = paraTrain + noParaTrain
trainLabels = [True]*trainSize + [False]*trainSize

validSize = 80
paraValidIndexes = rng.choice(len(paraValid), validSize, replace=False)
paraValid = [mpimg.imread(paraValid[i]) for i in paraValidIndexes]
noParaValidIndexes = rng.choice(len(noParaValid), validSize, replace=False)
noParaValid = [mpimg.imread(noParaValid[i]) for i in noParaValidIndexes]
validData = paraValid + noParaValid
validLabels = [True]*validSize + [False]*validSize

testSize = 80
paraTestIndexes = rng.choice(len(paraTest), testSize, replace=False)
paraTest = [mpimg.imread(paraTest[i]) for i in paraTestIndexes]
noParaTestIndexes = rng.choice(len(noParaTest), testSize, replace=False)
noParaTest = [mpimg.imread(noParaTest[i]) for i in noParaTestIndexes]
testData = paraTest + noParaTest
testLabels = [True]*testSize + [False]*testSize

In [None]:
trainHist = []
for i in trainData:
    flatImage = i.flatten()
    flatImage = flatImage[flatImage != 0]
    trainHist.append(np.histogram(flatImage, bins=20)
                     [0]/float(len(flatImage)))

validHist = []
for i in validData:
    flatImage = i.flatten()
    flatImage = flatImage[flatImage != 0]
    validHist.append(np.histogram(flatImage, bins=20)
                    [0]/float(len(flatImage)))

testHist = []
for i in testData:
    flatImage = i.flatten()
    flatImage = flatImage[flatImage != 0]
    testHist.append(np.histogram(flatImage, bins=20)
                    [0]/float(len(flatImage)))

In [None]:
logitModel = Logit(trainLabels, trainHist).fit()
pscore_logit = logitModel.predict(validHist)
joblib.dump(logitModel,'Models/logit_res.pkl')
logitModel.summary()

In [None]:
pscore_knn = None
max_kscore_knn = 0
knnModel = None
for i in range(1, 22, 2):
    knnClas = KNeighborsClassifier(n_neighbors=i)
    knnClas.fit(trainHist, trainLabels)
    temp_pscore_knn = knnClas.predict_proba(validHist)
    fscore_knn = f1_score(testLabels, temp_pscore_knn[:, 1] > 0.5)
    if fscore_knn > max_kscore_knn:
        max_fscore_knn = fscore_knn
        pscore_knn = temp_pscore_knn
        knnModel = knnClas
joblib.dump(knnModel, 'Models/knnClas.pkl')

In [None]:
criterion = 'gini' #gini or entropy
max_features = 'sqrt' #sqrt || log2 || None
max_depth = None
randomForestClas = RandomForestClassifier(n_estimators=100,criterion=criterion,max_features=max_features,max_depth=max_depth)
randomForestClas.fit(trainHist, trainLabels)
pscore_forest = randomForestClas.predict_proba(validHist)
joblib.dump(randomForestClas, 'Models/randomForestClas.pkl')

In [None]:
fig, axs = plt.subplots(3, 2, figsize=(10, 15))

# generate a no skill prediction
ns_probs = [0 for _ in range(len(testLabels))]
no_skill = np.count_nonzero(testLabels) / len(testLabels)

# calculate curves
ns_fpr, ns_tpr, _ = roc_curve(testLabels, ns_probs)
logit_fpr, logit_tpr, _ = roc_curve(testLabels, pscore_logit)
forest_fpr, forest_tpr, _ = roc_curve(testLabels, pscore_forest[:, 1])
knn_fpr, knn_tpr, _ = roc_curve(testLabels, pscore_knn[:, 1])

ns_precision, ns_recall, _ = precision_recall_curve(testLabels, ns_probs)
logit_precision, logit_recall, _ = precision_recall_curve(
    testLabels, pscore_logit)
forest_precision, forest_recall, _ = precision_recall_curve(
    testLabels, pscore_forest[:, 1])
knn_precision, knn_recall, _ = precision_recall_curve(
    testLabels, pscore_knn[:, 1])


# calculate scores
ns_auc_roc = auc(ns_fpr, ns_tpr)
logit_auc_roc = auc(logit_fpr, logit_tpr)
forest_auc_roc = auc(forest_fpr, forest_tpr)
knn_auc_roc = auc(knn_fpr, knn_tpr)

ns_auc_pr = auc([0, 1], [no_skill, no_skill])
logit_auc_pr = auc(logit_recall, logit_precision)
forest_auc_pr = auc(forest_recall, forest_precision)
knn_auc_pr = auc(knn_recall, knn_precision)

# summarize scores
print("-------------------->ROC<--------------------")
print('No Skill: ROC AUC=%.3f' % (ns_auc_roc))
print('Logistic: ROC AUC=%.3f' % (logit_auc_roc))
print('Random Forest: ROC AUC=%.3f' % (forest_auc_roc))
print('KNN: ROC AUC=%.3f' % (knn_auc_roc))

print("-------------------->Precision-Recall<--------------------")
print('No Skill: Precision-Recall AUC=%.3f' % (ns_auc_pr))
print('Logistic: Precision-Recall AUC=%.3f' % (logit_auc_pr))
print('Random Forest: Precision-Recall AUC=%.3f' % (forest_auc_pr))
print('KNN: Precision-Recall AUC=%.3f' % (knn_auc_pr))


# plot the curve for the model
axs[0][0].plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
axs[0][0].plot(logit_fpr, logit_tpr, marker='.', label='Logistic')
axs[1][0].plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
axs[1][0].plot(forest_fpr, forest_tpr, marker='.', label='Forest')
axs[2][0].plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
axs[2][0].plot(knn_fpr, knn_tpr, marker='.', label='KNN')

axs[0][1].plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
axs[0][1].plot(logit_recall, logit_precision, marker='.', label='Logistic')
axs[1][1].plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
axs[1][1].plot(forest_recall, forest_precision, marker='.', label='Forest')
axs[2][1].plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
axs[2][1].plot(knn_recall, knn_precision, marker='.', label='KNN')

# axis labels
plt.setp(axs[0][0], xlabel='False Positive Rate',
         ylabel='True Positive Rate', title="ROC Curve Logit")
plt.setp(axs[0][1], xlabel='Recall',
         ylabel='Precision', title="PR Curve Logit")
plt.setp(axs[1][0], xlabel='False Positive Rate',
         ylabel='True Positive Rate', title="ROC Curve Random Forest")
plt.setp(axs[1][1], xlabel='Recall',
         ylabel='Precision', title="PR Curve Random Forest")
plt.setp(axs[2][0], xlabel='False Positive Rate',
         ylabel='True Positive Rate', title="ROC Curve KNN")
plt.setp(axs[2][1], xlabel='Recall',
         ylabel='Precision', title="PR Curve KNN")

# show the legend
axs[0][0].legend()
axs[0][1].legend()
axs[1][0].legend()
axs[1][1].legend()
axs[2][0].legend()
axs[2][1].legend()
