# Imports

In [1]:
import os
import pandas as pd
import keras
import tensorflow as tf
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import confusion_matrix

# Modify sys.path 
project_root = '/Users/carlesferreres/Desktop/Carles/Empresas/KOA/Repos/aquagen-experimentation/'
os.chdir(project_root)

# Config

In [8]:
# Model config
exp_id = 'mult_rep2'
model_name = 'old/nn_model_mult_rep2_20240512.keras'
dir = f'data/input_data/exp_{exp_id}'

retrain = False

# Load data and model

In [3]:
# Load data
X_test = pd.read_csv(os.path.join(dir, 'X_test.csv'))
y_test = pd.read_csv(os.path.join(dir, 'y_test.csv'))['Class']

FileNotFoundError: [Errno 2] No such file or directory: 'data/input_data/exp_mult_rep2/X_test.csv'

In [4]:
# Test set distribution
y_test.value_counts()

Class
vharveyi        86
control         44
vangil          35
ahydrophila     18
asalmonicida    14
pdpiscicida     12
tmaritimum       4
Name: count, dtype: int64

In [5]:
# Get number of classes
target_names = np.unique(y_test)
num_classes = len(target_names)

# Standardize
scaler = StandardScaler()
X_test = scaler.fit_transform(X_test)

# Encode
encoder = LabelEncoder()
y_test = encoder.fit_transform(y_test)
y_test = to_categorical(y_test, num_classes=num_classes)

In [9]:
# Read model
model = keras.models.load_model(os.path.join('models', model_name))

# Model evaluation

## Metrics

In [7]:
# Make predictions and evaluate main metrics
y_pred = model.predict(X_test)
auc_prc = keras.metrics.AUC(curve='PR', name='auc_prc')(y_test, y_pred) 
roc_auc = keras.metrics.AUC(name='roc_auc')(y_test, y_pred) 

print("AUC-PRC:", auc_prc.numpy())
print("ROC-AUC:", roc_auc.numpy())

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
AUC-PRC: 0.5274954
ROC-AUC: 0.8205879


In [8]:
# Convert one-hot encoded test labels back to class indices for comparison
y_test_classes = np.argmax(y_test, axis=1)
y_pred_classes = np.argmax(y_pred, axis=1)

# Compute confusion matrix
cm = confusion_matrix(y_test_classes, y_pred_classes)
cm_df = pd.DataFrame(cm, index=target_names, columns=target_names)
cm_df.columns = pd.MultiIndex.from_product([['predicted'], cm_df.columns])
cm_df.index = pd.MultiIndex.from_product([['actual'], cm_df.index])
cm_df

Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted,predicted,predicted,predicted,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ahydrophila,asalmonicida,control,pdpiscicida,tmaritimum,vangil,vharveyi
actual,ahydrophila,16,0,0,0,1,0,1
actual,asalmonicida,0,4,0,0,0,1,9
actual,control,0,0,34,0,0,0,10
actual,pdpiscicida,4,0,0,5,0,2,1
actual,tmaritimum,1,0,0,2,1,0,0
actual,vangil,5,2,0,5,0,15,8
actual,vharveyi,13,7,6,2,2,4,52


In [9]:
# Overall accuracy
n = cm_df.sum().sum()
TP = 0
for label in target_names:
    TP += cm_df.loc[[('actual', label)]][('predicted', label)].sum()
acc = 100*TP/n 
acc

59.624413145539904

In [10]:
# Precisions
precisions = []
for label in target_names:
    p = cm_df.loc[('actual', label), ('predicted', label)].sum() / cm_df.loc[:, ('predicted', label)].sum() * 100
    precisions.append(p)
precisions

[41.02564102564102,
 30.76923076923077,
 85.0,
 35.714285714285715,
 25.0,
 68.18181818181817,
 64.19753086419753]

In [11]:
# Fallouts
fallouts = []
for label in target_names:
    f = cm_df.loc[cm_df.index.get_level_values(1) != label].loc[:, ('predicted', label)].sum() / cm_df.loc[cm_df.index.get_level_values(1) != label].sum().sum() * 100
    fallouts.append(f)
fallouts

[11.794871794871794,
 4.522613065326634,
 3.5502958579881656,
 4.477611940298507,
 1.4354066985645932,
 3.932584269662921,
 22.83464566929134]

In [12]:
# Recalls
recalls = []
for label in target_names:
    r = cm_df.loc[('actual', label), ('predicted', label)].sum() / cm_df.loc[('actual', label), :].sum() * 100
    recalls.append(r)
recalls

[88.88888888888889,
 28.57142857142857,
 77.27272727272727,
 41.66666666666667,
 25.0,
 42.857142857142854,
 60.46511627906976]

In [13]:
# False Omission Rates
fors = []
for label in target_names:
    f = cm_df.loc[:, cm_df.columns.get_level_values(1) != label].loc[('actual', label), :].sum() / cm_df.loc[:, cm_df.columns.get_level_values(1) != label].sum().sum() * 100
    fors.append(f)
fors

[1.1494252873563218,
 5.0,
 5.780346820809249,
 3.5175879396984926,
 1.4354066985645932,
 10.471204188481675,
 25.757575757575758]

## Insights

In [14]:
for a in zip(range(1,len(target_names)+1), target_names):
    print(a[0],'-',a[1])

1 - ahydrophila
2 - asalmonicida
3 - control
4 - pdpiscicida
5 - tmaritimum
6 - vangil
7 - vharveyi


In [15]:
# Select pathogen
## USER INPUT ##
selection = 7

###
pathogen = target_names[selection - 1]
recall = recalls[selection-1]
fallout = fallouts[selection-1]
precision = precisions[selection-1]
_for = fors[selection-1]

In [16]:
print(f'The overall model accuracy is {acc:.2f}%')

The overall model accuracy is 59.62%


In [17]:
print(f'Given that there is {pathogen} in the sample, the model is {recall:.2f}% likely to detect it.')
print(f'Given that there is no {pathogen} in the sample, the model is {fallout:.2f}% likely to wrongly detect it.')

Given that there is vharveyi in the sample, the model is 60.47% likely to detect it.
Given that there is no vharveyi in the sample, the model is 22.83% likely to wrongly detect it.


In [18]:
print(f'Given that the model detected {pathogen}, the sample is {precision:.2f}% likely to have a {pathogen}.')
print(f'Given that the model did not detect {pathogen}, the sample is still {_for:.2f}% likely to have {pathogen}.')

Given that the model detected vharveyi, the sample is 64.20% likely to have a vharveyi.
Given that the model did not detect vharveyi, the sample is still 25.76% likely to have vharveyi.


In [19]:
lights = [[1,1,3],[2,2,4],[3,3,5]]

In [20]:
stage = [[0] * n for _ in range(3)]

In [21]:
print(stage)

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 