## Import libraries for ensemble techniques

In [59]:
import pandas as pd
import numpy as np 

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    fbeta_score,
    log_loss,
    precision_score,
    recall_score,
    f1_score,
)

## Load valid_df and test_df

In [60]:
valid_df = pd.read_csv(
    "/home/samic_yongjian/temp/SC4000_Machine_Learning/data/valid_df.csv"
)
valid_df.head(5)

Unnamed: 0,image_id,labels
0,1611662564_noise_4.jpg,4
1,801551318_hue3_4.jpg,4
2,train-cbb-434.jpg,0
3,3078964330_hue2_4.jpg,4
4,175989862_hue4_2.jpg,2


### TEST SET !!

In [61]:
test_df = pd.read_csv(
    "/home/samic_yongjian/temp/SC4000_Machine_Learning/data/test_df.csv"
)

## Load csv files

In [62]:
csv_files = [
    "/home/samic_yongjian/temp/SC4000_Machine_Learning/output/resnext/20241103_232814/best_validation_probabilities.csv",
    "/home/samic_yongjian/temp/SC4000_Machine_Learning/output/vit/20241103_190631/validation_probabilities.csv",
    # "/home/samic_yongjian/temp/SC4000_Machine_Learning/output/vit_v2/20241104_164221/validation_probabilities.csv",
    # "/home/samic_yongjian/temp/SC4000_Machine_Learning/output/resnet/20241104_010113/best_validation_probabilities.csv",/
    # "/home/samic_yongjian/temp/SC4000_Machine_Learning/output/inception/20241104_124742/best_validation_probabilities.csv",
    # "/home/samic_yongjian/temp/SC4000_Machine_Learning/output/efficientnetb4/20241103_215449/best_validation_probabilities.csv",
    # "/home/samic_yongjian/temp/SC4000_Machine_Learning/output/efficientnetb4_v2/20241104_004159/best_validation_probabilities.csv",
    # "/home/samic_yongjian/temp/SC4000_Machine_Learning/output/efficientnetb0/20241104_125022/best_validation_probabilities.csv",
    # "/home/samic_yongjian/temp/SC4000_Machine_Learning/output/cnn/20241104_143543/best_validation_probabilities.csv",
    # "/home/samic_yongjian/temp/SC4000_Machine_Learning/output/alexnet/20241104_143840/best_validation_probabilities.csv",
    "/home/samic_yongjian/temp/SC4000_Machine_Learning/output/cropnet/20241109_181431/validation_inference_results.csv",
]

In [63]:
model_count = len(csv_files)

In [64]:
for i,file in enumerate(csv_files): 
    if i == 0: 
        merged_df = pd.read_csv(file)
        #move the image_id column to the first
        image_name = merged_df.columns[-1]
        merged_df = merged_df[[image_name] + merged_df.columns[:-1].tolist()]
        #sort the order of validation true labels and get the labels
        valid_df = merged_df.merge(valid_df, on = 'image_id', how = 'left')
        valid_df = valid_df[['image_id', 'labels']]
        true_val_labels = valid_df['labels'].values
    else: 
        df = pd.read_csv(file)
        merged_df = merged_df.merge(df, on = 'image_id', how = 'left', suffixes = ('', '_model'+str(i+1)))

## Initialize Eval Table for Ensembles

In [65]:
# Initialize an empty DataFrame with specific column names
columns = ['Method','Log Loss', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'F2 Score']
val_ensemble_df = pd.DataFrame(columns=columns)
test_ensemble_df = pd.DataFrame(columns=columns)

In [66]:
def add_ensemble (ensemble_df, true_labels, pred_labels, pred_probs, method, mode):
    logloss = log_loss(true_labels, pred_probs)
    accuracy = accuracy_score(true_labels, pred_labels)
    precision = precision_score(true_labels, pred_labels, average="weighted")
    recall = recall_score(true_labels, pred_labels, average="weighted")
    f1 = f1_score(true_labels, pred_labels, average="weighted")
    f2 = fbeta_score(true_labels, pred_labels, beta=2, average="weighted")

    print(f"Metrics for {method} on {mode} set")
    print(f"Log Loss: {logloss:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"F2 Score: {f2:.4f}")
    
    ensemble_df.loc[len(ensemble_df)] = [method, logloss, accuracy, precision, recall, f1, f2]
    return ensemble_df

## Load softmax probabilities
Note: Assuming that all csv files contain a last column stating the image_ids. Here, we only extract the true labels from the first file

## Ensemble 1 - Soft Voting: Averaging

In [67]:
average_df = pd.DataFrame()

average_df['image_id'] = merged_df['image_id']
average_df['avg_prob_class0'] = merged_df[['prob_class_0'] + [f'prob_class_0_model{i+2}' for i in range(model_count - 1)]].mean(axis=1)
average_df['avg_prob_class1'] = merged_df[['prob_class_1'] + [f'prob_class_1_model{i+2}' for i in range(model_count - 1)]].mean(axis=1)
average_df['avg_prob_class2'] = merged_df[['prob_class_2'] + [f'prob_class_2_model{i+2}' for i in range(model_count - 1)]].mean(axis=1)
average_df['avg_prob_class3'] = merged_df[['prob_class_3'] + [f'prob_class_3_model{i+2}' for i in range(model_count - 1)]].mean(axis=1)
average_df['avg_prob_class4'] = merged_df[['prob_class_4'] + [f'prob_class_4_model{i+2}' for i in range(model_count - 1)]].mean(axis=1)

average_df.to_csv('test_average.csv', index=False)

In [68]:
# Get predictions for logloss 
pred_val_probs = np.array([average_df.iloc[i, 1:].values for i in range(len(average_df))])

In [69]:
# Get the column index (integer) of the maximum value in each row, starting from the second column
pred_val_labels = average_df.values[:, 1:].argmax(axis=1) 

In [70]:
val_ensemble_df = add_ensemble(val_ensemble_df, true_val_labels, pred_val_labels, pred_val_probs, "Averaging", "Val")

Metrics for Averaging on Val set
Log Loss: 0.4024
Accuracy: 0.8788
Precision: 0.8800
Recall: 0.8788
F1 Score: 0.8790
F2 Score: 0.8788




In [71]:
val_ensemble_df

Unnamed: 0,Method,Log Loss,Accuracy,Precision,Recall,F1 Score,F2 Score
0,Averaging,0.402409,0.878784,0.880021,0.878784,0.879016,0.878783


Remember add for test set. 

## Ensemble 2 - Stacking Classifier

In [72]:
merged_df

Unnamed: 0,image_id,prob_class_0,prob_class_1,prob_class_2,prob_class_3,prob_class_4,prob_class_0_model2,prob_class_1_model2,prob_class_2_model2,prob_class_3_model2,prob_class_4_model2,prob_class_0_model3,prob_class_1_model3,prob_class_2_model3,prob_class_3_model3,prob_class_4_model3
0,2636545595_shadow_2.jpg,2.518566e-08,2.161615e-06,9.999973e-01,4.480139e-07,6.602512e-08,5.003601e-04,0.000053,0.136985,0.862447,0.000014,0.000403,0.072613,0.825092,0.057889,0.000729
1,1539638666_shear_3.jpg,9.983594e-07,3.671242e-05,3.376180e-03,9.965398e-01,4.627433e-05,2.218946e-05,0.000015,0.000021,0.999898,0.000044,0.000018,0.000169,0.003188,0.995700,0.000925
2,498197766_new_shadow_3.jpg,1.807987e-06,1.445904e-06,3.014859e-02,9.698479e-01,2.539879e-07,3.818872e-04,0.000247,0.000445,0.998914,0.000012,0.007025,0.335387,0.325038,0.326894,0.005449
3,849182498_hue1_4.jpg,2.336958e-06,1.141647e-05,1.462430e-02,3.538480e-03,9.818234e-01,9.783087e-07,0.000004,0.000007,0.000152,0.999836,0.000066,0.000061,0.187455,0.464244,0.347466
4,2270851426_hue1_0.jpg,9.999937e-01,1.020900e-07,2.840122e-08,6.042436e-06,1.460095e-07,3.407007e-01,0.001200,0.001126,0.656442,0.000530,0.002040,0.041629,0.016943,0.854036,0.084875
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14795,1080407872_new_shadow_0.jpg,9.464185e-01,5.353435e-06,9.840139e-06,3.887190e-06,5.356236e-02,5.896605e-01,0.000046,0.000166,0.000446,0.409682,0.174845,0.004152,0.000912,0.019724,0.798015
14796,3009386931_hue3_4.jpg,1.065973e-05,7.202194e-05,7.557277e-03,3.231623e-05,9.923277e-01,2.561129e-05,0.000044,0.006516,0.002270,0.991145,0.000598,0.000613,0.001398,0.002532,0.994605
14797,2086436188_hue4_3.jpg,1.675933e-06,1.938961e-06,2.248561e-06,9.999917e-01,2.480941e-06,1.498413e-05,0.000040,0.000003,0.999925,0.000017,0.000124,0.000053,0.012093,0.936466,0.051194
14798,train-cbsd-967.jpg,4.106674e-02,9.497469e-01,2.603798e-05,6.029042e-03,3.131305e-03,1.727101e-03,0.993556,0.000036,0.000599,0.004082,0.296304,0.077822,0.001495,0.620820,0.003486


In [73]:
#load the arrays of softmax probabilities 
model_probs = []
for j in range (model_count): 
    if j == 0: 
        column_names = [f'prob_class_{i}' for i in range(5)]
    else: 
        column_names = [f'prob_class_{i}_model{j+1}' for i in range(5)]
    model_prob = merged_df[column_names].values
    model_probs.append(model_prob)

In [74]:
# Stack all model probabilities horizontally
X_stacked = np.hstack(model_probs)

In [75]:
# Initialize and fit the logistic regression model
logistic_classifier = LogisticRegression(solver='lbfgs', max_iter=1000)
logistic_classifier.fit(X_stacked, true_val_labels)
# Make predictions on the validation set
log_val_predlabel = logistic_classifier.predict(X_stacked)
log_val_predprob = logistic_classifier.predict_proba(X_stacked)


In [76]:
val_ensemble_df = add_ensemble(val_ensemble_df, true_val_labels, log_val_predlabel, log_val_predprob, "LogReg", "Val")

Metrics for LogReg on Val set
Log Loss: 0.3384
Accuracy: 0.8960
Precision: 0.8960
Recall: 0.8960
F1 Score: 0.8959
F2 Score: 0.8960


In [77]:
val_ensemble_df

Unnamed: 0,Method,Log Loss,Accuracy,Precision,Recall,F1 Score,F2 Score
0,Averaging,0.402409,0.878784,0.880021,0.878784,0.879016,0.878783
1,LogReg,0.338423,0.896014,0.895976,0.896014,0.89594,0.895971


Remember add for test set. 