## Import libraries for ensemble techniques

In [1]:
import pandas as pd
import numpy as np 

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    ConfusionMatrixDisplay,
    fbeta_score,
    log_loss,
    precision_score,
    recall_score,
    f1_score,
)

## Load valid_df and test_df

In [2]:
valid_df = pd.read_csv(
    "/home/samic_yongjian/temp/SC4000_Machine_Learning/data/valid_df.csv"
)
valid_df.head(5)

Unnamed: 0,image_id,labels
0,1611662564_noise_4.jpg,4
1,801551318_hue3_4.jpg,4
2,train-cbb-434.jpg,0
3,3078964330_hue2_4.jpg,4
4,175989862_hue4_2.jpg,2


### TEST SET !!

In [3]:
test_df = pd.read_csv(
    "/home/samic_yongjian/temp/SC4000_Machine_Learning/data/test_df.csv"
)

## Load csv files

In [20]:
csv_files = [
    # "/home/samic_yongjian/temp/SC4000_Machine_Learning/output/vit_v2/20241104_164221/validation_probabilities.csv",
    "/home/samic_yongjian/temp/SC4000_Machine_Learning/output/vit/20241103_190631/validation_probabilities.csv",
    "/home/samic_yongjian/temp/SC4000_Machine_Learning/output/resnext/20241103_232814/best_validation_probabilities.csv",
]

In [21]:
model_count = len(csv_files)

In [22]:
for i,file in enumerate(csv_files): 
    if i == 0: 
        merged_df = pd.read_csv(file)
        #move the image_id column to the first
        image_name = merged_df.columns[-1]
        merged_df = merged_df[[image_name] + merged_df.columns[:-1].tolist()]
        #sort the order of validation true labels and get the labels
        valid_df = merged_df.merge(valid_df, on = 'image_id', how = 'left')
        valid_df = valid_df[['image_id', 'labels']]
        true_val_labels = valid_df['labels'].values
    else: 
        df = pd.read_csv(file)
        merged_df = merged_df.merge(df, on = 'image_id', how = 'left', suffixes = ('', '_model'+str(i+1)))

## Initialize Eval Table for Ensembles

In [23]:
# Initialize an empty DataFrame with specific column names
columns = ['Method','Log Loss', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'F2 Score']
val_ensemble_df = pd.DataFrame(columns=columns)
test_ensemble_df = pd.DataFrame(columns=columns)

In [24]:
def add_ensemble (ensemble_df, true_labels, pred_labels, pred_probs, method, mode):
    logloss = log_loss(true_labels, pred_probs)
    accuracy = accuracy_score(true_labels, pred_labels)
    precision = precision_score(true_labels, pred_labels, average="weighted")
    recall = recall_score(true_labels, pred_labels, average="weighted")
    f1 = f1_score(true_labels, pred_labels, average="weighted")
    f2 = fbeta_score(true_labels, pred_labels, beta=2, average="weighted")

    print(f"Metrics for {method} on {mode} set")
    print(f"Log Loss: {logloss:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"F2 Score: {f2:.4f}")
    
    ensemble_df.loc[len(ensemble_df)] = [method, logloss, accuracy, precision, recall, f1, f2]
    return ensemble_df

## Load softmax probabilities
Note: Assuming that all csv files contain a last column stating the image_ids. Here, we only extract the true labels from the first file

## Ensemble 1 - Soft Voting: Averaging

In [25]:
average_df = pd.DataFrame()

average_df['image_id'] = merged_df['image_id']
average_df['avg_prob_class0'] = merged_df[['prob_class_0'] + [f'prob_class_0_model{i+2}' for i in range(model_count - 1)]].mean(axis=1)
average_df['avg_prob_class1'] = merged_df[['prob_class_1'] + [f'prob_class_1_model{i+2}' for i in range(model_count - 1)]].mean(axis=1)
average_df['avg_prob_class2'] = merged_df[['prob_class_2'] + [f'prob_class_2_model{i+2}' for i in range(model_count - 1)]].mean(axis=1)
average_df['avg_prob_class3'] = merged_df[['prob_class_3'] + [f'prob_class_3_model{i+2}' for i in range(model_count - 1)]].mean(axis=1)
average_df['avg_prob_class4'] = merged_df[['prob_class_4'] + [f'prob_class_4_model{i+2}' for i in range(model_count - 1)]].mean(axis=1)

average_df.to_csv('test_average.csv', index=False)

In [26]:
# Get predictions for logloss 
pred_val_probs = np.array([average_df.iloc[i, 1:].values for i in range(len(average_df))])

In [27]:
# Get the column index (integer) of the maximum value in each row, starting from the second column
pred_val_labels = average_df.values[:, 1:].argmax(axis=1) 

In [28]:
val_ensemble_df = add_ensemble(val_ensemble_df, true_val_labels, pred_val_labels, pred_val_probs, "Averaging", "Val")

Metrics for Averaging on Val set
Log Loss: 0.3616
Accuracy: 0.8895
Precision: 0.8896
Recall: 0.8895
F1 Score: 0.8894
F2 Score: 0.8894




In [29]:
val_ensemble_df

Unnamed: 0,Method,Log Loss,Accuracy,Precision,Recall,F1 Score,F2 Score
0,Averaging,0.361553,0.889527,0.889641,0.889527,0.889418,0.889443


Remember add for test set. 

## Ensemble 2 - Stacking Classifier

In [30]:
merged_df

Unnamed: 0,image_id,prob_class_0,prob_class_1,prob_class_2,prob_class_3,prob_class_4,prob_class_0_model2,prob_class_1_model2,prob_class_2_model2,prob_class_3_model2,prob_class_4_model2
0,1611662564_noise_4.jpg,0.010940,0.000916,0.000009,0.000818,9.873169e-01,0.000495,0.001323,0.000080,0.000052,0.998050
1,801551318_hue3_4.jpg,0.005787,0.020777,0.001658,0.000371,9.714059e-01,0.000383,0.015886,0.036767,0.052547,0.894417
2,train-cbb-434.jpg,0.998480,0.001054,0.000334,0.000131,7.728260e-07,0.961509,0.037866,0.000372,0.000157,0.000096
3,3078964330_hue2_4.jpg,0.000208,0.007819,0.007128,0.001396,9.834493e-01,0.014867,0.020336,0.000385,0.000038,0.964374
4,175989862_hue4_2.jpg,0.001751,0.002175,0.980255,0.012771,3.048500e-03,0.000245,0.000017,0.999523,0.000211,0.000005
...,...,...,...,...,...,...,...,...,...,...,...
14795,3353595234_hue2_2.jpg,0.000005,0.000011,0.999971,0.000005,8.161075e-06,0.000004,0.000181,0.999460,0.000265,0.000091
14796,train-cmd-928_shear_3.jpg,0.000020,0.000865,0.000025,0.999082,8.746736e-06,0.000001,0.000041,0.000048,0.999823,0.000088
14797,471996434_hue4_4.jpg,0.004226,0.985866,0.000853,0.003628,5.427785e-03,0.000265,0.004871,0.982910,0.002376,0.009577
14798,3939876859_new_shadow_0.jpg,0.999825,0.000009,0.000017,0.000116,3.443130e-05,0.546116,0.006218,0.033348,0.016470,0.397849


In [31]:
#load the arrays of softmax probabilities 
model_probs = []
for j in range (model_count): 
    if j == 0: 
        column_names = [f'prob_class_{i}' for i in range(5)]
    else: 
        column_names = [f'prob_class_{i}_model{j+1}' for i in range(5)]
    model_prob = merged_df[column_names].values
    model_probs.append(model_prob)

In [32]:
# Stack all model probabilities horizontally
X_stacked = np.hstack(model_probs)

In [33]:
# Initialize and fit the logistic regression model
logistic_classifier = LogisticRegression(solver='lbfgs', max_iter=1000)
logistic_classifier.fit(X_stacked, true_val_labels)
# Make predictions on the validation set
log_val_predlabel = logistic_classifier.predict(X_stacked)
log_val_predprob = logistic_classifier.predict_proba(X_stacked)


In [34]:
val_ensemble_df = add_ensemble(val_ensemble_df, true_val_labels, log_val_predlabel, log_val_predprob, "LogReg", "Val")

Metrics for LogReg on Val set
Log Loss: 0.3507
Accuracy: 0.8936
Precision: 0.8936
Recall: 0.8936
F1 Score: 0.8935
F2 Score: 0.8936


In [35]:
val_ensemble_df

Unnamed: 0,Method,Log Loss,Accuracy,Precision,Recall,F1 Score,F2 Score
0,Averaging,0.361553,0.889527,0.889641,0.889527,0.889418,0.889443
1,LogReg,0.350727,0.893649,0.893632,0.893649,0.893544,0.893584


Remember add for test set. 