# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, matthews_corrcoef

# Building the ensemble

In [2]:
predictions_model1 = pd.read_csv('../data/predictions_model_visit1.csv')
predictions_model2 = pd.read_csv('../data/predictions_model_visit12.csv')
predictions_model3 = pd.read_csv('../data/predictions_model_visit123.csv')

In [4]:
# Merge predictions from all three models on ID
ensemble_df = predictions_model1.merge(predictions_model2, on='ID', suffixes=('_model1', '_model2')) \
                                .merge(predictions_model3, on='ID')

# Rename columns for clarity 
ensemble_df.rename(columns={
    'Predicted_Label_model1': 'Pred_Label_Model1',
    'Predicted_Label_model2': 'Pred_Label_Model2',
    'Predicted_Label': 'Pred_Label_Model3',
    'True_Label': 'True_Label'
}, inplace=True)


In [6]:
# Perform majority voting
ensemble_df['Ensemble_Prediction'] = np.apply_along_axis(
    lambda x: np.bincount(x).argmax(), axis=1,
    arr=ensemble_df[['Pred_Label_Model1', 'Pred_Label_Model2', 'Pred_Label_Model3']].values
)

In [8]:
# Keep only the necessary columns
ensemble_df_cleaned = ensemble_df[['ID', 'Pred_Label_Model1', 'Pred_Label_Model2', 'Pred_Label_Model3', 'Ensemble_Prediction', 'True_Label']]

In [10]:
# Generate the classification report
classification_report_ensemble = classification_report(
    ensemble_df_cleaned['True_Label'], 
    ensemble_df_cleaned['Ensemble_Prediction']
)
print("Classification Report for Ensemble:")
print(classification_report_ensemble)

# Calculate the Matthews Correlation Coefficient (MCC)
mcc_ensemble = matthews_corrcoef(
    ensemble_df_cleaned['True_Label'], 
    ensemble_df_cleaned['Ensemble_Prediction']
)
print(f"Matthews Correlation Coefficient (MCC): {mcc_ensemble}")


Classification Report for Ensemble:
              precision    recall  f1-score   support

           0       0.94      0.94      0.94        31
           1       0.88      0.88      0.88        17

    accuracy                           0.92        48
   macro avg       0.91      0.91      0.91        48
weighted avg       0.92      0.92      0.92        48

Matthews Correlation Coefficient (MCC): 0.8178368121442126


# High-certainty ensemble

In [11]:
# Add a new column for certainty-based predictions
def calculate_certainty(row):
    # Check if all predictions agree
    if row['Pred_Label_Model1'] == row['Pred_Label_Model2'] == row['Pred_Label_Model3']:
        return row['Pred_Label_Model1']  # Return the agreed prediction
    else:
        return -1  # Use -1 to indicate uncertainty

# Apply the function to calculate certainty-based predictions
ensemble_df_cleaned['Certainty_Prediction'] = ensemble_df_cleaned.apply(calculate_certainty, axis=1)

# Count certain and uncertain predictions
certain_count = (ensemble_df_cleaned['Certainty_Prediction'] != -1).sum()
uncertain_count = (ensemble_df_cleaned['Certainty_Prediction'] == -1).sum()

print(f"Number of Certain Predictions: {certain_count}")
print(f"Number of Uncertain Predictions: {uncertain_count}")


Number of Certain Predictions: 41
Number of Uncertain Predictions: 7


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ensemble_df_cleaned['Certainty_Prediction'] = ensemble_df_cleaned.apply(calculate_certainty, axis=1)


In [12]:
# Filter for certain predictions
certain_predictions = ensemble_df_cleaned[ensemble_df_cleaned['Certainty_Prediction'] != -1]

# Generate classification report for certain predictions
classification_report_certain = classification_report(
    certain_predictions['True_Label'], 
    certain_predictions['Certainty_Prediction']
)
print("Classification Report for Certain Predictions:")
print(classification_report_certain)

# Calculate MCC for certain predictions
mcc_certain = matthews_corrcoef(
    certain_predictions['True_Label'], 
    certain_predictions['Certainty_Prediction']
)
print(f"Matthews Correlation Coefficient (MCC) for Certain Predictions: {mcc_certain}")


Classification Report for Certain Predictions:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98        29
           1       0.92      1.00      0.96        12

    accuracy                           0.98        41
   macro avg       0.96      0.98      0.97        41
weighted avg       0.98      0.98      0.98        41

Matthews Correlation Coefficient (MCC) for Certain Predictions: 0.9440586233651661


# Further investigating of the first-visit model (against the high-certainty ensemble)

How is the first model performing on the cases, what the ensemble considers as high certainty cases?

In [48]:
# Evaluate the performance of the first model on high-certainty predictions
classification_report_model1_high_certainty = classification_report(
    certain_predictions['True_Label'], 
    certain_predictions['Pred_Label_Model1']
)
print("Classification Report for Model 1 on High-Certainty Predictions:")
print(classification_report_model1_high_certainty)

# Calculate Matthews Correlation Coefficient (MCC) for Model 1 on high-certainty predictions
mcc_model1_high_certainty = matthews_corrcoef(
    certain_predictions['True_Label'], 
    certain_predictions['Pred_Label_Model1']
)
print(f"Matthews Correlation Coefficient (MCC) for Model 1 on High-Certainty Predictions: {mcc_model1_high_certainty}")


Classification Report for Model 1 on High-Certainty Predictions:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98        29
           1       0.92      1.00      0.96        12

    accuracy                           0.98        41
   macro avg       0.96      0.98      0.97        41
weighted avg       0.98      0.98      0.98        41

Matthews Correlation Coefficient (MCC) for Model 1 on High-Certainty Predictions: 0.9440586233651661
