# DistilBERT Per-Class Performance Analysis

In [1]:
import numpy as np
import pandas as pd
import joblib
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import train_test_split

print("Imports complete!")

Imports complete!


In [2]:
# Configuration
BASE_DIR = r"C:\Users\apara\Desktop\MDM\saved_models"
BERT_DIR = r"C:\Users\apara\Desktop\MDM\saved_models\distilBERT_4_epochs"
LABEL_ENCODER_PATH = f"{BASE_DIR}\\label_encoder.pkl"
FEATURES_PATH = f"{BASE_DIR}\\extracted_features.csv"
TRAIN_CSV_PATH = r"C:\Users\apara\Desktop\MDM\train_none.csv"

MODELS = ["cohere-chat", "gpt4", "mistral-chat", "mpt-chat", "llama-chat"]
RANDOM_STATE = 5

In [3]:
# Load label encoder
print("Loading label encoder...")
le = joblib.load(LABEL_ENCODER_PATH)
print(f"Classes: {list(le.classes_)}")

# Load extracted features to get test labels
print("\nLoading extracted features...")
features_df = pd.read_csv(FEATURES_PATH)

# Get test labels from features file
test_labels = features_df[features_df['split'] == 'test']['label']

# Encode labels
y_test = le.transform(test_labels)
print(f"Test samples: {len(y_test)}")

Loading label encoder...
Classes: ['cohere-chat', 'gpt4', 'llama-chat', 'mistral-chat', 'mpt-chat']

Loading extracted features...


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Test samples: 42787


In [4]:
# Load DistilBERT predictions
print("\nLoading DistilBERT predictions...")
bert_preds = np.load(f"{BERT_DIR}\\bert_preds.npy")
bert_probs = np.load(f"{BERT_DIR}\\bert_probs.npy")
print(f"Predictions loaded: {len(bert_preds)}")


Loading DistilBERT predictions...
Predictions loaded: 42787


In [5]:
# Overall Performance
print("="*60)
print("DISTILBERT (4 EPOCHS) - OVERALL PERFORMANCE")
print("="*60)
print(f"Accuracy: {accuracy_score(y_test, bert_preds):.4f}")
print(f"Macro F1: {f1_score(y_test, bert_preds, average='macro'):.4f}")
print(f"Weighted F1: {f1_score(y_test, bert_preds, average='weighted'):.4f}")

DISTILBERT (4 EPOCHS) - OVERALL PERFORMANCE
Accuracy: 0.8978
Macro F1: 0.9017
Weighted F1: 0.8981


In [6]:
# Per-Class Performance
print("\n" + "="*60)
print("PER-CLASS PERFORMANCE")
print("="*60)
print(classification_report(y_test, bert_preds, target_names=le.classes_, digits=4))


PER-CLASS PERFORMANCE
              precision    recall  f1-score   support

 cohere-chat     0.9393    0.8326    0.8827      5348
        gpt4     0.9731    0.9274    0.9497      5348
  llama-chat     0.9242    0.9492    0.9365     10697
mistral-chat     0.8172    0.9050    0.8589     10697
    mpt-chat     0.9060    0.8568    0.8807     10697

    accuracy                         0.8978     42787
   macro avg     0.9120    0.8942    0.9017     42787
weighted avg     0.9009    0.8978    0.8981     42787



In [7]:
# Confusion Matrix
print("\n" + "="*60)
print("CONFUSION MATRIX")
print("="*60)
cm = confusion_matrix(y_test, bert_preds)
cm_df = pd.DataFrame(cm, index=le.classes_, columns=le.classes_)
print(cm_df)


CONFUSION MATRIX
              cohere-chat  gpt4  llama-chat  mistral-chat  mpt-chat
cohere-chat          4453    49          90           480       276
gpt4                   18  4960         164           137        69
llama-chat             19    21       10154           394       109
mistral-chat          130    35         354          9681       497
mpt-chat              121    32         225          1154      9165


In [8]:
# Per-Class Accuracy (Recall)
print("\n" + "="*60)
print("PER-CLASS ACCURACY (RECALL)")
print("="*60)
for i, class_name in enumerate(le.classes_):
    class_mask = (y_test == i)
    class_correct = (bert_preds[class_mask] == i).sum()
    class_total = class_mask.sum()
    class_acc = class_correct / class_total
    print(f"  {class_name:15s}: {class_acc:.4f} ({class_correct}/{class_total})")


PER-CLASS ACCURACY (RECALL)
  cohere-chat    : 0.8326 (4453/5348)
  gpt4           : 0.9274 (4960/5348)
  llama-chat     : 0.9492 (10154/10697)
  mistral-chat   : 0.9050 (9681/10697)
  mpt-chat       : 0.8568 (9165/10697)


In [9]:
# Most Common Errors
print("\n" + "="*60)
print("MOST COMMON MISCLASSIFICATIONS")
print("="*60)

errors = []
for i, true_class in enumerate(le.classes_):
    for j, pred_class in enumerate(le.classes_):
        if i != j and cm[i, j] > 0:
            errors.append({
                'True': true_class,
                'Predicted': pred_class,
                'Count': cm[i, j],
                'Rate': cm[i, j] / cm[i].sum()
            })

errors_df = pd.DataFrame(errors).sort_values('Count', ascending=False).head(10)
print(errors_df.to_string(index=False))


MOST COMMON MISCLASSIFICATIONS
        True    Predicted  Count     Rate
    mpt-chat mistral-chat   1154 0.107881
mistral-chat     mpt-chat    497 0.046462
 cohere-chat mistral-chat    480 0.089753
  llama-chat mistral-chat    394 0.036833
mistral-chat   llama-chat    354 0.033093
 cohere-chat     mpt-chat    276 0.051608
    mpt-chat   llama-chat    225 0.021034
        gpt4   llama-chat    164 0.030666
        gpt4 mistral-chat    137 0.025617
mistral-chat  cohere-chat    130 0.012153


In [None]:
# Table for report
print("\n" + "="*60)
print("TABLE FOR REPORT: Per-Class Metrics for DistilBERT")
print("="*60)

report = classification_report(y_test, bert_preds, target_names=le.classes_, output_dict=True)

table_data = []
for class_name in le.classes_:
    table_data.append({
        'Model': class_name,
        'Precision': f"{report[class_name]['precision']:.2f}",
        'Recall': f"{report[class_name]['recall']:.2f}",
        'F1-Score': f"{report[class_name]['f1-score']:.2f}",
        'Support': int(report[class_name]['support'])
    })

# Add macro avg
table_data.append({
    'Model': 'Macro Avg',
    'Precision': f"{report['macro avg']['precision']:.2f}",
    'Recall': f"{report['macro avg']['recall']:.2f}",
    'F1-Score': f"{report['macro avg']['f1-score']:.2f}",
    'Support': int(report['macro avg']['support'])
})

# Add weighted avg
table_data.append({
    'Model': 'Weighted Avg',
    'Precision': f"{report['weighted avg']['precision']:.2f}",
    'Recall': f"{report['weighted avg']['recall']:.2f}",
    'F1-Score': f"{report['weighted avg']['f1-score']:.2f}",
    'Support': int(report['weighted avg']['support'])
})

table_df = pd.DataFrame(table_data)
print(table_df.to_string(index=False))


TABLE FOR REPORT: Per-Class Metrics for DistilBERT
       Model Precision Recall F1-Score  Support
 cohere-chat      0.94   0.83     0.88     5348
        gpt4      0.97   0.93     0.95     5348
  llama-chat      0.92   0.95     0.94    10697
mistral-chat      0.82   0.91     0.86    10697
    mpt-chat      0.91   0.86     0.88    10697
   Macro Avg      0.91   0.89     0.90    42787
Weighted Avg      0.90   0.90     0.90    42787
