In [14]:
from scipy.sparse import load_npz
import pandas as pd
import numpy as np
import joblib
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier,
    VotingClassifier,
)
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report,
)

X_final = load_npz("preprocessed/X_final.npz")
y = pd.read_csv("preprocessed/y.csv")['problem_class']


In [15]:
# Test train split
X_train, X_test, y_train, y_test = train_test_split(X_final, y, test_size=0.2, random_state=42)

In [16]:
# Trying out different moedels

print("Training Logistic...")
log_clf = LogisticRegression(
    penalty="l2",
    C=1.0,
    solver="saga",
    max_iter=3000,          # Underfit at 300 so increased 
    tol=1e-3,    
    class_weight="balanced",        # Because 'Easy' class is imbalanced
    multi_class="multinomial",
    n_jobs=-1,
    random_state=42
)

log_clf.fit(X_train, y_train)
print("Logistic trained")

print("Training RF...")
rf_clf = RandomForestClassifier(             # Accuracy : 0.5151883353584447
    n_estimators=500,
    n_jobs=-1,
    random_state=42
)

rf_clf.fit(X_train, y_train)
print("RF Trained...")



print("Training SVC...")
svm_base = LinearSVC(
    C=1.0,
    max_iter=5000,          # increase
    tol=1e-3,               # relaxed tolerance
    class_weight="balanced",
    random_state=42
)

svm_clf = CalibratedClassifierCV(
    svm_base,
    method="sigmoid",
    cv=3
)

svm_clf.fit(X_train, y_train)
print("SVM Trained...")

print("Training an ensemble...")
difficulty_classifier = VotingClassifier(
    estimators=[
        ("lr", log_clf),
        ("rf", rf_clf),
        ("svc", svm_clf)
    ],
    voting="soft",
    weights=[1, 6, 1],      # Logistic:Stable baseline.	Random Forest:Non-linear correction(More important).	SVM:Margin sharpener
    n_jobs=-1
)

difficulty_classifier.fit(X_train, y_train)
print("Ensemble Trained...")

# Evaluation
count=1
for model in [log_clf, rf_clf, svm_clf, difficulty_classifier]:
    print(f"For model {count}")
    count +=1
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)

    print(f"\nModel Accuracy: {acc:.4f}")
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\n")

    ConfusionMatrixDisplay.from_predictions(y_test, y_pred)
    if count==2:
        title = 'Logistic Regression'
    if count==3:
        title = 'Random Forest'
    if count==4:
        title = 'SVM classifier'
    if count==5:
        title = 'Ensemble of all'
    plt.title(title)
    plt.tight_layout()
    plt.savefig(f"figures/confusion_matrix_classifier{count-1}.png", dpi=300)
    plt.close()

Training Logistic...




Logistic trained
Training RF...
RF Trained...
Training SVC...
SVM Trained...
Training an ensemble...
Ensemble Trained...
For model 1

Model Accuracy: 0.4581
Confusion Matrix:
 [[ 76  23  37]
 [ 76 203 146]
 [ 56 108  98]]


For model 2

Model Accuracy: 0.5808
Confusion Matrix:
 [[ 35  65  36]
 [ 16 369  40]
 [ 14 174  74]]


For model 3

Model Accuracy: 0.5334
Confusion Matrix:
 [[ 25 104   7]
 [  6 406  13]
 [ 12 242   8]]


For model 4

Model Accuracy: 0.5674
Confusion Matrix:
 [[ 42  58  36]
 [ 19 360  46]
 [ 19 178  65]]




In [17]:
# Final Model Training

print("Training Model finalised...")

# Random Forest is robust and handles mixed data (text + numbers) well
rf_model = RandomForestClassifier(n_estimators=500, random_state=42)
rf_model.fit(X_train, y_train)
print("Training completed")

#  Final Evaluation score
y_pred = rf_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print(f"\nModel Accuracy: {acc:.4f}")


print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Training Model finalised...
Training completed

Model Accuracy: 0.5808
Confusion Matrix:
 [[ 35  65  36]
 [ 16 369  40]
 [ 14 174  74]]

Classification Report:
              precision    recall  f1-score   support

        easy       0.54      0.26      0.35       136
        hard       0.61      0.87      0.71       425
      medium       0.49      0.28      0.36       262

    accuracy                           0.58       823
   macro avg       0.55      0.47      0.47       823
weighted avg       0.56      0.58      0.54       823



In [18]:
joblib.dump(rf_model, "pickle/final_classifier_58.pkl")

['pickle/final_classifier_58.pkl']