In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    classification_report
)

import joblib


In [2]:
df = pd.read_csv("../data/cleaned_student_data.csv")
df.head()


Unnamed: 0,Study Hours per Week,Attendance Rate,Previous Grades,Passed,Participation in Extracurricular Activities_Yes,Parent Education Level_Bachelor,Parent Education Level_Doctorate,Parent Education Level_High School,Parent Education Level_Master
0,12.5,75.276323,75.0,1,True,False,False,False,True
1,9.3,95.3,60.6,0,False,False,False,True,False
2,13.2,75.276323,64.0,0,False,False,False,False,False
3,17.6,76.8,62.4,0,True,True,False,False,False
4,8.8,89.3,72.7,0,False,False,False,False,True


In [3]:
X = df.drop(columns=["Passed"])
y = df["Passed"]


The dataset is split into input features (X) and the target variable (y).


In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [5]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [6]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_scaled, y_train)


In [7]:
y_pred_lr = log_reg.predict(X_test_scaled)


In [8]:
print("Logistic Regression Accuracy:",
      accuracy_score(y_test, y_pred_lr))

print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))


Logistic Regression Accuracy: 0.525875
[[   8 3790]
 [   3 4199]]
              precision    recall  f1-score   support

           0       0.73      0.00      0.00      3798
           1       0.53      1.00      0.69      4202

    accuracy                           0.53      8000
   macro avg       0.63      0.50      0.35      8000
weighted avg       0.62      0.53      0.36      8000



Logistic Regression provides a baseline model with reasonable performance.
It is simple and interpretable.


In [9]:
rf = RandomForestClassifier(
    n_estimators=100,
    random_state=42
)

rf.fit(X_train, y_train)


In [10]:
y_pred_rf = rf.predict(X_test)


In [11]:
print("Random Forest Accuracy:",
      accuracy_score(y_test, y_pred_rf))

print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.49925
[[1704 2094]
 [1912 2290]]
              precision    recall  f1-score   support

           0       0.47      0.45      0.46      3798
           1       0.52      0.54      0.53      4202

    accuracy                           0.50      8000
   macro avg       0.50      0.50      0.50      8000
weighted avg       0.50      0.50      0.50      8000



Model Comparison:

- Logistic Regression is simple and explainable.
- Random Forest handles feature interactions better.

The Random Forest model performs slightly better and is chosen
as the final model for deployment.


In [12]:
joblib.dump(rf, "../backend/model.pkl")
joblib.dump(scaler, "../backend/scaler.pkl")


['../backend/scaler.pkl']

### ML Phase Summary

- Data split into training and testing sets
- Two classification models were trained
- Models evaluated using accuracy and confusion matrix
- Best model saved for backend deployment


In [14]:
feature_names = X.columns.tolist()
joblib.dump(feature_names, "../backend/feature_names.pkl")


['../backend/feature_names.pkl']