# Model Development for Student Performance Analysis
This notebook trains and compares different machine learning models to predict students' performance.


In [46]:
# ----------------------------
# 1. IMPORT LIBRARIES
# ----------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score
import joblib
from google.colab import files

# ----------------------------
# 2. LOAD CLEANED DATA
# ----------------------------
uploaded = files.upload()
df = pd.read_csv(list(uploaded.keys())[0])

# ----------------------------
# 3. FEATURE / TARGET SPLIT
# ----------------------------
X = df.drop(columns=['Exam_Score', 'GradeCategory'])
y_reg = df['Exam_Score']
y_clf = df['GradeCategory']

# ----------------------------
# 4. ENCODE FEATURES
# ----------------------------
categorical_cols = X.select_dtypes(include='object').columns
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
all_features = X_encoded.columns.tolist()

# Encode classification target
le = LabelEncoder()
y_clf_enc = le.fit_transform(y_clf)

# ----------------------------
# 5. TRAIN-TEST SPLIT
# ----------------------------
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_encoded, y_reg, test_size=0.2, random_state=42
)
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    X_encoded, y_clf_enc, test_size=0.2, random_state=42, stratify=y_clf_enc
)

# ----------------------------
# 6. SCALE FEATURES
# ----------------------------
scaler_reg = StandardScaler()
X_train_scaled_reg = scaler_reg.fit_transform(X_train_reg)
X_test_scaled_reg = scaler_reg.transform(X_test_reg)

scaler_clf = StandardScaler()
X_train_scaled_clf = scaler_clf.fit_transform(X_train_clf)
X_test_scaled_clf = scaler_clf.transform(X_test_clf)

# ----------------------------
# 7. DEFINE MODELS
# ----------------------------
reg_models = {
    'LinearRegression': LinearRegression(),
    'SVR': SVR(),
    'XGBoost': XGBRegressor(eval_metric='rmse', use_label_encoder=False),
    'CatBoost': CatBoostRegressor(verbose=0)
}
clf_models = {
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    'CatBoost': CatBoostClassifier(verbose=0)
}

# ----------------------------
# 8. TRAIN MODELS AND SELECT BEST
# ----------------------------
# Regression
reg_results = []
for name, model in reg_models.items():
    model.fit(X_train_scaled_reg, y_train_reg)
    y_pred = model.predict(X_test_scaled_reg)
    reg_results.append({
        'Model': name,
        'MSE': mean_squared_error(y_test_reg, y_pred),
        'R2': r2_score(y_test_reg, y_pred)
    })
best_reg_model_name = min(reg_results, key=lambda x: x['MSE'])['Model']
best_reg_model = reg_models[best_reg_model_name]

# Classification
clf_results = []
for name, model in clf_models.items():
    model.fit(X_train_scaled_clf, y_train_clf)
    y_pred = model.predict(X_test_scaled_clf)
    clf_results.append({
        'Model': name,
        'Accuracy': accuracy_score(y_test_clf, y_pred),
        'F1': f1_score(y_test_clf, y_pred, average='weighted')
    })
best_clf_model_name = max(clf_results, key=lambda x: x['Accuracy'])['Model']
best_clf_model = clf_models[best_clf_model_name]

# ----------------------------
# 9. SAVE MODELS + SCALERS + FEATURES
# ----------------------------
import os
os.makedirs('models', exist_ok=True)

joblib.dump(best_reg_model, 'models/best_reg_model.pkl')
joblib.dump(best_clf_model, 'models/best_clf_model.pkl')
joblib.dump(scaler_reg, 'models/scaler_reg.pkl')
joblib.dump(scaler_clf, 'models/scaler_clf.pkl')
joblib.dump(le, 'models/label_encoder.pkl')
joblib.dump(all_features, 'models/all_training_columns.pkl')

# Optional: download
files.download('models/best_reg_model.pkl')
files.download('models/best_clf_model.pkl')
files.download('models/scaler_reg.pkl')
files.download('models/scaler_clf.pkl')
files.download('models/label_encoder.pkl')
files.download('models/all_training_columns.pkl')


Saving StudentPerformanceFactors_Cleaned.csv to StudentPerformanceFactors_Cleaned (3).csv


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>