# Model Development for Student Performance Analysis
This notebook trains and compares different machine learning models to predict students' performance.


In [None]:
# 1. IMPORTS
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score
import joblib
from google.colab import files


In [None]:
# 2. LOAD DATA
uploaded_files = files.upload()
df = pd.read_csv(list(uploaded_files.keys())[0])
print("Data preview:\n")
print(df.head())

In [None]:
# 3. CREATE GRADE CATEGORY IF MISSING
if 'GradeCategory' not in df.columns:
    def score_to_grade(score):
        if score >= 90: return 'A+'
        elif score >= 80: return 'A'
        elif score >= 70: return 'B'
        elif score >= 60: return 'C'
        else: return 'D'
    df['GradeCategory'] = df['Exam_Score'].apply(score_to_grade)
    print("GradeCategory column created from Exam_Score.")

In [None]:
# 4. FEATURE / TARGET SPLIT
X = df.drop(columns=['Exam_Score', 'GradeCategory'])
y_reg = df['Exam_Score']
y_clf = df['GradeCategory']

In [162]:
# 5. ENCODE CATEGORICAL FEATURES
categorical_cols = X.select_dtypes(include='object').columns
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Encode classification target
le = LabelEncoder()
y_clf_enc = le.fit_transform(y_clf)

In [None]:
# 6. TRAIN-TEST SPLIT
# Regression
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_encoded, y_reg, test_size=0.2, random_state=42
)

# Classification
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    X_encoded, y_clf_enc, test_size=0.2, random_state=42, stratify=y_clf_enc
)

In [None]:
# 7. FEATURE SCALING
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_reg)
X_test_scaled = scaler.transform(X_test_reg)
X_train_scaled_clf = scaler.fit_transform(X_train_clf)
X_test_scaled_clf = scaler.transform(X_test_clf)

In [None]:
# 8. DEFINE MODELS
# Regression models
reg_models = {
    'Linear Regression': LinearRegression(),
    'SVR': SVR(),
    'XGBoost': XGBRegressor(eval_metric='rmse', use_label_encoder=False),
    'CatBoost': CatBoostRegressor(verbose=0)
}

# Classification models
clf_models = {
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    'CatBoost': CatBoostClassifier(verbose=0)
}

In [None]:
# 9. TRAIN MODELS & STORE RESULTS
reg_results = []
for name, model in reg_models.items():
    model.fit(X_train_scaled, y_train_reg)
    y_pred = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test_reg, y_pred)
    r2 = r2_score(y_test_reg, y_pred)
    reg_results.append({'Model': name, 'MSE': mse, 'R2': r2})
reg_df = pd.DataFrame(reg_results)

clf_results = []
for name, model in clf_models.items():
    model.fit(X_train_scaled_clf, y_train_clf)
    y_pred = model.predict(X_test_scaled_clf)
    acc = accuracy_score(y_test_clf, y_pred)
    f1 = f1_score(y_test_clf, y_pred, average='weighted')
    clf_results.append({'Model': name, 'Accuracy': acc, 'F1': f1})
clf_df = pd.DataFrame(clf_results)

In [None]:
# 10. SELECT BEST MODELS & SAVE
# Regression
best_reg_model_name = reg_df.sort_values('MSE').iloc[0]['Model']
best_reg_model = reg_models[best_reg_model_name]
joblib.dump(best_reg_model, 'best_reg_model.pkl')
files.download('best_reg_model.pkl')

# Classification
best_clf_model_name = clf_df.sort_values('Accuracy', ascending=False).iloc[0]['Model']
best_clf_model = clf_models[best_clf_model_name]
joblib.dump(best_clf_model, 'best_clf_model.pkl')
files.download('best_clf_model.pkl')

In [None]:
# 11. SHOW RESULTS
print("=== REGRESSION RESULTS ===")
print(reg_df)
print(f"Best regression model saved: {best_reg_model_name}")

print("=== CLASSIFICATION RESULTS ===")
print(clf_df)
print(f"Best classification model saved: {best_clf_model_name}")