# Model Development for Student Performance Analysis
This notebook trains and compares different machine learning models to predict students' performance.


In [20]:
# Install CatBoost if not already installed
!pip install catboost --quiet

# Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor, XGBClassifier
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, f1_score
import joblib
from google.colab import files


In [21]:
# Upload the cleaned CSV from Notebook 1
uploaded_files = files.upload()
df = pd.read_csv(list(uploaded_files.keys())[0])

print("Data preview:")
display(df.head())
df.info()


Saving StudentPerformanceFactors_Cleaned.csv to StudentPerformanceFactors_Cleaned (1).csv
Data preview:


Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6607 entries, 0 to 6606
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Hours_Studied               6607 non-null   int64 
 1   Attendance                  6607 non-null   int64 
 2   Parental_Involvement        6607 non-null   object
 3   Access_to_Resources         6607 non-null   object
 4   Extracurricular_Activities  6607 non-null   object
 5   Sleep_Hours                 6607 non-null   int64 
 6   Previous_Scores             6607 non-null   int64 
 7   Motivation_Level            6607 non-null   object
 8   Internet_Access             6607 non-null   object
 9   Tutoring_Sessions           6607 non-null   int64 
 10  Family_Income               6607 non-null   object
 11  Teacher_Quality             6607 non-null   object
 12  School_Type                 6607 non-null   object
 13  Peer_Influence              6607 non-null   obje

In [22]:
# Create GradeCategory if missing
if 'GradeCategory' not in df.columns:
    def score_to_grade(score):
        if score >= 90: return 'A+'
        elif score >= 80: return 'A'
        elif score >= 70: return 'B'
        elif score >= 60: return 'C'
        else: return 'D'
    df['GradeCategory'] = df['Exam_Score'].apply(score_to_grade)
    print("GradeCategory column created.")


GradeCategory column created.


In [23]:
# Features
X = df.drop(columns=['Exam_Score', 'GradeCategory'])

# Regression target
y_reg = df['Exam_Score']

# Classification target
y_clf = df['GradeCategory']


In [24]:
# Identify categorical columns
categorical_cols = X.select_dtypes(include='object').columns.tolist()

# One-hot encode features
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

# Encode classification target
le = LabelEncoder()
y_clf_enc = le.fit_transform(y_clf)

print("Features encoded. Encoded shape:", X_encoded.shape)


Features encoded. Encoded shape: (6607, 27)


In [25]:
# Regression split
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_encoded, y_reg, test_size=0.2, random_state=42
)

# Classification split
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    X_encoded, y_clf_enc, test_size=0.2, random_state=42, stratify=y_clf_enc
)


In [26]:
# Standardize numeric features for regression
scaler_reg = StandardScaler()
X_train_scaled = scaler_reg.fit_transform(X_train_reg)
X_test_scaled = scaler_reg.transform(X_test_reg)

# Standardize for classification (optional, CatBoost does not require scaling)
scaler_clf = StandardScaler()
X_train_scaled_clf = scaler_clf.fit_transform(X_train_clf)
X_test_scaled_clf = scaler_clf.transform(X_test_clf)


In [27]:
# Regression models
reg_models = {
    'Linear Regression': LinearRegression(),
    'SVR': SVR(),
    'XGBoost': XGBRegressor(eval_metric='rmse', use_label_encoder=False),
    'CatBoost': CatBoostRegressor(verbose=0)
}

# Classification models
clf_models = {
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='mlogloss'),
    'CatBoost': CatBoostClassifier(verbose=0)
}


In [28]:
reg_results = []

for name, model in reg_models.items():
    model.fit(X_train_scaled, y_train_reg)
    y_pred = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test_reg, y_pred)
    r2 = r2_score(y_test_reg, y_pred)
    reg_results.append({'Model': name, 'MSE': mse, 'R2': r2})

reg_df = pd.DataFrame(reg_results)
print("=== REGRESSION RESULTS ===")
display(reg_df)

# Select best regression model (lowest MSE)
best_reg_model_name = reg_df.sort_values('MSE').iloc[0]['Model']
best_reg_model = reg_models[best_reg_model_name]
print(f"Best regression model: {best_reg_model_name}")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


=== REGRESSION RESULTS ===


Unnamed: 0,Model,MSE,R2
0,Linear Regression,3.25602,0.76965
1,SVR,3.38034,0.760854
2,XGBoost,4.614062,0.673573
3,CatBoost,3.734057,0.73583


Best regression model: Linear Regression


In [29]:
clf_results = []

for name, model in clf_models.items():
    model.fit(X_train_scaled_clf, y_train_clf)
    y_pred = model.predict(X_test_scaled_clf)
    acc = accuracy_score(y_test_clf, y_pred)
    f1 = f1_score(y_test_clf, y_pred, average='weighted')
    clf_results.append({'Model': name, 'Accuracy': acc, 'F1': f1})

clf_df = pd.DataFrame(clf_results)
print("=== CLASSIFICATION RESULTS ===")
display(clf_df)

# Select best classification model (highest Accuracy)
best_clf_model_name = clf_df.sort_values('Accuracy', ascending=False).iloc[0]['Model']
best_clf_model = clf_models[best_clf_model_name]
print(f"Best classification model: {best_clf_model_name}")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


=== CLASSIFICATION RESULTS ===


Unnamed: 0,Model,Accuracy,F1
0,XGBoost,0.934947,0.92975
1,CatBoost,0.953101,0.947757


Best classification model: CatBoost


In [30]:
# Save best models
joblib.dump(best_reg_model, 'best_reg_model.pkl')
joblib.dump(best_clf_model, 'best_clf_model.pkl')
files.download('best_reg_model.pkl')
files.download('best_clf_model.pkl')

# Save feature columns for Streamlit input alignment
all_features = X_encoded.columns.tolist()
joblib.dump(all_features, 'all_features.pkl')
files.download('all_features.pkl')

print("Models and feature list saved successfully!")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Models and feature list saved successfully!
