# Model Development for Student Performance Analysis
This notebook trains and compares different machine learning models to predict students' performance.


In [24]:
# IMPORTS

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from xgboost import XGBRegressor
!pip install catboost --quiet
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
from google.colab import files

In [25]:
# LOAD DATA

uploaded_files = files.upload()  # Upload cleaned CSV
df = pd.read_csv(list(uploaded_files.keys())[0])
print("Data preview:\n")
print(df.head())

Saving StudentPerformanceFactors_Cleaned.csv to StudentPerformanceFactors_Cleaned (2).csv
Data preview:

   Hours_Studied  Attendance Parental_Involvement Access_to_Resources  \
0             23          84                  Low                High   
1             19          64                  Low              Medium   
2             24          98               Medium              Medium   
3             29          89                  Low              Medium   
4             19          92               Medium              Medium   

  Extracurricular_Activities  Sleep_Hours  Previous_Scores Motivation_Level  \
0                         No            7               73              Low   
1                         No            8               59              Low   
2                        Yes            7               91           Medium   
3                        Yes            8               98           Medium   
4                        Yes            6               65   

In [28]:
# FEATURE / TARGET SPLIT

X = df.drop(columns=['Exam_Score'])
y_reg = df['Exam_Score']

In [29]:
# ENCODE CATEGORICAL FEATURES

categorical_cols = X.select_dtypes(include='object').columns
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)


In [30]:
# TRAIN-TEST SPLIT

X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(
    X_encoded, y_reg, test_size=0.2, random_state=42
)

In [31]:
# FEATURE SCALING

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_reg)
X_test_scaled = scaler.transform(X_test_reg)

In [32]:
# DEFINE REGRESSION MODELS

reg_models = {
    'Linear Regression': LinearRegression(),
    'SVR': SVR(),
    'XGBoost': XGBRegressor(eval_metric='rmse', use_label_encoder=False),
    'CatBoost': CatBoostRegressor(verbose=0)
}

In [33]:
# TRAIN MODELS & STORE RESULTS

reg_results = []
for name, model in reg_models.items():
    model.fit(X_train_scaled, y_train_reg)
    y_pred = model.predict(X_test_scaled)
    mse = mean_squared_error(y_test_reg, y_pred)
    r2 = r2_score(y_test_reg, y_pred)
    reg_results.append({'Model': name, 'MSE': mse, 'R2': r2})
reg_df = pd.DataFrame(reg_results)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [34]:
# SELECT BEST MODEL & SAVE

best_reg_model_name = reg_df.sort_values('MSE').iloc[0]['Model']
best_reg_model = reg_models[best_reg_model_name]
joblib.dump(best_reg_model, 'best_reg_model.pkl')
files.download('best_reg_model.pkl')

# Save scaler
joblib.dump(scaler, 'scaler_reg.pkl')
files.download('scaler_reg.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [35]:
# SAVE TRAINING FEATURE COLUMNS

all_training_columns = X_encoded.columns.tolist()
joblib.dump(all_training_columns, 'all_training_columns.pkl')
files.download('all_training_columns.pkl')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [36]:
# SHOW RESULTS

print("=== REGRESSION RESULTS ===")
print(reg_df)
print(f"Best regression model saved: {best_reg_model_name}")

=== REGRESSION RESULTS ===
               Model       MSE        R2
0  Linear Regression  3.256020  0.769650
1                SVR  3.380340  0.760854
2            XGBoost  4.614062  0.673573
3           CatBoost  3.734057  0.735830
Best regression model saved: Linear Regression


In [41]:
# FEATURE IMPORTANCE

print("=== FEATURE IMPORTANCE ===")
if best_reg_model_name in ['XGBoost', 'CatBoost']:
    importances = best_reg_model.feature_importances_
elif best_reg_model_name == 'Linear Regression':
    importances = best_reg_model.coef_
else:
    importances = None

if importances is not None:
    feature_importance_df = pd.DataFrame({
        'Feature': X_encoded.columns,
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)
    print(feature_importance_df)
else:
    print(f"Feature importance not available for {best_reg_model_name}.")


=== FEATURE IMPORTANCE ===
                                  Feature  Importance
1                              Attendance    2.290405
0                           Hours_Studied    1.756881
3                         Previous_Scores    0.705908
4                       Tutoring_Sessions    0.626224
20                Peer_Influence_Positive    0.516698
25                Distance_from_Home_Near    0.419498
10         Extracurricular_Activities_Yes    0.285924
19                 Peer_Influence_Neutral    0.275653
13                    Internet_Access_Yes    0.257021
23  Parental_Education_Level_Postgraduate    0.204870
5                       Physical_Activity    0.199011
24            Distance_from_Home_Moderate    0.146212
18                     School_Type_Public    0.007896
26                            Gender_Male   -0.012713
2                             Sleep_Hours   -0.018250
22   Parental_Education_Level_High School   -0.229445
21              Learning_Disabilities_Yes   -0.268142
1