In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [63]:
df = pd.read_csv("StudentPerformance.csv")
df.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70


In [64]:
x = df.drop(columns=["Exam_Score"])
y = df["Exam_Score"]

In [65]:
categorical_columns = x.select_dtypes(include=['object']).columns

In [94]:
print("Categorical columns:", categorical_columns.tolist())

Categorical columns: ['Parental_Involvement', 'Access_to_Resources', 'Motivation_Level', 'Family_Income', 'Teacher_Quality', 'Peer_Influence', 'Parental_Education_Level', 'Distance_from_Home']


In [66]:
for col in categorical_columns:
    print(f"Column: {col}", "have", x[col].nunique(), "values:", x[col].unique())

Column: Parental_Involvement have 3 values: ['Low' 'Medium' 'High']
Column: Access_to_Resources have 3 values: ['High' 'Medium' 'Low']
Column: Extracurricular_Activities have 2 values: ['No' 'Yes']
Column: Motivation_Level have 3 values: ['Low' 'Medium' 'High']
Column: Internet_Access have 2 values: ['Yes' 'No']
Column: Family_Income have 3 values: ['Low' 'Medium' 'High']
Column: Teacher_Quality have 3 values: ['Medium' 'High' 'Low' nan]
Column: School_Type have 2 values: ['Public' 'Private']
Column: Peer_Influence have 3 values: ['Positive' 'Negative' 'Neutral']
Column: Learning_Disabilities have 2 values: ['No' 'Yes']
Column: Parental_Education_Level have 3 values: ['High School' 'College' 'Postgraduate' nan]
Column: Distance_from_Home have 3 values: ['Near' 'Moderate' 'Far' nan]
Column: Gender have 2 values: ['Male' 'Female']


In [67]:
num_columns = x.select_dtypes(exclude=['object']).columns

In [92]:
print("Numerical columns:", num_columns.tolist())

Numerical columns: ['Hours_Studied', 'Attendance', 'Sleep_Hours', 'Previous_Scores', 'Tutoring_Sessions', 'Physical_Activity']


In [68]:
x.isna().sum()

Hours_Studied                  0
Attendance                     0
Parental_Involvement           0
Access_to_Resources            0
Extracurricular_Activities     0
Sleep_Hours                    0
Previous_Scores                0
Motivation_Level               0
Internet_Access                0
Tutoring_Sessions              0
Family_Income                  0
Teacher_Quality               78
School_Type                    0
Peer_Influence                 0
Physical_Activity              0
Learning_Disabilities          0
Parental_Education_Level      90
Distance_from_Home            67
Gender                         0
dtype: int64

In [69]:
x['Teacher_Quality'] = x['Teacher_Quality'].fillna(x['Teacher_Quality'].mode()[0])
x['Parental_Education_Level'] = x['Parental_Education_Level'].fillna(x['Parental_Education_Level'].mode()[0])
x['Distance_from_Home'] = x['Distance_from_Home'].fillna(x['Distance_from_Home'].mode()[0])

In [70]:
x.isna().sum()

Hours_Studied                 0
Attendance                    0
Parental_Involvement          0
Access_to_Resources           0
Extracurricular_Activities    0
Sleep_Hours                   0
Previous_Scores               0
Motivation_Level              0
Internet_Access               0
Tutoring_Sessions             0
Family_Income                 0
Teacher_Quality               0
School_Type                   0
Peer_Influence                0
Physical_Activity             0
Learning_Disabilities         0
Parental_Education_Level      0
Distance_from_Home            0
Gender                        0
dtype: int64

In [71]:
yes_no_columns = ['Extracurricular_Activities', 'Internet_Access','Learning_Disabilities']

for col in yes_no_columns:
    x[col] = x[col].map({'yes': 1, 'no': 0})

In [72]:
x["Gender"] = x["Gender"].map({'Male': 1, 'Female': 0})

In [73]:
x["School_Type"] = x["School_Type"].map({'Public': 1, 'Private': 0})

In [74]:
categorical_columns = x.select_dtypes(include=['object']).columns

In [75]:
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = RobustScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, categorical_columns),
         ("StandardScaler", numeric_transformer, num_columns),        
    ]
)

In [76]:
x = preprocessor.fit_transform(x)

In [77]:
x.shape

(6607, 30)

In [78]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape

((5285, 30), (1322, 30))

In [79]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [80]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [81]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 2.1376
- Mean Absolute Error: 0.6490
- R2 Score: 0.7029
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1.8407
- Mean Absolute Error: 0.5675
- R2 Score: 0.7603


Lasso
Model performance for Training set
- Root Mean Squared Error: 3.4733
- Mean Absolute Error: 2.4109
- R2 Score: 0.2157
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 3.2832
- Mean Absolute Error: 2.3711
- R2 Score: 0.2374


Ridge
Model performance for Training set
- Root Mean Squared Error: 2.1376
- Mean Absolute Error: 0.6490
- R2 Score: 0.7029
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1.8407
- Mean Absolute Error: 0.5675
- R2 Score: 0.7603


K-Neighbors Regressor
Model performance for Training set
- Root Mean Squared Error: 2.4201
- Mean Absolute Error: 1.4949
- R2 Score: 0.6192
-----------------------

In [25]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
2,Ridge,0.769658
0,Linear Regression,0.76965
7,CatBoosting Regressor,0.744098
6,XGBRegressor,0.671774
5,Random Forest Regressor,0.66674
3,K-Neighbors Regressor,0.548277
1,Lasso,0.437564
4,Decision Tree,0.155597
8,AdaBoost Regressor,-1.53282


In [26]:
from sklearn.model_selection import GridSearchCV

# Linear Regression (no hyperparameters to tune, but shown for completeness)
lr = LinearRegression()
lr_grid = {}
lr_search = GridSearchCV(lr, lr_grid, cv=5, scoring='r2')
lr_search.fit(X_train, y_train)
print("Best Linear Regression R2:", lr_search.best_score_)

# Ridge Regression
ridge = Ridge()
ridge_grid = {'alpha': [0.01, 0.1, 1, 10, 100]}
ridge_search = GridSearchCV(ridge, ridge_grid, cv=5, scoring='r2')
ridge_search.fit(X_train, y_train)
print("Best Ridge R2:", ridge_search.best_score_)
print("Best Ridge Params:", ridge_search.best_params_)

# Random Forest
rf = RandomForestRegressor()
rf_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 5, 10]
}
rf_search = GridSearchCV(rf, rf_grid, cv=5, scoring='r2', n_jobs=-1)
rf_search.fit(X_train, y_train)
print("Best RF R2:", rf_search.best_score_)
print("Best RF Params:", rf_search.best_params_)

# XGBRegressor
xgb = XGBRegressor()
xgb_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2]
}
xgb_search = GridSearchCV(xgb, xgb_grid, cv=5, scoring='r2', n_jobs=-1)
xgb_search.fit(X_train, y_train)
print("Best XGB R2:", xgb_search.best_score_)
print("Best XGB Params:", xgb_search.best_params_)

Best Linear Regression R2: 0.7231933407844309
Best Ridge R2: 0.7232285212384781
Best Ridge Params: {'alpha': 10}
Best RF R2: 0.6337662703262659
Best RF Params: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}
Best XGB R2: 0.6997561454772949
Best XGB Params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}


In [82]:
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.01)
lasso.fit(X_train, y_train)
selected = lasso.coef_ != 0
print(selected)

[ True  True False  True  True False  True  True False  True  True False
  True  True False  True False  True False  True  True  True False  True
  True  True  True  True  True  True]


In [83]:
X_train= X_train[:, selected]
X_test = X_test[:, selected]

In [91]:
from sklearn.model_selection import GridSearchCV


# Ridge Regression
ridge = Ridge()
ridge_grid = {'alpha': [10, 8, 50, 100]}
ridge_search = GridSearchCV(ridge, ridge_grid, cv=5, scoring='r2')
ridge_search.fit(X_train, y_train)
print("Best Ridge R2:", ridge_search.best_score_)
print("Best Ridge Params:", ridge_search.best_params_)

#

Best Ridge R2: 0.7093836089835068
Best Ridge Params: {'alpha': 10}


In [37]:
selected = lasso.coef_ != 0

In [38]:
X_train_selected = X_train[:, selected]
X_test_selected = X_test[:, selected]

In [39]:
model = LinearRegression()  # or any other model
model.fit(X_train_selected, y_train)
y_pred = model.predict(X_test_selected)

In [40]:
mae, rmse, r2 = evaluate_model(y_test, y_pred)
print("MAE:", mae)
print("RMSE:", rmse)
print("R2:", r2)

MAE: 0.4524841436015461
RMSE: 1.8048935143428602
R2: 0.7695349207210204
