In [1]:
#!wget https://github.com/alexeygrigorev/datasets/raw/refs/heads/master/jamb_exam_results.csv

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

df = pd.read_csv("jamb_exam_results.csv")
print(df.isnull().sum())
df.head()

JAMB_Score                        0
Study_Hours_Per_Week              0
Attendance_Rate                   0
Teacher_Quality                   0
Distance_To_School                0
School_Type                       0
School_Location                   0
Extra_Tutorials                   0
Access_To_Learning_Materials      0
Parent_Involvement                0
IT_Knowledge                      0
Student_ID                        0
Age                               0
Gender                            0
Socioeconomic_Status              0
Parent_Education_Level          891
Assignments_Completed             0
dtype: int64


Unnamed: 0,JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,School_Type,School_Location,Extra_Tutorials,Access_To_Learning_Materials,Parent_Involvement,IT_Knowledge,Student_ID,Age,Gender,Socioeconomic_Status,Parent_Education_Level,Assignments_Completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


In [3]:
# Data Consistency and Filling Missing Values
df.columns = df.columns.str.lower().str.replace(' ', '_')
df = df.fillna(0)
df.isnull().sum()

jamb_score                      0
study_hours_per_week            0
attendance_rate                 0
teacher_quality                 0
distance_to_school              0
school_type                     0
school_location                 0
extra_tutorials                 0
access_to_learning_materials    0
parent_involvement              0
it_knowledge                    0
student_id                      0
age                             0
gender                          0
socioeconomic_status            0
parent_education_level          0
assignments_completed           0
dtype: int64

In [4]:
#Data Splitting

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)
del df['student_id']
X = df.copy()

X_train_full, X_test = train_test_split(X, test_size=0.2, random_state=42, shuffle=True)
X_train, X_val = train_test_split(X_train_full, test_size=0.25, random_state=42, shuffle=True)

df_train_full = X_train_full.reset_index(drop=True)
df_train = X_train.reset_index(drop=True)
df_val = X_val.reset_index(drop =True)
df_test = X_test.reset_index(drop=True)

y_train = df_train.jamb_score.values
y_val = df_val.jamb_score.values
y_test = df_test.jamb_score.values

del df_train['jamb_score']
del df_val['jamb_score']
del df_test['jamb_score']    


In [5]:
#Implementing DictVectorizer
train_dict = df_train.to_dict(orient='records')
val_dict = df_val.to_dict(orient='records')
test_dict = df_test.to_dict(orient='records')

Xtrain = dv.fit_transform(train_dict)
Xval = dv.transform(val_dict)

In [6]:
#DecisionTree Model Trainning
from sklearn import tree
from sklearn.tree import export_text

dt = tree.DecisionTreeRegressor(max_depth=1)

model = dt.fit(Xtrain, y_train)

y_pred = model.predict(Xval)

print(export_text(model, feature_names=dv.get_feature_names_out()))

|--- study_hours_per_week <= 21.50
|   |--- value: [159.80]
|--- study_hours_per_week >  21.50
|   |--- value: [194.00]



In [7]:
#Random Forest Model Trainning
from sklearn import ensemble
from sklearn.metrics import mean_squared_error


rf = ensemble.RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
model_rf = rf.fit(Xtrain, y_train)
y_predrf = model_rf.predict(Xval)

err = mean_squared_error(y_val, y_predrf)

rmse = np.sqrt(err)

rmse


42.575814143713096

In [8]:
#Parameter Tunning

num = np.arange(10, 201, 10)

scores ={}

for i in num:
    rf = ensemble.RandomForestRegressor(n_estimators=i, random_state=1, n_jobs=-1)
    model_rf = rf.fit(Xtrain, y_train)
    y_predrf = model_rf.predict(Xval)

    err = mean_squared_error(y_val, y_predrf)

    rmse = np.sqrt(err).round(3)

    scores[i] = rmse

scores

{10: 42.576,
 20: 41.662,
 30: 41.149,
 40: 41.239,
 50: 41.134,
 60: 41.046,
 70: 41.048,
 80: 40.959,
 90: 40.864,
 100: 40.733,
 110: 40.696,
 120: 40.563,
 130: 40.483,
 140: 40.47,
 150: 40.482,
 160: 40.467,
 170: 40.405,
 180: 40.428,
 190: 40.399,
 200: 40.398}

In [9]:
#Parameter Tunning

depth = [10, 15, 20, 25]
scores ={}

for i in num:
    for d in depth:
        rf = ensemble.RandomForestRegressor(n_estimators=i, max_depth = d, random_state=1, n_jobs=-1)
        model_rf = rf.fit(Xtrain, y_train)
        y_predrf = model_rf.predict(Xval)

        err = mean_squared_error(y_val, y_predrf)

        rmse = np.sqrt(err).round(3)

        scores[(d, i)] = rmse

min(scores, key=scores.__getitem__) #Key(n_estimators, max_depth) with the least RMSE 

(10, 200)

In [10]:
#Feature Importance
rf = ensemble.RandomForestRegressor(n_estimators=10, max_depth = 20, random_state=1, n_jobs=-1)
model_rf = rf.fit(Xtrain, y_train)
importances = model_rf.feature_importances_
feature = dv.get_feature_names_out()
feature_imp_df = pd.DataFrame({'Feature': feature, 'Gini Importance': importances})
feature_imp_df = feature_imp_df.sort_values('Gini Importance', ascending=False)

print(feature_imp_df)

                             Feature  Gini Importance
27              study_hours_per_week         0.258953
4                    attendance_rate         0.148003
5                 distance_to_school         0.139021
28                   teacher_quality         0.081668
2                                age         0.058906
3              assignments_completed         0.035779
24         socioeconomic_status=High         0.024400
25          socioeconomic_status=Low         0.019753
17           parent_involvement=High         0.019039
10                 it_knowledge=High         0.016490
11                  it_knowledge=Low         0.015029
15  parent_education_level=Secondary         0.014863
16   parent_education_level=Tertiary         0.014074
14    parent_education_level=Primary         0.013449
18            parent_involvement=Low         0.012350
12               it_knowledge=Medium         0.011904
1   access_to_learning_materials=Yes         0.011124
6                 extra_tuto

In [12]:
#XGBoost
#%pip install xgboost
import xgboost as xgb

dtrain = xgb.DMatrix(Xtrain, y_train, feature_names=list(dv.get_feature_names_out()))
dval = xgb.DMatrix(Xval, y_val, feature_names=list(dv.get_feature_names_out()))

xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

modelx = xgb.train(xgb_params, dtrain, num_boost_round=200)
y_predx = modelx.predict(dval)

err = mean_squared_error(y_val, y_predx)

rmse = np.sqrt(err).round(3)
rmse

41.759