In [8]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns 
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score

In [7]:
df = pd.read_csv('jamb_exam_results.csv')
df.head()

Unnamed: 0,JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,School_Type,School_Location,Extra_Tutorials,Access_To_Learning_Materials,Parent_Involvement,IT_Knowledge,Student_ID,Age,Gender,Socioeconomic_Status,Parent_Education_Level,Assignments_Completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


In [10]:
df.columns = df.columns.str.lower().str.replace('_',' ')
df.head()

Unnamed: 0,jamb score,study hours per week,attendance rate,teacher quality,distance to school,school type,school location,extra tutorials,access to learning materials,parent involvement,it knowledge,student id,age,gender,socioeconomic status,parent education level,assignments completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


In [24]:
df['parent education level'] = df['parent education level'].fillna(0)

In [25]:
df.isnull().sum()

jamb score                      0
study hours per week            0
attendance rate                 0
teacher quality                 0
distance to school              0
school type                     0
school location                 0
extra tutorials                 0
access to learning materials    0
parent involvement              0
it knowledge                    0
age                             0
gender                          0
socioeconomic status            0
parent education level          0
assignments completed           0
dtype: int64

In [26]:
df_full_train , df_test = train_test_split(df ,test_size=0.2 , random_state=1 )
df_train , df_val = train_test_split(df_full_train , test_size= 0.25 , random_state=1)

In [29]:
y_train = df_train['jamb score'].values
y_test = df_test['jamb score'].values
y_val = df_val['jamb score'].values

del df_train['jamb score']
del df_test['jamb score']
del df_val['jamb score']

In [33]:
dv = DictVectorizer(sparse=False)

dicts_train = df_train.to_dict(orient='records')
X_train = dv.fit_transform(dicts_train)

dicts_test = df_test.to_dict(orient='records')
X_test = dv.fit_transform(dicts_test)

dicts_val = df_val.to_dict(orient='records')
X_val = dv.fit_transform(dicts_val)

In [36]:
from sklearn.tree import DecisionTreeRegressor 
from sklearn.tree import export_text

In [35]:
dr = DecisionTreeRegressor(max_depth=1)

dr.fit(X_train , y_train)

In [39]:
features = list(dv.get_feature_names_out())
print(export_text(dr , feature_names=features))

|--- study hours per week <= 18.50
|   |--- value: [155.24]
|--- study hours per week >  18.50
|   |--- value: [188.59]



In [42]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [41]:
rf = RandomForestRegressor(n_estimators = 10 , random_state = 1,
                          n_jobs=-1)
rf.fit(X_train , y_train)

In [44]:
y_pred = rf.predict(X_val)

mse =  mean_squared_error(y_val , y_pred)
rmse = np.sqrt(mse)
rmse

42.13724207871227

In [47]:
scores = []
for n in range(10,201,10):
    rf = RandomForestRegressor(n_estimators = n , random_state = 1,
                          n_jobs=-1)
    rf.fit(X_train , y_train)

    y_pred = rf.predict(X_val)

    mse =  mean_squared_error(y_val , y_pred)
    rmse = np.sqrt(mse)
    scores.append((n,round(rmse,3)))

columns = ['n_estimators' , 'rmse']
df_scores = pd.DataFrame(scores , columns=columns)
df_scores

Unnamed: 0,n_estimators,rmse
0,10,42.137
1,20,41.461
2,30,41.106
3,40,40.917
4,50,40.852
5,60,40.784
6,70,40.677
7,80,40.539
8,90,40.504
9,100,40.517


In [78]:
scores = []
for d in [10, 15, 20, 25] :
    for n in range(10,201,10):
        m = []
        rf = RandomForestRegressor(max_depth=d,n_estimators = n , random_state = 1,
                              n_jobs=-1)
        rf.fit(X_train , y_train)
    
        y_pred = rf.predict(X_val)
    
        mse =  mean_squared_error(y_val , y_pred)
        rmse = np.sqrt(mse)
        m.append(rmse)
    avg=np.mean(m)
    scores.append((d,avg ))
    
    
columns = ['max_depth' , 'mean_rmse']
df_scores = pd.DataFrame(scores , columns=columns)
df_scores

Unnamed: 0,max_depth,mean_rmse
0,10,40.325012
1,15,40.543618
2,20,40.547698
3,25,40.598527


In [97]:
rf = RandomForestRegressor(n_estimators = 10, max_depth=20 , random_state = 1,
                          n_jobs=-1)
rf.fit(X_train , y_train)
f=dict(zip(features , rf.feature_importances_))

In [105]:
max(f , key=f.get) , max(f.values())


('study hours per week', 0.24835361199396536)

In [103]:
f

{'access to learning materials=No': 0.012325395818561702,
 'access to learning materials=Yes': 0.010261891957053597,
 'age': 0.06931145469695398,
 'assignments completed': 0.03151678945460071,
 'attendance rate': 0.1497290297846747,
 'distance to school': 0.1364858048632369,
 'extra tutorials=No': 0.013459336291731934,
 'extra tutorials=Yes': 0.009131355726749437,
 'gender=Female': 0.009288710588929935,
 'gender=Male': 0.010382634572648446,
 'it knowledge=High': 0.0177193420394119,
 'it knowledge=Low': 0.012404050525957137,
 'it knowledge=Medium': 0.009141479881545294,
 'parent education level': 0.0,
 'parent education level=Primary': 0.01545036018792814,
 'parent education level=Secondary': 0.016956919815100068,
 'parent education level=Tertiary': 0.014488617277194148,
 'parent involvement=High': 0.022918852929064194,
 'parent involvement=Low': 0.013357613537934167,
 'parent involvement=Medium': 0.011492011642547156,
 'school location=Rural': 0.009559093298481168,
 'school location=Ur

In [106]:
import xgboost as xgb

In [127]:
def parse_output_xgb(output):
    results=[]
    for line in output.stdout.strip().split('\n'):
        num_iter, train_auc, val_auc = line.split('\t')
        val_rmse = float(val_auc.split(':')[1])
        results.append(val_rmse)
    df_result = pd.DataFrame(results , columns=['val rmse'])
    return df_result

In [107]:
dtrain = xgb.DMatrix(X_train , label = y_train , feature_names=features)
dval = xgb.DMatrix(X_val , label = y_val , feature_names=features)

In [108]:
watchlist = [(dtrain ,'train'),(dval,'val')]

In [125]:
%%capture output 
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric' : 'rmse',
    
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params , dtrain , num_boost_round=100,
                 verbose_eval=5,
                 evals=watchlist)

In [134]:

df_r1= parse_output_xgb(output)
df_r1.mean()

val rmse    41.034721
dtype: float64

In [130]:
%%capture output 
xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric' : 'rmse',
    
    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params , dtrain , num_boost_round=100,
                 verbose_eval=5,
                 evals=watchlist)

In [133]:
df_r2=parse_output_xgb(output)
df_r2.mean()

val rmse    41.034721
dtype: float64