# Homework 6: Decision Trees and Ensemble Learning for Machine Learning Zoomcamp 2024

https://github.com/DataTalksClub/machine-learning-zoomcamp/blob/master/cohorts/2024/06-trees/homework.md

Dataset: https://www.kaggle.com/datasets/idowuadamo/students-performance-in-2024-jamb

In [1]:
import pandas as pd

In [2]:
!wget https://github.com/alexeygrigorev/datasets/raw/refs/heads/master/jamb_exam_results.csv

--2024-11-04 12:42:59--  https://github.com/alexeygrigorev/datasets/raw/refs/heads/master/jamb_exam_results.csv
Resolving github.com (github.com)... 140.82.114.4
Connecting to github.com (github.com)|140.82.114.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/alexeygrigorev/datasets/refs/heads/master/jamb_exam_results.csv [following]
--2024-11-04 12:42:59--  https://raw.githubusercontent.com/alexeygrigorev/datasets/refs/heads/master/jamb_exam_results.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 391501 (382K) [text/plain]
Saving to: ‘jamb_exam_results.csv’


2024-11-04 12:43:00 (7.82 MB/s) - ‘jamb_exam_results.csv’ saved [391501/391501]



In [3]:
df = pd.read_csv('jamb_exam_results.csv')
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.head()

Unnamed: 0,jamb_score,study_hours_per_week,attendance_rate,teacher_quality,distance_to_school,school_type,school_location,extra_tutorials,access_to_learning_materials,parent_involvement,it_knowledge,student_id,age,gender,socioeconomic_status,parent_education_level,assignments_completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


In [4]:
# Remove the student_id column.
df = df.drop(columns=['student_id'])

In [5]:
# Fill missing values with zeros.
df = df.fillna(0)

In [6]:
# Do train/validation/test split with 60%/20%/20% distribution.
from sklearn.model_selection import train_test_split
X = df.drop('jamb_score', axis=1)
y = df['jamb_score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=1)

In [7]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=True)

train_dict = X_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = X_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

test_dict = X_test.to_dict(orient='records')
X_test = dv.transform(test_dict)

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

importances = dt.feature_importances_

feature_names = dv.get_feature_names_out()

most_important_feature_index = importances.argmax()
most_important_feature = feature_names[most_important_feature_index]

print(f"The most important feature for splitting is: {most_important_feature}")

The most important feature for splitting is: study_hours_per_week


In [9]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
rmse

42.13724207871227

In [10]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

rmse_values = []
for n in range(10, 201, 10):
    model = RandomForestRegressor(n_estimators=n, random_state=1)
    model.fit(X_train, y_train)
    predictions = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, predictions))
    rmse_values.append((n, rmse))

for i in range(1, len(rmse_values)):
    if round(rmse_values[i-1][1], 3) == round(rmse_values[i][1], 3):
        print(f"RMSE stops improving significantly after n_estimators = {rmse_values[i][0]}")
        break


In [11]:
rmse_values

[(10, 42.13724207871227),
 (20, 41.46121464694444),
 (30, 41.106170947924596),
 (40, 40.917193933296545),
 (50, 40.852278663496854),
 (60, 40.78428140159447),
 (70, 40.677098222414024),
 (80, 40.53933283129176),
 (90, 40.50434592594835),
 (100, 40.51680451861919),
 (110, 40.59335280539747),
 (120, 40.6248503681005),
 (130, 40.650840905587195),
 (140, 40.5948515491302),
 (150, 40.596715029667116),
 (160, 40.60350763548252),
 (170, 40.62754627591216),
 (180, 40.641313925139386),
 (190, 40.63135509073867),
 (200, 40.60101912236933)]

In [12]:
scores = []
for depth in [10, 15, 20, 25]:
    rmse_values = []
    for n in range(10, 201, 10):
        model = RandomForestRegressor(n_estimators=n, max_depth=depth, random_state=1, n_jobs=-1)
        model.fit(X_train, y_train)
        predictions = model.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, predictions))
        rmse_values.append(rmse)
    scores.append((depth, np.mean(rmse_values)))

best_depth = min(scores, key=lambda x: x[1])[0]
print(f"The best max_depth is: {best_depth}")

The best max_depth is: 10


In [13]:
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

importances = rf.feature_importances_
feature_names = dv.get_feature_names_out()

most_important_feature_index = importances.argmax()
most_important_feature = feature_names[most_important_feature_index]

print(f"The most important feature is: {most_important_feature}")

The most important feature is: study_hours_per_week


In [15]:
import xgboost as xgb
import numpy as np

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

watchlist = [(dtrain, 'train'), (dval, 'val')]

xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}
model_03 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, verbose_eval=False)
y_pred_03 = model_03.predict(dval)
rmse_03 = np.sqrt(mean_squared_error(y_val, y_pred_03))

xgb_params['eta'] = 0.1
model_01 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, verbose_eval=False)
y_pred_01 = model_01.predict(dval)
rmse_01 = np.sqrt(mean_squared_error(y_val, y_pred_01))


print(f"RMSE with eta=0.3: {rmse_03}")
print(f"RMSE with eta=0.1: {rmse_01}")

if rmse_03 < rmse_01:
    print("0.3")
elif rmse_01 < rmse_03:
    print("0.1")
else:
    print("Both give equal value")

RMSE with eta=0.3: 43.418817345871766
RMSE with eta=0.1: 41.05034017683498
0.1
