In [8]:
import requests

url = "https://github.com/alexeygrigorev/datasets/raw/refs/heads/master/jamb_exam_results.csv"
response = requests.get(url)

# Guardar el archivo localmente
with open("jamb_exam_results.csv", "wb") as file:
    file.write(response.content)

In [5]:
!python -m pip install xgboost




[notice] A new release of pip is available: 24.1 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np

# Load data
df = pd.read_csv("jamb_exam_results.csv")

# Preprocess columns
df.columns = df.columns.str.lower().str.replace(' ', '_')

# Remove student_id column
df = df.drop(columns=['student_id'])

print(df.head())




   jamb_score  study_hours_per_week  attendance_rate  teacher_quality  \
0         192                    22               78                4   
1         207                    14               88                4   
2         182                    29               87                2   
3         210                    29               99                2   
4         199                    12               98                3   

   distance_to_school school_type school_location extra_tutorials  \
0                12.4      Public           Urban             Yes   
1                 2.7      Public           Rural              No   
2                 9.6      Public           Rural             Yes   
3                 2.6      Public           Urban              No   
4                 8.8      Public           Urban              No   

  access_to_learning_materials parent_involvement it_knowledge  age  gender  \
0                          Yes               High       Medium   17

In [11]:
# Fill missing values with zero
df = df.fillna(0)

# Split data
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)  # 0.25 * 0.8 = 0.2

y_train = df_train.jamb_score.values
y_val = df_val.jamb_score.values
y_test = df_test.jamb_score.values

df_train = df_train.drop(columns=['jamb_score'])
df_val = df_val.drop(columns=['jamb_score'])
df_test = df_test.drop(columns=['jamb_score'])

# Convert data to dictionary format
train_dict = df_train.to_dict(orient='records')
val_dict = df_val.to_dict(orient='records')

# Vectorize the data
dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(train_dict)
X_val = dv.transform(val_dict)

In [None]:
#Q1
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

print("Feature:", dv.feature_names_[dt.tree_.feature[0]])

Feature: study_hours_per_week


In [13]:
#Q2
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print("RMSE:", rmse)

RMSE: 41.82546234054084


In [14]:
#Q3
rmse_values = []

for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    
    y_pred = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_values.append((n, rmse))

print(rmse_values)

[(10, 41.82546234054084), (20, 41.19602596125019), (30, 40.856505465946164), (40, 40.66637198749109), (50, 40.678118442228865), (60, 40.6039010304292), (70, 40.53460449582126), (80, 40.403939657461), (90, 40.376522525281544), (100, 40.43028290774132), (110, 40.47245412796827), (120, 40.52150253517672), (130, 40.529568505653224), (140, 40.4652468105506), (150, 40.43557058443579), (160, 40.46153425052305), (170, 40.47871275067374), (180, 40.4824103492515), (190, 40.46830275530142), (200, 40.454002673715735)]


In [15]:
#Q4
best_depth = None
best_rmse = float('inf')

for depth in [10, 15, 20, 25]:
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(n_estimators=n, max_depth=depth, random_state=1, n_jobs=-1)
        rf.fit(X_train, y_train)
        
        y_pred = rf.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        
        if rmse < best_rmse:
            best_rmse = rmse
            best_depth = depth

print("Best max_depth:", best_depth)

Best max_depth: 10


In [16]:
#Q6
rf = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

importances = rf.feature_importances_
sorted_importances = sorted(zip(dv.feature_names_, importances), key=lambda x: x[1], reverse=True)
print("Most important feature:", sorted_importances[0][0])

Most important feature: study_hours_per_week


In [17]:
#q7
# Prepare data for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)
watchlist = [(dtrain, 'train'), (dval, 'val')]

# Train model with eta=0.3
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'seed': 1,
    'verbosity': 1,
}

model_03 = xgb.train(xgb_params, dtrain, 100, watchlist, early_stopping_rounds=10)
rmse_03 = model_03.eval(dval)

# Train model with eta=0.1
xgb_params['eta'] = 0.1
model_01 = xgb.train(xgb_params, dtrain, 100, watchlist, early_stopping_rounds=10)
rmse_01 = model_01.eval(dval)

print("RMSE with eta=0.3:", rmse_03)
print("RMSE with eta=0.1:", rmse_01)

[0]	train-rmse:42.69384	val-rmse:44.89114
[1]	train-rmse:39.83326	val-rmse:43.07010
[2]	train-rmse:37.94542	val-rmse:42.00332
[3]	train-rmse:36.56125	val-rmse:41.46452
[4]	train-rmse:35.44252	val-rmse:40.88896
[5]	train-rmse:34.57756	val-rmse:40.69096
[6]	train-rmse:33.84230	val-rmse:40.59315
[7]	train-rmse:33.25929	val-rmse:40.47993
[8]	train-rmse:32.79415	val-rmse:40.45326
[9]	train-rmse:32.16019	val-rmse:40.43929
[10]	train-rmse:31.63404	val-rmse:40.48319
[11]	train-rmse:31.17673	val-rmse:40.68201
[12]	train-rmse:30.87313	val-rmse:40.63522
[13]	train-rmse:30.30310	val-rmse:40.70983
[14]	train-rmse:30.00098	val-rmse:40.78133
[15]	train-rmse:29.41497	val-rmse:40.86107
[16]	train-rmse:29.25816	val-rmse:40.96580
[17]	train-rmse:28.59378	val-rmse:41.12190
[18]	train-rmse:28.27990	val-rmse:41.14360
[19]	train-rmse:27.94572	val-rmse:41.22835
[0]	train-rmse:45.49999	val-rmse:47.00533
[1]	train-rmse:44.12948	val-rmse:45.92344
[2]	train-rmse:42.94858	val-rmse:44.98366
[3]	train-rmse:41.90896	

