# Homework 06

## 1. Import

### 1.1 Import packages 

In [1]:
# import standard libraries
import pandas as pd
import numpy as np

# import from scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb



### 1.2 Import data

In [2]:
# variable data
df = pd.read_csv("data/jamb_exam_results.csv", delimiter=',')

In [3]:
# display the first 5 rows of the data
df.head()

Unnamed: 0,JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,School_Type,School_Location,Extra_Tutorials,Access_To_Learning_Materials,Parent_Involvement,IT_Knowledge,Student_ID,Age,Gender,Socioeconomic_Status,Parent_Education_Level,Assignments_Completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


### 1.3 Preparing the dataset

In [4]:
# Making the column names to be in lower case
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [5]:
# removing the column student_id
df = df.drop('student_id', axis=1)

In [6]:
# filling missing values with 0
df = df.fillna(0)

In [7]:
# train/validation/test split with 60%/20%/20% distribution.
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

# Separate the target variable from the train/val/test sets
y_train = df_train['jamb_score'].values
y_val = df_val['jamb_score'].values
y_test = df_test['jamb_score'].values

del df_train['jamb_score']
del df_val['jamb_score']


In [8]:
# Use DictVectorizer(sparse=True) to turn the dataframes into matrices.
dv = DictVectorizer(sparse=False)
# convert the dataframes to dictionaries
df_train_dict = df_train.to_dict(orient='records')
# fit the DictVectorizer on the training data
X_train = dv.fit_transform(df_train_dict)
# convert the validation data to dictionaries
df_val_dict = df_val.to_dict(orient='records')
# transform the validation data
X_val = dv.transform(df_val_dict)
# convert test data to dictionaries
df_test_dict = df_test.to_dict(orient='records')
# transform the test data
X_test = dv.transform(df_test_dict)

## 2. Questions

### 2.1 Question 1

In [9]:
# Train a DecisionTreeRegressor model
dt_model = DecisionTreeRegressor(max_depth=1)
dt_model.fit(X_train, y_train)

In [10]:
# Identify the feature used for splitting the data
feature_names = dv.get_feature_names_out(input_features=df_train.columns)
split_feature_idx = dt_model.tree_.feature[0]
split_feature = feature_names[split_feature_idx]

print(f"Feature used for splitting: {split_feature}")

Feature used for splitting: study_hours_per_week


### 2.2 Question 2

In [11]:
# Create and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=10, random_state=1, n_jobs=-1)
rf_model.fit(X_train, y_train)

In [12]:
# Make predictions
y_pred = rf_model.predict(X_test)

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)

# Calculate RMSE
rmse = np.sqrt(mse)

print(f"RMSE: {rmse}")

RMSE: 68.14409732324583


### 2.2 Question 3

In [13]:
# List to store RMSE values for different n_estimators
rmse_values = []

# Loop over different values of n_estimators
for n in range(10, 201, 10):
    # Initialize the model with the current n_estimators and random_state
    rf_model = RandomForestRegressor(n_estimators=n, random_state=1)
    
    # Fit the model on the training data
    rf_model.fit(X_train, y_train)
    
    # Make predictions on the validation data
    y_pred = rf_model.predict(X_val)
    
    # Calculate MSE
    mse = mean_squared_error(y_val, y_pred)
    
    # Calculate RMSE
    rmse = np.sqrt(mse)
    
    # Store the RMSE value
    rmse_values.append((n, rmse))

# Print the RMSE values for different n_estimators
for n, rmse in rmse_values:
    print(f"n_estimators: {n}, RMSE: {rmse}")

n_estimators: 10, RMSE: 42.13724207871227
n_estimators: 20, RMSE: 41.46121464694444
n_estimators: 30, RMSE: 41.106170947924596
n_estimators: 40, RMSE: 40.917193933296545
n_estimators: 50, RMSE: 40.852278663496854
n_estimators: 60, RMSE: 40.78428140159447
n_estimators: 70, RMSE: 40.677098222414024
n_estimators: 80, RMSE: 40.53933283129176
n_estimators: 90, RMSE: 40.50434592594835
n_estimators: 100, RMSE: 40.51680451861919
n_estimators: 110, RMSE: 40.59335280539747
n_estimators: 120, RMSE: 40.6248503681005
n_estimators: 130, RMSE: 40.650840905587195
n_estimators: 140, RMSE: 40.5948515491302
n_estimators: 150, RMSE: 40.596715029667116
n_estimators: 160, RMSE: 40.60350763548252
n_estimators: 170, RMSE: 40.62754627591216
n_estimators: 180, RMSE: 40.641313925139386
n_estimators: 190, RMSE: 40.63135509073867
n_estimators: 200, RMSE: 40.60101912236933


### 2.2 Question 4

In [14]:
# List to store RMSE values for different combinations of max_depth and n_estimators
results = []

# Loop over different values of max_depth
for max_depth in [10, 15, 20, 25]:
    # List to store RMSE values for different n_estimators for the current max_depth
    rmse_values = []
    
    # Loop over different values of n_estimators
    for n in range(10, 201, 10):
        # Initialize the model with the current max_depth, n_estimators, and random_state
        rf_model = RandomForestRegressor(max_depth=max_depth, n_estimators=n, random_state=1)
        
        # Fit the model on the training data
        rf_model.fit(X_train, y_train)
        
        # Make predictions on the validation data
        y_pred = rf_model.predict(X_val)
        
        # Calculate MSE
        mse = mean_squared_error(y_val, y_pred)
        
        # Calculate RMSE
        rmse = np.sqrt(mse)
        
        # Store the RMSE value
        rmse_values.append(rmse)
    
    # Calculate the mean RMSE for the current max_depth
    mean_rmse = np.mean(rmse_values)
    
    # Store the results
    results.append((max_depth, mean_rmse))

# Print the mean RMSE values for different max_depth
for max_depth, mean_rmse in results:
    print(f"max_depth: {max_depth}, Mean RMSE: {mean_rmse}")

max_depth: 10, Mean RMSE: 40.39249798892396
max_depth: 15, Mean RMSE: 40.73528172486332
max_depth: 20, Mean RMSE: 40.739734321829275
max_depth: 25, Mean RMSE: 40.78786565962805


### 2.2 Question 5

In [16]:
# Initialize the model with the specified parameters
rf_model = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)

# Fit the model on the training data
rf_model.fit(X_train, y_train)

# Get the feature importance information
feature_importances = rf_model.feature_importances_
feature_names = df_train.columns

# Combine feature names and their importances
feature_importance_info = list(zip(feature_names, feature_importances))

# Sort the features by their importance
sorted_feature_importance = sorted(feature_importance_info, key=lambda x: x[1], reverse=True)

# Print the sorted feature importances
for feature, importance in sorted_feature_importance:
    print(f"Feature: {feature}, Importance: {importance}")

Feature: school_type, Importance: 0.1497290297846747
Feature: school_location, Importance: 0.13648580486323686
Feature: teacher_quality, Importance: 0.06931145469695398
Feature: distance_to_school, Importance: 0.0315167894546007
Feature: age, Importance: 0.0177193420394119
Feature: assignments_completed, Importance: 0.015450360187928142
Feature: extra_tutorials, Importance: 0.013459336291731936
Feature: gender, Importance: 0.012404050525957139
Feature: study_hours_per_week, Importance: 0.012325395818561706
Feature: it_knowledge, Importance: 0.010382634572648446
Feature: attendance_rate, Importance: 0.010261891957053597
Feature: parent_involvement, Importance: 0.009288710588929933
Feature: socioeconomic_status, Importance: 0.009141479881545294
Feature: access_to_learning_materials, Importance: 0.009131355726749439
Feature: parent_education_level, Importance: 0.0


### 2.2 Question 6

In [17]:
# Create DMatrix for train and validation
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

# Create a watchlist
watchlist = [(dtrain, 'train'), (dval, 'eval')]

# Define the parameters
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

# Train the model with eta=0.3
model_1 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist)

# Predict on the validation set
y_pred_1 = model_1.predict(dval)

# Calculate RMSE
rmse_1 = np.sqrt(mean_squared_error(y_val, y_pred_1))
print(f"RMSE with eta=0.3: {rmse_1}")

# Change eta to 0.1
xgb_params['eta'] = 0.1

# Train the model with eta=0.1
model_2 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist)

# Predict on the validation set
y_pred_2 = model_2.predict(dval)

# Calculate RMSE
rmse_2 = np.sqrt(mean_squared_error(y_val, y_pred_2))
print(f"RMSE with eta=0.1: {rmse_2}")

[0]	train-rmse:42.69552	eval-rmse:44.86028
[1]	train-rmse:39.85005	eval-rmse:43.06070
[2]	train-rmse:37.95285	eval-rmse:41.83772
[3]	train-rmse:36.52203	eval-rmse:41.26987
[4]	train-rmse:35.36066	eval-rmse:41.02965
[5]	train-rmse:34.43646	eval-rmse:40.87186
[6]	train-rmse:33.71528	eval-rmse:40.69132
[7]	train-rmse:33.07449	eval-rmse:40.65074
[8]	train-rmse:32.41772	eval-rmse:40.72710
[9]	train-rmse:31.91000	eval-rmse:40.82972
[10]	train-rmse:31.55119	eval-rmse:40.83684
[11]	train-rmse:30.99534	eval-rmse:40.98243
[12]	train-rmse:30.33163	eval-rmse:41.05515
[13]	train-rmse:30.04347	eval-rmse:41.18635
[14]	train-rmse:29.44279	eval-rmse:41.32160
[15]	train-rmse:29.08791	eval-rmse:41.37832
[16]	train-rmse:28.84106	eval-rmse:41.45737
[17]	train-rmse:28.28883	eval-rmse:41.51113
[18]	train-rmse:27.90071	eval-rmse:41.55139
[19]	train-rmse:27.60694	eval-rmse:41.66496
[20]	train-rmse:27.24424	eval-rmse:41.80313
[21]	train-rmse:26.79013	eval-rmse:41.88767
[22]	train-rmse:26.64232	eval-rmse:41.9048