In [1]:
import numpy as np
import pandas as pd

import xgboost as xgb

from tqdm import tqdm

from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV, PredefinedSplit
from sklearn.tree import export_text, DecisionTreeRegressor

**Question 01**

In [2]:
df = pd.read_csv('/kaggle/input/ml-learning-github/06-trees/housing.csv')

In [3]:
df = df.query("ocean_proximity == '<1H OCEAN' | ocean_proximity == 'INLAND'")
df.fillna(0, inplace=True)
df.median_house_value = df.median_house_value.apply(np.log1p)
df.reset_index(drop=True, inplace=True)

In [4]:
def split_data(data, labels, test_size=None, val_size=None, random_state=None):
    val_ratio = val_size / (test_size + val_size)
    train_data, temp_data, train_labels, temp_labels = train_test_split(
        data, labels, test_size=(test_size + val_size),
        random_state=random_state
    )
    val_data, test_data, val_labels, test_labels = train_test_split(
        temp_data, temp_labels, test_size=val_ratio,
        random_state=random_state
    )
    return train_data, val_data, test_data, train_labels, val_labels, test_labels

In [5]:
X = df.copy().drop('median_house_value', axis=1)
y = df.median_house_value

Xtrain, Xvalid, Xtest, ytrain, yvalid, ytest = split_data(X, y, test_size=0.2, val_size=0.2, random_state=1)

train_dict, valid_dict = Xtrain.to_dict(orient='records'), Xvalid.to_dict(orient='records')

dv = DictVectorizer(sparse=True)
Xtrain = dv.fit_transform(train_dict)
Xvalid = dv.transform(valid_dict)

model = DecisionTreeRegressor(max_depth=1)
model.fit(Xtrain, ytrain)

In [6]:
print(export_text(model, feature_names=list(dv.get_feature_names_out())))

|--- ocean_proximity=<1H OCEAN <= 0.50
|   |--- value: [11.61]
|--- ocean_proximity=<1H OCEAN >  0.50
|   |--- value: [12.30]



**Question 02**

In [49]:
def train(data, labels, estimators=10, depth=None, seed=1):
    train_data, val_data, test_data, train_labels, val_labels, test_labels = split_data(
        data, labels, test_size=0.2, val_size=0.2,
        random_state=seed
    )
    
    x_train, x_val = train_data.to_dict(orient='records'), val_data.to_dict(orient='records')
    
    dict_vector = DictVectorizer(sparse=True)
    train_data = dv.fit_transform(x_train)
    val_data = dv.transform(x_val)
    
    clf = RandomForestRegressor(n_estimators=estimators, max_depth=depth, n_jobs=-1, random_state=seed)
    clf.fit(train_data, train_labels)
    val_pred = clf.predict(val_data)
    
    rmse = mean_squared_error(val_pred, val_labels, squared=False)
    return rmse

In [51]:
rmse = train(X, y)

np.round(rmse, 3)

0.235

**Question 03**

In [9]:
n_estimators = []

for parm in tqdm(range(10, 200, 10)):
    score = np.round(train(X, y, estimators=parm), 3)
    n_estimators.append({'n_estimator': parm, 'RMSE': score})

100%|██████████| 19/19 [04:58<00:00, 15.73s/it]


In [48]:
pd.DataFrame(n_estimators).sort_values('RMSE').head(10)

Unnamed: 0,n_estimator,RMSE
18,190,0.22
16,170,0.22
15,160,0.22
14,150,0.22
13,140,0.22
12,130,0.22
11,120,0.22
10,110,0.22
17,180,0.22
9,100,0.221


**Question 04**

In [11]:
param_grid = {
    'n_estimators': list(range(10, 200, 10)),
    'max_depth': [10, 15, 20, 25]
}

grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=1),
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1
)

grid_search.fit(Xtrain, ytrain)

results = pd.DataFrame(grid_search.cv_results_)

In [45]:
# Display the best parameters and corresponding RMSE
best_params = grid_search.best_params_
best_rmse = -grid_search.best_score_

print("Best Parameters:", best_params)
print("Best RMSE:", best_rmse)

Best Parameters: {'max_depth': 20, 'n_estimators': 180}
Best RMSE: 0.23149441416413294


In [47]:
# Store the RMSE values in a DataFrame
rmse_values = results[['param_n_estimators', 'param_max_depth', 'mean_test_score']].copy()
rmse_values['mean_test_score'] = np.round(-rmse_values['mean_test_score'], 3)
rmse_values.rename(columns={'mean_test_score': 'RMSE'}, inplace=True)

rmse_values.sort_values('RMSE')

Unnamed: 0,param_n_estimators,param_max_depth,RMSE
55,180,20,0.231
37,190,15,0.232
32,140,15,0.232
33,150,15,0.232
34,160,15,0.232
...,...,...,...
1,20,10,0.246
57,10,25,0.246
19,10,15,0.246
38,10,20,0.246


In [55]:
scores = []

for depth in [10, 15, 20, 25]:
    rmse_mean = []
    for parm in tqdm(range(10, 200, 10)):
        rmse_score = train(X, y, estimators=parm, depth=depth)
        rmse_mean.append(rmse_score)
    scores.append({'max_depth': depth, 'rmse_mean': np.mean(rmse_mean)})

100%|██████████| 19/19 [01:34<00:00,  4.99s/it]
100%|██████████| 19/19 [03:14<00:00, 10.23s/it]
100%|██████████| 19/19 [04:33<00:00, 14.40s/it]
100%|██████████| 19/19 [04:53<00:00, 15.42s/it]


In [56]:
pd.DataFrame(scores).sort_values('rmse_mean')

Unnamed: 0,max_depth,rmse_mean
3,25,0.22167
2,20,0.221962
1,15,0.223126
0,10,0.232989


**Question 05**

In [63]:
model = RandomForestRegressor(n_estimators=10, max_depth=20, n_jobs=-1, random_state=1)
model.fit(Xtrain, ytrain)

pd.DataFrame({'feature': dv.feature_names_, 'score': model.feature_importances_}).sort_values('score', ascending=False)

Unnamed: 0,feature,score
4,median_income,0.33796
5,ocean_proximity=<1H OCEAN,0.250824
6,ocean_proximity=INLAND,0.108599
2,latitude,0.096263
3,longitude,0.08551
1,housing_median_age,0.032818
7,population,0.028586
9,total_rooms,0.023545
0,households,0.018616
8,total_bedrooms,0.017279


**Question 06**

In [16]:
features = dv.get_feature_names_out()

dtrain = xgb.DMatrix(data=Xtrain, label=ytrain)
dvalid = xgb.DMatrix(data=Xvalid, label=yvalid)

In [17]:
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,

    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=100)

In [18]:
ypred = model.predict(dvalid)

rmse = mean_squared_error(ypred, yvalid, squared=False)

np.round(rmse, 3)

0.215

In [19]:
xgb_params = {
    'eta': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,

    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=100)

In [20]:
ypred = model.predict(dvalid)

rmse = mean_squared_error(ypred, yvalid, squared=False)

np.round(rmse, 3)

0.218