In [18]:
import numpy as np
import pandas as pd

import xgboost as xgb

from tqdm import tqdm

from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV, PredefinedSplit
from sklearn.tree import export_text, DecisionTreeRegressor

**Question 01**

In [2]:
df = pd.read_csv('housing.csv')

In [3]:
df = df.query("ocean_proximity == '<1H OCEAN' | ocean_proximity == 'INLAND'")
df.fillna(0, inplace=True)
df.median_house_value = df.median_house_value.apply(np.log1p)
df.reset_index(drop=True, inplace=True)

In [4]:
def split_data(data, labels, test_size=None, val_size=None, random_state=None):
    val_ratio = val_size / (test_size + val_size)
    train_data, temp_data, train_labels, temp_labels = train_test_split(
        data, labels, test_size=(test_size + val_size),
        random_state=random_state
    )
    val_data, test_data, val_labels, test_labels = train_test_split(
        temp_data, temp_labels, test_size=val_ratio,
        random_state=random_state
    )
    return train_data, val_data, test_data, train_labels, val_labels, test_labels

In [5]:
X = df.copy().drop('median_house_value', axis=1)
y = df.median_house_value

Xtrain, Xvalid, Xtest, ytrain, yvalid, ytest = split_data(X, y, test_size=0.2, val_size=0.2, random_state=1)

train_dict, valid_dict = Xtrain.to_dict(orient='records'), Xvalid.to_dict(orient='records')

dv = DictVectorizer(sparse=True)
Xtrain = dv.fit_transform(train_dict)
Xvalid = dv.transform(valid_dict)

model = DecisionTreeRegressor(max_depth=1)
model.fit(Xtrain, ytrain)

In [6]:
print(export_text(model, feature_names=dv.get_feature_names_out()))

|--- ocean_proximity=<1H OCEAN <= 0.50
|   |--- value: [11.61]
|--- ocean_proximity=<1H OCEAN >  0.50
|   |--- value: [12.30]


**Question 02**

In [9]:
def train(data, labels, estimators=10, seed=1):
    train_data, val_data, test_data, train_labels, val_labels, test_labels = split_data(
        data, labels, test_size=0.2, val_size=0.2,
        random_state=seed
    )
    
    x_train, x_val = train_data.to_dict(orient='records'), val_data.to_dict(orient='records')
    
    dict_vector = DictVectorizer(sparse=True)
    train_data = dv.fit_transform(x_train)
    val_data = dv.transform(x_val)
    
    clf = RandomForestRegressor(n_estimators=estimators, n_jobs=-1, random_state=seed)
    clf.fit(train_data, train_labels)
    val_pred = clf.predict(val_data)
    
    rmse = mean_squared_error(val_pred, val_labels, squared=False)
    return rmse

In [8]:
model = RandomForestRegressor(n_estimators=10, n_jobs=-1, random_state=1)
model.fit(Xtrain, ytrain)

ypred = model.predict(Xvalid)

rmse = mean_squared_error(ypred, yvalid, squared=False)

np.round(rmse, 3)

0.235

**Question 03**

In [10]:
n_estimators = []

for parm in tqdm(range(10, 200, 10)):
    score = np.round(train(X, y, estimators=parm), 3)
    n_estimators.append({'n_estimator': parm, 'RMSE': score})

In [11]:
pd.DataFrame(n_estimators)

Unnamed: 0,n_estimator,RMSE
0,10,0.234947
1,20,0.226048
2,30,0.223287
3,40,0.223104
4,50,0.221692
5,60,0.22163
6,70,0.221184
7,80,0.221054
8,90,0.221393
9,100,0.220897


**Question 04**

In [None]:
param_grid = {
    'n_estimators': list(range(10, 200, 10)),
    'max_depth': [10, 15, 20, 25]
}

grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=1),
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(Xtrain, ytrain)

results = pd.DataFrame(grid_search.cv_results_)

# Display the best parameters and corresponding RMSE
best_params = grid_search.best_params_
best_rmse = -grid_search.best_score_  # Negate to get the actual RMSE

print("Best Parameters:", best_params)
print("Best RMSE:", best_rmse)

# Store the RMSE values in a DataFrame
rmse_values = results[['param_n_estimators', 'param_max_depth', 'mean_test_score']]
rmse_values.rename(columns={'mean_test_score': 'RMSE'}, inplace=True)

rmse_values

Fitting 5 folds for each of 76 candidates, totalling 380 fits


**Question 05**

In [None]:
model = RandomForestRegressor(n_estimators=10, max_depth=20, n_jobs=-1, random_state=1)
model.fit(Xtrain, ytrain)

model.feature_importances_

**Question 06**

In [None]:
features = list(dv.get_feature_names_out())

dtrain = xgb.DMatrix(data=Xtrain, label=ytrain, feature_names=features)
dvalid = xgb.DMatrix(data=Xvalid, label=yvalid, feature_names=features)

In [None]:
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,

    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=100)

In [None]:
ypred = model.predict(dvalid)

rmse = mean_squared_error(ypred, yvalid, squared=False)

np.round(rmse, 3)

In [None]:
xgb_params = {
    'eta': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,

    'seed': 1,
    'verbosity': 1,
}

model = xgb.train(xgb_params, dtrain, num_boost_round=100)

In [None]:
ypred = model.predict(dvalid)

rmse = mean_squared_error(ypred, yvalid, squared=False)

np.round(rmse, 3)