<a href="https://colab.research.google.com/github/ayamlearning/ML_Zoom_Camp/blob/main/L6_gradient_boosting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#installation
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import mean_squared_error

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [None]:
data = pd.read_csv("housing.csv")
data.sample(5)

#Preparing the dataset

In [None]:
def apply_log_transform(y):
  return np.log1p(y)

apply_log_transform(2000)

In [None]:
df = data.copy()

In [None]:
#First, keep only the records where ocean_proximity is either '<1H OCEAN' or 'INLAND'
df = df.loc[(df['ocean_proximity']=='<1H OCEAN') | (df['ocean_proximity']=='INLAND')]

In [None]:
#Fill missing values with zeros
df = df.fillna(0)

#Apply the log transform to median_house_value
df.median_house_value = df['median_house_value'].apply(apply_log_transform)

In [None]:
#Do train/validation/test split with 60%/20%/20% distribution.
#Use the train_test_split function and set the random_state parameter to 1.

df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)


y_train = df_train.median_house_value.values
y_val = df_val.median_house_value.values
y_test = df_test.median_house_value.values


del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']


print(df_train.shape[0], df_val.shape[0], df_test.shape[0] )

In [None]:
#Use DictVectorizer(sparse=True) to turn the dataframes into matrices.
train_dict = df_train.to_dict(orient='records')

dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [None]:
dv.get_feature_names_out()

# Training and Evaluate model

In [None]:
#Let's train a decision tree regressor to predict the median_house_value variable.
#Train a model with max_depth=1.

In [None]:
from sklearn.tree import DecisionTreeRegressor,export_text

In [None]:
#max_depth=1
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train,y_train)

y_pred = dt.predict(X_val)
rmse = mean_squared_error(y_val,y_pred,squared=False)
print(round(rmse,3))

In [None]:
print(export_text(dt,feature_names = list(dv.get_feature_names_out())))

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
'''
Train a random forest model with these parameters:
  * n_estimators=10
  * random_state=1
'''

dt = RandomForestRegressor(n_estimators=10, random_state=1)
dt.fit(X_train,y_train)

y_pred = dt.predict(X_val)
rmse = mean_squared_error(y_val,y_pred,squared=False)
print(round(rmse,3))


In [None]:
'''
Now let's experiment with the n_estimators parameter
  * Try different values of this parameter from 10 to 200 with step 10.
  * Set random_state to 1.
  * Evaluate the model on the validation dataset.
'''

lst_estimators_acc = []

for i in range(10, 201, 10):
  dt = RandomForestRegressor(n_estimators=i,random_state=1)
  dt.fit(X_train,y_train)

  y_pred = dt.predict(X_val)
  rmse = mean_squared_error(y_val,y_pred,squared=False)
  lst_estimators_acc.append((i,round(rmse,3)))

In [None]:
df_est = pd.DataFrame(lst_estimators_acc, columns = ['estimator','rmse'] )
df_est.sort_values(by=['rmse'],ascending=False)
df_est.head()

In [None]:
plt.plot(df_est.estimator,df_est.rmse)

In [None]:
'''
Let's select the best max_depth:
  * Try different values of max_depth: [10, 15, 20, 25]
  * For each of these values,
  * try different values of n_estimators from 10 till 200 (with step 10)
  * calculate the mean RMSE
  * Fix the random seed: random_state=1

'''
lst_estimators_acc = []

for e in range(10, 201, 10):
  for d in [10, 15, 20, 25]:
    dt = RandomForestRegressor(n_estimators=e, max_depth=d, random_state=1)
    dt.fit(X_train,y_train)

    y_pred = dt.predict(X_val)
    rmse = mean_squared_error(y_val,y_pred,squared=False)
    lst_estimators_acc.append((e,d,round(rmse,3)))

In [None]:
columns =  ['estimator','max_depth','rmse']
df_est = pd.DataFrame(lst_estimators_acc, columns =columns).head()
df_est.sort_values(by=['rmse'],ascending=False)

In [None]:
'''
We can extract feature importance information from tree-based models.
At each step of the decision tree learning algorithm, it finds the best split.
When doing it, we can calculate "gain" - the reduction in impurity before and
after the split. This gain is quite useful in understanding what are the
important features for tree-based models.

In Scikit-Learn, tree-based models contain this information in the feature_importances_ field.

For this homework question, we'll find the most important feature:

Train the model with these parameters:
n_estimators=10,
max_depth=20,
random_state=1,
n_jobs=-1 (optional)
Get the feature importance information from this model
What's the most important feature (among these 4)?

total_rooms
median_income
total_bedrooms
longitude
'''

dt = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1)
dt.fit(X_train,y_train)

In [None]:
lst_feat_imp = list(zip(dv.get_feature_names_out(),dt.feature_importances_))
lst_feat_imp.sort(key = lambda i:i[1], reverse = True)
lst_feat_imp

In [None]:
len(dv.get_feature_names_out())

In [None]:
'''
Now let's train an XGBoost model! For this question, we'll tune the eta parameter:
  * Install XGBoost
  * Create DMatrix for train and validation
  * Create a watchlist
  * Train a model with these parameters for 100 rounds:
'''

In [None]:
!pip install XGBoost

In [None]:
import xgboost as xgb
import re

In [None]:
def parse_xgb_output(data_str):
 # Split the data into lines
  lines = data_str.strip().split("\n")

  # Extract data from each line
  indices = [int(re.search(r'\[(\d+)\]', line).group(1)) for line in lines]
  train_rmse = [float(re.search(r'train-rmse:(\d+\.\d+)', line).group(1)) for line in lines]
  val_rmse = [float(re.search(r'val-rmse:(\d+\.\d+)', line).group(1)) for line in lines]

  # Create a dataframe from the extracted data
  df = pd.DataFrame({
      'train_rmse': train_rmse,
      'Validation_rmse': val_rmse
  })

  df['train_rmse'] = df['train_rmse'].round(3)
  df['Validation_rmse'] = df['Validation_rmse'].round(3)

  return df

In [None]:
features = dv.get_feature_names_out().tolist()
features = list(map(lambda i: str(i).replace("<", "").replace("=", "_")
.replace(" ", "_"), features))

dtrain = xgb.DMatrix(X_train, label = y_train, feature_names = features)
dval = xgb.DMatrix(X_val, label = y_val, feature_names = features)

In [None]:
from contextlib import redirect_stdout, redirect_stderr
from io import StringIO

# Create StringIO objects to capture the output
stdout_buffer = StringIO()
stderr_buffer = StringIO()

xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,

    'seed': 1,
    'verbosity': 1,
}
watchlist = [(dtrain,'train'),(dval,'val')]

with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer):
  model = xgb.train(xgb_params, dtrain, num_boost_round=100,
                  evals=watchlist)

captured_stdout = stdout_buffer.getvalue()
captured_stderr = stderr_buffer.getvalue()

df_xgb = parse_xgb_output(captured_stdout)
df_xgb.sort_values(by=['Validation_rmse'],ascending=True).head()

In [None]:
# Create StringIO objects to capture the output
stdout_buffer = StringIO()
stderr_buffer = StringIO()

xgb_params = {
    'eta': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,

    'objective': 'reg:squarederror',
    'nthread': 8,

    'seed': 1,
    'verbosity': 1,
}

with redirect_stdout(stdout_buffer), redirect_stderr(stderr_buffer):
  model = xgb.train(xgb_params, dtrain, num_boost_round=100,
                  evals=watchlist)

captured_stdout = stdout_buffer.getvalue()
captured_stderr = stderr_buffer.getvalue()

df_xgb = parse_xgb_output(captured_stdout)
df_xgb.sort_values(by=['Validation_rmse'],ascending=True).head()
