# Boosting

In [None]:
# When running, you will need to have xgboost installed, for example with:
# !pip install xgboost
# WARNING:  you may also need to concern yourself with xgb vs scikit-learn 
# compatable versions, for example: 
# https://stackoverflow.com/questions/79290968/super-object-has-no-attribute-sklearn-tags

In [None]:
# The following will import our libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import DecisionBoundaryDisplay

In [None]:
california_housing = fetch_california_housing(as_frame=True)
ca_housing_df = california_housing.frame

## Train/test split

In [None]:
# Split the dataset into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(california_housing.data, 
                                                    california_housing.target, 
                                                    test_size=0.2, 
                                                    random_state=42)

## Linear Regression

In [None]:
lin_reg = LinearRegression()

lin_reg.fit(x_train, y_train)

test_score = lin_reg.score(x_test, y_test)
print(f"R2 of Linear Regression: {test_score:.2f}")

preds = lin_reg.predict(x_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

## Random Forest

In [None]:
rf_reg = RandomForestRegressor(n_estimators=200, 
                                 max_depth=20, 
                                 n_jobs=-1,
                                 random_state=42)
rf_reg.fit(x_train, y_train)

test_score = rf_reg.score(x_test, y_test)
print(f"R2 of Random Forest: {test_score:.2f}")

preds = rf_reg.predict(x_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

In [None]:
cv_grid = GridSearchCV(RandomForestRegressor(n_jobs=-1,random_state=42),
                       param_grid = {
                           'max_depth' : [10,20],
                           'n_estimators' : [200],
                           'max_leaf_nodes' : [8, 16]
                       })
cv_grid.fit(x_train, y_train)
cv_grid.best_params_

In [None]:
y_predict = cv_grid.predict(x_test)
r2score = r2_score(y_test,y_predict)
print('R2 of the best Random Forest regressor after CV is %.2f' % (r2score))

In [None]:
plt.barh(california_housing.data.columns, rf_reg.feature_importances_)

## XGBoost

[XGBoost documentation](https://xgboost.readthedocs.io/en/stable/)
* *"XGBoost is an optimized distributed gradient boosting library designed to be highly efficient, flexible and portable. It implements machine learning algorithms under the Gradient Boosting framework. XGBoost provides a parallel tree boosting (also known as GBDT, GBM) that solve many data science problems in a fast and accurate way. The same code runs on major distributed environment (Hadoop, SGE, MPI) and can solve problems beyond billions of examples."*

In [None]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', 
                          colsample_bytree = 0.3, 
                          learning_rate = 0.1,
                          max_depth = 5, 
                          alpha = 10, 
                          n_estimators = 10)

What are the above input parameters? [documentation on parameters](https://xgboost.readthedocs.io/en/stable/parameter.html)

In [None]:
xg_reg.fit(x_train,y_train)

In [None]:
preds = xg_reg.predict(x_test)

In [None]:
test_score = xg_reg.score(x_test, y_test)
print(f"R2 of XGBoost: {test_score:.2f}")

rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

Well.... doesn't look like we picked a good set of parameters

We can improve by:
* doing cross validation and searching for a better set of hyperparameters
* the performance will also benefit from using the DMatrices of XGBoost:
  * *"DMatrix is the baisc data storage for XGBoost used by all XGBoost algorithms including both training, prediction and explanation. There are a few variants of DMatrix including normal DMatrix, which is a CSR matrix, QuantileDMatrix, which is used by histogram-based tree methods for saving memory, and lastly the experimental external-memory-based DMatrix, which reads data in batches during training."* -- [documentation](https://xgboost.readthedocs.io/en/stable/c.html#dmatrix)

In [None]:
data_dmatrix_train = xgb.DMatrix(data=x_train,
                                 label=y_train)
data_dmatrix_test = xgb.DMatrix(data=x_test,
                                label=y_test)

In [None]:
params = {"objective":"reg:squarederror",
          'colsample_bytree': 0.3,
          'learning_rate': 0.1,
          'max_depth': 5,
          'alpha': 10}

cv_results = xgb.cv(dtrain=data_dmatrix_train,
                    params=params,
                    nfold=3,
                    num_boost_round=1000,
                    early_stopping_rounds=10,
                    metrics="rmse",
                    as_pandas=True,
                    seed=123)

In [None]:
cv_results.head()

In [None]:
print((cv_results["test-rmse-mean"]).tail(1))

In [None]:
ax = cv_results.plot(y='test-rmse-mean')
cv_results.plot(y='train-rmse-mean', ax=ax)


In [None]:
for lr in [0.1, 0.2, 0.3]:
    for md in [5, 10]:

        params = {"objective":"reg:squarederror",
                  'colsample_bytree': 0.3,
                  'learning_rate': lr,
                  'max_depth': md,
                  'alpha': 10}

        cv_results = xgb.cv(dtrain=data_dmatrix_train,
                            params=params,
                            nfold=3,
                            num_boost_round=1000,
                            early_stopping_rounds=10,
                            metrics="rmse",
                            as_pandas=True,
                            seed=123)
        print(lr,md,(cv_results["test-rmse-mean"]).tail(1))

In [None]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', 
                          colsample_bytree = 0.3, 
                          learning_rate = 0.2,
                          max_depth = 10,
                          alpha = 10)

In [None]:
xg_reg.fit(x_train,y_train)

In [None]:
preds = xg_reg.predict(x_test)

In [None]:
test_score = xg_reg.score(x_test, y_test)
print(f"R2 of XGBoost: {test_score:.2f}")

rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

In [None]:
xgb.plot_importance(xg_reg)
# xgb.plot_importance(xg_reg, importance_type='gain')
# xgb.plot_importance(xg_reg, importance_type='cover')
plt.show()

In [None]:
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(12,7))
ax = plt.gca()
xgb.plot_tree(xg_reg,num_trees=0,ax=ax)
plt.show()

In [None]:
print(xg_reg.get_booster().get_dump()[0])

Rather than doing a manual search across parameter space, we can also use the GridSearchCV we looked at last time.

Also, we are going to max out the memory here soon.  Let's restart the kernel and reset.

In [None]:
xg_reg = xgb.XGBRegressor()

In [None]:
params = {"objective":["reg:squarederror"],
                  'colsample_bytree': [0.3],
                  'learning_rate': [0.1,0.3,0.5],
                  'max_depth': [5,10,20],
                  'alpha': [5,10]}

xg_reg_best = GridSearchCV(xg_reg, params, n_jobs=-1)

In [None]:
xg_reg_best.fit(x_train, y_train)

In [None]:
# print the R2 score of the model

test_score = xg_reg_best.score(x_test, y_test)
print(f"R2 of Linear Regression: {test_score:.2f}")

In [None]:
xg_reg_best.best_params_

In [None]:
y_predict = xg_reg_best.predict(x_test)

In [None]:
print('MSE = %.2f' % mean_squared_error(y_test, y_predict))

In [None]:
y_predict = xg_reg_best.predict(x_test)
r2score = r2_score(y_test,y_predict)
print('R2 of the best regressor after CV is %.2f' % (r2score))

In [None]:
# print the R2 score of the model

test_score = xg_reg_best.score(x_test, y_test)
print(f"R2 of XGBoost: {test_score:.2f}")

In [None]:
test_score = lin_reg.score(x_test, y_test)
print(f"R2 of Linear Regression: {test_score:.2f}")

In [None]:
print(xg_reg_best)

In [None]:
print(xg_reg)

In [None]:
xg_reg = xgb.train(params=xg_reg_best.best_params_, 
                   dtrain=data_dmatrix_train, 
                   num_boost_round=1000)

In [None]:
y_predict = xg_reg.predict(data_dmatrix_test)
r2score = r2_score(y_test,y_predict)
print('R2 of XGBoost after CV is %.2f' % (r2score))

# Classification Example with Moons Dataset

In [None]:
import seaborn as sns
from sklearn.datasets import make_moons
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
%matplotlib inline

In [None]:
x, y = make_moons(n_samples=400, noise=0.3, random_state=42)

In [None]:
sns.scatterplot(x=x[:,0], 
                y=x[:,1],
                hue=y)

In [None]:
# split into training/test sets

x_train, x_test, y_train, y_test = train_test_split(x, 
                                                    y, 
                                                    random_state=0)

In [None]:
knn_clf = KNeighborsClassifier()
logreg_clf = LogisticRegression()
tree_clf = DecisionTreeClassifier()

In [None]:
voting_clf = VotingClassifier(
    estimators=[('lr', logreg_clf), ('knn', knn_clf), ('tr', tree_clf)],
    voting='hard')
voting_clf.fit(x_train, y_train)
classifiers = [knn_clf, logreg_clf, tree_clf, voting_clf]
for clf in classifiers:
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

In [None]:
votingsoft_clf = VotingClassifier(
    estimators=[('lr', logreg_clf), ('knn', knn_clf), ('tr', tree_clf)],
    voting='soft')

votingsoft_clf.fit(x_train, y_train)

classifiers = [knn_clf, logreg_clf, tree_clf, voting_clf, votingsoft_clf]

for clf in classifiers:
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

# Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1)
bag_clf.fit(x_train, y_train)
y_pred = bag_clf.predict(x_test)
print(bag_clf.__class__.__name__, accuracy_score(y_test, y_pred))

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rnd_clf = RandomForestClassifier(n_estimators=500, 
                                 max_leaf_nodes=16, 
                                 n_jobs=-1,
                                 random_state=42)
rnd_clf.fit(x_train, y_train)

y_pred_rf = rnd_clf.predict(x_test)
accuracy_score(y_test, y_pred_rf)

In [None]:
classifiers = [knn_clf, logreg_clf, tree_clf, voting_clf, votingsoft_clf, rnd_clf]

In [None]:
for clf in classifiers:
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

In [None]:
xg_clf = xgb.XGBClassifier()

In [None]:
params = {"objective":["binary:logistic"],
                  'colsample_bytree': [0.3,0.5,1.0],
                  'learning_rate': [0.1,0.3,0.5],
                  'max_depth': [2,3,5,10],
                  'alpha': [3,5,10]}

clf = GridSearchCV(xg_clf, params, n_jobs=-1, 
                   scoring='accuracy')

In [None]:
clf.fit(x_train, y_train)

In [None]:
clf.best_params_

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test, clf.predict(x_test)))

In [None]:
# plot the decision boundary with the data

DecisionBoundaryDisplay.from_estimator(clf, 
                                       x, 
                                       response_method="predict",
                                       cmap="RdBu", 
                                       alpha=0.5
)

sns.scatterplot(x=x[:,0], 
                y=x[:,1],
                hue=y,
                palette=['red','green'])

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')

In [None]:
import ipywidgets

In [None]:
# plot the decision boundary with the data

def modelbnd(model=clf):
    DecisionBoundaryDisplay.from_estimator(model, 
                                           x, 
                                           response_method="predict",
                                           cmap="RdBu", 
                                           alpha=0.5
    )

    sns.scatterplot(x=x[:,0], 
                    y=x[:,1],
                    hue=y,
                    palette=['red','green'])

    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
ipywidgets.interactive(modelbnd, model = [clf,
                                          knn_clf, 
                                          logreg_clf, 
                                          tree_clf, 
                                          voting_clf, 
                                          votingsoft_clf, 
                                          rnd_clf])