<a href="https://colab.research.google.com/github/belalabouzaid/siads699_team13_collab/blob/main/Notebooks/5a-XGB_Model_Tuning_DASK_with_Kfold.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Notebook Objectives
The aim of this notebook is to tune the XGB regression model by optimyzing the hyperparameters. The model tuning will be done by means of using K-fold Gridsearch CV. As this is a computationally expensive step, we have used DASK to access additional computational resources.

In [None]:
# Import libraries
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# Import libraries
import os
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# To work with numpy arrays
import numpy as np
# ML model building, training and testing
import sklearn
# Kfold split
from sklearn.model_selection import KFold
# Build ML pipeline
from sklearn.pipeline import Pipeline
# Liner regression model building
from sklearn.linear_model import LinearRegression
# Polynomial regression
from sklearn.preprocessing import PolynomialFeatures
# sklearn included transformer
from sklearn.preprocessing import QuantileTransformer
# to split data into test and train
from sklearn.model_selection import train_test_split
# Scaling transform for PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
# To run PCA feature reduction
from sklearn.decomposition import PCA
# XGBoost model
from xgboost import XGBRegressor,XGBClassifier
# Random forest regressor
from sklearn.ensemble import RandomForestRegressor
# for regression error calculation
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import learning_curve
# for regression r2 score calculation
from sklearn.metrics import r2_score
# for regression mse score calculation
from sklearn.metrics import mean_squared_error
# For hyperparameter tuning
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
# To transform the target variable into a normal distribution
from sklearn.compose import TransformedTargetRegressor
# visualisation
from matplotlib import pyplot

# Gridsearch (XGB hyperparameter tuning) using DASK

In [None]:
pip install "dask[complete]"

Collecting lz4>=4.3.2 (from dask[complete])
  Downloading lz4-4.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: lz4
Successfully installed lz4-4.3.2


In [None]:
pip install dask-ml

Collecting dask-ml
  Downloading dask_ml-2023.3.24-py3-none-any.whl (148 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m148.7/148.7 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting dask-glm>=0.2.0 (from dask-ml)
  Downloading dask_glm-0.3.2-py2.py3-none-any.whl (13 kB)
Collecting sparse>=0.7.0 (from dask-glm>=0.2.0->dask-ml)
  Downloading sparse-0.14.0-py2.py3-none-any.whl (80 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.0/81.0 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sparse, dask-glm, dask-ml
Successfully installed dask-glm-0.3.2 dask-ml-2023.3.24 sparse-0.14.0


In [None]:
import joblib
from dask.distributed import Client

In [None]:
# Instantiate pipeline
pipe = Pipeline([
               # ('standard_scaler', StandardScaler())
              #, ('PCA', PCA(n_components= 6)) ,
              ('xgb', XGBRegressor(random_state= 42))
              ])

In [None]:
param_grid = {
            #'PCA__n_components':[6],
            'xgb__max_depth': [5,10,15,20],
            'xgb__colsample_bylevel': [0.5,0.7,0.8],
            'xgb__learning_rate': [0.001,0.01,0.1],
            'xgb__n_estimators': [100]
            }

In [None]:
import dask_ml.model_selection as dcv

In [None]:
client = Client() # start a local Dask client

INFO:distributed.http.proxy:To route to workers diagnostics web server please install jupyter-server-proxy: python -m pip install jupyter-server-proxy
INFO:distributed.scheduler:State start
INFO:distributed.scheduler:  Scheduler at:     tcp://127.0.0.1:39185
INFO:distributed.scheduler:  dashboard at:  http://127.0.0.1:8787/status
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:42333'
INFO:distributed.nanny:        Start Nanny at: 'tcp://127.0.0.1:44731'
INFO:distributed.scheduler:Register worker <WorkerState 'tcp://127.0.0.1:40265', name: 0, status: init, memory: 0, processing: 0>
INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:40265
INFO:distributed.core:Starting established connection to tcp://127.0.0.1:42626
INFO:distributed.scheduler:Register worker <WorkerState 'tcp://127.0.0.1:35531', name: 1, status: init, memory: 0, processing: 0>
INFO:distributed.scheduler:Starting worker compute stream, tcp://127.0.0.1:35531
INFO:distributed.core:Sta

## Model Tuning using Three Rivers 22 data

In [None]:
threerivers_22 = pd.read_csv('https://github.com/belalabouzaid/siads699_team13_collab/raw/main/Data/threerivers_2022.csv')

In [None]:
X = threerivers_22.drop(columns = 'smi')
y = threerivers_22['smi']

In [None]:
X = X[['aspect','slope','NDMI','SR_B5','MSI','SAVI','EVI','NDVI','ST_B10'
                                ,'MNDWI','SR_B7','SR_B6','MSAVI']]

In [None]:
#Splitting the data into test (25%) and train (75%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
%%time
# Quick test K = 2
with joblib.parallel_backend('dask'):
  grid = dcv.GridSearchCV(pipe, param_grid, cv = 2, n_jobs = -1, scoring = 'r2')
  grid.fit(X_train, y_train)

CPU times: user 30.4 s, sys: 3.91 s, total: 34.3 s
Wall time: 7min 33s


In [None]:
grid.best_params_

{'xgb__colsample_bylevel': 0.8,
 'xgb__learning_rate': 0.1,
 'xgb__max_depth': 10,
 'xgb__n_estimators': 100}

In [None]:
grid.best_score_

0.9815606027732697

In [None]:
%%time
# K = 10
with joblib.parallel_backend('dask'):
  grid = dcv.GridSearchCV(pipe, param_grid, cv = 10, n_jobs = -1, scoring = 'r2')
  grid.fit(X_train, y_train)

CPU times: user 4min 21s, sys: 29 s, total: 4min 50s
Wall time: 56min 34s


In [None]:
grid.best_params_

{'xgb__colsample_bylevel': 0.7,
 'xgb__learning_rate': 0.1,
 'xgb__max_depth': 10,
 'xgb__n_estimators': 100}

In [None]:
grid.best_score_

0.9828189286623382

## Model Tuning using Mariposa 21 data

In [None]:
mariposa_21 = pd.read_csv('https://github.com/belalabouzaid/siads699_team13_collab/raw/main/Data/mariposa_2021.csv')

In [None]:
X = mariposa_21.drop('smi',1)
y = mariposa_21['smi']

  X = mariposa_21.drop('smi',1)


In [None]:
X = X[['aspect','slope','NDMI','SR_B5','MSI','SAVI','EVI','NDVI','ST_B10'
                                ,'MNDWI','SR_B7','SR_B6','MSAVI']]

In [None]:
#Splitting the data into test (25%) and train (75%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
%%time
# Quick test K = 2
with joblib.parallel_backend('dask'):
  grid = dcv.GridSearchCV(pipe, param_grid, cv = 2, n_jobs = -1, scoring = 'r2')
  grid.fit(X_train, y_train)

CPU times: user 1min 7s, sys: 9.02 s, total: 1min 16s
Wall time: 17min 20s


In [None]:
grid.best_params_

{'xgb__colsample_bylevel': 0.8,
 'xgb__learning_rate': 0.1,
 'xgb__max_depth': 10,
 'xgb__n_estimators': 100}

In [None]:
grid.best_score_

0.8245801889809902

In [None]:
%%time
# K = 10
with joblib.parallel_backend('dask'):
  grid = dcv.GridSearchCV(pipe, param_grid, cv = 10, n_jobs = -1, scoring = 'r2')
  grid.fit(X_train, y_train)

CPU times: user 9min 1s, sys: 1min 2s, total: 10min 3s
Wall time: 2h 2min 9s


In [None]:
grid.best_params_

{'xgb__colsample_bylevel': 0.7,
 'xgb__learning_rate': 0.1,
 'xgb__max_depth': 10,
 'xgb__n_estimators': 100}

In [None]:
grid.best_score_

0.8343250412675868