<a href="https://colab.research.google.com/github/aslan-wong/7days-intro-2-DS/blob/main/day6_Models_Optimization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CODING TASK #1: IMPORT DATASET AND LIBRARIES

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [None]:
# import the csv files using pandas
bike_df = pd.read_csv('bike_sharing_daily.csv')

In [None]:
# Let's explore the dataframe
bike_df

In [None]:
bike_df.head(5)

In [None]:
bike_df.tail(10)

In [None]:
bike_df.info()

**PRACTICE OPPORTUNITY #1 [OPTIONAL]:**
- **Compare the Average casual, registered and total bike sharing demand**
- **Does the average value of casual and registered sum up to the overall average total demand?**

# CODING TASK #2: PERFORM DATA CLEANING

In [None]:
sns.heatmap(bike_df.isnull())

In [None]:
bike_df

In [None]:
bike_df = bike_df.drop(labels = ['instant'], axis = 1)
bike_df

In [None]:
bike_df = bike_df.drop(labels = ['casual', 'registered'], axis = 1)
bike_df

In [None]:
bike_df['dteday'] = pd.to_datetime(bike_df['dteday'], format = '%m/%d/%Y')
bike_df

In [None]:
bike_df.index = pd.DatetimeIndex(bike_df['dteday'])
bike_df

In [None]:
bike_df = bike_df.drop(labels = ['dteday'], axis = 1)
bike_df

# CODING TASK #3: PERFORM DATA VISUALIZATION

In [None]:
plt.figure(figsize = (12, 7))
bike_df['cnt'].asfreq('W').plot(linewidth = 5)
plt.title('Bike Rental Usage Per Week')
plt.xlabel('Week')
plt.ylabel('Bike Rental')
plt.grid()

In [None]:
plt.figure(figsize = (12, 7))
bike_df['cnt'].asfreq('M').plot(linewidth = 5)
plt.title('Bike Rental Usage Per Month')
plt.xlabel('Month')
plt.ylabel('Bike Rental')
plt.grid()

In [None]:
X_numerical = bike_df[['temp', 'hum', 'windspeed', 'cnt']]
X_numerical

In [None]:
sns.pairplot(X_numerical);

In [None]:
plt.figure(figsize = (8, 6))
sns.heatmap(X_numerical.corr(), annot = True);

**PRACTICE OPPORTUNITY #2 [OPTIONAL]:**
- **Plot the rental usage per quarter**
- **Set the line width to 6 and enable the grid**

# CODING TASK #4: CREATE TRAINING AND TESTING DATASET

In [None]:
X_cat = bike_df[['season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit']]
X_cat

In [None]:
from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder()
X_cat = onehotencoder.fit_transform(X_cat).toarray()
X_cat

In [None]:
X_cat.shape

In [None]:
X_cat = pd.DataFrame(X_cat)

In [None]:
X_numerical

In [None]:
X_numerical = X_numerical.reset_index()
X_numerical

In [None]:
X_all = pd.concat([X_cat, X_numerical], axis = 1)
X_all

In [None]:
X_all = X_all.drop('dteday', axis = 1)
X_all

In [None]:
X = X_all.iloc[:, :-1].values
y = X_all.iloc[:, -1:].values

In [None]:
X.shape

In [None]:
y.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
X_train.shape

In [None]:
X_test.shape

# CODING TASK #5: TRAIN AN XG-BOOST ALGORITHM (WITHOUT OPTIMIZATION)

In [None]:
!pip install xgboost

In [None]:
# Train an XGBoost regressor model

import xgboost as xgb
model = xgb.XGBRegressor(objective ='reg:squarederror', learning_rate = 1, max_depth = 20, n_estimators = 500)
model.fit(X_train, y_train)

In [None]:
# predict the score of the trained model using the testing dataset

result = model.score(X_test, y_test)
print("Accuracy : {}".format(result))

In [None]:
# make predictions on the test data
y_predict = model.predict(X_test)


In [None]:
y_test

In [None]:
y_predict

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt
k = X_test.shape[1]
n = len(X_test)
RMSE = float(format(np.sqrt(mean_squared_error(y_test, y_predict)),'.3f'))
MSE = mean_squared_error(y_test, y_predict)
MAE = mean_absolute_error(y_test, y_predict)
r2 = r2_score(y_test, y_predict)
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)

print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2, '\nAdjusted R2 =', adj_r2)

**PRACTICE OPPORTUNITY #3 [OPTIONAL]:**
- **Retrain the model with less 'max_depth'**
- **Comment on the results**

# CODING TASK #6: MODEL OPTIMIZATION USING GRIDSEARCH

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
parameters_grid = { 'max_depth': [3, 6, 10],
                   'learning_rate': [0.01, 0.05, 0.1],
                   'n_estimators': [100, 500, 1000],
                   'colsample_bytree': [0.3, 0.7]}

In [None]:
import xgboost as xgb

In [None]:
model = xgb.XGBRegressor(objective ='reg:squarederror')

In [None]:
# Note that we used the "neg_mean_squared_error" since GridSearchCV() ranks all the algorithms (estimators)
# and specifies which one is the best. We are trying to minimize the error.
xgb_gridsearch = GridSearchCV(estimator = model,
                              param_grid = parameters_grid,
                              scoring = 'neg_mean_squared_error',
                              cv = 5,
                              verbose = 5)

In [None]:
xgb_gridsearch.fit(X_train, y_train)

In [None]:
xgb_gridsearch.best_params_

In [None]:
xgb_gridsearch.best_estimator_

In [None]:
y_predict = xgb_gridsearch.predict(X_test)

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt
k = X_test.shape[1]
n = len(X_test)
RMSE = float(format(np.sqrt(mean_squared_error(y_test, y_predict)),'.3f'))
MSE = mean_squared_error(y_test, y_predict)
MAE = mean_absolute_error(y_test, y_predict)
r2 = r2_score(y_test, y_predict)
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)

print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2, '\nAdjusted R2 =', adj_r2)

**PRACTICE OPPORTUNITY #4 [OPTIONAL]:**
- **Expand on the parameter grid to include an additional hyperparameter "gamma"**
- **Try any three reasonable values for gamma. How many fits are run this time? Comment on the results**


# CODING TASK #7: MODEL OPTIMIZATION USING RANDOM SEARCH

In [None]:
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Define the grid of hyperparameters to search

# you can choose which booster you'd like to choose:
# Two options are available: gbtree, gblinear
# gbtree uses tree based models while gblinear uses linear functions

grid = {
    'n_estimators': [100, 500, 700],
    'max_depth': [2, 3, 5],
    'learning_rate': [0.1, 0.5, 1],
    'min_child_weight': [1, 2, 3]
            }

# grid = {
#    'n_estimators': [100, 500, 700],
#     'max_depth': [2, 3, 5],
#     'learning_rate': [0.1, 0.5, 1],
#      'min_child_weight': [1, 2, 3],
#     'booster': ['gbtree','gblinear']}

import xgboost as xgb
model = xgb.XGBRegressor(objective ='reg:squarederror')

# Set up the random search
random_cv = RandomizedSearchCV(estimator = model,
                               param_distributions = grid,
                               cv = 5,
                               n_iter = 50,
                               scoring = 'neg_mean_absolute_error',
                               verbose = 5,
                               return_train_score = True)
random_cv.fit(X_train, y_train)

random_cv.best_estimator_


In [None]:
y_predict = random_cv.predict(X_test)

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt
k = X_test.shape[1]
n = len(X_test)
RMSE = float(format(np.sqrt(mean_squared_error(y_test, y_predict)),'.3f'))
MSE = mean_squared_error(y_test, y_predict)
MAE = mean_absolute_error(y_test, y_predict)
r2 = r2_score(y_test, y_predict)
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)

print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2, '\nAdjusted R2 =', adj_r2)

# CODING TASK #8: MODEL OPTIMIZATION USING BAYESIAN OPTIMIZATION

In [None]:
# Let's install a library called Scikit-Optimize (Skopt) which is used to perform bayesian optimization
# BayesSearchCV class is used in a similar fashion to GridSearchCV
# You secify the search space as a distribution instead of discrete values

! pip install scikit-optimize
from skopt import BayesSearchCV
import xgboost as xgb


In [None]:
model = xgb.XGBRegressor(objective ='reg:squarederror')

In [None]:
search_space = {
    "learning_rate": (0.01, 1.0, "log-uniform"),
    "max_depth": (1, 50),
    "n_estimators": (5, 500),
}

In [None]:
xgb_bayes_search = BayesSearchCV(model,
                               search_space,
                               n_iter = 50,
                               scoring = 'neg_mean_absolute_error',
                               cv = 5)

In [None]:
result = xgb_bayes_search.fit(X_train, y_train)

In [None]:
y_predict = xgb_bayes_search.predict(X_test)

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt
k = X_test.shape[1]
n = len(X_test)
RMSE = float(format(np.sqrt(mean_squared_error(y_test, y_predict)),'.3f'))
MSE = mean_squared_error(y_test, y_predict)
MAE = mean_absolute_error(y_test, y_predict)
r2 = r2_score(y_test, y_predict)
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)

print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2, '\nAdjusted R2 =', adj_r2)

# FINAL CAPSTONE PROJECT

Using the used car prices dataset included in the course package, perform the following:
1. Load the “used_car_price.csv” dataset
3. Split the data into 75% for training and 25% for testing
4. Train an XG-Boost model in Scikit-Learn
5. Assess trained XG-Boost model performance using RMSE and R2
6. Perform hyperparameters optimization using GridSearch, choose any reasonable values for max_depth, learning_rate, n_estimators, and colsample_bytree. Use 5 cross validation folds.  
7. Perform hyperparameters optimization using RandomSearch, choose any reasonable values for max_depth, learning_rate, n_estimators, and colsample_bytree. Use 5 cross validation folds and 100 iterations.  
8. Perform hyperparameters optimization using Bayesian optimization, choose any reasonable values for max_depth, learning_rate, n_estimators. Use 5 cross validation folds and 100 iterations.  
9. Compare the 3 optimization strategies using RMSE and R2.

# FINAL CAPSTONE PROJECT SOLUTION

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Read the CSV file
car_df = pd.read_csv("used_car_price.csv")

In [None]:
# Load the top 6 instances
car_df.head(6)

In [None]:
# Perform One-Hot Encoding for "Make", "Model", "Type", "Origin", and "DriveTrain"
car_df = pd.get_dummies(car_df, columns=["Make", "Model", "Type", "Origin", "DriveTrain"])

In [None]:
# Feeding input features to X and output (MSRP) to y
X = car_df.drop("MSRP", axis = 1)
y = car_df["MSRP"]

In [None]:
X = np.array(X)

In [None]:
y = np.array(y)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size = 0.25)

In [None]:
X_train.shape

In [None]:
X_test.shape

# 1. XG-BOOST WITHOUT OPTIMIZATION

In [None]:
!pip install xgboost
import xgboost as xgb
model = xgb.XGBRegressor(objective ='reg:squarederror', learning_rate = 1, max_depth = 3, n_estimators = 500)
model.fit(X_train, y_train)

# predict the score of the trained model using the testing dataset

result = model.score(X_test, y_test)
# make predictions on the test data
y_predict = model.predict(X_test)

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt

RMSE = float(format(np.sqrt(mean_squared_error(y_test, y_predict)),'.3f'))
r2 = r2_score(y_test, y_predict)

print('RMSE =',RMSE,'\nR2 =', r2)



# 2. XG-BOOST WITH GRIDSEARCH

In [None]:
from sklearn.model_selection import GridSearchCV
parameters_grid = { 'max_depth': [3, 10, 20],
                   'learning_rate': [0.1, 0.5],
                   'n_estimators': [100, 500],
                   'colsample_bytree': [0.3, 0.7]}

model = xgb.XGBRegressor(objective ='reg:squarederror')

# Note that we used the "neg_mean_squared_error" since GridSearchCV() ranks all the algorithms (estimators)
# and specifies which one is the best. We are trying to minimize the error.
xgb_gridsearch = GridSearchCV(estimator = model,
                              param_grid = parameters_grid,
                              scoring = 'neg_mean_squared_error',
                              cv = 5,
                              verbose = 5)

xgb_gridsearch.fit(X_train, y_train)
y_predict = xgb_gridsearch.predict(X_test)

from sklearn.metrics import r2_score, mean_squared_error
from math import sqrt

RMSE = float(format(np.sqrt(mean_squared_error(y_test, y_predict)),'.3f'))
r2 = r2_score(y_test, y_predict)

print('RMSE =',RMSE,'\nR2 =', r2)


# 2. XG-BOOST WITH RANDOMSEARCH

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Define the grid of hyperparameters to search

# you can choose which booster you'd like to choose:
# Two options are available: gbtree, gblinear
# gbtree uses tree based models while gblinear uses linear functions

grid = {
    'n_estimators': [100, 500],
    'max_depth': [3, 10, 20],
    'learning_rate': [0.1, 0.5],
    'colsample_bytree': [0.3, 0.7]}


import xgboost as xgb
model = xgb.XGBRegressor(objective ='reg:squarederror')

# Set up the random search
random_cv = RandomizedSearchCV(estimator = model,
                               param_distributions = grid,
                               cv = 5,
                               n_iter = 100,
                               scoring = 'neg_mean_absolute_error',
                               verbose = 5,
                               return_train_score = True)
random_cv.fit(X_train, y_train)

random_cv.best_estimator_
y_predict = random_cv.predict(X_test)


from sklearn.metrics import r2_score, mean_squared_error
from math import sqrt

RMSE = float(format(np.sqrt(mean_squared_error(y_test, y_predict)),'.3f'))
r2 = r2_score(y_test, y_predict)

print('RMSE =',RMSE,'\nR2 =', r2)


# 3. XG-BOOST WITH BAYESIAN OPTIMIZATION

In [None]:
! pip install scikit-optimize
from skopt import BayesSearchCV
# from skopt.space import Real, Categorical, Integer

search_space = {
        "max_depth": (4, 20, 'log-uniform'),
        "n_estimators": (2, 100, 'log-uniform'),
        'learning_rate': (0.01, 1.0, 'log-uniform')}

import xgboost as xgb
model = xgb.XGBRegressor(objective ='reg:squarederror')

xgb_bayes_search = BayesSearchCV(model,
                                    search_space,
                                    n_iter = 100,
                                    scoring = 'neg_mean_absolute_error',
                                    cv = 5)

xgb_bayes_search.fit(X_train, y_train)

y_predict = xgb_bayes_search.predict(X_test)

from sklearn.metrics import r2_score, mean_squared_error
from math import sqrt

RMSE = float(format(np.sqrt(mean_squared_error(y_test, y_predict)),'.3f'))
r2 = r2_score(y_test, y_predict)

print('RMSE =',RMSE,'\nR2 =', r2)


# PRACTICE OPPORTUNITIES SOLUTION

**PRACTICE OPPORTUNITY #1 SOLUTION:**
- **What is the average, minimum and maximum registered bike rental usage?**

In [None]:
bike_df.describe()

**PRACTICE OPPORTUNITY #2 SOLUTION:**
- **Plot the rental usage per quarter**
- **Set the line width to 6 and enable the grid**

In [None]:
bike_df['cnt'].asfreq('Q').plot(linewidth = 6)
plt.title('Bike Usage Per Quarter')
plt.xlabel('Quarter')
plt.ylabel('Bike Rental')
plt.grid()

**PRACTICE OPPORTUNITY #3 SOLUTION:**
- **Retrain the model with less 'max_depth'**
- **Comment on the results**

In [None]:
# Train an XGBoost regressor model

import xgboost as xgb

model = xgb.XGBRegressor(objective ='reg:squarederror', learning_rate = 1, max_depth = 3, n_estimators = 500)

model.fit(X_train, y_train)

**PRACTICE OPPORTUNITY #4 SOLUTION:**
- **Expand on the parameter grid to include an additional hyperparameter "gamma"**
- **Try any three reasonable values for gamma. How many fits are run this time? Comment on the results**


In [None]:
from sklearn.model_selection import GridSearchCV
parameters_grid = { 'max_depth': [3, 6, 10],
                    'learning_rate': [0.01, 0.05, 0.1],
                    'n_estimators': [100, 500, 1000],
                    'colsample_bytree': [0.3, 0.7],
                    'gamma': [1, 0.1, 0.01]}
import xgboost as xgb
model = xgb.XGBRegressor(objective ='reg:squarederror')
# Note that we used the "neg_mean_squared_error" since GridSearchCV() ranks all the algorithms (estimators)
# and specifies which one is the best. We are trying to minimize the error.
xgb_gridsearch = GridSearchCV(estimator = model,
                              param_grid = parameters_grid,
                              scoring = 'neg_mean_squared_error',
                              cv = 5,
                              verbose = 5)
xgb_gridsearch
xgb_gridsearch.fit(X_train, y_train)

In [None]:
xgb_gridsearch.best_params_

In [None]:
xgb_gridsearch.best_estimator_

In [None]:
y_predict = xgb_gridsearch.predict(X_test)

In [None]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from math import sqrt
k = X_test.shape[1]
n = len(X_test)
RMSE = float(format(np.sqrt(mean_squared_error(y_test, y_predict)),'.3f'))
MSE = mean_squared_error(y_test, y_predict)
MAE = mean_absolute_error(y_test, y_predict)
r2 = r2_score(y_test, y_predict)
adj_r2 = 1-(1-r2)*(n-1)/(n-k-1)

print('RMSE =',RMSE, '\nMSE =',MSE, '\nMAE =',MAE, '\nR2 =', r2, '\nAdjusted R2 =', adj_r2)