# Black Friday Dataset Model Building

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score,KFold
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error
import xgboost as xgb

import joblib

## Loading Pre-processed Dataset from BigQuery 

In [2]:
%load_ext google.cloud.bigquery

The google.cloud.bigquery extension is already loaded. To reload it, use:
  %reload_ext google.cloud.bigquery


In [3]:
%%bigquery train_df
SELECT * FROM `aa-ai-specialisation.black_friday.preprocessed_train_df`

Query is running:   0%|          |

Downloading:   0%|          |

In [4]:
%%bigquery test_df
SELECT * FROM `aa-ai-specialisation.black_friday.preprocessed_test_df`

Query is running:   0%|          |

Downloading:   0%|          |

In [5]:
train_df_copy = train_df.copy()
test_df_copy = test_df.copy()

X= train_df_copy.drop(columns=['User_ID', 'Product_ID','Purchase'])
y = train_df_copy['Purchase'].astype(int)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

## Linear Regression Model

In [7]:

# Initialize the model
model = LinearRegression()

# Fit the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Calculate RMSE and append to list
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 3511.1271018255425
Root Mean Squared Error: 4616.707666158212


### Save the Model Checkpoint to Loacal Folder

In [8]:
# Save the model locally
joblib.dump(model, './model_checkpoints/linear_regresssion_model.joblib')

['./model_checkpoints/linear_regresssion_model.joblib']

## Random Forest with Cross-validation
- Cross

In [12]:
# Define the parameter distribution
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt']
}

# Initialize the model
rf_model = RandomForestRegressor(n_jobs=-1, random_state=20)

# Set up the randomized search with 5-fold cross-validation
random_search = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, n_iter=10, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, random_state=20)

# Fit the random search to the data
random_search.fit(X, y)

# Get the best parameters
best_params = random_search.best_params_
print(f'Best parameters found: {best_params}')


Best parameters found: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 20}


In [16]:
best_params = {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 'auto', 'max_depth': 20}

In [17]:
# Initialize the model with the best parameters
best_rf_model = RandomForestRegressor(
    n_estimators=best_params['n_estimators'],
    min_samples_split=best_params['min_samples_split'],
    min_samples_leaf=best_params['min_samples_leaf'],
    max_features=best_params['max_features'],
    max_depth=best_params['max_depth'],
    n_jobs=-1,
    random_state=20
)
best_rf_model.fit(X_train, y_train)
y_pred = best_rf_model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print(f'Root Mean Squared Error: {rmse}')

Mean Absolute Error: 2159.807366154734
Root Mean Squared Error: 2920.5985431351137


### Save the Model Checkpoint to Loacal Folder

In [18]:
# Save the model locally
joblib.dump(best_rf_model, './model_checkpoints/random_forest_model.joblib')

['./model_checkpoints/random_forest_model.joblib']

## XGBOOST

In [None]:
# Initialize the XGBRegressor
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42, enable_categorical=True, tree_method='hist')

# Define the range of parameters to search
parameters = {
    'max_depth': range(3, 7, 2),  # Searching in steps of 2 from 3 to 7
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': range(100, 301, 50),  
    'subsample': [0.8, 0.9, 1],
    'colsample_bytree': [0.8, 0.9, 1]
}

# Grid search for the optimal parameters
grid_search = GridSearchCV(model, parameters, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Retrieve the best parameters
best_params = grid_search.best_params_
print(f'Best parameters found: {best_params}')


In [8]:
best_params = {'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 1}

In [9]:
best_model = xgb.XGBRegressor(**best_params, objective='reg:squarederror', random_state=42)
best_model.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.9,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=7, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=200, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=42,
             reg_alpha=0, reg_lambda=1, ...)

In [10]:
y_pred_train = best_model.predict(X_train)
rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))


print('Mean Absolute Error:', mean_absolute_error(y_train, y_pred_train))
print(f'Root Mean Squared Error: {rmse}')

Mean Absolute Error: 2113.8786834331513
Root Mean Squared Error: 2825.5656856973783


In [31]:
# # Validation set evaluation
# y_pred_val = best_model.predict(X_val)
# rmse_val = np.sqrt(mean_squared_error(y_val, y_pred_val))

# Test set evaluation
y_pred_test = best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))


print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred_test))
print(f'Root Mean Squared Error: {rmse}')

Mean Absolute Error: 2142.1320054467706
Root Mean Squared Error: 2870.9729278111936


In [32]:
best_model.score(X_test,y_test)*100

67.28519140186616

In [33]:
dtrain = xgb.DMatrix(X_train, label=y_train)
cv_results = xgb.cv(
    best_params,
    dtrain,
    num_boost_round=100,
    nfold=5,
    metrics={'rmse'},
    early_stopping_rounds=10,
    seed=42
)

Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


Parameters: { "n_estimators" } might not be

In [34]:
cv_results

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,9572.896075,5.356484,9572.982124,22.364517
1,8732.464775,39.911316,8732.552925,38.620628
2,7986.487008,76.953759,7986.532867,72.242463
3,7304.064595,67.903587,7304.344902,63.510226
4,6713.481973,58.736967,6713.913744,58.687923
...,...,...,...,...
95,2863.246351,2.810668,2887.877247,10.428463
96,2862.965179,2.806766,2887.757955,10.365021
97,2862.628603,2.846193,2887.627973,10.327412
98,2862.199993,2.665235,2887.367921,10.427918


In [13]:
# Save the model locally
joblib.dump(best_model, './model_checkpoints/xgboost_model.joblib')

['./model_checkpoints/xgboost_model.joblib']

In [23]:
# Save the model in xgboost format
best_model.save_model('./model_checkpoints/model.bst')