In [8]:
# import libraries
import os
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns 

# for making the output constant across all run
np.random.seed(42)

# display settings & code formatting
pd.options.display.max_columns = 999
%matplotlib inline

# project paths
# project_root_dir = os.path.normpath(os.getcwd() + os.sep + os.pardir)

# data_path = os.path.join(project_root_dir, "data")
# os.makedirs(data_path, exist_ok=True)

data_path = "/content/drive/MyDrive/workspace/walmart/data"

# function for loading data
def read_data(filename, date_col=None, data_path=data_path):
    csv_path = os.path.join(data_path, filename)
    return pd.read_csv(csv_path, parse_dates=date_col)

# function for saving data as csv file
def save_dataframe(df, filename, file_path=data_path):
    path = os.path.join(file_path, filename)
    df.to_csv(path, index=False)


In [9]:
train = read_data("train.csv", date_col=["Date"])
test = read_data("test.csv", date_col=["Date"])
stores = read_data("stores.csv")
features = read_data("features.csv", date_col=["Date"])
sample_submission = read_data("sampleSubmission.csv")

In [10]:
# Merge the stores data with train and test
train = pd.merge(train, stores, how="left", on="Store")
test = pd.merge(test, stores, how="left", on="Store")

# Merge the features data with train and test
train = pd.merge(train, features, how="left", on=["Store", "Date"])
test = pd.merge(test, features, how="left", on=["Store", "Date"])

train.drop(["IsHoliday_y"], axis=1, inplace=True)
test.drop(["IsHoliday_y"], axis=1, inplace=True)

# rename column
train.rename(columns={"IsHoliday_x": "IsHoliday"}, inplace=True)
test.rename(columns={"IsHoliday_x": "IsHoliday"}, inplace=True)

## Datetime features
train["Year"] = train["Date"].dt.year
train["Month"] = train["Date"].dt.month
train["Day"] = train["Date"].dt.day
train["WeekOfYear"] = train["Date"].dt.weekofyear
train["DayOfWeek"] = train["Date"].dt.dayofweek
train["Weekend"] = (train["Date"].dt.weekday >= 5).astype(int)

test["Year"] = test["Date"].dt.year
test["Month"] = test["Date"].dt.month
test["Day"] = test["Date"].dt.day
test["WeekOfYear"] = test["Date"].dt.weekofyear
test["DayOfWeek"] = test["Date"].dt.dayofweek
test["Weekend"] = (test["Date"].dt.weekday >= 5).astype(int)

# convert boolean column to categorical column
train["IsHoliday"] = train["IsHoliday"].map({True: "Yes", False: "No"})
test["IsHoliday"] = test["IsHoliday"].map({True: "Yes", False: "No"})
train["IsHoliday"] = train["IsHoliday"].astype("category")
test["IsHoliday"] = test["IsHoliday"].astype("category")

# ordered the categorical store type col
from pandas.api.types import CategoricalDtype

cat_type = CategoricalDtype(categories=["C", "B", "A"], ordered=True)
train["Type"] = train["Type"].astype(cat_type)
test["Type"] = test["Type"].astype(cat_type)

# convert to categorical columns
train["Store"] = train["Store"].astype("category")
train["Dept"] = train["Dept"].astype("category")
train["Year"] = train["Year"].astype("category")
train["Month"] = train["Month"].astype("category")
train["DayOfWeek"] = train["DayOfWeek"].astype("category")
train["Weekend"] = train["Weekend"].astype("category")

# convert to categorical columns
test["Store"] = test["Store"].astype("category")
test["Dept"] = test["Dept"].astype("category")
test["Year"] = test["Year"].astype("category")
test["Month"] = test["Month"].astype("category")
test["DayOfWeek"] = test["DayOfWeek"].astype("category")
test["Weekend"] = test["Weekend"].astype("category")


Series.dt.weekofyear and Series.dt.week have been deprecated.  Please use Series.dt.isocalendar().week instead.


Series.dt.weekofyear and Series.dt.week have been deprecated.  Please use Series.dt.isocalendar().week instead.



In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder

# features and labels of train and test set
# labels of test are not provided as we need to predict them

X_train = train.drop(["Weekly_Sales"], axis=1).copy()
y_train = train["Weekly_Sales"].copy()

X_test = test.copy()

# drop and save the date column in a variable
train_date = X_train.pop("Date")
test_date = X_test.pop("Date")


#### Data preparation pipeline

# select numerical and categorical columns
num_cols = X_train.select_dtypes(exclude=["object", "category"]).columns.tolist()
cat_cols = X_train.select_dtypes(include=["object", "category"]).columns.tolist()

# numerical date preprocessing pipeline
num_pipe = make_pipeline(SimpleImputer(strategy="median"), StandardScaler())

# categorical data preprocessing pipeline
cat_pipe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="NA"),
    OneHotEncoder(handle_unknown="ignore", sparse=False),
)

# full pipeline
full_pipe = ColumnTransformer(
    [("num", num_pipe, num_cols), ("cat", cat_pipe, cat_cols)]
)

full_pipe

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('num',
                                 Pipeline(memory=None,
                                          steps=[('simpleimputer',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='median',
                                                                verbose=0)),
                                                 ('standardscaler',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
             

In [12]:
from sklearn.model_selection import train_test_split

X_train_full = X_train.copy()
y_train_full = y_train.copy()

# randomly select 30% of the data only
train = pd.concat([X_train, y_train], axis='columns')
train = train.sample(frac=0.3, random_state=42)

X_train = train.drop(["Weekly_Sales"], axis=1).copy()
y_train = train["Weekly_Sales"].copy()

# now divide it to train and validation set
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

## XGBoost 

In [13]:
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

xgb_reg = make_pipeline(full_pipe, XGBRegressor(objective= "reg:squarederror", random_state=42, tree_method="gpu_hist", n_jobs=-1,))

params = {
    "xgbregressor__learning_rate": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
    "xgbregressor__max_depth": list(range(6,27, 3))
}

xgb_grid = GridSearchCV(
    estimator= xgb_reg,
    param_grid = params,
    scoring="neg_mean_absolute_error",
    cv=3
)
xgb_grid.fit(X_train, y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('simpleimputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                         

In [None]:
xgb_grid.best_score_

-1779.5820899296223

In [None]:
xgb_grid.best_params_

{'xgbregressor__learning_rate': 0.2, 'xgbregressor__max_depth': 24}

In [None]:
# make submission
y_pred = xgb_grid.best_estimator_.predict(X_test)
sample_submission["Weekly_Sales"] = y_pred
save_dataframe(sample_submission, "xgb_tunned_lr_max_depth.csv")

In [None]:
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

xgb_reg = make_pipeline(full_pipe, XGBRegressor(objective= "reg:squarederror",
                                                learning_rate=0.2, 
                                                max_depth= 24,
                                                min_child_weight=4,
                                                random_state=42, 
                                                tree_method="gpu_hist", 
                                                n_jobs=-1,))

params = {
    "xgbregressor__colsample_bytree": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1],
    "xgbregressor__colsample_bylevel": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
}

xgb_grid = GridSearchCV(
    estimator= xgb_reg,
    param_grid = params,
    scoring="neg_mean_absolute_error",
    cv=3
)
xgb_grid.fit(X_train, y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('simpleimputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                         

In [None]:
xgb_grid.best_score_

-1744.6734322330256

In [None]:
xgb_grid.best_params_

{'xgbregressor__colsample_bylevel': 0.9, 'xgbregressor__colsample_bytree': 0.9}

In [None]:
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor

xgb_reg = make_pipeline(full_pipe, XGBRegressor(objective= "reg:squarederror",
                                                learning_rate=0.2, 
                                                max_depth= 24,
                                                min_child_weight=4,
                                                colsample_bylevel=0.9,
                                                colsample_bytree=0.9,
                                                random_state=42, 
                                                tree_method="gpu_hist", 
                                                n_jobs=-1,))
xgb_reg.fit(X_train_full, y_train_full)

# make submission
y_pred = xgb_reg.predict(X_test)
sample_submission["Weekly_Sales"] = y_pred
save_dataframe(sample_submission, "xgb_tunned_lr_md_mc_col_sample.csv")

Now, this models performance is very similar to the random forest model and this is also by far the best xgboost model that we have created. The hyper-parameter tunning is going in the right direction.

In [None]:
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

xgb_reg = make_pipeline(full_pipe, XGBRegressor(objective= "reg:squarederror",
                                                learning_rate=0.2, 
                                                max_depth= 24,
                                                min_child_weight=4,
                                                colsample_bylevel=0.9,
                                                colsample_bytree=0.9,
                                                reg_alpha=27,
                                                random_state=42, 
                                                tree_method="gpu_hist", 
                                                n_jobs=-1,))

params = {
    "xgbregressor__tree_method": ["auto", "exact", "approx", "hist", "gpu_hist"]
}

xgb_grid = GridSearchCV(
    estimator= xgb_reg,
    param_grid = params,
    scoring="neg_mean_absolute_error",
    cv=3
)
xgb_grid.fit(X_train, y_train)

GridSearchCV(cv=3, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('columntransformer',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('simpleimputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                                                        copy=True,
                         

In [None]:
xgb_grid.best_score_

-1737.078187289079

In [None]:
xgb_grid.best_params_

{'xgbregressor__tree_method': 'approx'}

In [None]:
xgb_reg = make_pipeline(full_pipe, XGBRegressor(objective= "reg:squarederror",
                                                learning_rate=0.2, 
                                                max_depth= 24,
                                                min_child_weight=4,
                                                colsample_bylevel=0.9,
                                                colsample_bytree=0.9,
                                                reg_alpha=27,
                                                tree_method="approx",
                                                random_state=42, 
                                                n_jobs=-1,))

xgb_reg.fit(X_train_full, y_train_full)

# make submission
y_pred = xgb_reg.predict(X_test)
sample_submission["Weekly_Sales"] = y_pred
save_dataframe(sample_submission, "xgb_tree_method_approx_tunned_params.csv")

This model has beatean every other models we have used so far even the rf model. Now, let's try to find of the optimal number of trees to use.

In [None]:
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor


xgb_reg = make_pipeline(full_pipe, XGBRegressor(objective= "reg:squarederror",
                                                n_estimators=3000,
                                                learning_rate=0.2, 
                                                max_depth= 24,
                                                min_child_weight=4,
                                                colsample_bylevel=0.9,
                                                colsample_bytree=0.9,
                                                reg_alpha=27,
                                                tree_method="gpu_hist",
                                                random_state=42, 
                                                n_jobs=-1,))

xgb_reg.fit(X_train_full, y_train_full)

# make submission
y_pred = xgb_reg.predict(X_test)
sample_submission["Weekly_Sales"] = y_pred
save_dataframe(sample_submission, "xgb_final_without_early_stopping.csv")

In [None]:
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.pipeline import make_pipeline

xgb_reg = make_pipeline(full_pipe, XGBRegressor(objective= "reg:squarederror",
                                                n_estimators=700,
                                                learning_rate=0.2,
                                                max_depth= 24,
                                                min_child_weight=4,
                                                colsample_bylevel=0.9,
                                                colsample_bytree=0.9,
                                                reg_alpha=27,
                                                tree_method="approx",
                                                random_state=42,
                                                n_jobs=-1))

xgb_reg.fit(X_train_full, y_train_full)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  Pipeline(memory=None,
                                                           steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                                 copy=True,
                                                                                 fill_value=None,
                                                                                 missing_values=nan,
                                                                                 strategy='median',
                                                  

In [None]:
# make submission
y_pred = xgb_reg.predict(X_test)
sample_submission["Weekly_Sales"] = y_pred
save_dataframe(sample_submission, "xgb_final.csv")