<a href="https://colab.research.google.com/github/azhgh22/Walmart-Recruiting-Store-Sales-Forecasting/blob/main/notebooks/general.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from google.colab import userdata
token = userdata.get('GITHUB_TOKEN')
user_name = userdata.get('GITHUB_USERNAME')
mail = userdata.get('GITHUB_MAIL')

!git config --global user.name "{user_name}"
!git config --global user.email "{mail}"
!git clone https://{token}@github.com/azhgh22/Walmart-Recruiting-Store-Sales-Forecasting.git

%cd Walmart-Recruiting-Store-Sales-Forecasting

Cloning into 'Walmart-Recruiting-Store-Sales-Forecasting'...
remote: Enumerating objects: 188, done.[K
remote: Counting objects: 100% (188/188), done.[K
remote: Compressing objects: 100% (118/118), done.[K
remote: Total 188 (delta 84), reused 159 (delta 64), pack-reused 0 (from 0)[K
Receiving objects: 100% (188/188), 6.38 MiB | 26.23 MiB/s, done.
Resolving deltas: 100% (84/84), done.
/content/Walmart-Recruiting-Store-Sales-Forecasting


In [3]:
from google.colab import userdata
!pip install -r requirements.txt
kaggle_json_path = userdata.get('KAGGLE_JSON_PATH')
! ./src/data_loader.sh -f {kaggle_json_path}

Collecting onnx (from -r requirements.txt (line 3))
  Downloading onnx-1.18.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting dagshub (from -r requirements.txt (line 8))
  Downloading dagshub-0.5.10-py3-none-any.whl.metadata (12 kB)
Collecting mlflow (from -r requirements.txt (line 9))
  Downloading mlflow-3.1.1-py3-none-any.whl.metadata (29 kB)
Collecting appdirs>=1.4.4 (from dagshub->-r requirements.txt (line 8))
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting dacite~=1.6.0 (from dagshub->-r requirements.txt (line 8))
  Downloading dacite-1.6.0-py3-none-any.whl.metadata (14 kB)
Collecting gql[requests] (from dagshub->-r requirements.txt (line 8))
  Downloading gql-3.5.3-py2.py3-none-any.whl.metadata (9.4 kB)
Collecting dataclasses-json (from dagshub->-r requirements.txt (line 8))
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting treelib>=1.6.4 (from dagshub->-r requirements.txt (line

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
from src.config import *

stores = pd.read_csv(STORES_PATH)
features = pd.read_csv(FEATURES_PATH)
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)

## **Merge tables and split data**

In [8]:
merged_train = pd.merge(train,stores,on='Store',how='left').merge(features,how='left',on=['Store','Date','IsHoliday'])

In [9]:
from src.time_series_split import TimeSeriesSplit
from src.config import SPLIT_DATE
merged_train.Date = pd.to_datetime(merged_train.Date)
x_train, x_val = TimeSeriesSplit(SPLIT_DATE).split(merged_train)
y_train = x_train.pop('Weekly_Sales')
y_val = x_val.pop('Weekly_Sales')

# **Time series features**

In [10]:
from feature_engineering.time_features import FeatureAdder

# deal with **NaN** values

In [11]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer

class NaImputer(BaseEstimator, TransformerMixin):
  def __init__(self, strategy='mean'):
    self.strategy = strategy
    self.imputer = SimpleImputer(strategy=strategy)
    self.na_cols = []

  def fit(self, X, y=None):
    self.na_cols = [col for col in X.columns if X[col].isna().sum() > 0]
    self.imputer.fit(X[self.na_cols])
    return self

  def transform(self, X, y=None):
    x_copy = X.copy()
    x_copy[self.na_cols] = self.imputer.transform(x_copy[self.na_cols])
    return x_copy

# **Cat2Num**

In [12]:
from sklearn.base import BaseEstimator, TransformerMixin

class Cat2Num(BaseEstimator, TransformerMixin):
  def fit(self, X, y=None):
    return self

  def transform(self, X, y=None):
    x_copy = X.copy()
    x_copy.pop('Date')
    x_copy.IsHoliday = x_copy.IsHoliday.astype(int)
    x_copy.Type = x_copy.Type.astype('category').cat.codes
    return x_copy

# **XGboost**

In [13]:
from xgboost import DMatrix
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import mean_absolute_error
from src.utils import wmae

In [None]:
from src.cross_validation import manual_model_search
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('adder', FeatureAdder()),
    ('imputer', NaImputer()),
    ('cat2num', Cat2Num()),
    ('model', XGBRegressor(
        n_estimators=1000,
        learning_rate=0.1,
        max_depth=7,
        reg_lambda=3,
        min_split_loss=100,
        objective='reg:squarederror',
        random_state=42,
    ))
  ])


param_grid = {
    'model__n_estimators': [1000],
    'model__learning_rate': [0.1],
    'model__max_depth': [7],
    'model__reg_lambda': [3],
    'model__min_split_loss': [100],
}


metric_kwargs = {
    'is_holiday': x_val['IsHoliday']
}

best_model, best_params, best_score = manual_model_search(
    model=pipeline,
    param_grid=param_grid,
    X_train=x_train,
    y_train=y_train,
    X_valid=x_val,
    y_valid=y_val,
    metric_func=wmae,
    metric_kwargs=metric_kwargs
)

print("\nBest Params:", best_params)
print("Best Validation Score:", best_score)

In [15]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('adder', FeatureAdder(add_dummy_date=True,start_date=pd.Timestamp('2010-02-05'))),
    ('imputer', NaImputer()),
    ('cat2num', Cat2Num()),
    ('model', XGBRegressor(
        n_estimators=1000,
        learning_rate=0.1,
        max_depth=7,
        reg_lambda=3,
        min_split_loss=100,
        objective='reg:squarederror',
        random_state=42,
    ))
  ])

model = pipeline.fit(x_train, y_train)

y_train_predict = model.predict(x_train)
y_val_predict = model.predict(x_val)

train_score = wmae(y_train, y_train_predict,x_train['IsHoliday'].to_list())
val_score = wmae(y_val, y_val_predict,x_val['IsHoliday'].to_list())
print(f"Train wmae: {train_score}, Val wmae: {val_score}")

Train wmae: 1578.84506300633, Val wmae: 2848.6895716804747


# **upload to WanDB**

In [16]:
! wandb login

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 
[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mazhgh22[0m ([33mMLBeasts[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [17]:
import wandb
import joblib

joblib.dump(model, "xgb_pipeline.pkl")
wandb.init(project="Walmart Recruiting - Store Sales Forecasting", name="xgboost:run1")

wandb.config.update({
    'merge1' : 'train, store, how=left, on=Store',
    'merge2' : 'train, features, how=left, on=Store, Date, IsHoliday',
    'merged_tables' : ['train','stores','features'],
    'time_features' : [
        'DateDummy', 'Month', 'Year',
        'WeekOfYear', 'Is_Christmas', 'Is_LaborDay', 'Is_Thanksgiving',
        'Is_SuperBowl', 'Days_until_next_holiday', 'Days_since_last_holiday',
        'week_sin', 'week_cos', 'month_sin', 'month_cos',
        'Days_until_next_Christmas', 'Days_since_last_Christmas',
        'Days_until_next_LaborDay', 'Days_since_last_LaborDay',
        'Days_until_next_Thanksgiving', 'Days_since_last_Thanksgiving',
        'Days_until_next_SuperBowl', 'Days_since_last_SuperBowl'
    ],
    'score_metric' : 'WMAE',
    'score_policy' : {
        'weight on holidays' : 5,
        'weight on non_holidays' : 1
    },
    'model' : 'Xgboost',
    'n_estimators' : 1000,
    'learning_rate' : 0.1,
    'max_depth' : 7,
    'reg_lambda' : 3,
    'min_split_loss' : 100,
    'objective' : 'reg:squarederror',
})

wandb.log({
    'train_wmae': train_score,
    'val_wmae': val_score
})


artifact = wandb.Artifact(
    name="xgb_pipeline",
    type="model",
    description="XGBoost pipeline with Date engineering and imputing"
)

artifact.add_file("xgb_pipeline.pkl")
wandb.log_artifact(artifact)

wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mazhgh22[0m ([33mMLBeasts[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


0,1
train_wmae,▁
val_wmae,▁

0,1
train_wmae,1578.84506
val_wmae,2848.68957


# **Load model from wandb**

In [18]:
import wandb
import joblib

# Resume or init run
run = wandb.init(project="Walmart Recruiting - Store Sales Forecasting", name="xgboost:run1")

# Download the artifact
artifact = run.use_artifact('MLBeasts/Walmart Recruiting - Store Sales Forecasting/xgb_pipeline:latest', type='model')
artifact_dir = artifact.download()

# Load the model
model = joblib.load(f"{artifact_dir}/xgb_pipeline.pkl")

[34m[1mwandb[0m:   1 of 1 files downloaded.  


In [19]:
test = pd.read_csv(TEST_PATH)
merged_test = test.merge(stores,on='Store',how='left').merge(features,how='left',on=['Store','Date','IsHoliday'])
model.predict(merged_test)

array([33550.23    , 18070.236   , 17630.887   , ...,   887.0808  ,
         770.0637  ,   106.878555], dtype=float32)