# Import Required Libraries

In [None]:
import os
import gc
import time
from datetime import datetime, timedelta

import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error

import xgboost as xgb
import joblib
import matplotlib.pyplot as plt
import seaborn as sns


# Helper Functions

In [None]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def wmape(y_true, y_pred):
    # avoid division by zero
    denom = np.sum(np.abs(y_true))
    return np.sum(np.abs(y_true - y_pred)) / denom if denom != 0 else np.nan

# Load Dataset 

In [8]:
DATA_DIR = "data/"   # update if needed
train = pd.read_csv(os.path.join(DATA_DIR, "train.csv"), parse_dates=["Date"])
test  = pd.read_csv(os.path.join(DATA_DIR, "test.csv"),  parse_dates=["Date"])
store = pd.read_csv(os.path.join(DATA_DIR, "store.csv"))

  train = pd.read_csv(os.path.join(DATA_DIR, "train.csv"), parse_dates=["Date"])


In [9]:
print(train.shape, test.shape, store.shape)

(1017209, 9) (41088, 8) (1115, 10)


In [16]:
print("Train missing values:")
print(train.isna().sum())
print("\nTest missing values:")
print(test.isna().sum())
print("\nStore missing values:")
print(store.isna().sum())

Train missing values:
Store            0
DayOfWeek        0
Date             0
Sales            0
Customers        0
Open             0
Promo            0
StateHoliday     0
SchoolHoliday    0
dtype: int64

Test missing values:
Id                0
Store             0
DayOfWeek         0
Date              0
Open             11
Promo             0
StateHoliday      0
SchoolHoliday     0
dtype: int64

Store missing values:
Store                          0
StoreType                      0
Assortment                     0
CompetitionDistance            3
CompetitionOpenSinceMonth    354
CompetitionOpenSinceYear     354
Promo2                         0
Promo2SinceWeek              544
Promo2SinceYear              544
PromoInterval                544
dtype: int64


# Basic preprocessing & merge store info

In [None]:
# Some basic cleaning consistent with many Rossmann kernels
train['Open'] = train['Open'].fillna(1).astype(int)
test['Open']  = test['Open'].fillna(1).astype(int)

# Merge store info into train/test
train = train.merge(store, how='left', on='Store')
test  = test.merge(store, how='left', on='Store')

if 'Sales' in test.columns:
    test = test.drop(columns=['Sales'])
