In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
import joblib

train = pd.read_csv("../data/processed/walmart_global_train.csv")
valid = pd.read_csv("../data/processed/walmart_global_valid.csv")

print(train.shape, valid.shape)

train.head()

(1800, 20) (450, 20)


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,year,month,week,day_of_week,day_of_year,is_weekend,lag_1,lag_2,lag_4,roll_mean_4,roll_std_4,roll_mean_8
0,1,2010-05-03,1554806.68,0,46.5,2.625,211.350143,8.106,2010,5,18,0,123,0,1643690.9,1615524.71,1540163.53,1590514.0,46665.119383,1555367.0
1,1,2010-05-11,1551659.28,0,58.74,2.689,211.956394,7.838,2010,5,19,1,131,0,1554806.68,1643690.9,1548033.78,1591420.0,45587.769145,1567658.0
2,1,2010-06-08,1605491.78,0,87.16,2.627,211.504662,7.787,2010,6,23,1,159,0,1551659.28,1554806.68,1615524.71,1588912.0,44070.228007,1568974.0
3,1,2010-07-05,1603955.12,0,72.55,2.835,210.339968,7.808,2010,7,27,0,186,0,1605491.78,1551659.28,1643690.9,1578978.0,29762.404843,1582916.0
4,1,2010-08-10,1508239.93,0,63.93,2.633,211.746754,7.838,2010,8,32,1,222,0,1603955.12,1605491.78,1554806.68,1567337.0,46672.363469,1578925.0


In [2]:
target_col = "Weekly_Sales"

feature_cols = [
    'Store',
    'year','month','week','day_of_week','day_of_year','is_weekend',
    'Holiday_Flag','Temperature','Fuel_Price','CPI','Unemployment',
    'lag_1','lag_2','lag_4',
    'roll_mean_4','roll_std_4','roll_mean_8'
]

X_train = train[feature_cols]
y_train = train[target_col]

X_valid = valid[feature_cols]
y_valid = valid[target_col]

X_train.shape, X_valid.shape

((1800, 18), (450, 18))

In [3]:
y_pred_naive = valid['lag_1'].values

mae_naive = mean_absolute_error(y_valid, y_pred_naive)
rmse_naive = np.sqrt(mean_squared_error(y_valid, y_pred_naive))
mape_naive = np.mean(np.abs((y_valid - y_pred_naive) / y_valid)) * 100

print("Naive Baseline:")
print("MAE :", mae_naive)
print("RMSE:", rmse_naive)
print("MAPE:", mape_naive)

Naive Baseline:
MAE : 88276.00955555555
RMSE: 130567.7445487781
MAPE: 7.910722021441273


In [4]:
rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=15,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

rf_pred = rf.predict(X_valid)

rf_mae = mean_absolute_error(y_valid, rf_pred)
rf_rmse = np.sqrt(mean_squared_error(y_valid, rf_pred))
rf_mape = np.mean(np.abs((y_valid - rf_pred) / y_valid)) * 100

print("RandomForest Performance:")
print("MAE :", rf_mae)
print("RMSE:", rf_rmse)
print("MAPE:", rf_mape)

RandomForest Performance:
MAE : 57919.71935387705
RMSE: 83978.3017200551
MAPE: 5.09909232055235


In [5]:
xgb_model = xgb.XGBRegressor(
    n_estimators=500,
    max_depth=10,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

xgb_model.fit(X_train, y_train)

xgb_pred = xgb_model.predict(X_valid)

xgb_mae = mean_absolute_error(y_valid, xgb_pred)
xgb_rmse = np.sqrt(mean_squared_error(y_valid, xgb_pred))
xgb_mape = np.mean(np.abs((y_valid - xgb_pred) / y_valid)) * 100

print("XGBoost Performance:")
print("MAE :", xgb_mae)
print("RMSE:", xgb_rmse)
print("MAPE:", xgb_mape)

XGBoost Performance:
MAE : 64364.94801666667
RMSE: 92754.92378640975
MAPE: 5.894340622942633


In [6]:
results = pd.DataFrame({
    "Model": ["Naive Lag-1", "RandomForest", "XGBoost"],
    "MAE": [mae_naive, rf_mae, xgb_mae],
    "RMSE": [rmse_naive, rf_rmse, xgb_rmse],
    "MAPE (%)": [mape_naive, rf_mape, xgb_mape]
})

results

Unnamed: 0,Model,MAE,RMSE,MAPE (%)
0,Naive Lag-1,88276.009556,130567.744549,7.910722
1,RandomForest,57919.719354,83978.30172,5.099092
2,XGBoost,64364.948017,92754.923786,5.894341


In [8]:
best_model = xgb_model  # choose based on results

joblib.dump(best_model, "../models/walmart_global_xgb.pkl")

print("Saved model at ../models/walmart_global_xgb.pkl")

Saved model at ../models/walmart_global_xgb.pkl
