In [11]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/processed/walmart_clean_base.csv")
print("Loaded shape:", df.shape)

df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df.dropna(subset=['Date'])
df = df.sort_values('Date')

df.head()

Loaded shape: (2565, 8)


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,2,2010-01-10,1827440.43,0,69.24,2.603,211.329874,8.163
25,11,2010-01-10,1182490.46,0,75.11,2.603,214.984655,7.564
26,35,2010-01-10,771065.21,0,70.19,2.707,136.629757,8.763
27,21,2010-01-10,677158.39,0,70.28,2.603,211.329874,8.163
28,32,2010-01-10,1061089.56,0,66.14,2.759,190.673824,9.137


In [12]:
print("Unique stores:", df['Store'].nunique())
df['Store'].unique()

Unique stores: 45


array([ 2, 11, 35, 21, 32,  4, 25,  5, 27, 23,  6,  1, 44, 39, 31, 28, 42,
       12, 19, 10, 16, 20, 41, 38, 40, 22, 14, 30, 45, 33, 15, 34, 18, 37,
        9, 43, 36,  3, 13,  8,  7, 24, 17, 29, 26])

In [14]:
data = df.copy()
data = data.sort_values(["Store", "Date"])

data['year'] = data['Date'].dt.year
data['month'] = data['Date'].dt.month
data['week'] = data['Date'].dt.isocalendar().week.astype(int)
data['day_of_week'] = data['Date'].dt.weekday
data['day_of_year'] = data['Date'].dt.dayofyear
data['is_weekend'] = (data['day_of_week'] >= 5).astype(int)

data.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,year,month,week,day_of_week,day_of_year,is_weekend
36,1,2010-01-10,1453329.5,0,71.89,2.603,211.671989,7.838,2010,1,1,6,10,1
88,1,2010-02-04,1594968.28,0,62.27,2.719,210.82045,7.808,2010,2,5,3,35,0
134,1,2010-02-07,1492418.14,0,80.91,2.669,211.223533,7.787,2010,2,5,6,38,1
137,1,2010-03-09,1540163.53,0,81.21,2.577,211.531248,7.787,2010,3,10,1,68,0
213,1,2010-03-12,1548033.78,0,49.27,2.708,211.607193,7.838,2010,3,10,4,71,0


In [15]:
data = data.sort_values(["Store", "Date"])

data['lag_1'] = data.groupby("Store")['Weekly_Sales'].shift(1)
data['lag_2'] = data.groupby("Store")['Weekly_Sales'].shift(2)
data['lag_4'] = data.groupby("Store")['Weekly_Sales'].shift(4)

data.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,year,month,week,day_of_week,day_of_year,is_weekend,lag_1,lag_2,lag_4
36,1,2010-01-10,1453329.5,0,71.89,2.603,211.671989,7.838,2010,1,1,6,10,1,,,
88,1,2010-02-04,1594968.28,0,62.27,2.719,210.82045,7.808,2010,2,5,3,35,0,1453329.5,,
134,1,2010-02-07,1492418.14,0,80.91,2.669,211.223533,7.787,2010,2,5,6,38,1,1594968.28,1453329.5,
137,1,2010-03-09,1540163.53,0,81.21,2.577,211.531248,7.787,2010,3,10,1,68,0,1492418.14,1594968.28,
213,1,2010-03-12,1548033.78,0,49.27,2.708,211.607193,7.838,2010,3,10,4,71,0,1540163.53,1492418.14,1453329.5


In [16]:
data['roll_mean_4'] = (
    data.groupby("Store")['Weekly_Sales']
        .rolling(4).mean()
        .reset_index(level=0, drop=True)
)

data['roll_std_4'] = (
    data.groupby("Store")['Weekly_Sales']
        .rolling(4).std()
        .reset_index(level=0, drop=True)
)

data['roll_mean_8'] = (
    data.groupby("Store")['Weekly_Sales']
        .rolling(8).mean()
        .reset_index(level=0, drop=True)
)

data.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,year,month,week,day_of_week,day_of_year,is_weekend,lag_1,lag_2,lag_4,roll_mean_4,roll_std_4,roll_mean_8
36,1,2010-01-10,1453329.5,0,71.89,2.603,211.671989,7.838,2010,1,1,6,10,1,,,,,,
88,1,2010-02-04,1594968.28,0,62.27,2.719,210.82045,7.808,2010,2,5,3,35,0,1453329.5,,,,,
134,1,2010-02-07,1492418.14,0,80.91,2.669,211.223533,7.787,2010,2,5,6,38,1,1594968.28,1453329.5,,,,
137,1,2010-03-09,1540163.53,0,81.21,2.577,211.531248,7.787,2010,3,10,1,68,0,1492418.14,1594968.28,,1520220.0,61189.138909,
213,1,2010-03-12,1548033.78,0,49.27,2.708,211.607193,7.838,2010,3,10,4,71,0,1540163.53,1492418.14,1453329.5,1543896.0,41989.6829,


In [17]:
feature_cols = [
    'Store',
    'year','month','week','day_of_week','day_of_year','is_weekend',
    'Holiday_Flag','Temperature','Fuel_Price','CPI','Unemployment',
    'lag_1','lag_2','lag_4',
    'roll_mean_4','roll_std_4','roll_mean_8'
]

target_col = "Weekly_Sales"

data_ml = data.dropna(subset=feature_cols + [target_col]).copy()
data_ml = data_ml.sort_values(["Store", "Date"])

print("GLOBAL ML dataset shape:", data_ml.shape)
data_ml.head()

GLOBAL ML dataset shape: (2250, 20)


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,year,month,week,day_of_week,day_of_year,is_weekend,lag_1,lag_2,lag_4,roll_mean_4,roll_std_4,roll_mean_8
358,1,2010-05-03,1554806.68,0,46.5,2.625,211.350143,8.106,2010,5,18,0,123,0,1643690.9,1615524.71,1540163.53,1590514.0,46665.119383,1555367.0
397,1,2010-05-11,1551659.28,0,58.74,2.689,211.956394,7.838,2010,5,19,1,131,0,1554806.68,1643690.9,1548033.78,1591420.0,45587.769145,1567658.0
440,1,2010-06-08,1605491.78,0,87.16,2.627,211.504662,7.787,2010,6,23,1,159,0,1551659.28,1554806.68,1615524.71,1588912.0,44070.228007,1568974.0
479,1,2010-07-05,1603955.12,0,72.55,2.835,210.339968,7.808,2010,7,27,0,186,0,1605491.78,1551659.28,1643690.9,1578978.0,29762.404843,1582916.0
511,1,2010-08-10,1508239.93,0,63.93,2.633,211.746754,7.838,2010,8,32,1,222,0,1603955.12,1605491.78,1554806.68,1567337.0,46672.363469,1578925.0


In [18]:
# 80% of dates for train, 20% for valid
split_date = data_ml['Date'].quantile(0.8)

train = data_ml[data_ml['Date'] <= split_date]
valid = data_ml[data_ml['Date'] > split_date]

print("Train shape:", train.shape)
print("Valid shape:", valid.shape)

print("Train date range:", train['Date'].min(), "→", train['Date'].max())
print("Valid date range:", valid['Date'].min(), "→", valid['Date'].max())

Train shape: (1800, 20)
Valid shape: (450, 20)
Train date range: 2010-05-03 00:00:00 → 2012-05-10 00:00:00
Valid date range: 2012-06-01 00:00:00 → 2012-12-10 00:00:00


In [19]:
train.to_csv("../data/processed/walmart_global_train.csv", index=False)
valid.to_csv("../data/processed/walmart_global_valid.csv", index=False)
data_ml.to_csv("../data/processed/walmart_global_features.csv", index=False)

print("Saved all ML datasets for GLOBAL model.")

Saved all ML datasets for GLOBAL model.
