# ***Causal Learning Tutorial - Store Sales***


'''
Author:
        
        KIM, JoengYoong, jeongyoong@ccnets.org
        
    COPYRIGHT (c) 2024. CCNets. All Rights reserved.
'''

Data Source: https://www.kaggle.com/competitions/store-sales-time-series-forecasting/overview

<p align="center">
  <img src="https://github.com/user-attachments/assets/81e5c8f0-d2ca-462c-b493-f21bce422ff6" alt="IMG" width='800'>
</p>

<a id="1"></a>
> <h1 style = 'font-family: Times New Roman'><b> <b style = 'color: #42c2f5'>1.</b> Import Necessary Libraries </b></h1>

In [1]:
import os
import sys
import warnings
warnings.filterwarnings("ignore")

path_append = "../../"
sys.path.append(path_append)  # Go up one directory from where you are.

In [2]:
import pandas  as pd
dataset_name = "Store Sales"
df_train = pd.read_csv(path_append + f'../data/{dataset_name}/train.csv')
df_train.head()

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0
1,1,2013-01-01,1,BABY CARE,0.0,0
2,2,2013-01-01,1,BEAUTY,0.0,0
3,3,2013-01-01,1,BEVERAGES,0.0,0
4,4,2013-01-01,1,BOOKS,0.0,0


In [3]:
df_holidays_events = pd.read_csv(path_append + f'../data/{dataset_name}/holidays_events.csv').head()
df_oil = pd.read_csv(path_append + f'../data/{dataset_name}/oil.csv').head()
df_transactions = pd.read_csv(path_append + f'../data/{dataset_name}/transactions.csv').head()
df_stores = pd.read_csv(path_append + f'../data/{dataset_name}/stores.csv').head()

In [4]:
df_test = pd.read_csv(path_append + f'../data/{dataset_name}/test.csv')

<a id="2"></a>
> <h1 style = 'font-family: Times New Roman'><b> <b style = 'color: #4290f5'>2.</b> Preprocess </b></h1>

In [5]:
df_train = pd.merge(df_train, df_stores, on ='store_nbr')
df_train = df_train.sort_values(["store_nbr","family","date"])
df_train = df_train.astype({"store_nbr":'str', "family":'str', "city":'str',
                          "state":'str', "type":'str', "cluster":'str'})

In [6]:
df_train.head(15)

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,city,state,type,cluster
0,0,2013-01-01,1,AUTOMOTIVE,0.0,0,Quito,Pichincha,D,13
165,1782,2013-01-02,1,AUTOMOTIVE,2.0,0,Quito,Pichincha,D,13
330,3564,2013-01-03,1,AUTOMOTIVE,3.0,0,Quito,Pichincha,D,13
495,5346,2013-01-04,1,AUTOMOTIVE,3.0,0,Quito,Pichincha,D,13
660,7128,2013-01-05,1,AUTOMOTIVE,5.0,0,Quito,Pichincha,D,13
825,8910,2013-01-06,1,AUTOMOTIVE,2.0,0,Quito,Pichincha,D,13
990,10692,2013-01-07,1,AUTOMOTIVE,0.0,0,Quito,Pichincha,D,13
1155,12474,2013-01-08,1,AUTOMOTIVE,2.0,0,Quito,Pichincha,D,13
1320,14256,2013-01-09,1,AUTOMOTIVE,2.0,0,Quito,Pichincha,D,13
1485,16038,2013-01-10,1,AUTOMOTIVE,2.0,0,Quito,Pichincha,D,13


<a id="2-1"></a>
> <h2 style = 'font-family: Times New Roman'><b> <b style = 'color: #4290f5'>2-1.</b> Auto Preprocess </b></h2>

In [7]:
from preprocessing.data_frame import auto_preprocess_dataframe
target_columns = ['sales']
df_train, description = auto_preprocess_dataframe(df_train, target_columns)

Column 'city' has 2 unique values.
Column 'cluster' has 4 unique values.
Column 'family' has 33 unique values.
Column 'state' has 2 unique values.
Column 'store_nbr' has 5 unique values.
Column 'type' has 1 unique values.


Unnamed: 0,Min,Max,Mean,Std,Null Count,Scaled,Encoded
id,-1.999801,1.999801,4.909818e-18,1.154703,0,Minmax,
onpromotion,0.0,230.0,2.78754,12.949375,0,Robust,
day_of_year_sin,-0.999991,0.999991,0.05939406,0.699191,0,,EncodedDateTime
day_of_year_cos,-0.999963,1.0,-0.02619571,0.711985,0,,EncodedDateTime
city,0.0,1.0,0.2,0.400001,0,,One-hot
state,0.0,1.0,0.2,0.400001,0,,One-hot
cluster_13,0.0,1.0,0.4,0.489899,0,,One_hot
cluster_4,0.0,1.0,0.2,0.400001,0,,One_hot
cluster_8,0.0,1.0,0.2,0.400001,0,,One_hot
cluster_9,0.0,1.0,0.2,0.400001,0,,One_hot


In [8]:
from preprocessing.scaler import scale_dataframe
df_train, target_scale = scale_dataframe(df_train, transform_columns=target_columns)
df_train

Unnamed: 0,id,onpromotion,day_of_year_sin,day_of_year_cos,city,state,cluster_13,cluster_4,cluster_8,cluster_9,...,family_PREPARED FOODS,family_PRODUCE,family_SCHOOL AND OFFICE SUPPLIES,family_SEAFOOD,store_nbr_1,store_nbr_2,store_nbr_3,store_nbr_4,store_nbr_5,sales
0,-1.999801,0.0,0.017213,0.999852,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.000000
165,-1.997426,0.0,0.034422,0.999407,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.046047
330,-1.995051,0.0,0.051620,0.998667,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.069070
495,-1.992676,0.0,0.068802,0.997630,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.069070
660,-1.990300,0.0,0.085965,0.996298,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.115117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
277199,1.990300,4.0,-0.642055,-0.766659,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.193442
277364,1.992676,0.0,-0.655156,-0.755493,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.168531
277529,1.995051,0.0,-0.668064,-0.744104,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.268037
277694,1.997426,0.0,-0.680773,-0.732494,1.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.269235


In [9]:
import torch
from sklearn.model_selection import train_test_split
from preprocessing.dataset import TemplateDataset

min_seq_len = 8
max_seq_len = 16

train_df, eval_df = train_test_split(df_train, test_size=0.2, shuffle=False, random_state=42)
# predict the next value in the sequence
train_df_x = train_df.iloc[:, :-1] # all columns except the last one
train_df_y = train_df.iloc[:, -1:] # only the last column

eval_df_x = eval_df.iloc[:, :-1] # all columns except the last one
eval_df_y = eval_df.iloc[:, -1:] # only the last column

print('train df shape: ', train_df.shape)
print('eval df shape: ', eval_df.shape)
trainset = TemplateDataset(train_df_x, train_df_y, min_seq_len = min_seq_len, max_seq_len = max_seq_len)
evalset = TemplateDataset(eval_df_x, eval_df_y, min_seq_len = min_seq_len, max_seq_len = max_seq_len)

train df shape:  (222288, 49)
eval df shape:  (55572, 49)


<a id="3"></a>
> <h1 style = 'font-family: Times New Roman'><b> <b style = 'color: #4290f5'>3.</b> Modeling </b></h1>

In [10]:
from tools.config.data_config import DataConfig
from tools.config.ml_config import MLConfig
from causal_learning import CausalLearning

num_features = description['num_features']
num_classes = description['num_classes']
data_config = DataConfig(dataset_name = dataset_name, task_type='regression', obs_shape=[num_features], label_size=num_classes, label_scale=target_scale, explain_size = 7)

#  Set training configuration from the AlgorithmConfig class, returning them as a Namespace object.
ml_config = MLConfig(model_name = 'gpt')

ml_config.model.num_layers = 3
ml_config.optimization.learning_rate = 2e-4
ml_config.training.error_function = 'rmsle'

# Set the device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

# Initialize the CausalLearning class with the training configuration, data configuration, device, and use_print and use_wandb flags
causal_learning = CausalLearning(ml_config, data_config, device, use_print=True, use_wandb=True) 

<a id="3-1"></a>
> <h2 style = 'font-family: Times New Roman'><b> <b style = 'color: #4290f5'>3-1.</b> Training </b></h2>

In [None]:
causal_learning.train(trainset, evalset)