In [None]:
import pandas as pd

# Read the data files
train = pd.read_csv('data/store_sales/train.csv', parse_dates=['date'])
test = pd.read_csv('data/store_sales/test.csv', parse_dates=['date'])
stores = pd.read_csv('data/store_sales/stores.csv')
oil = pd.read_csv('data/store_sales/oil.csv', parse_dates=['date'])
holidays_events = pd.read_csv('data/store_sales/holidays_events.csv', parse_dates=['date'])

# Merge store information with train and test data
train = train.merge(stores, on='store_nbr', how='left')
test = test.merge(stores, on='store_nbr', how='left')

# Merge oil prices with train and test data
train = train.merge(oil, on='date', how='left')
test = test.merge(oil, on='date', how='left')

# Merge holidays and events information with train and test data
holidays_events = holidays_events[holidays_events['transferred'] == False]  # Filter out transferred holidays
holidays_events['holiday'] = 1  # Add a holiday indicator column
holidays_events = holidays_events[['date', 'holiday']]  # Keep only date and holiday columns
train = train.merge(holidays_events, on='date', how='left')
test = test.merge(holidays_events, on='date', how='left')

# Fill missing holiday values with 0
train['holiday'] = train['holiday'].fillna(0)
test['holiday'] = test['holiday'].fillna(0)

# Encode categorical features (e.g., city, state, type, family) using one-hot encoding or label encoding
# You can use pandas' get_dummies or sklearn's LabelEncoder functions for this purpose

# Feature engineering: create additional useful features, such as lagged sales, moving averages, or indicators for holidays and special events
# Example: Add 7-day lagged sales
# train['sales_lag_7'] = train.groupby(['store_nbr', 'family'])['sales'].shift(7)
# test['sales_lag_7'] = test.groupby(['store_nbr', 'family'])['sales'].shift(7)

# Fill missing values in the engineered features, if necessary
# train['sales_lag_7'] = train['sales_lag_7'].fillna(0)
# test['sales_lag_7'] = test['sales_lag_7'].fillna(0)

# Normalize and scale input features, if necessary
# You can use sklearn's StandardScaler, MinMaxScaler, or another scaler for this purpose

# Convert the date column to a more suitable format, such as the number of days since the start of the dataset
min_date = train['date'].min()
train['date'] = (train['date'] - min_date).dt.days
test['date'] = (test['date'] - min_date).dt.days

train = train.set_index('id')
test = test.set_index('id')


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelBinarizer, MinMaxScaler

# Encode categorical features using LabelEncoder
cat_columns = ['city', 'state', 'type', 'family', 'cluster']
encoder = LabelBinarizer()

for column in cat_columns:
    encoded = encoder.fit_transform(train[column])
    encoded = pd.DataFrame(encoded, columns=encoder.classes_, index=train.index)
    train = pd.concat([train, encoded], axis=1)
    test_encoded = encoder.transform(test[column])  # Use the same encoder as the train set for consistency
    test_encoded = pd.DataFrame(test_encoded, columns=encoder.classes_, index=test.index)
    test = pd.concat([test, test_encoded], axis=1)

# Drop the original categorical columns
train = train.drop(cat_columns, axis=1)
test = test.drop(cat_columns, axis=1)

# Normalize the continuous features using MinMaxScaler
continuous_columns = ['date', 'store_nbr', 'dcoilwtico']
scaler = MinMaxScaler()

train[continuous_columns] = scaler.fit_transform(train[continuous_columns])
# test[continuous_columns] = scaler.transform(test[continuous_columns[:-2]])  # Use the same scaler as the train set for consistency

# Fill missing values in the dcoilwtico column with 0 or an appropriate value, if necessary
train['dcoilwtico'] = train['dcoilwtico'].fillna(train['dcoilwtico'].mean())
test['dcoilwtico'] = test['dcoilwtico'].fillna(train['dcoilwtico'].mean())


In [None]:
from sklearn.model_selection import train_test_split

X = train.drop(columns=['sales'])
y = train['sales']
# Split the preprocessed data into input sequences (X) and target values (y) for model training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.head()

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from tda.nn import TDA, TDAClip

class SalesDataset(Dataset):
    def __init__(self, X, y=None):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        if self.y is None:
            return self.X[idx]
        else:
            return self.X[idx], self.y[idx]



In [None]:
import numpy as np

def create_sequences(X, y, sequence_length):
    X_sequences = []
    y_sequences = []
    for i in range(len(X) - sequence_length + 1):
        X_sequences.append(X[i:i+sequence_length])
        y_sequences.append(y[i+sequence_length-1])
    return np.array(X_sequences), np.array(y_sequences)

# def create_sequences(X, y, sequence_length):
#     X_sequences = []
#     y_sequences = []
#     for i in range(0, len(X) - sequence_length + 1, sequence_length):
#         X_sequences.append(X[i:i+sequence_length])
#         y_sequences.append(y[i+sequence_length-1])
#     return np.array(X_sequences), np.array(y_sequences)

sequence_length = 14  # Choose an appropriate sequence length based on your understanding of the problem

X_train_seq, y_train_seq = create_sequences(X_train.values, y_train.values, sequence_length)
X_test_seq, y_test_seq = create_sequences(X_test.values, y_test.values, sequence_length)

train_dataset = SalesDataset(torch.tensor(X_train_seq, dtype=torch.float32), torch.tensor(y_train_seq, dtype=torch.float32))
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)

In [None]:
import torch.nn.functional as F

class SalesModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, tda_time_window, tda_scaling_factor, max_value=1.0):
        super(SalesModel, self).__init__()
        self.fc1 = nn.Linear(input_size * tda_time_window, hidden_size*2)
        self.tda = TDA(tda_time_window, tda_scaling_factor)
        self.clip = TDAClip(max_value=max_value, op=torch.sum)
        self.fc2 = nn.Linear(hidden_size*2, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = x.view(x.shape[0], -1)
        x = F.relu(self.fc1(x))
        x = x.unsqueeze(-1)  # Add a dimension for time_window

        # Apply TDA layer along the time dimension
        x = F.relu(self.tda(x))
        x = self.clip(x)
        # Flatten the tensor and apply the first linear layer
        # x = x.view(x.shape[0], -1)
        x = F.leaky_relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        return x

In [None]:
class RMSLELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()
        
    def forward(self, pred, actual):
        return torch.sqrt(self.mse(torch.log(pred + 0.01), torch.log(actual + 0.01)))

In [None]:
import random

# Instantiate the model
input_size = X_train.shape[1]
hidden_size = 256
max_value = 5.0
output_size = 1
# Choose an appropriate value based on your understanding of the problem
tda_time_window = sequence_length
# Choose an appropriate value based on your understanding of the problem
tda_scaling_factor = 0.4
model = SalesModel(input_size, hidden_size, output_size,
                   tda_time_window, tda_scaling_factor, max_value=1.0)

# Choose a loss function and an optimizer
criterion = RMSLELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

device = torch.device('cuda' if torch.cuda.is_available()
                      else 'mps' if torch.has_mps else 'cpu')
# device = torch.device('cpu')

# seed all random generators
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)
    torch.cuda.manual_seed(42)
    torch.cuda.empty_cache()
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

model.to(device)
print('Device:', device)
# Train the model
num_epochs = 50
for epoch in range(num_epochs):
    for inputs, targets in train_dataloader:
        inputs = inputs.to(device)
        targets = targets.to(device)
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs.squeeze(), targets)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=2.0, norm_type=2)
        optimizer.step()

    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}")

In [None]:
# Generate predictions for the test set
test_dataset = SalesDataset(torch.tensor(X_test_seq, dtype=torch.float32), torch.tensor(y_test_seq, dtype=torch.float32))
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)
predictions = []
ground_truth = []
with torch.no_grad():
    for inputs, targets in test_dataloader:
        inputs = inputs.to(device)
        targets = targets.to(device)
        outputs = model(inputs)
        predictions.extend(outputs.squeeze().tolist())
        ground_truth.extend(targets.tolist())

In [None]:
for i in range(len(predictions)):
    print(f"Predicted: {predictions[i]:.2f}, Actual: {ground_truth[i]:.2f}")

In [None]:
# calculate the mean absolute error (MAE)
from sklearn.metrics import mean_absolute_error

loss = RMSLELoss()
with torch.no_grad():
    mae = loss(torch.asarray(ground_truth), torch.asarray(predictions))
print('RMSE: %.3f' % mae)

In [None]:
ground_truth

In [None]:
X_train.head()