In [None]:
#load the train.csv file from the dataset folder
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns



#load the dataset
df = pd.read_csv('dataset/train.csv')
print(df.head())

#check for nan or non existent values in the dataset
df.isnull().sum()

#print unique countries in country column
print(df['country'].unique())

#Plot num_sold over time for each unique country in country column, the date column is in the format yyyy-mm-dd
#convert the date column to datetime format
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

#define unique countries
countries = df['country'].unique()

#plot num_sold over time for each unique country in its own subplot
plt.figure(figsize=(20, 20))
for i, country in enumerate(countries):
    plt.subplot(3, 3, i+1)
    df[df['country'] == country].groupby('date')['num_sold'].sum().plot()
    plt.title(country)
    plt.xlabel('Date')
    plt.ylabel('num_sold')
plt.show()









In [None]:
#print the unique values in country, store and product columns, and min and max for date in num_sold column
print(df['country'].unique())
print(df['store'].unique())
print(df['product'].unique())
print(df['num_sold'].min(), df['num_sold'].max())


In [None]:
#for each country, plot the num_sold over time for each store
plt.figure(figsize=(20, 20))
for i, country in enumerate(countries):
    plt.subplot(3, 3, i+1)
    for store in df['store'].unique():
        df[(df['country'] == country) & (df['store'] == store)].groupby('date')['num_sold'].sum().plot()
    plt.title(country)
    plt.xlabel('Date')
    plt.ylabel('num_sold')

In [None]:
#for each country, plot the num_sold over time for each product
plt.figure(figsize=(20, 20))
for i, country in enumerate(countries):
    plt.subplot(3, 3, i+1)
    for product in df['product'].unique():
        df[(df['country'] == country) & (df['product'] == product)].groupby('date')['num_sold'].sum().plot()
    plt.title(country)
    plt.xlabel('Date')
    plt.ylabel('num_sold')

# **Mean/Median prediction** 

In [17]:
import pandas as pd

# Load data
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

# Compute mean sales for each group
mean_sales = train.groupby(['country', 'store', 'product'])['num_sold'].mean().reset_index()
mean_sales.rename(columns={'num_sold': 'mean_num_sold'}, inplace=True)

# Merge mean sales with test data
test = test.merge(mean_sales, on=['country', 'store', 'product'], how='left')

# Fill missing values (if any) with the global mean
test['num_sold'] = test['mean_num_sold'].fillna(train['num_sold'].mean())

# Prepare submission
submission = test[['id', 'num_sold']]
submission.to_csv('mean_baseline_submission.csv', index=False)


In [None]:
#visualize the mean_sales 
print(mean_sales)

# **LightGBM**

In [None]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error

# Load data
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

#replace all NaN values in the dataset with the value before it
train.fillna(method='ffill', inplace=True)

# Feature engineering
train['date'] = pd.to_datetime(train['date'])
train['day_of_week'] = train['date'].dt.dayofweek
train['month'] = train['date'].dt.month
train['year'] = train['date'].dt.year

test['date'] = pd.to_datetime(test['date'])
test['day_of_week'] = test['date'].dt.dayofweek
test['month'] = test['date'].dt.month
test['year'] = test['date'].dt.year

# Encode categorical variables
train['country'] = train['country'].astype('category')
train['store'] = train['store'].astype('category')
train['product'] = train['product'].astype('category')

test['country'] = test['country'].astype('category')
test['store'] = test['store'].astype('category')
test['product'] = test['product'].astype('category')

# Define features and target
features = ['country', 'store', 'product', 'day_of_week', 'month', 'year']
X = train[features]
y = train['num_sold']
X_test = test[features]

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True)

# Train LightGBM model
train_data = lgb.Dataset(X_train, label=y_train)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)

params = {
    'objective': 'regression',
    'metric': 'mape',
    'boosting_type': 'gbdt',
    'num_leaves': 1000,
    'learning_rate': 0.1,
    'feature_fraction': 1,
}

model = lgb.train(params, train_data, valid_sets=[val_data])

# Predict and evaluate
y_pred = model.predict(X_val)
mape = mean_absolute_percentage_error(y_val, y_pred)
print(f'MAPE: {mape:.4f}')

# Predict on test set
test['num_sold'] = model.predict(X_test)

# Prepare submission
submission = test[['id', 'num_sold']]
submission.to_csv('lgb_submission.csv', index=False)


# **XGBoost**

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error

# Load data
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

# Replace all NaN values in the dataset with the value before it
train.fillna(method='ffill', inplace=True)
train.fillna(method='bfill', inplace=True)

# Feature engineering
train['date'] = pd.to_datetime(train['date'])
train['day_of_week'] = train['date'].dt.dayofweek
train['month'] = train['date'].dt.month
train['year'] = train['date'].dt.year

test['date'] = pd.to_datetime(test['date'])
test['day_of_week'] = test['date'].dt.dayofweek
test['month'] = test['date'].dt.month
test['year'] = test['date'].dt.year

# Encode categorical variables
train['country'] = train['country'].astype('category')
train['store'] = train['store'].astype('category')
train['product'] = train['product'].astype('category')

test['country'] = test['country'].astype('category')
test['store'] = test['store'].astype('category')
test['product'] = test['product'].astype('category')

# Define features and target
features = ['country', 'store', 'product', 'day_of_week', 'month', 'year']
X = train[features]
y = train['num_sold']
X_test = test[features]

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True)

# Convert data to DMatrix (XGBoost's optimized data structure)
dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
dval = xgb.DMatrix(X_val, label=y_val, enable_categorical=True)
dtest = xgb.DMatrix(X_test, enable_categorical=True)

# Set XGBoost parameters
params = {
    'objective': 'reg:squarederror',  # Regression objective
    'eval_metric': 'mape',           # Evaluation metric (XGBoost doesn't have MAPE natively; calculate later)
    'learning_rate': 0.005,
    'max_depth': 40,
    'colsample_bytree': 1,
    'subsample': 1,
    'n_estimators': 100
}

# Train the XGBoost model
watchlist = [(dtrain, 'train'), (dval, 'valid')]
model = xgb.train(params, dtrain, num_boost_round=2000, evals=watchlist, early_stopping_rounds=50)

# Predict and evaluate
y_pred = model.predict(dval)
mape = mean_absolute_percentage_error(y_val, y_pred)
print(f'MAPE: {mape:.4f}')

# Predict on test set
test['num_sold'] = model.predict(dtest)

# Prepare submission
submission = test[['id', 'num_sold']]
submission.to_csv('xgb2_submission.csv', index=False)


In [None]:
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error

# Load data
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

# Replace all NaN values in the dataset with the value before it
train.fillna(method='ffill', inplace=True)
train.fillna(method='bfill', inplace=True)

# Feature engineering
train['date'] = pd.to_datetime(train['date'])
train['day_of_week'] = train['date'].dt.dayofweek
train['month'] = train['date'].dt.month
train['year'] = train['date'].dt.year

test['date'] = pd.to_datetime(test['date'])
test['day_of_week'] = test['date'].dt.dayofweek
test['month'] = test['date'].dt.month
test['year'] = test['date'].dt.year

# Encode categorical variables (CatBoost handles categorical variables natively)
categorical_features = ['country', 'store', 'product']
train[categorical_features] = train[categorical_features].astype('category')
test[categorical_features] = test[categorical_features].astype('category')

# Define features and target
features = ['country', 'store', 'product', 'day_of_week', 'month', 'year']
X = train[features]
y = train['num_sold']
X_test = test[features]

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True)

# Initialize CatBoost model
model = CatBoostRegressor(
    iterations=2000,
    learning_rate=0.01,
    depth=15,
    loss_function='MAPE',
    eval_metric='MAPE',
    cat_features=categorical_features,
    early_stopping_rounds=50,
    verbose=100
)

# Train the model
model.fit(X_train, y_train, eval_set=(X_val, y_val))

# Predict and evaluate
y_pred = model.predict(X_val)
mape = mean_absolute_percentage_error(y_val, y_pred)
print(f'MAPE: {mape:.4f}')

# Predict on test set
test['num_sold'] = model.predict(X_test)

# Prepare submission
submission = test[['id', 'num_sold']]
submission.to_csv('catboost_submission.csv', index=False)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_percentage_error
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization


# Load data
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

# Replace NaN values
train.fillna(method='ffill', inplace=True)
train.fillna(method='bfill', inplace=True)

# Feature engineering
train['date'] = pd.to_datetime(train['date'])
train['day_of_week'] = train['date'].dt.dayofweek
train['month'] = train['date'].dt.month
train['year'] = train['date'].dt.year

test['date'] = pd.to_datetime(test['date'])
test['day_of_week'] = test['date'].dt.dayofweek
test['month'] = test['date'].dt.month
test['year'] = test['date'].dt.year

# Define features and target
features = ['day_of_week', 'month', 'year']
categorical_features = ['country', 'store', 'product']

# One-hot encode categorical features
train = pd.get_dummies(train, columns=categorical_features)
test = pd.get_dummies(test, columns=categorical_features)

X = train[features + list(test.columns.difference(features))]
y = train['num_sold']
X_test = test[features + list(test.columns.difference(features))]

X = X.drop(columns=['date'], errors='ignore')
X_test = X_test.drop(columns=['date'], errors='ignore')

# Ensure same columns in train and test
X_test = X_test.reindex(columns=X.columns, fill_value=0)

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Build the neural network
model = Sequential([
    Dense(128, input_dim=X_train.shape[1], activation='relu'),
    Dropout(0.2),
    BatchNormalization(),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1, activation='linear')  # Linear activation for regression
])

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='mean_squared_error',
              metrics=['mae'])

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=20,
    batch_size=32,
    verbose=2
)

# Evaluate the model
y_pred = model.predict(X_val)
mape = mean_absolute_percentage_error(y_val, y_pred)
print(f'MAPE: {mape:.4f}')

# Predict on test set
test['num_sold'] = model.predict(X_test_scaled)

# Prepare submission
submission = test[['id', 'num_sold']]
submission.to_csv('nn_submission.csv', index=False)


In [None]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import mean_absolute_percentage_error

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Load data
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

# Replace NaN values
train.fillna(method='ffill', inplace=True)
train.fillna(method='bfill', inplace=True)

# Feature engineering
train['date'] = pd.to_datetime(train['date'])
train['day_of_week'] = train['date'].dt.dayofweek
train['month'] = train['date'].dt.month
train['year'] = train['date'].dt.year

test['date'] = pd.to_datetime(test['date'])
test['day_of_week'] = test['date'].dt.dayofweek
test['month'] = test['date'].dt.month
test['year'] = test['date'].dt.year

# Define features and target
features = ['day_of_week', 'month', 'year']
categorical_features = ['country', 'store', 'product']

# One-hot encode categorical features
train = pd.get_dummies(train, columns=categorical_features)
test = pd.get_dummies(test, columns=categorical_features)

X = train[features + list(test.columns.difference(features))]
y = train['num_sold']
X_test = test[features + list(test.columns.difference(features))]

X = X.drop(columns=['date'], errors='ignore')
X_test = X_test.drop(columns=['date'], errors='ignore')

# Ensure same columns in train and test
X_test = X_test.reindex(columns=X.columns, fill_value=0)

# Scale features and target
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

y_scaler = MinMaxScaler()
y_scaled = y_scaler.fit_transform(y.values.reshape(-1, 1))

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)

# Create data loaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Define the model
class NeuralNetwork(nn.Module):
    def __init__(self, input_dim):
        super(NeuralNetwork, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.BatchNorm1d(128),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        return self.model(x)

# Initialize model
input_dim = X_train_tensor.shape[1]
model = NeuralNetwork(input_dim).to(device)

# Loss function and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

# Training loop
epochs = 20
for epoch in range(epochs):
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        predictions = model(X_batch).squeeze()
        loss = criterion(predictions, y_batch.squeeze())
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_loss /= len(train_loader)

    # Validation loop
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            predictions = model(X_batch).squeeze()
            loss = criterion(predictions, y_batch.squeeze())
            val_loss += loss.item()
    val_loss /= len(val_loader)

    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

# Evaluate the model
model.eval()
with torch.no_grad():
    y_val_pred = model(X_val_tensor).cpu().numpy()
    y_val_original = y_scaler.inverse_transform(y_val_tensor.cpu().numpy())
    y_val_pred_original = y_scaler.inverse_transform(y_val_pred)
    mape = mean_absolute_percentage_error(y_val_original, y_val_pred_original)
    print(f"Validation MAPE: {mape:.4f}")

# Predict on test set
with torch.no_grad():
    test_predictions = model(X_test_tensor).cpu().numpy()
    test['num_sold'] = y_scaler.inverse_transform(test_predictions)

# Prepare submission
submission = test[['id', 'num_sold']]
submission.to_csv('pytorch_submission.csv', index=False)


# **RNN**

## Data preprocessing

In [None]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.metrics import mean_absolute_percentage_error

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Load data
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

# Set the id to index and drop NaNs in target
train.set_index("id", inplace=True)
test.set_index("id", inplace=True)
train.dropna(subset=["num_sold"], inplace=True)

# Feature engineering
def process_date_features(df):
    df["date"] = pd.to_datetime(df["date"])
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['quarter'] = df['date'].dt.quarter
    df['day_of_week'] = df['date'].dt.day_name()

    # Cyclical features
    df['day_sin']    = np.sin(2 * np.pi * df['day'] / 31)
    df['day_cos']    = np.cos(2 * np.pi * df['day'] / 31)
    df['month_sin']  = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos']  = np.cos(2 * np.pi * df['month'] / 12)
    df['quarter_sin'] = np.sin(2 * np.pi * df['quarter'] / 4)
    df['quarter_cos'] = np.cos(2 * np.pi * df['quarter'] / 4)

    df.drop("date", axis=1, inplace=True)
    return df

train = process_date_features(train)
test = process_date_features(test)

# Separate features and target
X = train.drop(columns=["num_sold"])
y = np.log1p(train["num_sold"])  # Log transform target
X_test = test[X.columns]

# Encode categorical features
cat_cols = ["country", "store", "product", "day_of_week"]
for col in cat_cols:
    le = LabelEncoder()
    combined = pd.concat([X[col], X_test[col]], axis=0)
    le.fit(combined)
    X[col] = le.transform(X[col])
    X_test[col] = le.transform(X_test[col])

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# Scale target
y_scaler = MinMaxScaler()
y_scaled = y_scaler.fit_transform(y.values.reshape(-1, 1))

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).to(device)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)


## Model

In [2]:
class RNNModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers=1):
        super(RNNModel, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True, dropout=0.2)
        self.fc1 = nn.Linear(hidden_dim, 64)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, output_dim)

    def forward(self, x):
        out, _ = self.lstm(x)
        out = out[:, -1, :]  # Take the output of the last time step
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu(out)
        out = self.fc3(out)
        return out


In [4]:
class DeepRNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, dropout_rate=0.2):
        super(DeepRNN, self).__init__()
        self.rnn = nn.LSTM(
            input_dim, 
            hidden_dim, 
            num_layers=num_layers, 
            batch_first=True, 
            dropout=dropout_rate
        )
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim, 64),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, output_dim)
        )
    
    def forward(self, x):
        out, _ = self.rnn(x)  # out: (batch_size, seq_len, hidden_dim)
        out = out[:, -1, :]  # Take the last time step's output
        out = self.fc(out)
        return out


In [11]:
class ComplexRNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, dropout_rate=0.3):
        super(ComplexRNN, self).__init__()
        self.lstm = nn.LSTM(
            input_dim, 
            hidden_dim, 
            num_layers=num_layers, 
            batch_first=True, 
            dropout=dropout_rate, 
            bidirectional=True
        )
        self.attention = nn.Linear(hidden_dim * 2, 1)  # Attention layer
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim * 2, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(dropout_rate),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(dropout_rate),
            nn.Linear(64, output_dim)
        )
    
    def forward(self, x):
        lstm_out, _ = self.lstm(x)  # Output shape: (batch_size, seq_len, hidden_dim*2)
        
        # Attention mechanism
        attention_weights = torch.softmax(self.attention(lstm_out), dim=1)  # Shape: (batch_size, seq_len, 1)
        context_vector = torch.sum(attention_weights * lstm_out, dim=1)  # Shape: (batch_size, hidden_dim*2)
        
        # Fully connected layers
        out = self.fc(context_vector)  # Shape: (batch_size, output_dim)
        return out


In [14]:
class DeepComplexRNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, num_layers, dropout_rate=0.3):
        super(DeepComplexRNN, self).__init__()
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim

        # Stacked Bidirectional LSTMs
        self.rnns = nn.ModuleList([
            nn.LSTM(
                input_dim if i == 0 else hidden_dim * 2,  # Input size for the first RNN is input_dim, for others it's hidden_dim*2
                hidden_dim,
                num_layers=1,  # Single layer for each RNN in the stack
                batch_first=True,
                dropout=dropout_rate,
                bidirectional=True
            )
            for i in range(num_layers)
        ])

        # Attention Mechanism
        self.attention = nn.Linear(hidden_dim * 2, 1)

        # Fully Connected Layers
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim * 2, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(dropout_rate),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(dropout_rate),
            nn.Linear(128, output_dim)
        )
    
    def forward(self, x):
        for rnn in self.rnns:
            x, _ = rnn(x)  # Pass through each LSTM layer in the stack

        # Attention Mechanism
        attention_weights = torch.softmax(self.attention(x), dim=1)  # Shape: (batch_size, seq_len, 1)
        context_vector = torch.sum(attention_weights * x, dim=1)  # Shape: (batch_size, hidden_dim*2)

        # Fully connected layers
        out = self.fc(context_vector)  # Shape: (batch_size, output_dim)
        return out


## Training

In [None]:
# Reshape data for RNN
X_train_rnn = X_train_tensor.unsqueeze(1)  # Add sequence dimension (seq_len=1)
X_val_rnn = X_val_tensor.unsqueeze(1)
X_test_rnn = X_test_tensor.unsqueeze(1)

# Create DataLoaders
train_dataset = TensorDataset(X_train_rnn, y_train_tensor)
val_dataset = TensorDataset(X_val_rnn, y_val_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

# Initialize model
input_dim = X_train_rnn.shape[2]  # Number of features
hidden_dim = 128
output_dim = 1
num_layers = 2
model = DeepComplexRNN(input_dim, hidden_dim, output_dim, num_layers).to(device)

# Loss function and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

best_val_loss = float('inf')
patience = 5
trigger_times = 0

for epoch in range(epochs):
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        predictions = model(X_batch).squeeze()
        loss = criterion(predictions, y_batch.squeeze())
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    train_loss /= len(train_loader)

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            predictions = model(X_batch).squeeze()
            loss = criterion(predictions, y_batch.squeeze())
            val_loss += loss.item()
    val_loss /= len(val_loader)

    #scheduler.step(val_loss)

    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}")

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        trigger_times = 0
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        trigger_times += 1
        if trigger_times >= patience:
            print("Early stopping!")
            break



## Submission 

In [None]:
# Evaluate the model
model.eval()
with torch.no_grad():
    y_val_pred = model(X_val_rnn).cpu().numpy()
    y_val_original = y_scaler.inverse_transform(y_val_tensor.cpu().numpy())
    y_val_pred_original = y_scaler.inverse_transform(y_val_pred)
    mape = mean_absolute_percentage_error(y_val_original, y_val_pred_original)
    print(f"Validation MAPE: {mape:.4f}")

# Predict on the test set
with torch.no_grad():
    test_predictions = model(X_test_rnn).cpu().numpy()
    test['num_sold'] = y_scaler.inverse_transform(test_predictions)

# Reset index to make 'id' a column again
submission = test.reset_index()[['id', 'num_sold']]
submission.to_csv('dcrnn_submission.csv', index=False)



# **Transformer + LSTM**

## Data Preprocessing

In [2]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.metrics import mean_absolute_percentage_error

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Load data
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

# Set the id to index and drop NaNs in target
train.set_index("id", inplace=True)
test.set_index("id", inplace=True)
train.dropna(subset=["num_sold"], inplace=True)

# Feature engineering
def process_date_features(df):
    df["date"] = pd.to_datetime(df["date"])
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['quarter'] = df['date'].dt.quarter
    df['day_of_week'] = df['date'].dt.day_name()

    # Cyclical features
    df['day_sin']    = np.sin(2 * np.pi * df['day'] / 31)
    df['day_cos']    = np.cos(2 * np.pi * df['day'] / 31)
    df['month_sin']  = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos']  = np.cos(2 * np.pi * df['month'] / 12)
    df['quarter_sin'] = np.sin(2 * np.pi * df['quarter'] / 4)
    df['quarter_cos'] = np.cos(2 * np.pi * df['quarter'] / 4)

    return df

# Retain the 'date' column for splitting
train = process_date_features(train)
test = process_date_features(test)

# Separate features and target
X = train.drop(columns=["num_sold"])  # Retain 'date' column for now
y = np.log1p(train["num_sold"])  # Log transform target
X_test = test[X.columns]

# Encode categorical features
cat_cols = ["country", "store", "product", "day_of_week"]
for col in cat_cols:
    le = LabelEncoder()
    combined = pd.concat([X[col], X_test[col]], axis=0)
    le.fit(combined)
    X[col] = le.transform(X[col])
    X_test[col] = le.transform(X_test[col])

# Split based on date
train_data = X[X['date'] < '2016-01-01']
val_data = X[X['date'] >= '2016-01-01']
train_target = y[X['date'] < '2016-01-01']
val_target = y[X['date'] >= '2016-01-01']

# Drop the 'date' column after splitting
train_data = train_data.drop(columns=["date"])
val_data = val_data.drop(columns=["date"])

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_data)
X_val_scaled = scaler.transform(val_data)
X_test_scaled = scaler.transform(X_test.drop(columns=["date"]))

# Scale target
y_scaler = MinMaxScaler()
y_train_scaled = y_scaler.fit_transform(train_target.values.reshape(-1, 1))
y_val_scaled = y_scaler.transform(val_target.values.reshape(-1, 1))

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_scaled, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train_scaled, dtype=torch.float32).to(device)
X_val_tensor = torch.tensor(X_val_scaled, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val_scaled, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(device)

print(f"Train shape: {X_train_tensor.shape}, Validation shape: {X_val_tensor.shape}")


Using device: cuda
Train shape: torch.Size([189492, 13]), Validation shape: torch.Size([31767, 13])


In [23]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import mean_absolute_percentage_error

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Load data
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

# Set the id to index and drop NaNs in target
train.set_index("id", inplace=True)
test.set_index("id", inplace=True)
train.dropna(subset=["num_sold"], inplace=True)

# Feature engineering
def process_date_features(df):
    df["date"] = pd.to_datetime(df["date"])
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['quarter'] = df['date'].dt.quarter
    df['day_of_week'] = df['date'].dt.day_name()

    # Cyclical features
    df['day_sin']    = np.sin(2 * np.pi * df['day'] / 31)
    df['day_cos']    = np.cos(2 * np.pi * df['day'] / 31)
    df['month_sin']  = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos']  = np.cos(2 * np.pi * df['month'] / 12)
    df['quarter_sin'] = np.sin(2 * np.pi * df['quarter'] / 4)
    df['quarter_cos'] = np.cos(2 * np.pi * df['quarter'] / 4)

    return df

# Retain the 'date' column for splitting
train = process_date_features(train)
test = process_date_features(test)
print("shape of test",test.shape)

# Separate features and target
X = train.drop(columns=["num_sold"])  # Retain 'date' column for now
y = np.log1p(train["num_sold"])  # Log transform target
X_test = test[X.columns]
print("shape of X_test",X_test.shape)

# Encode categorical features using LabelEncoder and OneHotEncoder for high cardinality features
cat_cols = ["country", "store", "product", "day_of_week"]

# Apply LabelEncoder for smaller categorical variables
for col in cat_cols:
    le = LabelEncoder()
    combined = pd.concat([X[col], X_test[col]], axis=0)
    le.fit(combined)
    X[col] = le.transform(X[col])
    X_test[col] = le.transform(X_test[col])

# Add lag features for the previous week's sales (example)
train['num_sold_lag_1'] = train.groupby(['country', 'store', 'product'])['num_sold'].shift(1)
train['num_sold_lag_2'] = train.groupby(['country', 'store', 'product'])['num_sold'].shift(2)
train['num_sold_lag_3'] = train.groupby(['country', 'store', 'product'])['num_sold'].shift(3)

# Add holiday features (for simplicity, mark major holidays as binary)
train['holiday'] = train['date'].apply(lambda x: 1 if x.month == 12 and x.day == 25 else 0)  # Example for Christmas

# Fill NaN values due to lag creation
train.fillna(0, inplace=True)

# Split based on date
train_data = X[X['date'] < '2016-01-01']
val_data = X[X['date'] >= '2016-01-01']
train_target = y[X['date'] < '2016-01-01']
val_target = y[X['date'] >= '2016-01-01']

# Drop the 'date' column after splitting
train_data = train_data.drop(columns=["date"])
val_data = val_data.drop(columns=["date"])

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_data)
X_val_scaled = scaler.transform(val_data)
X_test_scaled = scaler.transform(X_test.drop(columns=["date"]))
print("shape of X_test_scaled",X_test_scaled.shape)

# Scale target
y_scaler = MinMaxScaler()
y_train_scaled = y_scaler.fit_transform(train_target.values.reshape(-1, 1))
y_val_scaled = y_scaler.transform(val_target.values.reshape(-1, 1))

# Add seq_len for sequence-based models
seq_len = 31  # Example: 7 days of historical data

def create_sequences_test(data, seq_len):
    """
    Create sequences for test data, ensuring alignment with original test rows.
    Pads the beginning of the data to ensure all rows are covered.
    Args:
        data (np.ndarray): Feature data.
        seq_len (int): Length of each sequence.
    Returns:
        np.ndarray: Sequences (X).
    """
    padded_data = np.pad(data, ((seq_len - 1, 0), (0, 0)), mode='constant', constant_values=0)
    sequences = []
    for i in range(len(data)):  # Ensure all original rows are included
        seq_x = padded_data[i:i+seq_len]
        sequences.append(seq_x)
    return np.array(sequences)


def create_sequences(data, target, seq_len):
    """
    Create sequences of length seq_len for LSTM/Transformer training.
    Args:
        data (np.ndarray): Feature data.
        target (np.ndarray): Target data.
        seq_len (int): Length of each sequence.
    Returns:
        tuple: Sequences (X) and corresponding targets (y).
    """
    sequences = []
    targets = []
    for i in range(len(data) - seq_len + 1):
        seq_x = data[i:i+seq_len]
        seq_y = target[i+seq_len-1]  # Target corresponds to the last step in the sequence
        sequences.append(seq_x)
        targets.append(seq_y)
    return np.array(sequences), np.array(targets)

# Apply sliding window on training and validation data
X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train_scaled, seq_len)
X_val_seq, y_val_seq = create_sequences(X_val_scaled, y_val_scaled, seq_len)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_seq, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train_seq, dtype=torch.float32).to(device)
X_val_tensor = torch.tensor(X_val_seq, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val_seq, dtype=torch.float32).to(device)

# # Test set: Prepare sequences, but no target (predict future)
# X_test_seq, _ = create_sequences(X_test_scaled, np.zeros(len(X_test_scaled)), seq_len)
# print("shape of X_test_seq",X_test_seq.shape)
# X_test_tensor = torch.tensor(X_test_seq, dtype=torch.float32).to(device)

X_test_seq = create_sequences_test(X_test_scaled, seq_len)
print("shape of X_test_seq after padding:", X_test_seq.shape)

# Convert to PyTorch tensor
X_test_tensor = torch.tensor(X_test_seq, dtype=torch.float32).to(device)



print(f"Train shape: {X_train_tensor.shape}, Validation shape: {X_val_tensor.shape}, Test shape: {X_test_tensor.shape}")



Using device: cuda
shape of test (98550, 14)
shape of X_test (98550, 14)
shape of X_test_scaled (98550, 13)
shape of X_test_seq after padding: (98550, 31, 13)
Train shape: torch.Size([189462, 31, 13]), Validation shape: torch.Size([31737, 31, 13]), Test shape: torch.Size([98550, 31, 13])


## Model

In [24]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error
import time

# Define Transformer + LSTM Model
class TransformerLSTMModel(nn.Module):
    def __init__(self, input_dim, lstm_hidden_dim, transformer_hidden_dim, num_heads, output_dim):
        super(TransformerLSTMModel, self).__init__()

        # LSTM Layer
        self.lstm = nn.LSTM(input_dim, lstm_hidden_dim, batch_first=True)

        # Transformer Layer
        self.transformer = nn.Transformer(
            d_model=lstm_hidden_dim,
            nhead=num_heads,
            num_encoder_layers=1,
            dim_feedforward=transformer_hidden_dim
        )
        
        # Fully connected layers
        self.fc1 = nn.Linear(lstm_hidden_dim, 128)
        self.fc2 = nn.Linear(128, output_dim)

    def forward(self, x):
        # LSTM layer
        lstm_out, _ = self.lstm(x)

        # Transformer expects [seq_len, batch_size, features], so transpose the output
        lstm_out = lstm_out.permute(1, 0, 2)  # Change shape to [batch_size, seq_len, features]

        # Transformer layer
        transformer_out = self.transformer(lstm_out, lstm_out)

        # Take the output of the last time step
        transformer_out = transformer_out[-1, :, :]

        # Fully connected layers
        x = torch.relu(self.fc1(transformer_out))
        x = self.fc2(x)
        return x


## Training

In [25]:
# Training Function
def train_model(model, train_loader, val_loader, num_epochs=10, learning_rate=1e-4):
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets.squeeze())
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for inputs, targets in val_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                loss = criterion(outputs.squeeze(), targets.squeeze())
                val_loss += loss.item()

        print(f"Epoch [{epoch+1}/{num_epochs}], Training Loss: {running_loss/len(train_loader):.4f}, Validation Loss: {val_loss/len(val_loader):.4f}")
    
    return model

# Prepare DataLoader
def create_dataloader(X_train, y_train, X_val, y_val, batch_size=64):
    train_data = TensorDataset(X_train, y_train)
    val_data = TensorDataset(X_val, y_val)
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
    return train_loader, val_loader

# Prepare the final prediction function
def make_predictions(model, X_test):
    model.eval()
    with torch.no_grad():
        predictions = model(X_test)
    return predictions.squeeze().cpu().numpy()

# Instantiate model
input_dim = X_train_tensor.shape[-1]  # Number of features
lstm_hidden_dim = 128
transformer_hidden_dim = 256
num_heads = 4
output_dim = 1  # Predicting num_sold

model = TransformerLSTMModel(input_dim, lstm_hidden_dim, transformer_hidden_dim, num_heads, output_dim).to(device)

# Create DataLoader for training and validation
train_loader, val_loader = create_dataloader(X_train_tensor, y_train_tensor, X_val_tensor, y_val_tensor)

# Train the model
model = train_model(model, train_loader, val_loader, num_epochs=10)



Epoch [1/10], Training Loss: 0.0089, Validation Loss: 0.0050
Epoch [2/10], Training Loss: 0.0019, Validation Loss: 0.0016
Epoch [3/10], Training Loss: 0.0018, Validation Loss: 0.0017
Epoch [4/10], Training Loss: 0.0010, Validation Loss: 0.0041
Epoch [5/10], Training Loss: 0.0010, Validation Loss: 0.0010
Epoch [6/10], Training Loss: 0.0008, Validation Loss: 0.0011
Epoch [7/10], Training Loss: 0.0005, Validation Loss: 0.0007
Epoch [8/10], Training Loss: 0.0006, Validation Loss: 0.0009
Epoch [9/10], Training Loss: 0.0005, Validation Loss: 0.0010
Epoch [10/10], Training Loss: 0.0004, Validation Loss: 0.0009


## Submission

In [26]:
from torch.utils.data import DataLoader, TensorDataset

# Update make_predictions to handle batches
def make_predictions(model, X_test, batch_size=1024):
    model.eval()
    dataloader = DataLoader(TensorDataset(X_test), batch_size=batch_size, shuffle=False)
    all_predictions = []
    with torch.no_grad():
        for batch in dataloader:
            inputs = batch[0].to("cuda")
            outputs = model(inputs)
            all_predictions.append(outputs.cpu().numpy())
    return np.concatenate(all_predictions, axis=0)

# Generate Predictions for Test Set in batches
predictions = make_predictions(model, X_test_tensor, batch_size=1024)

# Reverse scaling and log transformation
predictions = y_scaler.inverse_transform(predictions.reshape(-1, 1))

# Check size mismatch and handle it
if len(predictions) < len(test):
    print(f"Mismatch detected: Test index ({len(test)}) and predictions ({len(predictions)})")
    # Truncate test index if it exceeds predictions
    test = test.iloc[:len(predictions)]
elif len(predictions) > len(test):
    raise ValueError("More predictions than test entries—investigate preprocessing steps.")

# Prepare submission file
submission = pd.DataFrame({
    'id': test.index,
    'num_sold': np.expm1(predictions.flatten())  # Reverse log transformation
})

submission.to_csv('submission2.csv', index=False)
print("Submission file created successfully!")



Submission file created successfully!


# **CNN + LSTM + Attention**

## Data Preprocessing

In [20]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import mean_absolute_percentage_error

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Load data
train = pd.read_csv('dataset/train.csv')
test = pd.read_csv('dataset/test.csv')

# Set the id to index and drop NaNs in target
train.set_index("id", inplace=True)
test.set_index("id", inplace=True)
train.dropna(subset=["num_sold"], inplace=True)

# Feature engineering
def process_date_features(df):
    df["date"] = pd.to_datetime(df["date"])
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['quarter'] = df['date'].dt.quarter
    df['day_of_week'] = df['date'].dt.day_name()

    # Cyclical features
    df['day_sin']    = np.sin(2 * np.pi * df['day'] / 31)
    df['day_cos']    = np.cos(2 * np.pi * df['day'] / 31)
    df['month_sin']  = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos']  = np.cos(2 * np.pi * df['month'] / 12)
    df['quarter_sin'] = np.sin(2 * np.pi * df['quarter'] / 4)
    df['quarter_cos'] = np.cos(2 * np.pi * df['quarter'] / 4)

    return df

# Retain the 'date' column for splitting
train = process_date_features(train)
test = process_date_features(test)
print("shape of test",test.shape)

# Separate features and target
X = train.drop(columns=["num_sold"])  # Retain 'date' column for now
y = np.log1p(train["num_sold"])  # Log transform target
X_test = test[X.columns]
print("shape of X_test",X_test.shape)

# Encode categorical features using LabelEncoder and OneHotEncoder for high cardinality features
cat_cols = ["country", "store", "product", "day_of_week"]

# Apply LabelEncoder for smaller categorical variables
for col in cat_cols:
    le = LabelEncoder()
    combined = pd.concat([X[col], X_test[col]], axis=0)
    le.fit(combined)
    X[col] = le.transform(X[col])
    X_test[col] = le.transform(X_test[col])

# Add lag features for the previous week's sales (example)
train['num_sold_lag_1'] = train.groupby(['country', 'store', 'product'])['num_sold'].shift(1)
train['num_sold_lag_2'] = train.groupby(['country', 'store', 'product'])['num_sold'].shift(2)
train['num_sold_lag_3'] = train.groupby(['country', 'store', 'product'])['num_sold'].shift(3)

# Add holiday features (for simplicity, mark major holidays as binary)
train['holiday'] = train['date'].apply(lambda x: 1 if x.month == 12 and x.day == 25 else 0)  # Example for Christmas

# Fill NaN values due to lag creation
train.fillna(0, inplace=True)

# Split based on date
train_data = X[X['date'] < '2016-01-01']
val_data = X[X['date'] >= '2016-01-01']
train_target = y[X['date'] < '2016-01-01']
val_target = y[X['date'] >= '2016-01-01']

# Drop the 'date' column after splitting
train_data = train_data.drop(columns=["date"])
val_data = val_data.drop(columns=["date"])

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_data)
X_val_scaled = scaler.transform(val_data)
X_test_scaled = scaler.transform(X_test.drop(columns=["date"]))
print("shape of X_test_scaled",X_test_scaled.shape)

# Scale target
y_scaler = MinMaxScaler()
y_train_scaled = y_scaler.fit_transform(train_target.values.reshape(-1, 1))
y_val_scaled = y_scaler.transform(val_target.values.reshape(-1, 1))

# Add seq_len for sequence-based models
seq_len = 31  # Example: 7 days of historical data

def create_sequences_test(data, seq_len):
    """
    Create sequences for test data, ensuring alignment with original test rows.
    Pads the beginning of the data to ensure all rows are covered.
    Args:
        data (np.ndarray): Feature data.
        seq_len (int): Length of each sequence.
    Returns:
        np.ndarray: Sequences (X).
    """
    padded_data = np.pad(data, ((seq_len - 1, 0), (0, 0)), mode='constant', constant_values=0)
    sequences = []
    for i in range(len(data)):  # Ensure all original rows are included
        seq_x = padded_data[i:i+seq_len]
        sequences.append(seq_x)
    return np.array(sequences)


def create_sequences(data, target, seq_len):
    """
    Create sequences of length seq_len for LSTM/Transformer training.
    Args:
        data (np.ndarray): Feature data.
        target (np.ndarray): Target data.
        seq_len (int): Length of each sequence.
    Returns:
        tuple: Sequences (X) and corresponding targets (y).
    """
    sequences = []
    targets = []
    for i in range(len(data) - seq_len + 1):
        seq_x = data[i:i+seq_len]
        seq_y = target[i+seq_len-1]  # Target corresponds to the last step in the sequence
        sequences.append(seq_x)
        targets.append(seq_y)
    return np.array(sequences), np.array(targets)

# Apply sliding window on training and validation data
X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train_scaled, seq_len)
X_val_seq, y_val_seq = create_sequences(X_val_scaled, y_val_scaled, seq_len)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_seq, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train_seq, dtype=torch.float32).to(device)
X_val_tensor = torch.tensor(X_val_seq, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val_seq, dtype=torch.float32).to(device)

# # Test set: Prepare sequences, but no target (predict future)
# X_test_seq, _ = create_sequences(X_test_scaled, np.zeros(len(X_test_scaled)), seq_len)
# print("shape of X_test_seq",X_test_seq.shape)
# X_test_tensor = torch.tensor(X_test_seq, dtype=torch.float32).to(device)

X_test_seq = create_sequences_test(X_test_scaled, seq_len)
print("shape of X_test_seq after padding:", X_test_seq.shape)

# Convert to PyTorch tensor
X_test_tensor = torch.tensor(X_test_seq, dtype=torch.float32).to(device)



print(f"Train shape: {X_train_tensor.shape}, Validation shape: {X_val_tensor.shape}, Test shape: {X_test_tensor.shape}")



Using device: cuda
shape of test (98550, 14)
shape of X_test (98550, 14)
shape of X_test_scaled (98550, 13)
shape of X_test_seq after padding: (98550, 31, 13)
Train shape: torch.Size([189462, 31, 13]), Validation shape: torch.Size([31737, 31, 13]), Test shape: torch.Size([98550, 31, 13])


## Model

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error

# Define CNN + LSTM + Attention Model
class CNN_LSTM_Attention(nn.Module):
    def __init__(self, input_dim, lstm_hidden_dim, cnn_channels, attention_heads, output_dim):
        super(CNN_LSTM_Attention, self).__init__()

        # CNN Layer: Apply 1D convolutions to capture local patterns
        self.cnn1 = nn.Conv1d(input_dim, cnn_channels, kernel_size=3, padding=1)
        self.cnn2 = nn.Conv1d(cnn_channels, cnn_channels, kernel_size=3, padding=1)

        # LSTM Layer: Capture long-term dependencies
        self.lstm = nn.LSTM(cnn_channels, lstm_hidden_dim, batch_first=True)

        # Attention Mechanism: Focus on important time steps
        self.attention = nn.MultiheadAttention(embed_dim=lstm_hidden_dim, num_heads=attention_heads)

        # Fully connected layers
        self.fc1 = nn.Linear(lstm_hidden_dim, 128)
        self.fc2 = nn.Linear(128, output_dim)

    def forward(self, x):
        # Apply CNN layers to extract local features
        x = x.permute(0, 2, 1)  # [batch_size, seq_len, features] -> [batch_size, features, seq_len]
        x = torch.relu(self.cnn1(x))
        x = torch.relu(self.cnn2(x))
        
        # Return to [batch_size, seq_len, features] for LSTM
        x = x.permute(0, 2, 1)

        # Apply LSTM layer
        lstm_out, _ = self.lstm(x)

        # Apply attention mechanism (LSTM output to itself)
        attn_output, _ = self.attention(lstm_out, lstm_out, lstm_out)

        # Use output of the last time step
        output = attn_output[:, -1, :]

        # Fully connected layers for prediction
        x = torch.relu(self.fc1(output))
        x = self.fc2(x)
        return x

## Training

In [17]:
# Define MAPE Loss function
def mape_loss(y_true, y_pred, eps=1e-8):
    return torch.mean(torch.abs((y_true - y_pred) / (y_true + eps))) * 100




# Training Function
def train_model(model, train_loader, val_loader, num_epochs=10, learning_rate=1e-4):
    criterion = nn.MSELoss()  # MAPE loss
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(targets.squeeze(), outputs.squeeze())
            loss.backward()
            optimizer.step()
            running_loss += loss.item()

        # Validation Phase
        model.eval()
        val_loss = 0
        val_mape = 0
        with torch.no_grad():
            for inputs, targets in val_loader:
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                loss = criterion(targets.squeeze(), outputs.squeeze())
                mape = mape_loss(targets.squeeze(), outputs.squeeze())
                val_loss += loss.item()
                val_mape += mape.item()

        print(f"Epoch [{epoch+1}/{num_epochs}], Training loss: {running_loss/len(train_loader):.4f}, Validation loss: {val_loss/len(val_loader):.4f}, Validation MAPE: {val_mape/len(val_loader):.4f}")
    
    return model

# Prepare DataLoader
def create_dataloader(X_train, y_train, X_val, y_val, batch_size=64):
    train_data = TensorDataset(X_train, y_train)
    val_data = TensorDataset(X_val, y_val)
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False)
    return train_loader, val_loader

# Prediction Function
def make_predictions(model, X_test):
    model.eval()
    X_test = X_test.to(device)
    with torch.no_grad():
        predictions = model(X_test)
    return predictions.squeeze().cpu().numpy()

# Train and Evaluate Model
# Assuming X_train_tensor, y_train_tensor, X_val_tensor, y_val_tensor are already defined

# Instantiate Model
input_dim = X_train_tensor.shape[-1]
lstm_hidden_dim = 128
cnn_channels = 64
attention_heads = 4
output_dim = 1

model = CNN_LSTM_Attention(input_dim, lstm_hidden_dim, cnn_channels, attention_heads, output_dim).to(device)

# Create DataLoader for training and validation
train_loader, val_loader = create_dataloader(X_train_tensor, y_train_tensor, X_val_tensor, y_val_tensor)

# Train the Model
model = train_model(model, train_loader, val_loader, num_epochs=20)

Epoch [1/20], Training loss: 0.0493, Validation loss: 0.1325, Validation MAPE: 21552273.2301
Epoch [2/20], Training loss: 0.0046, Validation loss: 0.0143, Validation MAPE: 14469765.6324
Epoch [3/20], Training loss: 0.0014, Validation loss: 0.0034, Validation MAPE: 7049178.8403
Epoch [4/20], Training loss: 0.0010, Validation loss: 0.0014, Validation MAPE: 3961237.0155
Epoch [5/20], Training loss: 0.0008, Validation loss: 0.0015, Validation MAPE: 2775019.9610
Epoch [6/20], Training loss: 0.0006, Validation loss: 0.0009, Validation MAPE: 3140964.4561
Epoch [7/20], Training loss: 0.0005, Validation loss: 0.0008, Validation MAPE: 1656778.8683
Epoch [8/20], Training loss: 0.0004, Validation loss: 0.0008, Validation MAPE: 1444897.0309
Epoch [9/20], Training loss: 0.0004, Validation loss: 0.0006, Validation MAPE: 1421975.9203
Epoch [10/20], Training loss: 0.0003, Validation loss: 0.0007, Validation MAPE: 1533128.5149
Epoch [11/20], Training loss: 0.0003, Validation loss: 0.0006, Validation MAP

## Submission

In [18]:
from torch.utils.data import DataLoader, TensorDataset

# Update make_predictions to handle batches
def make_predictions(model, X_test, batch_size=1024):
    model.eval()
    dataloader = DataLoader(TensorDataset(X_test), batch_size=batch_size, shuffle=False)
    all_predictions = []
    with torch.no_grad():
        for batch in dataloader:
            inputs = batch[0].to("cuda")
            outputs = model(inputs)
            all_predictions.append(outputs.cpu().numpy())
    return np.concatenate(all_predictions, axis=0)

# Generate Predictions for Test Set in batches
predictions = make_predictions(model, X_test_tensor, batch_size=1024)

# Reverse scaling and log transformation
predictions = y_scaler.inverse_transform(predictions.reshape(-1, 1))

# Check size mismatch and handle it
if len(predictions) < len(test):
    print(f"Mismatch detected: Test index ({len(test)}) and predictions ({len(predictions)})")
    # Truncate test index if it exceeds predictions
    test = test.iloc[:len(predictions)]
elif len(predictions) > len(test):
    raise ValueError("More predictions than test entries—investigate preprocessing steps.")

# Prepare submission file
submission = pd.DataFrame({
    'id': test.index,
    'num_sold': np.expm1(predictions.flatten())  # Reverse log transformation
})

submission.to_csv('submission5.csv', index=False)
print("Submission file created successfully!")



Submission file created successfully!
