**Downloading Kaggle data sets directly into Colab**

Install the kaggle python library

In [3]:
#! pip install kaggle



Mount the Google drive so you can store your kaggle API credentials for future use

In [5]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


Make a directory for kaggle at the temporary instance location on Colab drive.

Download your kaggle API key (.json file). You can do this by going to your kaggle account page and clicking 'Create new API token' under the API section.

In [6]:
! mkdir ~/.kaggle

Upload the json file to Google Drive and then copy to the temporary location.

In [7]:
!cp /content/drive/MyDrive/ColabNotebooks/kaggle_API_credentials/kaggle.json ~/.kaggle/kaggle.json

Change the file permissions to read/write to the owner only

In [8]:
! chmod 600 ~/.kaggle/kaggle.json

**Competitions and Datasets are the two types of Kaggle data**

**1. Download competition data**

If you get 403 Forbidden error, you need to click 'Late Submission' on the Kaggle page for that competition.

In [9]:
! kaggle competitions download -c walmart-recruiting-store-sales-forecasting

Downloading walmart-recruiting-store-sales-forecasting.zip to /content
  0% 0.00/2.70M [00:00<?, ?B/s]
100% 2.70M/2.70M [00:00<00:00, 932MB/s]


Unzip, in case the downloaded file is zipped. Refresh the files on the left hand side to update the view.

In [10]:
! unzip walmart-recruiting-store-sales-forecasting

Archive:  walmart-recruiting-store-sales-forecasting.zip
  inflating: features.csv.zip        
  inflating: sampleSubmission.csv.zip  
  inflating: stores.csv              
  inflating: test.csv.zip            
  inflating: train.csv.zip           


In [1]:
!pip install torch torchvision torchaudio
!pip install pytorch-lightning
!pip install optuna
!pip install mlflow==2.7.1
!pip install -q dagshub

Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5.147-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cusolver-cu12==11.6.1.9 (from torch)
  Downloading nvidia_cusolver_cu12-11.6.1.9-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cusparse-cu12==12.3.1.170 (from torch)
  Downloading nvidia_cusparse_cu12-12.3.1.170-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-nvjitlink-cu12==12.4.127 (from torch)
  Downloading nvidia_nvjitlink_cu12-12.4.127-py3-n

In [7]:
!pip uninstall scipy numpy -y
!pip install numpy scipy

Found existing installation: scipy 1.15.2
Uninstalling scipy-1.15.2:
  Successfully uninstalled scipy-1.15.2
Found existing installation: numpy 1.26.4
Uninstalling numpy-1.26.4:
  Successfully uninstalled numpy-1.26.4
Collecting numpy
  Using cached numpy-2.3.1-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (62 kB)
Collecting scipy
  Downloading scipy-1.16.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.9/61.9 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hUsing cached numpy-2.3.1-cp311-cp311-manylinux_2_28_x86_64.whl (16.9 MB)
Downloading scipy-1.16.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (35.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.3/35.3 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: numpy, scipy
[31mERROR: pip's dependency resolver does not currently take into account all the

In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor, ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import optuna
import mlflow
import dagshub
from typing import Optional, List, Tuple, Dict, Any
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)

* 'schema_extra' has been renamed to 'json_schema_extra'


In [30]:
# stores = pd.read_csv('stores.csv')
# train = pd.read_csv("train.csv.zip")
# features = pd.read_csv('features.csv.zip')
# sample = pd.read_csv('sampleSubmission.csv.zip')
# test = pd.read_csv('test.csv.zip')

data_path = "/kaggle/input/walmart-recruiting-store-sales-forecasting/"

# Read the datasets using the correct paths
stores = pd.read_csv(data_path + 'stores.csv')
train = pd.read_csv(data_path + 'train.csv.zip')
features = pd.read_csv(data_path + 'features.csv.zip')
sample = pd.read_csv(data_path + 'sampleSubmission.csv.zip')
test = pd.read_csv(data_path + 'test.csv.zip')

In [31]:
# Convert 'Date' columns to datetime objects for easier manipulation
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])
features['Date'] = pd.to_datetime(features['Date'])

# Merge features with train and test data.
# Note: 'IsHoliday' is present in both train/test and features.csv.
# We'll merge on it to ensure consistency, but if there were discrepancies,
# we'd need a more careful merge strategy.
train_df = pd.merge(train, features, on=['Store', 'Date', 'IsHoliday'], how='left')
test_df = pd.merge(test, features, on=['Store', 'Date', 'IsHoliday'], how='left')

# Merge store information
train_df = pd.merge(train_df, stores, on='Store', how='left')
test_df = pd.merge(test_df, stores, on='Store', how='left')

print("\n--- Merged Train Data Head ---")
print(train_df.head())
print("\n--- Merged Test Data Head ---")
print(test_df.head())

print("\n--- Merged Train Data Info ---")
print(train_df.info())
print("\n--- Merged Test Data Info ---")
print(test_df.info())

# Free up memory
del train, test, features, stores
gc.collect()


--- Merged Train Data Head ---
   Store  Dept       Date  Weekly_Sales  IsHoliday  Temperature  Fuel_Price  MarkDown1  MarkDown2  MarkDown3  MarkDown4  MarkDown5         CPI  Unemployment Type    Size
0      1     1 2010-02-05      24924.50      False        42.31       2.572        NaN        NaN        NaN        NaN        NaN  211.096358         8.106    A  151315
1      1     1 2010-02-12      46039.49       True        38.51       2.548        NaN        NaN        NaN        NaN        NaN  211.242170         8.106    A  151315
2      1     1 2010-02-19      41595.55      False        39.93       2.514        NaN        NaN        NaN        NaN        NaN  211.289143         8.106    A  151315
3      1     1 2010-02-26      19403.54      False        46.63       2.561        NaN        NaN        NaN        NaN        NaN  211.319643         8.106    A  151315
4      1     1 2010-03-05      21827.90      False        46.50       2.625        NaN        NaN        NaN        Na

1319

## **DATA CLEANING**


In [32]:
class ImprovedMissingValueImputer(BaseEstimator, TransformerMixin):
    """Enhanced imputer that handles ALL missing values including Weekly_Sales"""
    
    def __init__(self, markdown_cols=None, numerical_cols=None, target_col='Weekly_Sales'):
        self.markdown_cols = markdown_cols if markdown_cols is not None else [f'MarkDown{i}' for i in range(1, 6)]
        self.numerical_cols = numerical_cols if numerical_cols is not None else [
            'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Size', target_col
        ]
        self.target_col = target_col
        self.group_means = {}
        self.global_means = {}
        
    def fit(self, X, y=None):
        """Fit imputer with group-wise and global statistics"""
        # Calculate group means for target variable
        if 'group_id' in X.columns and self.target_col in X.columns:
            group_stats = X.groupby('group_id')[self.target_col].agg(['mean', 'median']).reset_index()
            self.group_means = group_stats.set_index('group_id').to_dict()
        
        # Calculate global means for all numerical columns
        for col in self.numerical_cols:
            if col in X.columns:
                self.global_means[col] = X[col].mean()
                
        return self
    
    def transform(self, X):
        """Transform with comprehensive missing value handling"""
        X_copy = X.copy()
        
        # Handle MarkDown columns (fill with 0 and create indicators)
        for col in self.markdown_cols:
            if col in X_copy.columns:
                X_copy[f"{col}_was_missing"] = X_copy[col].isna().astype(int)
                X_copy[col] = X_copy[col].fillna(0)
        
        # Handle numerical columns with forward/backward fill first
        for col in self.numerical_cols:
            if col in X_copy.columns:
                if 'group_id' in X_copy.columns:
                    # Group-wise forward/backward fill
                    X_copy[col] = X_copy.groupby('group_id')[col].transform(
                        lambda x: x.fillna(method='ffill').fillna(method='bfill')
                    )
                else:
                    # Global forward/backward fill
                    X_copy[col] = X_copy[col].fillna(method='ffill').fillna(method='bfill')
        
        # Handle remaining missing values with group means (for target variable)
        if self.target_col in X_copy.columns and 'group_id' in X_copy.columns:
            def fill_group_mean(group):
                group_id = group.name
                if group_id in self.group_means.get('mean', {}):
                    return group[self.target_col].fillna(self.group_means['mean'][group_id])
                else:
                    return group[self.target_col].fillna(self.global_means.get(self.target_col, 0))
            
            X_copy[self.target_col] = X_copy.groupby('group_id').apply(fill_group_mean).reset_index(level=0, drop=True)
        
        # Final cleanup: fill any remaining missing values with global means
        for col in self.numerical_cols:
            if col in X_copy.columns:
                X_copy[col] = X_copy[col].fillna(self.global_means.get(col, 0))
        
        return X_copy


In [33]:
class DateFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, date_column='Date', keep_date=True):
        self.date_column = date_column
        self.keep_date = keep_date

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        if self.date_column not in X_copy.columns:
            raise ValueError(f"Date column '{self.date_column}' not found in DataFrame.")

        X_copy[self.date_column] = pd.to_datetime(X_copy[self.date_column])

        # Create time features
        X_copy['Year'] = X_copy[self.date_column].dt.year
        X_copy['Month'] = X_copy[self.date_column].dt.month
        X_copy['Month_sin'] = np.sin(2 * np.pi * X_copy['Month'] / 12)
        X_copy['Month_cos'] = np.cos(2 * np.pi * X_copy['Month'] / 12)
        X_copy['Week'] = X_copy[self.date_column].dt.isocalendar().week.astype(int)
        X_copy['Day'] = X_copy[self.date_column].dt.day
        X_copy['DayOfWeek'] = X_copy[self.date_column].dt.dayofweek

        # Convert boolean to int
        if 'IsHoliday' in X_copy.columns and X_copy['IsHoliday'].dtype == bool:
            X_copy['IsHoliday'] = X_copy['IsHoliday'].astype(int)

        # Drop Month, optionally keep Date
        columns_to_drop = ["Month"]
        if not self.keep_date:
            columns_to_drop.append(self.date_column)

        return X_copy.drop(columns=columns_to_drop)

In [34]:
class PatchTSTPreprocessor(BaseEstimator, TransformerMixin):
    """Preprocessor for PatchTST model that handles encoding and scaling"""
    def __init__(self):
        self.scalers = {}
        self.categorical_encoders = {}

    def fit(self, X, y=None):
        # Fit scalers for numerical columns
        numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()
        # Remove target column if present
        if 'Weekly_Sales' in numerical_cols:
            numerical_cols.remove('Weekly_Sales')

        for col in numerical_cols:
            if col in X.columns:
                self.scalers[col] = StandardScaler()
                self.scalers[col].fit(X[[col]])

        # Fit encoders for categorical columns
        categorical_cols = ['Type']  # Store and Dept will be handled as group identifiers
        for col in categorical_cols:
            if col in X.columns:
                unique_values = X[col].unique()
                self.categorical_encoders[col] = {val: idx for idx, val in enumerate(unique_values)}

        return self

    def transform(self, X):
        X_copy = X.copy()

        # Scale numerical features
        for col, scaler in self.scalers.items():
            if col in X_copy.columns:
                X_copy[col] = scaler.transform(X_copy[[col]]).flatten()

        # Encode categorical features
        for col, encoder in self.categorical_encoders.items():
            if col in X_copy.columns:
                X_copy[col] = X_copy[col].map(encoder).fillna(-1)

        # Create group identifier
        X_copy['group_id'] = X_copy['Store'].astype(str) + '_' + X_copy['Dept'].astype(str)

        return X_copy

In [35]:
class PatchTSTModel(nn.Module):
    def __init__(self, seq_len, pred_len, patch_len=16, stride=8, d_model=128,
                 n_heads=8, n_layers=3, d_ff=256, dropout=0.1, num_features=1):
        super().__init__()
        
        self.seq_len = seq_len
        self.pred_len = pred_len
        self.patch_len = min(patch_len, seq_len)  # FIX: Ensure patch_len <= seq_len
        self.stride = min(stride, self.patch_len)  # FIX: Ensure stride <= patch_len
        self.d_model = d_model
        self.num_features = num_features
        
        # FIX: Calculate number of patches correctly
        self.n_patches = max(1, (seq_len - self.patch_len) // self.stride + 1)
        
        # FIX: Ensure d_model is divisible by n_heads
        if d_model % n_heads != 0:
            d_model = (d_model // n_heads) * n_heads
            self.d_model = d_model
        
        # Input projection
        self.patch_embedding = nn.Linear(self.patch_len * num_features, d_model)
        
        # Positional encoding
        self.pos_encoding = nn.Parameter(torch.randn(1, self.n_patches, d_model) * 0.02)
        
        # FIX: Use standard TransformerEncoder with proper configuration
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=n_heads,
            dim_feedforward=d_ff,
            dropout=dropout,
            batch_first=True,
            activation='relu'
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=n_layers)
        
        # FIX: Better output projection
        self.head = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(d_ff, pred_len)
        )
        
        # FIX: Proper weight initialization
        self.apply(self._init_weights)
    
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.xavier_uniform_(module.weight)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.TransformerEncoderLayer):
            for param in module.parameters():
                if param.dim() > 1:
                    torch.nn.init.xavier_uniform_(param)
    
    def create_patches(self, x):
        """Create patches from time series data"""
        batch_size, seq_len, num_features = x.shape
        
        # FIX: Handle edge cases
        if seq_len < self.patch_len:
            # Pad sequence if too short
            padding = self.patch_len - seq_len
            x = F.pad(x, (0, 0, 0, padding), mode='constant', value=0)
            seq_len = self.patch_len
        
        patches = []
        for i in range(0, seq_len - self.patch_len + 1, self.stride):
            patch = x[:, i:i+self.patch_len, :]
            patches.append(patch)
        
        if len(patches) == 0:
            patches = [x[:, :self.patch_len, :]]
        
        # Stack patches: [batch_size, n_patches, patch_len, num_features]
        patches = torch.stack(patches, dim=1)
        
        # Reshape for embedding: [batch_size, n_patches, patch_len * num_features]
        patches = patches.reshape(batch_size, patches.size(1), -1)
        
        return patches
    
    def forward(self, x):
        # FIX: Add input validation
        if torch.isnan(x).any() or torch.isinf(x).any():
            x = torch.nan_to_num(x, nan=0.0, posinf=1.0, neginf=-1.0)
        
        # Create patches
        patches = self.create_patches(x)
        
        # Embed patches
        embedded = self.patch_embedding(patches)
        
        # Add positional encoding
        pos_enc = self.pos_encoding[:, :embedded.size(1), :]
        embedded = embedded + pos_enc
        
        # Apply transformer
        encoded = self.transformer(embedded)
        
        # FIX: Use mean pooling instead of flattening
        pooled = encoded.mean(dim=1)  # [batch_size, d_model]
        
        # Project to output
        output = self.head(pooled)
        
        return output

In [42]:
class PatchTSTLightningModule(pl.LightningModule):
    def __init__(self, seq_len, pred_len, patch_len=16, stride=8, d_model=128,
                 n_heads=8, n_layers=3, d_ff=256, dropout=0.1, num_features=1,
                 learning_rate=1e-3, weight_decay=1e-4):
        super().__init__()
        self.save_hyperparameters()
        
        # FIX: Validate parameters
        patch_len = min(patch_len, seq_len)
        stride = min(stride, patch_len)
        if d_model % n_heads != 0:
            d_model = (d_model // n_heads) * n_heads
        
        self.model = PatchTSTModel(
            seq_len=seq_len,
            pred_len=pred_len,
            patch_len=patch_len,
            stride=stride,
            d_model=d_model,
            n_heads=n_heads,
            n_layers=n_layers,
            d_ff=d_ff,
            dropout=dropout,
            num_features=num_features
        )
        
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        
        # FIX: Add validation tracking
        self.validation_step_outputs = []
    
    def forward(self, x):
        return self.model(x)
    
    def compute_wmae(self, y_true, y_pred):
        """Compute Weighted Mean Absolute Error"""
        # Handle shape mismatch
        if y_pred.shape != y_true.shape:
            min_len = min(y_pred.shape[-1], y_true.shape[-1])
            y_pred = y_pred[:, :min_len]
            y_true = y_true[:, :min_len]
        
        # Compute weights (based on magnitude of true values)
        weights = torch.abs(y_true) + 1  # Add 1 to avoid zero weights
        
        # Compute weighted MAE
        mae = torch.abs(y_pred - y_true)
        wmae = torch.sum(weights * mae) / torch.sum(weights)
        
        return wmae
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        
        # FIX: Robust error handling
        try:
            y_hat = self(x)
            
            # Compute WMAE (this will also be our loss)
            wmae = self.compute_wmae(y, y_hat)
            
            # FIX: Check for invalid loss
            if torch.isnan(wmae) or torch.isinf(wmae):
                return None
                
            # Log both loss and WMAE (they're the same in this case)
            self.log('train_loss', wmae, on_step=True, on_epoch=True, prog_bar=True)
            self.log('train_wmae', wmae, on_step=True, on_epoch=True, prog_bar=True)
            
            return wmae
            
        except Exception as e:
            print(f"Training step failed: {e}")
            return None
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        
        try:
            y_hat = self(x)
            
            # Compute WMAE
            wmae = self.compute_wmae(y, y_hat)
            
            if torch.isnan(wmae) or torch.isinf(wmae):
                return None
                
            # Log both loss and WMAE (they're the same in this case)
            self.log('val_loss', wmae, on_step=False, on_epoch=True, prog_bar=True)
            self.log('val_wmae', wmae, on_step=False, on_epoch=True, prog_bar=True)
            
            self.validation_step_outputs.append(wmae)
            return wmae
            
        except Exception as e:
            print(f"Validation step failed: {e}")
            return None
    
    def on_validation_epoch_end(self):
        # FIX: Clear validation outputs
        self.validation_step_outputs.clear()
    
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(
            self.parameters(),
            lr=self.learning_rate,
            weight_decay=self.weight_decay
        )
        
        # FIX: Add learning rate scheduler
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer,
            mode='min',
            factor=0.5,
            patience=5,
            min_lr=1e-6
        )
        
        return {
            'optimizer': optimizer,
            'lr_scheduler': {
                'scheduler': scheduler,
                'monitor': 'val_wmae',  # CHANGE: Monitor WMAE instead of val_loss
                'frequency': 1
            }
        }

In [37]:
class WalmartTSDataset(Dataset):
    def __init__(self, df, seq_len, pred_len, feature_cols, target_col='Weekly_Sales'):
        self.df = df.copy()
        self.seq_len = seq_len
        self.pred_len = pred_len
        self.feature_cols = feature_cols
        self.target_col = target_col
        
        # Ensure data is sorted
        self.df = self.df.sort_values(['group_id', 'Date']).reset_index(drop=True)
        
        # Create samples
        self.samples = self._create_samples()
        
        if len(self.samples) == 0:
            raise ValueError("No valid samples created. Check your data and parameters.")
        
        # FIX: Better scaling approach
        self._prepare_scalers()
        
        print(f"Dataset created with {len(self.samples)} samples")
    
    def _create_samples(self):
        """Create valid samples from the dataset"""
        samples = []
        
        for group_id in self.df['group_id'].unique():
            group_data = self.df[self.df['group_id'] == group_id].copy()
            group_data = group_data.reset_index(drop=True)
            
            # FIX: Check for sufficient data
            if len(group_data) < self.seq_len + self.pred_len:
                continue
            
            # FIX: Skip groups with too many missing values
            if group_data[self.target_col].isna().sum() > len(group_data) * 0.5:
                continue
            
            # Create sliding window samples
            for i in range(len(group_data) - self.seq_len - self.pred_len + 1):
                sample = {
                    'group_id': group_id,
                    'start_idx': i,
                    'seq_end_idx': i + self.seq_len,
                    'pred_end_idx': i + self.seq_len + self.pred_len,
                    'group_data': group_data
                }
                samples.append(sample)
        
        return samples
    
    def _prepare_scalers(self):
        """Prepare scalers for features and target"""
        # FIX: Collect all data for scaling
        all_features = []
        all_targets = []
        
        for sample in self.samples:
            group_data = sample['group_data']
            
            # Get feature data
            seq_data = group_data.iloc[sample['start_idx']:sample['seq_end_idx']]
            features = seq_data[self.feature_cols].values
            
            # Get target data
            pred_data = group_data.iloc[sample['seq_end_idx']:sample['pred_end_idx']]
            target = pred_data[self.target_col].values
            
            # FIX: Handle missing values
            features = np.nan_to_num(features, nan=0.0)
            target = np.nan_to_num(target, nan=0.0)
            
            all_features.append(features)
            all_targets.append(target)
        
        # Combine all data
        all_features = np.concatenate(all_features, axis=0)
        all_targets = np.concatenate(all_targets, axis=0).reshape(-1, 1)
        
        # FIX: Robust scaling
        self.feature_scaler = StandardScaler()
        self.target_scaler = StandardScaler()
        
        # Handle edge cases
        if np.std(all_features, axis=0).min() < 1e-8:
            # Add small noise to prevent division by zero
            all_features += np.random.normal(0, 1e-8, all_features.shape)
        
        if np.std(all_targets) < 1e-8:
            all_targets += np.random.normal(0, 1e-8, all_targets.shape)
        
        self.feature_scaler.fit(all_features)
        self.target_scaler.fit(all_targets)
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        sample = self.samples[idx]
        group_data = sample['group_data']
        
        # Get sequence data
        seq_data = group_data.iloc[sample['start_idx']:sample['seq_end_idx']]
        pred_data = group_data.iloc[sample['seq_end_idx']:sample['pred_end_idx']]
        
        # Extract features and target
        features = seq_data[self.feature_cols].values
        target = pred_data[self.target_col].values
        
        # FIX: Handle missing values
        features = np.nan_to_num(features, nan=0.0)
        target = np.nan_to_num(target, nan=0.0)
        
        # Scale data
        features_scaled = self.feature_scaler.transform(features)
        target_scaled = self.target_scaler.transform(target.reshape(-1, 1)).flatten()
        
        # Convert to tensors
        X = torch.FloatTensor(features_scaled)
        y = torch.FloatTensor(target_scaled)
        
        return X, y

In [43]:
improved_preprocessing_pipeline = Pipeline([
    ('date_extractor', DateFeatureExtractor(keep_date=True)),  # Do this first
    ('patchtst_preprocessor', PatchTSTPreprocessor()),  # This creates group_id
    ('imputer', ImprovedMissingValueImputer(target_col='Weekly_Sales')),  # This uses group_id
])

# Apply preprocessing
print("Applying improved preprocessing...")
train_processed = improved_preprocessing_pipeline.fit_transform(train_df)
test_processed = improved_preprocessing_pipeline.transform(test_df)

print(f"Train processed shape: {train_processed.shape}")
print(f"Test processed shape: {test_processed.shape}")

# Check for missing values after preprocessing
print("\nMissing values after preprocessing:")
print("Train missing values:", train_processed.isnull().sum().sum())
print("Test missing values:", test_processed.isnull().sum().sum())

# Define parameters
seq_len = 8
pred_len = 4
validation_cutoff_date = pd.to_datetime('2012-02-01')

# Split data temporally
train_temporal = train_processed[train_processed['Date'] < validation_cutoff_date].copy()
val_temporal = train_processed[train_processed['Date'] >= validation_cutoff_date].copy()

print(f"\nTemporal split:")
print(f"Training: {train_temporal['Date'].min()} to {train_temporal['Date'].max()}")
print(f"Validation: {val_temporal['Date'].min()} to {val_temporal['Date'].max()}")

# Filter groups with sufficient data
min_samples_per_group = seq_len + pred_len
train_group_sizes = train_temporal.groupby('group_id').size()
val_group_sizes = val_temporal.groupby('group_id').size()

print(f"\nGroup filtering:")
print(f"Total groups in train: {len(train_group_sizes)}")
print(f"Total groups in val: {len(val_group_sizes)}")

# Find groups with sufficient data
valid_train_groups = train_group_sizes[train_group_sizes >= min_samples_per_group].index
valid_val_groups = val_group_sizes[val_group_sizes >= min_samples_per_group].index
valid_groups = valid_train_groups.intersection(valid_val_groups)

print(f"Groups with sufficient data: {len(valid_groups)}")

# Apply filtering
train_filtered = train_temporal[train_temporal['group_id'].isin(valid_groups)].copy()
val_filtered = val_temporal[val_temporal['group_id'].isin(valid_groups)].copy()

print(f"Training samples after filtering: {len(train_filtered)}")
print(f"Validation samples after filtering: {len(val_filtered)}")

# Check for missing values in target after filtering
print(f"\nTarget missing values after filtering:")
print(f"Train Weekly_Sales missing: {train_filtered['Weekly_Sales'].isnull().sum()}")
print(f"Val Weekly_Sales missing: {val_filtered['Weekly_Sales'].isnull().sum()}")

# Define feature columns
feature_cols = [
    'Temperature', 'Fuel_Price', 'CPI', 'Unemployment',
    'Month_sin', 'Month_cos', 'Week', 'Day', 'Year', 'IsHoliday', 'DayOfWeek',
    'Size', 'Type'
] + [f'MarkDown{i}' for i in range(1, 6)] + [f'MarkDown{i}_was_missing' for i in range(1, 6)]

# Filter existing columns
feature_cols = [col for col in feature_cols if col in train_filtered.columns]
print(f"\nFeature columns: {len(feature_cols)}")
print(f"Features: {feature_cols}")

# Create datasets
print("\nCreating datasets...")
try:
    train_dataset = WalmartTSDataset(train_filtered, seq_len, pred_len, feature_cols)
    val_dataset = WalmartTSDataset(val_filtered, seq_len, pred_len, feature_cols)
    
    print(f"Training dataset size: {len(train_dataset)}")
    print(f"Validation dataset size: {len(val_dataset)}")
    
    # Test dataset creation
    if len(train_dataset) > 0 and len(val_dataset) > 0:
        test_batch = train_dataset[0]
        print(f"Sample batch shapes: X={test_batch[0].shape}, y={test_batch[1].shape}")
        print("Dataset creation successful!")
    else:
        print("ERROR: Empty datasets created!")
        
except Exception as e:
    print(f"ERROR creating datasets: {e}")
    print("This suggests there are still missing value issues or insufficient data.")

Applying improved preprocessing...
Train processed shape: (421570, 28)
Test processed shape: (115064, 27)

Missing values after preprocessing:
Train missing values: 0
Test missing values: 0

Temporal split:
Training: 2010-02-05 00:00:00 to 2012-01-27 00:00:00
Validation: 2012-02-03 00:00:00 to 2012-10-26 00:00:00

Group filtering:
Total groups in train: 3306
Total groups in val: 3204
Groups with sufficient data: 2993
Training samples after filtering: 301424
Validation samples after filtering: 114400

Target missing values after filtering:
Train Weekly_Sales missing: 0
Val Weekly_Sales missing: 0

Feature columns: 23
Features: ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Month_sin', 'Month_cos', 'Week', 'Day', 'Year', 'IsHoliday', 'DayOfWeek', 'Size', 'Type', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'MarkDown1_was_missing', 'MarkDown2_was_missing', 'MarkDown3_was_missing', 'MarkDown4_was_missing', 'MarkDown5_was_missing']

Creating datasets...
Dataset cr

In [16]:
!pip install optuna-integration[pytorch_lightning]



In [None]:
# %pip install -q dagshub


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/261.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.0/261.0 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.9/139.9 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m92.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m203.4/203.4 kB[0m [31m15.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.2/85.2 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.3/74.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [16]:
#!pip install mlflow==2.7.1

Collecting pandas<3 (from mlflow==2.7.1)
  Using cached pandas-2.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
Using cached pandas-2.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.4 MB)
[0mTraceback (most recent call last):
  File "/usr/lib/python3.11/pathlib.py", line 540, in __str__
    return self._str
           ^^^^^^^^^
AttributeError: 'PosixPath' object has no attribute '_str'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/base_command.py", line 179, in exc_logging_wrapper
    status = run_func(*args)
             ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/req_command.py", line 67, in wrapper
    return func(self, options, args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/commands/install.py", line 447, in run


In [18]:

import dagshub
# Try to get credentials from environment first
dagshub.init(
    repo_owner='abarb22',
    repo_name='Walmart-Recruiting---Store-Sales-Forecasting',
    mlflow=True
)



Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=1d1fabd1-9527-42ef-9daf-a4d3e6918312&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=5f21703fe4910423cd04743f6d7c567e82f6a283aed965ded8c7b2539b8974ec




In [39]:
class ManualHyperparameterTuner:
    """Manual hyperparameter tuning with comprehensive logging"""
    
    def __init__(self, train_dataset, val_dataset, feature_cols, seq_len, pred_len):
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.feature_cols = feature_cols
        self.seq_len = seq_len
        self.pred_len = pred_len
        self.results = []
        
    def define_search_space(self):
        """Define hyperparameter search space"""
        return {
            'patch_len': [4, 8, 16],
            'stride': [2, 4, 8],
            'd_model': [64, 128, 256],
            'n_heads': [4, 8],
            'n_layers': [2, 3, 4],
            'd_ff': [128, 256, 512],
            'dropout': [0.1, 0.2, 0.3],
            'learning_rate': [1e-4, 5e-4, 1e-3],
            'weight_decay': [1e-6, 1e-5, 1e-4],
            'batch_size': [16, 32, 64]
        }
    
    def create_param_combinations(self, max_trials=50):
        """Create parameter combinations for tuning"""
        search_space = self.define_search_space()
        
        # Alternative approach without using itertools.product
        import random
        
        param_combinations = []
        keys = list(search_space.keys())
        
        # Generate random combinations
        for _ in range(max_trials):
            param_dict = {}
            for key in keys:
                param_dict[key] = random.choice(search_space[key])
            
            # Apply constraints
            if param_dict['patch_len'] > self.seq_len:
                param_dict['patch_len'] = min(param_dict['patch_len'], self.seq_len)
            
            if param_dict['stride'] > param_dict['patch_len']:
                param_dict['stride'] = param_dict['patch_len'] // 2
            
            if param_dict['d_model'] % param_dict['n_heads'] != 0:
                param_dict['d_model'] = (param_dict['d_model'] // param_dict['n_heads']) * param_dict['n_heads']
        
            param_combinations.append(param_dict)
        
        return param_combinations
    
    def train_single_configuration(self, params, trial_num):
        """Train a single configuration and return results"""
        print(f"\n=== Trial {trial_num} ===")
        print(f"Parameters: {params}")
        
        try:
            # Create data loaders
            train_loader = DataLoader(
                self.train_dataset,
                batch_size=params['batch_size'],
                shuffle=True,
                num_workers=0,
                drop_last=True
            )
            
            val_loader = DataLoader(
                self.val_dataset,
                batch_size=params['batch_size'],
                shuffle=False,
                num_workers=0,
                drop_last=False
            )
            
            # Create model
            model = PatchTSTLightningModule(
                seq_len=self.seq_len,
                pred_len=self.pred_len,
                patch_len=params['patch_len'],
                stride=params['stride'],
                d_model=params['d_model'],
                n_heads=params['n_heads'],
                n_layers=params['n_layers'],
                d_ff=params['d_ff'],
                dropout=params['dropout'],
                num_features=len(self.feature_cols),
                learning_rate=params['learning_rate'],
                weight_decay=params['weight_decay']
            )
            
            # Test model creation
            test_batch = next(iter(train_loader))
            test_x, test_y = test_batch
            with torch.no_grad():
                test_output = model(test_x)
            
            if torch.isnan(test_output).any() or torch.isinf(test_output).any():
                raise ValueError("Model produces NaN/Inf outputs")
            
            # Create trainer
            trainer = pl.Trainer(
                max_epochs=25,  # Reduced for faster tuning
                callbacks=[
                    EarlyStopping(monitor='val_wmae', patience=5, mode='min'),  # CHANGE: Monitor WMAE
                ],
                logger=False,
                enable_checkpointing=False,
                enable_progress_bar=False,
                accelerator='gpu' if torch.cuda.is_available() else 'cpu',
                devices=1,
                gradient_clip_val=1.0,
                gradient_clip_algorithm='norm'
            )
            
            # Train
            trainer.fit(model, train_loader, val_loader)
            
            # Get results - CHANGE: Get WMAE metrics
            train_wmae = trainer.callback_metrics.get('train_wmae', float('inf'))
            val_wmae = trainer.callback_metrics.get('val_wmae', float('inf'))
            
            # Convert to float if tensor
            if hasattr(train_wmae, 'item'):
                train_wmae = train_wmae.item()
            if hasattr(val_wmae, 'item'):
                val_wmae = val_wmae.item()
            
            # Check for invalid results
            if np.isnan(train_wmae) or np.isnan(val_wmae) or np.isinf(train_wmae) or np.isinf(val_wmae):
                raise ValueError("Invalid WMAE values")
            
            result = {
                'trial': trial_num,
                'train_wmae': train_wmae,  # CHANGE: Store WMAE instead of loss
                'val_wmae': val_wmae,     # CHANGE: Store WMAE instead of loss
                'train_loss': train_wmae,  # Keep for compatibility
                'val_loss': val_wmae,      # Keep for compatibility
                'epochs_trained': trainer.current_epoch,
                'parameters': params.copy(),
                'model_size': sum(p.numel() for p in model.parameters()),
                'status': 'success'
            }
            
            print(f"Train WMAE: {train_wmae:.6f}, Val WMAE: {val_wmae:.6f}")  # CHANGE: Print WMAE
            
            # Clean up
            del model, trainer, train_loader, val_loader
            torch.cuda.empty_cache() if torch.cuda.is_available() else None
            gc.collect()
            
            return result
            
        except Exception as e:
            print(f"Trial {trial_num} failed: {str(e)}")
            return {
                'trial': trial_num,
                'train_wmae': float('inf'),  # CHANGE: Add WMAE fields
                'val_wmae': float('inf'),    # CHANGE: Add WMAE fields
                'train_loss': float('inf'),
                'val_loss': float('inf'),
                'epochs_trained': 0,
                'parameters': params.copy(),
                'model_size': 0,
                'status': f'failed: {str(e)}'
            }
    
    def run_hyperparameter_search(self, max_trials=20):
        """Run complete hyperparameter search"""
        print(f"Starting manual hyperparameter tuning with {max_trials} trials...")
        
        # Get parameter combinations
        param_combinations = self.create_param_combinations(max_trials)
        
        # Start MLflow run for the entire tuning process
        with mlflow.start_run(run_name="PatchTST_Manual_Hyperparameter_Tuning_WMAE"):  # CHANGE: Update run name
            
            # Log general information
            mlflow.log_param("total_trials", len(param_combinations))
            mlflow.log_param("seq_len", self.seq_len)
            mlflow.log_param("pred_len", self.pred_len)
            mlflow.log_param("train_dataset_size", len(self.train_dataset))
            mlflow.log_param("val_dataset_size", len(self.val_dataset))
            mlflow.log_param("num_features", len(self.feature_cols))
            mlflow.log_param("metric", "WMAE")  # CHANGE: Log that we're using WMAE
            
            # Run trials
            for i, params in enumerate(param_combinations):
                
                # Start child run for each trial
                with mlflow.start_run(run_name=f"Trial_{i+1}", nested=True):
                    
                    # Log all parameters
                    for key, value in params.items():
                        mlflow.log_param(key, value)
                    
                    # Train and get results
                    result = self.train_single_configuration(params, i+1)
                    
                    # Log results - CHANGE: Log WMAE metrics
                    mlflow.log_metric("train_wmae", result['train_wmae'])
                    mlflow.log_metric("val_wmae", result['val_wmae'])
                    mlflow.log_metric("train_loss", result['train_loss'])  # Keep for compatibility
                    mlflow.log_metric("val_loss", result['val_loss'])      # Keep for compatibility
                    mlflow.log_metric("epochs_trained", result['epochs_trained'])
                    mlflow.log_param("model_size", result['model_size'])
                    mlflow.log_param("status", result['status'])
                    
                    # Store result
                    self.results.append(result)
            
            # Find best trial - CHANGE: Use WMAE for selection
            valid_results = [r for r in self.results if r['status'] == 'success']
            if valid_results:
                best_result = min(valid_results, key=lambda x: x['val_wmae'])  # CHANGE: Use val_wmae
                
                # Log best results
                mlflow.log_metric("best_train_wmae", best_result['train_wmae'])  # CHANGE: Log WMAE
                mlflow.log_metric("best_val_wmae", best_result['val_wmae'])      # CHANGE: Log WMAE
                mlflow.log_metric("best_train_loss", best_result['train_loss'])
                mlflow.log_metric("best_val_loss", best_result['val_loss'])
                mlflow.log_param("best_trial", best_result['trial'])
                
                for key, value in best_result['parameters'].items():
                    mlflow.log_param(f"best_{key}", value)
                
                print(f"\n=== BEST TRIAL FOUND ===")
                print(f"Trial: {best_result['trial']}")
                print(f"Validation WMAE: {best_result['val_wmae']:.6f}")  # CHANGE: Display WMAE
                print(f"Training WMAE: {best_result['train_wmae']:.6f}")   # CHANGE: Display WMAE
                print(f"Parameters: {best_result['parameters']}")
                
                return best_result
            else:
                print("No successful trials found!")
                return None

In [40]:
def train_final_model(best_params, train_dataset, val_dataset, feature_cols, seq_len, pred_len):
    """Train the final model with best parameters"""
    
    print(f"\n=== TRAINING FINAL MODEL ===")
    print(f"Using parameters: {best_params}")
    
    with mlflow.start_run(run_name="PatchTST_Final_Model_WMAE"):  # CHANGE: Update run name
        
        # Log all parameters
        mlflow.log_param("seq_len", seq_len)
        mlflow.log_param("pred_len", pred_len)
        mlflow.log_param("train_dataset_size", len(train_dataset))
        mlflow.log_param("val_dataset_size", len(val_dataset))
        mlflow.log_param("num_features", len(feature_cols))
        mlflow.log_param("metric", "WMAE")  # CHANGE: Log that we're using WMAE
        
        for key, value in best_params.items():
            mlflow.log_param(key, value)
        
        # Create final dataloaders
        train_loader = DataLoader(
            train_dataset,
            batch_size=best_params['batch_size'],
            shuffle=True,
            num_workers=0,
            drop_last=True
        )
        
        val_loader = DataLoader(
            val_dataset,
            batch_size=best_params['batch_size'],
            shuffle=False,
            num_workers=0,
            drop_last=False
        )
        
        # Create final model
        final_model = PatchTSTLightningModule(
            seq_len=seq_len,
            pred_len=pred_len,
            patch_len=best_params['patch_len'],
            stride=best_params['stride'],
            d_model=best_params['d_model'],
            n_heads=best_params['n_heads'],
            n_layers=best_params['n_layers'],
            d_ff=best_params['d_ff'],
            dropout=best_params['dropout'],
            num_features=len(feature_cols),
            learning_rate=best_params['learning_rate'],
            weight_decay=best_params['weight_decay']
        )
        
        # Log model info
        total_params = sum(p.numel() for p in final_model.parameters())
        mlflow.log_param("total_parameters", total_params)
        mlflow.log_param("model_size_mb", total_params * 4 / 1024 / 1024)
        
        print(f"Model parameters: {total_params:,}")
        
        # Create trainer - CHANGE: Monitor WMAE
        trainer = pl.Trainer(
            max_epochs=100,
            accelerator='gpu' if torch.cuda.is_available() else 'cpu',
            devices=1,
            callbacks=[
                EarlyStopping(monitor="val_wmae", patience=15, mode="min"),  # CHANGE: Monitor WMAE
                LearningRateMonitor(logging_interval='epoch'),
                ModelCheckpoint(monitor="val_wmae", mode="min", save_top_k=1),  # CHANGE: Monitor WMAE
            ],
            enable_progress_bar=True,
        )
        
        # Train model
        print("Training final model...")
        trainer.fit(final_model, train_loader, val_loader)
        
        # Log final results - CHANGE: Log WMAE metrics
        final_train_wmae = trainer.callback_metrics.get("train_wmae", 0)
        final_val_wmae = trainer.callback_metrics.get("val_wmae", 0)
        final_train_loss = trainer.callback_metrics.get("train_loss", 0)
        final_val_loss = trainer.callback_metrics.get("val_loss", 0)
        
        if hasattr(final_train_wmae, 'item'):
            final_train_wmae = final_train_wmae.item()
        if hasattr(final_val_wmae, 'item'):
            final_val_wmae = final_val_wmae.item()
        if hasattr(final_train_loss, 'item'):
            final_train_loss = final_train_loss.item()
        if hasattr(final_val_loss, 'item'):
            final_val_loss = final_val_loss.item()
        
        mlflow.log_metric("final_train_wmae", final_train_wmae)  # CHANGE: Log WMAE
        mlflow.log_metric("final_val_wmae", final_val_wmae)      # CHANGE: Log WMAE
        mlflow.log_metric("final_train_loss", final_train_loss)
        mlflow.log_metric("final_val_loss", final_val_loss)
        mlflow.log_metric("final_epochs", trainer.current_epoch)
        
        print(f"Final validation WMAE: {final_val_wmae:.6f}")  # CHANGE: Display WMAE
        print(f"Final training WMAE: {final_train_wmae:.6f}")   # CHANGE: Display WMAE
        print(f"Epochs trained: {trainer.current_epoch}")
        
        return final_model, trainer

In [None]:
tuner = ManualHyperparameterTuner(
    train_dataset=train_dataset,
    val_dataset=val_dataset,
    feature_cols=feature_cols,
    seq_len=seq_len,
    pred_len=pred_len
)

# Run hyperparameter search
best_result = tuner.run_hyperparameter_search(max_trials=25)

# Save results
results_df = tuner.get_results_dataframe()
print(f"\nHyperparameter tuning completed. Results shape: {results_df.shape}")

# Display top results
if not results_df.empty:
    print("\nTop 5 results:")
    top_results = results_df.nsmallest(5, 'val_loss')
    display_cols = ['trial', 'val_loss', 'train_loss', 'epochs_trained', 'status']
    print(top_results[display_cols].to_string(index=False))
    
    # Display parameter summary for best result
    if best_result:
        print(f"\nBest parameters:")
        for key, value in best_result['parameters'].items():
            print(f"  {key}: {value}")


Starting manual hyperparameter tuning with 25 trials...


INFO: GPU available: False, used: False
INFO: TPU available: False, using: 0 TPU cores
INFO: HPU available: False, using: 0 HPUs



=== Trial 1 ===
Parameters: {'patch_len': 8, 'stride': 4, 'd_model': 256, 'n_heads': 4, 'n_layers': 4, 'd_ff': 512, 'dropout': 0.2, 'learning_rate': 0.0005, 'weight_decay': 1e-05, 'batch_size': 32}


In [None]:

# Replace the existing final evaluation section with this:
if best_result and best_result['status'] == 'success':
    print("\n=== TRAINING FINAL MODEL WITH BEST PARAMETERS ===")
    
    # Log best hyperparameters
    print("Best Hyperparameters:")
    for param, value in best_result['parameters'].items():
        print(f"  {param}: {value}")
    
    final_model, final_trainer = train_final_model(
        best_params=best_result['parameters'],
        train_dataset=train_dataset,
        val_dataset=val_dataset,
        feature_cols=feature_cols,
        seq_len=seq_len,
        pred_len=pred_len
    )
    print("Training completed successfully!")
    
    # Additional evaluation
    print("\n=== FINAL MODEL EVALUATION ===")
    
    # Get predictions on validation set
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
    final_model.eval()
    all_predictions = []
    all_targets = []
    
    with torch.no_grad():
        for batch in val_loader:
            x, y = batch
            y_pred = final_model(x)
            all_predictions.append(y_pred.cpu().numpy())
            all_targets.append(y.cpu().numpy())
    
    predictions = np.concatenate(all_predictions, axis=0)
    targets = np.concatenate(all_targets, axis=0)
    
    # Calculate WMAE using the same logic as in the model
    weights = np.abs(targets) + 1  # Add 1 to avoid zero weights
    absolute_errors = np.abs(predictions - targets)
    wmae = np.sum(weights * absolute_errors) / np.sum(weights)
    
    print(f"Final Validation Metrics:")
    print(f"  WMAE: {wmae:.6f}")
    
    # CHANGE: Log final metrics with WMAE focus
    with mlflow.start_run(run_name="Final_Model_Metrics_WMAE"):
        # Log best hyperparameters
        for param, value in best_result['parameters'].items():
            mlflow.log_param(f"best_{param}", value)
        
        # Log final WMAE
        mlflow.log_metric("final_wmae", wmae)
        
        # Log best validation score from hyperparameter tuning
        mlflow.log_metric("best_validation_wmae", best_result['val_wmae'])  # CHANGE: Use WMAE

else:
    print("No successful hyperparameter tuning results found!")
    print("Check the preprocessing and dataset creation steps.")