In [6]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder # For Type encoding if not using category dtype directly
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
import gc # For garbage collection
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.expand_frame_repr', False)

In [7]:
# stores = pd.read_csv('stores.csv')
# train = pd.read_csv("train.csv.zip")
# features = pd.read_csv('features.csv.zip')
# sample = pd.read_csv('sampleSubmission.csv.zip')
# test = pd.read_csv('test.csv.zip')

data_path = "/kaggle/input/walmart-recruiting-store-sales-forecasting/"

# Read the datasets using the correct paths
stores = pd.read_csv(data_path + 'stores.csv')
train = pd.read_csv(data_path + 'train.csv.zip')
features = pd.read_csv(data_path + 'features.csv.zip')
sample = pd.read_csv(data_path + 'sampleSubmission.csv.zip')
test = pd.read_csv(data_path + 'test.csv.zip')

In [8]:
%pip install -q dagshub


Note: you may need to restart the kernel to use updated packages.


In [9]:
!pip install mlflow==2.7.1



In [10]:

import dagshub
# Try to get credentials from environment first
dagshub.init(
    repo_owner='abarb22',
    repo_name='Walmart-Recruiting---Store-Sales-Forecasting',
    mlflow=True
)





Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=e31bf3a6-7054-490d-8714-d285ff42ae90&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=3218b371305d08997ad1f32cfd563952fbbaa4043ac01c11222030266cd09a1d




Output()

In [11]:
class MissingValueImputer(BaseEstimator, TransformerMixin):
    """
    Custom Transformer to handle missing values for specific columns.
    - MarkDown columns: fill with 0.
    - Other specified numerical columns: fill with ffill then bfill, fallback to mean.
    """
    def __init__(self, markdown_cols=None, numerical_cols_to_impute=None):
        self.markdown_cols = markdown_cols if markdown_cols is not None else [f'MarkDown{i}' for i in range(1, 6)]
        self.numerical_cols_to_impute = numerical_cols_to_impute if numerical_cols_to_impute is not None else ['Temperature', 'Fuel_Price', 'CPI', 'Unemployment']
        self.means = {} # To store means for fallback imputation during transform

    def fit(self, X, y=None):
        # Calculate means for fallback imputation from the training data
        for col in self.numerical_cols_to_impute:
            if col in X.columns:
                self.means[col] = X[col].mean()
        return self

    def transform(self, X):
        X_copy = X.copy()


        for col in self.markdown_cols:
          if col in X_copy.columns:
            X_copy[f"{col}_was_missing"] = X_copy[col].isna().astype(int)
            X_copy[col] = X_copy[col].fillna(0)


        # Impute other numerical columns with ffill then bfill, fallback to mean
        for col in self.numerical_cols_to_impute:
            if col in X_copy.columns:
                X_copy[col] = X_copy[col].fillna(method='ffill').fillna(method='bfill')
                # Fallback to mean if NaNs still exist (e.g., if all values were NaN in a column)
                if X_copy[col].isnull().any() and col in self.means:
                    X_copy[col] = X_copy[col].fillna(self.means[col])
        return X_copy

In [12]:
class DateFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, date_column='Date', keep_date=True):
        self.date_column = date_column
        self.keep_date = keep_date

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_copy = X.copy()
        if self.date_column not in X_copy.columns:
            raise ValueError(f"Date column '{self.date_column}' not found in DataFrame.")

        X_copy[self.date_column] = pd.to_datetime(X_copy[self.date_column])

        # Create time features
        X_copy['Year'] = X_copy[self.date_column].dt.year
        X_copy['Month'] = X_copy[self.date_column].dt.month
        X_copy['Month_sin'] = np.sin(2 * np.pi * X_copy['Month'] / 12)
        X_copy['Month_cos'] = np.cos(2 * np.pi * X_copy['Month'] / 12)
        X_copy['Week'] = X_copy[self.date_column].dt.isocalendar().week.astype(int)
        X_copy['Day'] = X_copy[self.date_column].dt.day
        X_copy['DayOfWeek'] = X_copy[self.date_column].dt.dayofweek

        # Convert boolean to int
        if 'IsHoliday' in X_copy.columns and X_copy['IsHoliday'].dtype == bool:
            X_copy['IsHoliday'] = X_copy['IsHoliday'].astype(int)

        # Drop Month, optionally keep Date
        columns_to_drop = ["Month"]
        if not self.keep_date:
            columns_to_drop.append(self.date_column)

        return X_copy.drop(columns=columns_to_drop)

In [13]:
# Convert 'Date' columns to datetime objects for easier manipulation
train['Date'] = pd.to_datetime(train['Date'])
test['Date'] = pd.to_datetime(test['Date'])
features['Date'] = pd.to_datetime(features['Date'])

# Merge features with train and test data.
# Note: 'IsHoliday' is present in both train/test and features.csv.
# We'll merge on it to ensure consistency, but if there were discrepancies,
# we'd need a more careful merge strategy.
train_df = pd.merge(train, features, on=['Store', 'Date', 'IsHoliday'], how='left')
test_df = pd.merge(test, features, on=['Store', 'Date', 'IsHoliday'], how='left')

# Merge store information
train_df = pd.merge(train_df, stores, on='Store', how='left')
test_df = pd.merge(test_df, stores, on='Store', how='left')

print("\n--- Merged Train Data Head ---")
print(train_df.head())
print("\n--- Merged Test Data Head ---")
print(test_df.head())

print("\n--- Merged Train Data Info ---")
print(train_df.info())
print("\n--- Merged Test Data Info ---")
print(test_df.info())

# Apply your preprocessing pipeline (imputer + date features)
pipe = Pipeline([
    ("imputer", MissingValueImputer()),
    ("date_features", DateFeatureExtractor()),
])
train_df = pipe.fit_transform(train_df)
test_df = pipe.transform(test_df)

# ➕ ADD THIS NOW: time_idx column
# min_date = train_df['Date'].min()
# train_df["time_idx"] = (train_df["Date"] - min_date).dt.days

# Free up memory
del train, test, features, stores
gc.collect()


--- Merged Train Data Head ---
   Store  Dept       Date  Weekly_Sales  IsHoliday  Temperature  Fuel_Price  MarkDown1  MarkDown2  MarkDown3  MarkDown4  MarkDown5         CPI  Unemployment Type    Size
0      1     1 2010-02-05      24924.50      False        42.31       2.572        NaN        NaN        NaN        NaN        NaN  211.096358         8.106    A  151315
1      1     1 2010-02-12      46039.49       True        38.51       2.548        NaN        NaN        NaN        NaN        NaN  211.242170         8.106    A  151315
2      1     1 2010-02-19      41595.55      False        39.93       2.514        NaN        NaN        NaN        NaN        NaN  211.289143         8.106    A  151315
3      1     1 2010-02-26      19403.54      False        46.63       2.561        NaN        NaN        NaN        NaN        NaN  211.319643         8.106    A  151315
4      1     1 2010-03-05      21827.90      False        46.50       2.625        NaN        NaN        NaN        Na

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  X_copy[col] = X_copy[col].fillna(method='ffill').fillna(method='bfill')
  X_copy[col] = X_copy[col].fillna(method='ffill').fillna(method='bfill')


27

In [14]:
# Custom TFT Implementation for Walmart Sales Forecasting
# This avoids version conflicts by implementing TFT from scratch

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import mlflow
import matplotlib.pyplot as plt
from typing import Dict, List, Tuple, Optional
import math

# Custom Dataset class for time series
class WalmartTimeSeriesDataset(Dataset):
    def __init__(self, data, sequence_length=24, forecast_length=6, target_col='Weekly_Sales'):
        self.data = data.sort_values(['Store', 'Dept', 'Date']).reset_index(drop=True)
        self.sequence_length = sequence_length
        self.forecast_length = forecast_length
        self.target_col = target_col
        
        # Group by store-dept combination
        self.groups = self.data.groupby(['Store', 'Dept'])
        self.samples = self._create_samples()
        
    def _create_samples(self):
        samples = []
        
        for (store, dept), group in self.groups:
            group = group.sort_values('Date').reset_index(drop=True)
            
            # Skip if not enough data
            if len(group) < self.sequence_length + self.forecast_length:
                continue
                
            # Create sliding windows
            for i in range(len(group) - self.sequence_length - self.forecast_length + 1):
                sample = {
                    'store': store,
                    'dept': dept,
                    'start_idx': i,
                    'group_data': group
                }
                samples.append(sample)
        
        return samples
    
    def __len__(self):
        return len(self.samples)
    
    def __getitem__(self, idx):
        sample = self.samples[idx]
        group_data = sample['group_data']
        start_idx = sample['start_idx']
        
        # Historical sequence
        hist_end = start_idx + self.sequence_length
        hist_data = group_data.iloc[start_idx:hist_end]
        
        # Future sequence
        future_data = group_data.iloc[hist_end:hist_end + self.forecast_length]
        
        # Prepare features
        hist_features = self._prepare_features(hist_data)
        future_features = self._prepare_features(future_data, is_future=True)
        
        # Target values
        hist_target = torch.tensor(hist_data[self.target_col].values, dtype=torch.float32)
        future_target = torch.tensor(future_data[self.target_col].values, dtype=torch.float32)
        
        # Weights (5 for holidays, 1 for non-holidays)
        future_weights = torch.tensor(future_data['IsHoliday'].apply(lambda x: 5.0 if x else 1.0).values, dtype=torch.float32)
        
        return {
            'hist_features': hist_features,
            'future_features': future_features,
            'hist_target': hist_target,
            'future_target': future_target,
            'future_weights': future_weights
        }
    
    def _prepare_features(self, data, is_future=False):
        features = []
        
        # Time features
        features.extend([
            data['Year'].values,
            data['Month_sin'].values,
            data['Month_cos'].values,
            data['Week'].values,
            data['Day'].values,
            data['DayOfWeek'].values,
        ])
        
        # Known reals (available for future)
        features.extend([
            data['IsHoliday'].astype(float).values,
            data['Temperature'].values,
            data['Fuel_Price'].values,
            data['CPI'].values,
            data['Unemployment'].values,
        ])
        
        # MarkDown features
        for i in range(1, 6):
            if f'MarkDown{i}' in data.columns:
                features.append(data[f'MarkDown{i}'].values)
        
        # Store type (categorical -> numerical)
        type_mapping = {'A': 0, 'B': 1, 'C': 2}
        features.append([type_mapping.get(t, 0) for t in data['Type'].values])
        
        # Store size (normalize)
        features.append(data['Size'].values / 200000)  # Simple normalization
        
        return torch.tensor(np.column_stack(features), dtype=torch.float32)

# Attention mechanism for TFT
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.n_heads = n_heads
        self.d_k = d_model // n_heads
        
        self.w_q = nn.Linear(d_model, d_model)
        self.w_k = nn.Linear(d_model, d_model)
        self.w_v = nn.Linear(d_model, d_model)
        self.w_o = nn.Linear(d_model, d_model)
        
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(d_model)
        
    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)
        
        # Linear transformations
        Q = self.w_q(query).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        K = self.w_k(key).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        V = self.w_v(value).view(batch_size, -1, self.n_heads, self.d_k).transpose(1, 2)
        
        # Attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)
        
        attn_weights = F.softmax(scores, dim=-1)
        attn_weights = self.dropout(attn_weights)
        
        context = torch.matmul(attn_weights, V)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        
        # Output projection
        output = self.w_o(context)
        
        # Residual connection and layer norm
        return self.layer_norm(output + query)

class VariableSelectionNetwork(nn.Module):
    def __init__(self, input_dim, hidden_dim, dropout=0.1):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.input_dim = input_dim
        
        self.flattened_grn = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, input_dim),
            nn.Softmax(dim=-1)
        )
        
        # Process each variable to a single output dimension (not hidden_dim)
        self.single_variable_grns = nn.ModuleList([
            nn.Sequential(
                nn.Linear(1, hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout),
                nn.Linear(hidden_dim, 1)  # Output single dimension per variable
            ) for _ in range(input_dim)
        ])
        
        # Final projection to hidden_dim
        self.final_projection = nn.Linear(input_dim, hidden_dim)
        
    def forward(self, x):
        # x shape: (batch_size, seq_len, input_dim)
        batch_size, seq_len, _ = x.shape
        
        # Get variable importance weights
        weights = self.flattened_grn(x)  # Shape: (batch_size, seq_len, input_dim)
        
        # Apply variable-specific processing
        processed_vars = []
        for i, grn in enumerate(self.single_variable_grns):
            var_input = x[:, :, i:i+1]  # Shape: (batch_size, seq_len, 1)
            var_processed = grn(var_input)  # Shape: (batch_size, seq_len, 1)
            processed_vars.append(var_processed)
        
        processed = torch.cat(processed_vars, dim=-1)  # Shape: (batch_size, seq_len, input_dim)
        
        # Apply importance weights
        selected = processed * weights  # Shape: (batch_size, seq_len, input_dim)
        
        # Project to hidden dimension
        output = self.final_projection(selected)  # Shape: (batch_size, seq_len, hidden_dim)
        
        return output

# Updated TFT Model using the fixed VSN
class TemporalFusionTransformer(nn.Module):
    def __init__(self, input_dim, hidden_dim=64, n_heads=4, n_layers=2, dropout=0.1):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.input_dim = input_dim
        
        # Use the simpler, fixed variable selection
        self.variable_selection = VariableSelectionNetwork(input_dim, hidden_dim, dropout)
        
        # LSTM for temporal processing
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, num_layers=n_layers, 
                           dropout=dropout, batch_first=True)
        
        # Multi-head attention
        self.attention = MultiHeadAttention(hidden_dim, n_heads, dropout)
        
        # Output layers
        self.output_projection = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, 1)
        )
        
    def forward(self, hist_features, future_features):
        # Process historical features
        hist_selected = self.variable_selection(hist_features)  # Now outputs (batch, seq, hidden_dim)
        
        # LSTM processing
        lstm_out, (h_n, c_n) = self.lstm(hist_selected)
        
        # Attention mechanism
        attended = self.attention(lstm_out, lstm_out, lstm_out)
        
        # Use last hidden state for future prediction
        last_hidden = attended[:, -1:, :]  # Shape: (batch, 1, hidden_dim)
        
        # Predict future values
        future_predictions = []
        hidden_state = last_hidden
        
        for i in range(future_features.size(1)):  # For each future time step
            # Project future features
            future_step = future_features[:, i:i+1, :]
            future_selected = self.variable_selection(future_step)
            
            # Combine with hidden state
            combined = hidden_state + future_selected
            
            # Apply attention
            attended_future = self.attention(combined, combined, combined)
            
            # Predict
            prediction = self.output_projection(attended_future)
            future_predictions.append(prediction)
            
            # Update hidden state
            hidden_state = attended_future
        
        predictions = torch.cat(future_predictions, dim=1)
        return predictions.squeeze(-1)  
        





# Custom WMAE Loss
class WMAELoss(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, predictions, targets, weights):
        mae = torch.abs(predictions - targets)
        weighted_mae = mae * weights
        return weighted_mae.sum() / weights.sum()

# Training function
def train_model(model, train_loader, val_loader, num_epochs=50, lr=0.001):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, factor=0.5)
    criterion = WMAELoss()
    
    best_val_loss = float('inf')
    train_losses = []
    val_losses = []
    
    for epoch in range(num_epochs):
        # Training
        model.train()
        train_loss = 0.0
        
        for batch in train_loader:
            hist_features = batch['hist_features'].to(device)
            future_features = batch['future_features'].to(device)
            future_target = batch['future_target'].to(device)
            future_weights = batch['future_weights'].to(device)
            
            optimizer.zero_grad()
            
            predictions = model(hist_features, future_features)
            loss = criterion(predictions, future_target, future_weights)
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            
            train_loss += loss.item()
        
        # Validation
        model.eval()
        val_loss = 0.0
        
        with torch.no_grad():
            for batch in val_loader:
                hist_features = batch['hist_features'].to(device)
                future_features = batch['future_features'].to(device)
                future_target = batch['future_target'].to(device)
                future_weights = batch['future_weights'].to(device)
                
                predictions = model(hist_features, future_features)
                loss = criterion(predictions, future_target, future_weights)
                
                val_loss += loss.item()
        
        train_loss /= len(train_loader)
        val_loss /= len(val_loader)
        
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        
        scheduler.step(val_loss)
        
        print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')
        ?
        # Save best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_tft_model.pth')
    
    return train_losses, val_losses

* 'schema_extra' has been renamed to 'json_schema_extra'


In [None]:
import numpy as np

# Method 1: Calculate input_dim automatically from your data
def calculate_input_dim(dataset):
    """Calculate the input dimension from the dataset"""
    sample = dataset[0]  # Get first sample
    hist_features = sample['hist_features']
    return hist_features.shape[-1]  # Last dimension is feature count

# Method 2: Count features manually based on your DateFeatureExtractor
def count_expected_features(train_data):
    """Count features that will be used in _prepare_features"""
    # Based on your _prepare_features method, count the features:
    feature_count = 0
    
    # Time features (from your DateFeatureExtractor)
    time_features = ['Year', 'Month_sin', 'Month_cos', 'Week_sin', 'Week_cos', 
                    'Day_sin', 'Day_cos', 'Week', 'Day', 'DayOfWeek']
    feature_count += len(time_features)  # 10 features
    
    # Store-specific features (if they exist)
    store_features = ['Store', 'Dept']
    for feat in store_features:
        if feat in train_data.columns:
            feature_count += 1
    
    # Other potential features
    other_features = ['IsHoliday', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment']
    for feat in other_features:
        if feat in train_data.columns:
            feature_count += 1
    
    # Weekly_Sales (target variable, also used as feature)
    if 'Weekly_Sales' in train_data.columns:
        feature_count += 1
    
    return feature_count

# Apply the fix to your code:
cutoff_date = pd.to_datetime("2012-02-01")
train_data = train_df[train_df['Date'] <= cutoff_date].copy()
val_data = train_df[train_df['Date'] > cutoff_date].copy()

# Apply DateFeatureExtractor
date_extractor = DateFeatureExtractor()
train_data = date_extractor.transform(train_data)
val_data = date_extractor.transform(val_data)

# Create datasets
train_dataset = WalmartTimeSeriesDataset(train_data, sequence_length=24, forecast_length=6)
val_dataset = WalmartTimeSeriesDataset(val_data, sequence_length=24, forecast_length=6)

# IMPORTANT: Calculate the correct input dimension
input_dim = calculate_input_dim(train_dataset)
print(f"Calculated input_dim: {input_dim}")

# Alternative: Manual calculation
# input_dim = count_expected_features(train_data)
# print(f"Expected input_dim: {input_dim}")

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=0)

# Initialize model with correct input dimension
model = TemporalFusionTransformer(
    input_dim=input_dim,  # Use calculated dimension
    hidden_dim=64,
    n_heads=4,
    n_layers=2,
    dropout=0.1
)


    # Start MLflow run
with mlflow.start_run():
        # Log parameters
        mlflow.log_params({
            "hidden_dim": 64,
            "n_heads": 4,
            "n_layers": 2,
            "dropout": 0.1,
            "sequence_length": 24,
            "forecast_length": 6,
            "batch_size": 64,
            "learning_rate": 0.001
        })
        
        # Train model
        train_losses, val_losses = train_model(model, train_loader, val_loader, num_epochs=50)
        
        # Log metrics
        for i, (train_loss, val_loss) in enumerate(zip(train_losses, val_losses)):
            mlflow.log_metric("train_loss", train_loss, step=i)
            mlflow.log_metric("val_loss", val_loss, step=i)
        
        # Plot training curves
        plt.figure(figsize=(10, 6))
        plt.plot(train_losses, label='Training Loss')
        plt.plot(val_losses, label='Validation Loss')
        plt.xlabel('Epoch')
        plt.ylabel('WMAE Loss')
        plt.title('Training and Validation Loss')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.savefig('training_curves.png')
        mlflow.log_artifact('training_curves.png')
        
        print("Training completed!")
        print(f"Best validation loss: {min(val_losses):.4f}")

Calculated input_dim: 18
Epoch 1/50, Train Loss: 14448.5290, Val Loss: 13621.4440
Epoch 2/50, Train Loss: 13955.5894, Val Loss: 13630.7403
Epoch 3/50, Train Loss: 13957.0871, Val Loss: 13641.3287
Epoch 4/50, Train Loss: 13954.3191, Val Loss: 13620.3801
Epoch 5/50, Train Loss: 13957.8897, Val Loss: 13618.1965
Epoch 6/50, Train Loss: 13952.9046, Val Loss: 13659.2352
Epoch 7/50, Train Loss: 13957.1140, Val Loss: 13629.1108
Epoch 8/50, Train Loss: 13952.8447, Val Loss: 13617.6506
Epoch 9/50, Train Loss: 13953.0407, Val Loss: 13606.9762
Epoch 10/50, Train Loss: 13952.6305, Val Loss: 13634.1617
Epoch 11/50, Train Loss: 13954.6078, Val Loss: 13638.8720
Epoch 12/50, Train Loss: 13953.0587, Val Loss: 13640.5851
Epoch 13/50, Train Loss: 13952.3029, Val Loss: 13653.1155
Epoch 14/50, Train Loss: 13953.6253, Val Loss: 13638.9365
Epoch 15/50, Train Loss: 13952.2541, Val Loss: 13617.4090
Epoch 16/50, Train Loss: 13954.7167, Val Loss: 13650.3842
Epoch 17/50, Train Loss: 13952.9838, Val Loss: 13636.842