In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

##################################
# 1) LOAD AND PREPROCESS
##################################

def load_data():
    train = pd.read_csv("playground-series-s5e1/train.csv")
    test  = pd.read_csv("playground-series-s5e1/test.csv")
    sub   = pd.read_csv("playground-series-s5e1/sample_submission.csv")
    return train, test, sub

def fill_missing_mean(df):
    """Fill missing in 'num_sold' with the column mean."""
    df['num_sold'] = df['num_sold'].fillna(df['num_sold'].mean())
    return df

def kaggle_transform_num_sold(train):
    """
    1) min–max scale using (x - min) / (max - min)
    2) log1p
    3) sqrt
    4) outlier-clip (IQR)
    Returns: transformed train + dictionary of transform stats
    """

    # 1) Compute min, max from the training
    num_sold_min = train['num_sold'].min()
    num_sold_max = train['num_sold'].max()

    # 2) Min–max scale
    train['num_sold'] = (train['num_sold'] - num_sold_min) / (num_sold_max - num_sold_min)

    # 3) log1p
    train['num_sold'] = np.log1p(train['num_sold'])

    # 4) sqrt
    train['num_sold'] = np.sqrt(train['num_sold'])

    # 5) outlier clip with IQR
    q1 = train['num_sold'].quantile(0.25)
    q3 = train['num_sold'].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5*iqr
    upper = q3 + 1.5*iqr
    train['num_sold'] = train['num_sold'].clip(lower, upper)

    # Return transform stats so we can invert later
    transform_stats = {
        'num_sold_min': num_sold_min,
        'num_sold_max': num_sold_max,
        'lower_bound': lower,
        'upper_bound': upper
    }
    return train, transform_stats

def invert_kaggle_transform(y_pred_transformed, stats):
    """
    Reverse the transformations done by kaggle_transform_num_sold:
      1) outlier clip -> we can't truly 'unclip', so we just skip
      2) sqrt -> square
      3) expm1
      4) min–max invert
    """
    # 1) square
    y_pred = y_pred_transformed**2

    # 2) expm1
    y_pred = np.expm1(y_pred)

    # 3) invert min–max
    num_sold_min = stats['num_sold_min']
    num_sold_max = stats['num_sold_max']
    y_pred = y_pred * (num_sold_max - num_sold_min) + num_sold_min

    return y_pred

##################################
# 2) CREATE DATASET
##################################
class KaggleStyleDataset(Dataset):
    """
    We'll treat each row as an independent sample, as the Kaggle example does.
    We do one-hot encoding for country, store, product, etc.
    X will be all the columns except 'num_sold', y is the transformed 'num_sold'.
    """
    def __init__(self, df):
        # df already has 'num_sold' transformed
        self.X = df.drop(['num_sold'], axis=1).values.astype(np.float32)
        self.y = df['num_sold'].values.astype(np.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return (
            torch.tensor(self.X[idx], dtype=torch.float32),
            torch.tensor(self.y[idx], dtype=torch.float32)
        )

##################################
# 3) FAN MODEL (Minimal Variation)
##################################
# We'll reuse your FANTimeSeriesModel, but adapt so it takes X as shape [Batch, Features]
# rather than [Batch, seq_len, enc_in]. We'll do a simple fix: treat "Features" as "seq_len",
# or wrap it in a single time step. We'll keep the rest the same just for demonstration.
##################################

from layers.SelfAttention_Family import FullAttention, AttentionLayer
from layers.Transformer_EncDec import Decoder, DecoderLayer, Encoder, EncoderLayer
from layers.Embed import DataEmbedding
from layers.FANLayer import FANLayer

class FANForKaggle(nn.Module):
    """
    Minimal example: we pretend we have seq_len=1, enc_in=FeatureDim
    to reuse your FAN architecture. 
    In reality, if you truly want a tabular approach, you might remove 
    the 1D convolution, etc. 
    """
    def __init__(self, feature_dim=20, d_model=128):
        super().__init__()
        self.d_model = d_model

        # We'll treat 'feature_dim' as enc_in
        # But we do a single time-step => [B, 1, feature_dim]
        # so freq= 'h', etc. => time features = 4 might not truly apply
        # We'll do a minimal approach anyway.
        class DummyConfigs:
            enc_in = feature_dim
            dec_in = feature_dim
            c_out = 1
            d_model = 128
            embed = 'timeF'
            freq = 'h'
            dropout = 0.1
            e_layers = 3
            d_layers = 1
            d_ff = 512
            n_heads = 8
            factor = 5
            activation = 'gelu'
            output_attention = False
            pred_len = 1
            exp_setting = 0
        
        configs = DummyConfigs()

        self.enc_embedding = DataEmbedding(
            configs.enc_in, configs.d_model, configs.embed, configs.freq, configs.dropout
        )
        # We'll reuse the same embedding for dec. 
        self.dec_embedding = DataEmbedding(
            configs.dec_in, configs.d_model, configs.embed, configs.freq, configs.dropout
        )

        self.fan_layer = FANLayer(configs.d_model, configs.d_model)

        # Encoder
        self.encoder = Encoder(
            [
                EncoderLayer(
                    AttentionLayer(
                        FullAttention(
                            False,
                            configs.factor,
                            attention_dropout=configs.dropout,
                            output_attention=configs.output_attention
                        ),
                        configs.d_model,
                        configs.n_heads,
                    ),
                    configs.d_model,
                    configs.d_ff,
                    dropout=configs.dropout,
                    activation=configs.activation,
                    exp_setting=configs.exp_setting
                )
                for _ in range(configs.e_layers)
            ],
            norm_layer=nn.LayerNorm(configs.d_model),
        )

        # Decoder
        self.decoder = Decoder(
            [
                DecoderLayer(
                    AttentionLayer(
                        FullAttention(
                            True,
                            configs.factor,
                            attention_dropout=configs.dropout,
                            output_attention=False,
                        ),
                        configs.d_model,
                        configs.n_heads,
                    ),
                    AttentionLayer(
                        FullAttention(
                            False,
                            configs.factor,
                            attention_dropout=configs.dropout,
                            output_attention=False,
                        ),
                        configs.d_model,
                        configs.n_heads,
                    ),
                    configs.d_model,
                    configs.d_ff,
                    dropout=configs.dropout,
                    activation=configs.activation,
                    exp_setting=configs.exp_setting,
                )
                for _ in range(configs.d_layers)
            ],
            norm_layer=nn.LayerNorm(configs.d_model),
            projection=nn.Linear(configs.d_model, 1, bias=True),
        )

    def forward(self, x):
        """
        x: shape [B, feature_dim]
        We'll artificially treat it as: [B, 1, feature_dim] to feed the existing code.
        We'll also pass a dummy 'time feature' array for x_mark_enc, x_mark_dec.
        """
        B, F = x.shape
        # Expand to [B, 1, F]
        x_enc = x.unsqueeze(1)
        x_dec = x.unsqueeze(1)  # same input for dec
        # Create dummy time features [B, 1, 4]
        x_mark_enc = torch.zeros(B, 1, 4, device=x.device)
        x_mark_dec = torch.zeros(B, 1, 4, device=x.device)

        # pass
        enc_out = self.enc_embedding(x_enc, x_mark_enc)
        enc_out = self.fan_layer(enc_out)
        enc_out, _ = self.encoder(enc_out)

        dec_out = self.dec_embedding(x_dec, x_mark_dec)
        dec_out = self.decoder(dec_out, enc_out)  # shape [B, 1, 1]
        return dec_out.squeeze(1).squeeze(-1)  # => [B]

##################################
# 4) MAIN PIPELINE
##################################

def main():
    #### A) LOAD
    train, test, sub = load_data()
    print("Initial train shape:", train.shape)
    print("Initial test shape:", test.shape)

    #### B) Transform train
    train = fill_missing_mean(train)
    # We now do all the Kaggle-style transforms on 'num_sold'
    # remove date, id or keep them if needed, etc.
    # We'll just do it like the Kaggle snippet
    min_val = train['num_sold'].min()
    print("Min of num_sold BEFORE transform:", min_val)

    train, transform_stats = kaggle_transform_num_sold(train)

    #### C) One-Hot or get_dummies for country, store, product, etc.
    # 1) We'll drop 'id','date' if they exist
    if 'id' in train.columns: 
        train.drop('id', axis=1, inplace=True)
    if 'date' in train.columns:
        train.drop('date', axis=1, inplace=True)

    cat_cols = ['country','store','product']
    for c in cat_cols:
        if c in train.columns:
            train = pd.get_dummies(train, columns=[c])

    # train now has numeric columns plus 'num_sold'
    # We'll separate X,y => y is the transformed 'num_sold'
    # e.g. X => everything except 'num_sold'
    # y => 'num_sold'
    y = train['num_sold'].values
    X = train.drop(['num_sold'], axis=1).values.astype(np.float32)

    print("Train after transforms:", train.shape)

    # D) Build Dataset
    # We'll replicate the Kaggle approach: random or time-based train/val?
    # For simplicity, let's do a random split:
    from sklearn.model_selection import train_test_split
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, shuffle=True, random_state=42
    )
    print("X_train:", X_train.shape, "y_train:", y_train.shape)
    print("X_val:", X_val.shape, "y_val:", y_val.shape)

    # Torch datasets
    class SimpleDataset(Dataset):
        def __init__(self, X_, y_):
            self.X_ = X_
            self.y_ = y_
        def __len__(self):
            return len(self.X_)
        def __getitem__(self, idx):
            return (
                torch.tensor(self.X_[idx], dtype=torch.float32),
                torch.tensor(self.y_[idx], dtype=torch.float32)
            )

    train_ds = SimpleDataset(X_train, y_train)
    val_ds   = SimpleDataset(X_val,   y_val)

    train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
    val_loader   = DataLoader(val_ds, batch_size=32, shuffle=False)

    # E) Initialize your FAN-based model
    feature_dim = X_train.shape[1]  # number of columns
    model = FANForKaggle(feature_dim=feature_dim, d_model=64)
    model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.MSELoss()

    #### F) TRAIN
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.train()

    epochs = 50
    optimizer = optim.Adam(model.parameters(), lr=1e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=3)
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            optimizer.zero_grad()
            # forward
            pred = model(batch_x)
            loss = criterion(pred, batch_y)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()*len(batch_x)
        train_loss /= len(train_ds)

        # val
        from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

        model.eval()
        val_loss = 0

        val_preds = []
        val_trues = []

        with torch.no_grad():
            for batch_x, batch_y in val_loader:
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                
                # Forward pass
                vpred = model(batch_x)
                
                # If your model outputs shape [B,1], flatten it
                vpred = vpred.view(-1)
                
                # Compute MSE loss (or your chosen loss)
                vloss = criterion(vpred, batch_y)
                val_loss += vloss.item() * len(batch_x)
                
                # Collect predictions & targets for further metrics
                val_preds.append(vpred.cpu().numpy())
                val_trues.append(batch_y.cpu().numpy())

        val_loss /= len(val_ds)
        scheduler.step(val_loss)

        # Concatenate all batches
        val_preds = np.concatenate(val_preds, axis=0)
        val_trues = np.concatenate(val_trues, axis=0)

        # Compute additional metrics
        mse = mean_squared_error(val_trues, val_preds)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(val_trues, val_preds)
        r2 = r2_score(val_trues, val_preds)
        # MAPE needs care if some y-values are zero; handle gracefully
        mape = np.mean(np.abs((val_trues - val_preds) / (val_trues + 1e-8))) * 100

        print(f"[Epoch {epoch+1}/{epochs}] "
            f"train_loss={train_loss:.4f}, val_loss={val_loss:.4f}, "
            f"MSE={mse:.4f}, RMSE={rmse:.4f}, MAE={mae:.4f}, R2={r2:.4f}, MAPE={mape:.2f}%")


    #### G) PREDICTION for Test
    # We'll do the same transforms to test, then invert at the end
    # 1) fill missing if any
    #test = fill_missing_mean(test)

    # 2) min–max scale => log => sqrt => clip
    # BUT test has no 'num_sold' to transform. We'll do a dummy approach, or we might just do final predictions?
    # The Kaggle example used an LSTM, so it had real feature engineering. 
    # Actually we only have 'country','store','product' + 'id','date'.

    # We'll replicate the get_dummies approach:
    if 'id' in test.columns:
        test_id = test['id'].values
        test.drop('id', axis=1, inplace=True)
    if 'date' in test.columns:
        test.drop('date', axis=1, inplace=True)

    cat_cols2 = ['country','store','product']
    for c in cat_cols2:
        if c in test.columns:
            test = pd.get_dummies(test, columns=[c])

    # Ensure test has the same columns as train (besides 'num_sold')
    # If train had columns that test lacks, add them:
    train_cols = set(train.drop('num_sold', axis=1).columns)
    test_cols  = set(test.columns)
    missing_in_test = train_cols - test_cols
    for mcol in missing_in_test:
        test[mcol] = 0

    # Possibly test has extra columns => drop them
    extra_in_test = test_cols - train_cols
    if len(extra_in_test)>0:
        test.drop(list(extra_in_test), axis=1, inplace=True)

    # reorder columns to match
    train_order = list(train.drop('num_sold',axis=1).columns)
    test = test[train_order]

    # Convert to float32
    X_test = test.values.astype(np.float32)

    # Predict 
    test_dataset = SimpleDataset(X_test, np.zeros(len(X_test), dtype=np.float32))
    test_loader  = DataLoader(test_dataset, batch_size=32, shuffle=False)

    model.eval()
    preds_transformed = []
    with torch.no_grad():
        for bx, _ in test_loader:
            bx = bx.to(device)
            out = model(bx)
            preds_transformed.append(out.cpu().numpy())
    preds_transformed = np.concatenate(preds_transformed, axis=0)  # shape (len(test),)

    # Invert the Kaggle transform 
    # i.e. y_pred_original = (expm1( (y_pred^2) ) * (max-min) ) + min
    final_preds = invert_kaggle_transform(preds_transformed, transform_stats)

    # Build submission
    submission = pd.DataFrame({
        'id': test_id,
        'num_sold': final_preds
    })
    submission.to_csv("submission.csv", index=False)
    print("submission.csv created with final predictions!")


if __name__=="__main__":
    main()


Initial train shape: (230130, 6)
Initial test shape: (98550, 5)
Min of num_sold BEFORE transform: 5.0
Train after transforms: (230130, 15)
X_train: (184104, 14) y_train: (184104,)
X_val: (46026, 14) y_val: (46026,)
