In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import os


In [2]:

##########################################
# 1) LOADING + SCALING + GROUPING
##########################################

def load_data():
    train = pd.read_csv("playground-series-s5e1/train.csv")
    test  = pd.read_csv("playground-series-s5e1/test.csv")
    sub   = pd.read_csv("playground-series-s5e1/sample_submission.csv")
    return train, test, sub

def fill_missing_mean(df):
    """Fill missing in 'num_sold' with mean (only for train)."""
    df['num_sold'] = df['num_sold'].fillna(df['num_sold'].mean())
    return df

def kaggle_transform_num_sold(df):
    """
    Applies min–max -> log1p -> sqrt -> IQR clip to `num_sold`.
    Returns (df, transform_stats) so we can invert it later.
    """
    num_sold_min = df['num_sold'].min()
    num_sold_max = df['num_sold'].max()

    # 1) min–max
    df['num_sold'] = (df['num_sold'] - num_sold_min) / (num_sold_max - num_sold_min)

    # 2) log1p
    df['num_sold'] = np.log1p(df['num_sold'])

    # 3) sqrt
    df['num_sold'] = np.sqrt(df['num_sold'])

    # 4) IQR clip
    q1 = df['num_sold'].quantile(0.25)
    q3 = df['num_sold'].quantile(0.75)
    iqr = q3 - q1
    lower = q1 - 1.5 * iqr
    upper = q3 + 1.5 * iqr
    df['num_sold'] = df['num_sold'].clip(lower, upper)

    transform_stats = {
        'num_sold_min': num_sold_min,
        'num_sold_max': num_sold_max,
        'lower_bound': lower,
        'upper_bound': upper
    }
    return df, transform_stats

def invert_kaggle_transform(y_pred_transformed, stats):
    """
    Invert the transform: square -> expm1 -> invert min–max.
    """
    # 1) square
    y_pred = y_pred_transformed**2
    # 2) expm1
    y_pred = np.expm1(y_pred)
    # 3) min–max invert
    num_sold_min = stats['num_sold_min']
    num_sold_max = stats['num_sold_max']
    y_pred = y_pred * (num_sold_max - num_sold_min) + num_sold_min
    return y_pred



In [3]:

##########################################
# 2) BUILD SLIDING WINDOWS BY GROUP
##########################################
def build_group_windows(group_df, seq_len=96, pred_len=1):
    """
    group_df: columns [date, country, store, product, num_sold, ...]
    We'll assume only 'num_sold' is used for the time-series input.
    Return lists of x_seq, y_seq for each sliding window in this group.
    Single-step approach: x_seq => last 96 days, y_seq => next day.
    """
    group_df = group_df.sort_values('date')  # sort by date
    arr = group_df['num_sold'].values  # shape (N,)

    x_list = []
    y_list = []
    # We'll also store the corresponding date of the label so we can track it if needed
    # but for pure training, it's not mandatory. We'll omit for brevity.

    N = len(arr)
    for i in range(N - seq_len - pred_len + 1):
        x_seq = arr[i : i+seq_len]                 # last 96
        y_seq = arr[i+seq_len : i+seq_len+pred_len]# next day
        x_list.append(x_seq)
        y_list.append(y_seq[0])  # single-step => just 1 value

    return x_list, y_list

class MultiSeriesDataset(Dataset):
    """
    Collects sliding windows from all (country, store, product) groups.
    """
    def __init__(self, train_df, seq_len=96):
        self.seq_len = seq_len
        # group by (country, store, product)
        self.samples = []
        gdf = train_df.groupby(['country','store','product'], as_index=False)

        for (coun,st,prod), subdf in gdf:
            x_list, y_list = build_group_windows(subdf, seq_len=seq_len, pred_len=1)
            for x_seq, y_val in zip(x_list, y_list):
                self.samples.append((x_seq, y_val))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        x_seq, y_val = self.samples[idx]
        x_seq = torch.tensor(x_seq, dtype=torch.float32)  # shape [seq_len]
        y_val = torch.tensor(y_val, dtype=torch.float32)
        return x_seq, y_val



In [4]:

##########################################
# 3) FAN MODEL
##########################################
# We'll do a minimal approach: input is [B, seq_len], we pretend that's [B, seq_len, 1].
# Then we do the same FAN-based Transformer approach over time steps.
from layers.SelfAttention_Family import FullAttention, AttentionLayer
from layers.Transformer_EncDec import Decoder, DecoderLayer, Encoder, EncoderLayer
from layers.Embed import DataEmbedding
from layers.FANLayer import FANLayer

class FANTimeSeries(nn.Module):
    """
    A multi-step Transformer for a single time-series input dimension (or we do enc_in=1).
    We interpret x: [B, seq_len] -> [B, seq_len, 1].
    """
    def __init__(self, seq_len=96, d_model=128, e_layers=3, d_layers=1):
        super().__init__()
        class Configs:
            enc_in = 1
            dec_in = 1
            c_out = 1
            d_model = 128
            embed = 'timeF'
            freq = 'h'
            dropout = 0.1
            e_layers = 3
            d_layers = 1
            d_ff = 512
            n_heads = 8
            factor = 5
            activation = 'gelu'
            output_attention = False
            pred_len = 1
            exp_setting = 0
        configs = Configs()

        self.pred_len = configs.pred_len
        self.enc_embedding = DataEmbedding(1, configs.d_model, configs.embed, configs.freq, configs.dropout)
        self.dec_embedding = DataEmbedding(1, configs.d_model, configs.embed, configs.freq, configs.dropout)
        self.fan_layer = FANLayer(configs.d_model, configs.d_model)

        # Encoder
        self.encoder = Encoder(
            [
                EncoderLayer(
                    AttentionLayer(
                        FullAttention(False, configs.factor,
                                      attention_dropout=configs.dropout,
                                      output_attention=configs.output_attention),
                        configs.d_model,
                        configs.n_heads),
                    configs.d_model,
                    configs.d_ff,
                    dropout=configs.dropout,
                    activation=configs.activation,
                    exp_setting=configs.exp_setting
                )
                for _ in range(configs.e_layers)
            ],
            norm_layer=nn.LayerNorm(configs.d_model),
        )

        # Decoder
        self.decoder = Decoder(
            [
                DecoderLayer(
                    AttentionLayer(
                        FullAttention(True, configs.factor, attention_dropout=configs.dropout, output_attention=False),
                        configs.d_model,
                        configs.n_heads,
                    ),
                    AttentionLayer(
                        FullAttention(False, configs.factor, attention_dropout=configs.dropout, output_attention=False),
                        configs.d_model,
                        configs.n_heads,
                    ),
                    configs.d_model,
                    configs.d_ff,
                    dropout=configs.dropout,
                    activation=configs.activation,
                    exp_setting=configs.exp_setting,
                )
                for _ in range(configs.d_layers)
            ],
            norm_layer=nn.LayerNorm(configs.d_model),
            projection=nn.Linear(configs.d_model, 1, bias=True),
        )

    def forward(self, x):
        """
        x: shape [B, seq_len], single feature -> interpret as [B, seq_len, 1].
        We'll do a single-step decode => x_dec shape also [B, 1, 1].
        """
        device = x.device
        B, L = x.shape
        x_enc = x.unsqueeze(-1)  # [B, L, 1]
        # We'll create a single "start token" or zero for the decoder
        x_dec = torch.zeros([B, 1, 1], device=device)

        # dummy time features
        x_mark_enc = torch.zeros([B, L, 4], device=device)
        x_mark_dec = torch.zeros([B, 1, 4], device=device)

        enc_out = self.enc_embedding(x_enc, x_mark_enc)  # [B,L,d_model]
        enc_out = self.fan_layer(enc_out)
        enc_out, _ = self.encoder(enc_out)

        dec_out = self.dec_embedding(x_dec, x_mark_dec)
        dec_out = self.decoder(dec_out, enc_out)  # shape [B,1,1]
        # => single-step forecast
        return dec_out.squeeze(1).squeeze(-1)  # [B]


In [5]:

##########################################
# 4) MAIN PIPELINE
##########################################
def main():
    # A) Load
    train, test, sub = load_data()
    train['date'] = pd.to_datetime(train['date'])
    test['date']  = pd.to_datetime(test['date'])

    # B) Preprocess
    # fill missing in train
    train = fill_missing_mean(train)

    # C) Apply "Kaggle transform" to train's num_sold
    train, transform_stats = kaggle_transform_num_sold(train)

    # D) We'll keep date, country, store, product, but numeric for grouping & sliding windows
    # Sort train by (country, store, product, date)
    train = train.sort_values(['country','store','product','date'])

    # E) Build multi-series dataset with sliding windows
    seq_len = 96
    train_ds = MultiSeriesDataset(train, seq_len=seq_len)
    train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)

    # F) Initialize model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = FANTimeSeries(seq_len=seq_len, d_model=64, e_layers=2, d_layers=1).to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=3)
    criterion = nn.MSELoss()

    # G) Train loop
    epochs = 50
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        print(f"Allocated Memory: {torch.cuda.memory_allocated(device) / 1024 ** 2:.2f} MB")
        print(f"Cached Memory: {torch.cuda.memory_reserved(device) / 1024 ** 2:.2f} MB")
        for x_seq, y_val in train_loader:
            x_seq, y_val = x_seq.to(device), y_val.to(device)
            optimizer.zero_grad()
            pred = model(x_seq)  # shape [B]
            loss = criterion(pred, y_val)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * len(x_seq)
        total_loss /= len(train_ds)
        print(f"Epoch {epoch+1}/{epochs}, train_loss={total_loss:.4f}")
        scheduler.step(total_loss)
    # H) Save model
    torch.save(model.state_dict(), "fan_model_timeseries.pth")
    print("Model saved.")

    # I) Build a dictionary of historical data for each group, including the train portion
    # for easy inference
    # We'll store *transformed* num_sold in a structure so we can keep rolling forward
    history = {}
    for (c,s,p), gdf in train.groupby(["country","store","product"]):
        gdf = gdf.sort_values("date")
        history[(c,s,p)] = list(gdf['num_sold'].values)  # store as a list of transformed values

    # J) Inference on test
    # We'll sort test by date and do a rolling approach: for each row => build the last 96 from that group
    test = test.sort_values(["country","store","product","date"])

    predictions = []
    model.eval()
    with torch.no_grad():
        for i, row in test.iterrows():
            c = row['country']
            s = row['store']
            p = row['product']
            # retrieve that group's history
            if (c,s,p) not in history:
                history[(c,s,p)] = []  # if group not in train, start empty

            group_hist = history[(c,s,p)]

            # Build input x_seq
            if len(group_hist) < seq_len:
                # pad with zeros if not enough history
                padded = [0.0]*(seq_len - len(group_hist)) + group_hist
                x_seq = np.array(padded[-seq_len:], dtype=np.float32)
            else:
                x_seq = np.array(group_hist[-seq_len:], dtype=np.float32)

            x_seq_ten = torch.tensor([x_seq], device=device)  # shape [1, seq_len]
            pred_t = model(x_seq_ten)  # shape [1]
            pred_val = pred_t.item()   # transformed
            # store it into group_hist for future windows
            group_hist.append(pred_val)

            # invert
            num_sold_pred = invert_kaggle_transform(np.array([pred_val], dtype=np.float32), transform_stats)[0]
            predictions.append(num_sold_pred)

    # K) Build submission
    submission = pd.DataFrame({
        'id': test['id'],
        'num_sold': predictions
    })
    submission.to_csv("submission.csv", index=False)
    print("submission.csv created!")


if __name__=="__main__":
    main()


Allocated Memory: 8.22 MB
Cached Memory: 24.00 MB


KeyboardInterrupt: 