<a href="https://colab.research.google.com/github/ZerXXX0/sales-prediction/blob/main/FTTransformer_MLQ.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [83]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

In [84]:
train_df = pd.read_csv('https://raw.githubusercontent.com/ZerXXX0/sales-prediction/refs/heads/main/dataset/train_final.csv')
test_df = pd.read_csv('https://raw.githubusercontent.com/ZerXXX0/sales-prediction/refs/heads/main/dataset/test_final.csv')

In [85]:
# === Preprocessing ===
drop_cols = ['Unnamed: 0', 'TransactionID', 'MemberID']
train_df_clean = train_df.drop(columns=drop_cols)

X = train_df_clean.drop(columns=['next_buy'])
y = train_df_clean['next_buy']

# ✅ New: exclude target column
train_cols = [col for col in train_df_clean.columns if col not in ['next_buy']]

imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

In [86]:
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y)

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.long)

In [87]:
train_loader = DataLoader(TensorDataset(X_train_tensor, y_train_tensor), batch_size=1024, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val_tensor, y_val_tensor), batch_size=1024)

In [88]:
# === FT-Transformer Model ===
class FTTransformer(nn.Module):
    def __init__(self, input_dim, dim=64, depth=3, heads=4, dropout=0.1):
        super().__init__()
        self.embedding = nn.Linear(input_dim, dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=dim, nhead=heads, dropout=dropout, batch_first=True)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=depth)
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, 2)
        )

    def forward(self, x):
        B = x.shape[0]
        x = self.embedding(x).unsqueeze(1)
        cls_tokens = self.cls_token.expand(B, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x = self.transformer(x)
        return self.head(x[:, 0])


In [89]:
# class balancing
classes = np.unique(y_train)  # ensures correct classes
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32)

In [90]:
# === Training Loop ===
model = FTTransformer(input_dim=X_train.shape[1],
                      dim=128,         # increase hidden size
                      depth=4,         # more transformer layers
                      heads=8,         # more attention heads
                      dropout=0.2      # add more regularization
                      )
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = nn.CrossEntropyLoss(weight=class_weights_tensor.to(device))
optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=3, factor=0.5)
model.to(device)

FTTransformer(
  (embedding): Linear(in_features=8, out_features=128, bias=True)
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=2048, bias=True)
        (dropout): Dropout(p=0.2, inplace=False)
        (linear2): Linear(in_features=2048, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.2, inplace=False)
        (dropout2): Dropout(p=0.2, inplace=False)
      )
    )
  )
  (head): Sequential(
    (0): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    (1): Linear(in_features=128, out_features=2, bias=True)
  )
)

In [91]:
def train_model(model, train_loader, val_loader, epochs=50, save_path="best_ft_transformer.pth"):
    best_loss = float('inf')

    for epoch in range(epochs):
        model.train()
        total_loss = 0

        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            preds = model(xb)
            loss = criterion(preds, yb)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # === Validation ===
        model.eval()
        val_loss = 0
        correct = total = 0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                preds = model(xb)
                loss = criterion(preds, yb)
                val_loss += loss.item()

                predicted = torch.argmax(preds, dim=1)
                correct += (predicted == yb).sum().item()
                total += yb.size(0)

        val_loss /= len(val_loader)
        val_acc = correct / total * 100

        # Scheduler step on validation accuracy
        scheduler.step(val_acc)

        # Save the model with the lowest val loss
        if val_loss < best_loss:
            best_loss = val_loss
            torch.save(model.state_dict(), save_path)
            print(f"✅ Saved new best model (epoch {epoch+1}, val_loss: {val_loss:.4f}, val_acc: {val_acc:.2f}%)")

        print(f"Epoch {epoch+1}: Train Loss={total_loss:.4f}, Val Loss={val_loss:.4f}, Val Acc={val_acc:.2f}%")


In [92]:
train_model(model, train_loader, val_loader, epochs=200)

✅ Saved new best model (epoch 1, val_loss: 0.4797, val_acc: 65.63%)
Epoch 1: Train Loss=58.7218, Val Loss=0.4797, Val Acc=65.63%
✅ Saved new best model (epoch 2, val_loss: 0.4782, val_acc: 68.44%)
Epoch 2: Train Loss=49.2192, Val Loss=0.4782, Val Acc=68.44%
✅ Saved new best model (epoch 3, val_loss: 0.4664, val_acc: 69.81%)
Epoch 3: Train Loss=48.9498, Val Loss=0.4664, Val Acc=69.81%
Epoch 4: Train Loss=48.2853, Val Loss=0.4668, Val Acc=67.73%
✅ Saved new best model (epoch 5, val_loss: 0.4606, val_acc: 70.31%)
Epoch 5: Train Loss=48.0459, Val Loss=0.4606, Val Acc=70.31%
Epoch 6: Train Loss=47.9798, Val Loss=0.4623, Val Acc=74.11%
Epoch 7: Train Loss=47.8494, Val Loss=0.4626, Val Acc=71.74%
✅ Saved new best model (epoch 8, val_loss: 0.4587, val_acc: 71.27%)
Epoch 8: Train Loss=47.5241, Val Loss=0.4587, Val Acc=71.27%
Epoch 9: Train Loss=47.8203, Val Loss=0.4661, Val Acc=73.37%
Epoch 10: Train Loss=47.4492, Val Loss=0.4617, Val Acc=69.95%
✅ Saved new best model (epoch 11, val_loss: 0.451

In [93]:
# Re-create the model
model = FTTransformer(
    input_dim=8,  # from embedding.weight shape
    dim=128,      # from all transformer and linear layer dimensions
    depth=4,      # because there are 2 transformer layers in the checkpoint
    heads=8,      # from self-attention weight shape (128 * 3)
    dropout=0.1   # or whatever you used before
)

model.load_state_dict(torch.load("best_ft_transformer.pth"))
model.eval()

FTTransformer(
  (embedding): Linear(in_features=8, out_features=128, bias=True)
  (transformer): TransformerEncoder(
    (layers): ModuleList(
      (0-3): 4 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (head): Sequential(
    (0): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    (1): Linear(in_features=128, out_features=2, bias=True)
  )
)

In [94]:
# === Preprocess test set ===
member_ids = test_df["MemberID"]  # Save MemberID before dropping

drop_cols = ['Unnamed: 0', 'TransactionID', 'MemberID']
test_df_clean = test_df.drop(columns=drop_cols)

# ✅ Reorder test columns to match train
test_df_clean = test_df_clean[train_cols]

# Use the same preprocessing steps
X_test_imputed = imputer.transform(test_df_clean)

X_test_scaled = scaler.transform(X_test_imputed)

X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)

In [95]:
# === Predict ===
with torch.no_grad():
    preds = model(X_test_tensor)
    predicted_classes = torch.argmax(preds, dim=1).numpy()

# === Export predictions with MemberID ===
output = pd.DataFrame({
    "MemberID": member_ids,
    "next_buy_predicted": predicted_classes
})

output.to_csv("ft_transformer_predictions.csv", index=False)
print("✅ Predictions saved to ft_transformer_predictions.csv")

✅ Predictions saved to ft_transformer_predictions.csv


In [96]:
submission_df = pd.read_csv("ft_transformer_predictions.csv")

In [97]:
# prompt: rename a column

submission_df = submission_df.rename(columns={"next_buy_predicted": "next_buy"})
submission_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21098 entries, 0 to 21097
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   MemberID  21098 non-null  object
 1   next_buy  21098 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 329.8+ KB


In [98]:
submit_df = pd.read_csv("https://raw.githubusercontent.com/ZerXXX0/sales-prediction/refs/heads/main/dataset/sample_submission.csv")
# First, create a new dataframe from submission_df with duplicate MemberIDs removed.
# We keep the 'last' entry for each member.
submission_df_unique = submission_df.drop_duplicates(subset=['MemberID'], keep='last')

# Now, create the lookup map from this de-duplicated dataframe.
# This will succeed because the 'MemberID' index is now unique.
next_buy_lookup = submission_df_unique.set_index('MemberID')['next_buy']

# Proceed with the map operation as before. This will now work correctly.
submit_df['next_buy'] = submit_df['MemberID'].map(next_buy_lookup)

In [99]:
submit_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6381 entries, 0 to 6380
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   MemberID  6381 non-null   object
 1   next_buy  6381 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 99.8+ KB


In [100]:
submit_df.to_csv('submission_FTTransformer.csv', index=False)