In [None]:
import math
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import seaborn as sns
import torch
from sklearn.metrics import classification_report, f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from datetime import datetime
from tqdm import tqdm
from torch.optim import LBFGS
import torch.nn as nn
from scipy.io import arff
from native_sparse_attention_pytorch import SparseAttention
from torch.utils.data import DataLoader, TensorDataset 
import os
import tabmixer
from tabmixer import TabMixer

import sys
!{sys.executable} -m pip install transformers

from huggingface_hub import login
login("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")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


NUM_FEATURES = 64  # you can change this
TRIAL = 2
EPOTCHES = 3

# DATA = "Credit-g(CG)"
DATA = "Credit-approval(CA)"


  from .autonotebook import tqdm as notebook_tqdm




In [2]:
# model = BertTabNSA(
#     num_features=NUM_FEATURES,
#     num_classes=NUM_CLASSES,
#     **tabnsa_params
# )


In [None]:
def load_dataset_auto(filepath):
    df = pd.read_csv(filepath)
    feature_cols = df.columns[:-1].tolist()  # all but last
    label_col = df.columns[-1]              # last column is label
    return df, feature_cols, label_col

def row_to_prompt(row, feature_cols):
    return ", ".join([f"{col} is {row[col]}" for col in feature_cols])

def prepare_prompt_data(filepath):
    df, feature_cols, label_col = load_dataset_auto(filepath)
    df["prompt"] = df.apply(lambda row: row_to_prompt(row, feature_cols), axis=1)
    df["label"] = df[label_col]
    return df[["prompt", "label"]]



F1P1 = prepare_prompt_data(f"/home/data3/Ali/Code/DATA/{DATA}/ZSL-LLM/features1_part1.csv")
F1P2 = prepare_prompt_data(f"/home/data3/Ali/Code/DATA/{DATA}/ZSL-LLM/features1_part2.csv")
F2P1 = prepare_prompt_data(f"/home/data3/Ali/Code/DATA/{DATA}/ZSL-LLM/features2_part1.csv")
F2P2 = prepare_prompt_data(f"/home/data3/Ali/Code/DATA/{DATA}/ZSL-LLM/features2_part2.csv")


In [4]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(F1P1["label"])  # fit only once on train

for df in [F1P1, F1P2, F2P1, F2P2]:
    df["label_id"] = le.transform(df["label"])

In [5]:
from transformers import BertTokenizer
from transformers import RobertaModel, RobertaTokenizer


# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")


def tokenize_dataset(df):
    return tokenizer(
        df["prompt"].tolist(),
        padding="max_length",
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )

In [6]:
import torch
from torch.utils.data import Dataset

class PromptDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = torch.tensor(labels)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "labels": self.labels[idx]
        }

In [7]:
# # Load
# F1P1 = pd.read_csv("/home/data3/Ali/Code/DATA/Credit-g(CG)/ZSL-LLM/features1_part1.csv")
# F1P2 = pd.read_csv("/home/data3/Ali/Code/DATA/Credit-g(CG)/ZSL-LLM/features1_part2.csv")
# F2P1 = pd.read_csv("/home/data3/Ali/Code/DATA/Credit-g(CG)/ZSL-LLM/features2_part1.csv")
# F2P2 = pd.read_csv("/home/data3/Ali/Code/DATA/Credit-g(CG)/ZSL-LLM/features2_part2.csv")

# Encode labels as integers
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
F1P1["label_id"] = le.fit_transform(F1P1["label"])
F1P2["label_id"] = le.transform(F1P2["label"])
F2P1["label_id"] = le.transform(F2P1["label"])
F2P2["label_id"] = le.transform(F2P2["label"])

# Tokenize
train_enc = tokenize_dataset(F1P1)
val_enc   = tokenize_dataset(F1P2)
few_enc   = tokenize_dataset(F2P1)
test_enc  = tokenize_dataset(F2P2)

# Datasets
train_dataset = PromptDataset(train_enc, F1P1["label_id"].tolist())
val_dataset   = PromptDataset(val_enc,   F1P2["label_id"].tolist())
few_dataset   = PromptDataset(few_enc,   F2P1["label_id"].tolist())
test_dataset  = PromptDataset(test_enc,  F2P2["label_id"].tolist())


## Model

In [8]:
class TabNSA(nn.Module):
    def __init__(self, input_shape, output_shape, dim_head, heads, sliding_window_size, compress_block_size, selection_block_size, num_selected_blocks):
        super().__init__()
        
        self.dim = 64
        self.feature_embedding = nn.Linear(1, self.dim)

        self.attention = SparseAttention(
            dim=self.dim,
            dim_head = dim_head,
            heads = heads,
            sliding_window_size = sliding_window_size,
            compress_block_size = compress_block_size,
            selection_block_size = selection_block_size,
            num_selected_blocks = num_selected_blocks
        )
        
        self.tabmixer = TabMixer(
            dim_tokens=input_shape,       
            dim_features=self.dim,    
            dim_feedforward=256 
        )

        self.head = nn.Sequential(
            nn.Linear(self.dim, 32),
            nn.GELU(),
            nn.Linear(32, output_shape)
        )

    def forward(self, x):
        x = x.unsqueeze(-1)  # [B, F] â†’ [B, F, 1]
        x = self.feature_embedding(x)  # [B, F, D]
        x_1 = self.attention(x)
        x_2 = self.tabmixer(x)
        x = x_1 + x_2
        x = x.mean(dim=1)  # [B, D]
        return self.head(x)


In [9]:
from transformers import BertModel
import torch.nn as nn

class BertTabNSA(nn.Module):
    def __init__(self, num_features, num_classes, **tabnsa_params):
        super().__init__()
        # self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.bert = RobertaModel.from_pretrained("roberta-base")
        self.adapter = nn.Linear(768, num_features)
        self.tabnsa = TabNSA(
            input_shape=num_features,
            output_shape=num_classes,
            **tabnsa_params
        )

    def forward(self, input_ids, attention_mask):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_embed = bert_output.last_hidden_state[:, 0, :]  # [B, 768]
        tab_input = self.adapter(cls_embed)  # [B, NUM_FEATURES]
        return self.tabnsa(tab_input)


In [10]:

NUM_CLASSES = len(le.classes_)  # number of label classes
print(NUM_CLASSES)

# tabnsa_params = {
#     "dim_head": 16,
#     "heads": 4,
#     "sliding_window_size": 4,
#     "compress_block_size": 4,
#     "selection_block_size": 2,
#     "num_selected_blocks": 2
# }


2


In [11]:
# model = BertTabNSA(
#     num_features=NUM_FEATURES,
#     num_classes=NUM_CLASSES,
#     **tabnsa_params
# )


In [12]:
# def objective(trial):
#     # Hyperparameters
#     dim_head = trial.suggest_int("dim_head", 8, 64, step=8)
#     heads = trial.suggest_int("heads", 1, 8)
#     sliding_window_size = trial.suggest_int("sliding_window_size", 1, 8)
#     compress_block_size = trial.suggest_int("compress_block_size", 4, 16, step=4)
#     selection_block_size = trial.suggest_int("selection_block_size", 2, compress_block_size, step=2)
#     num_selected_blocks = trial.suggest_int("num_selected_blocks", 1, 4)
#     learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-4, log=True)
#     batch_size = trial.suggest_int("batch_size", 16, 64, step=16)

#     # Model
#     model = BertTabNSA(
#         num_features=NUM_FEATURES,
#         num_classes=NUM_CLASSES,
#         dim_head=dim_head,
#         heads=heads,
#         sliding_window_size=sliding_window_size,
#         compress_block_size=compress_block_size,
#         selection_block_size=selection_block_size,
#         num_selected_blocks=num_selected_blocks
#     ).to(device)

#     optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
#     criterion = nn.CrossEntropyLoss()

#     # DataLoader
#     train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
#     val_loader = DataLoader(val_dataset, batch_size=64)

#     # Train
#     for epoch in range(3):  # Keep short for tuning
#         model.train()
#         for batch in train_loader:
#             input_ids = batch["input_ids"].to(device)
#             attention_mask = batch["attention_mask"].to(device)
#             labels = batch["labels"].to(device)

#             outputs = model(input_ids, attention_mask)
#             loss = criterion(outputs, labels)

#             optimizer.zero_grad()
#             loss.backward()
#             optimizer.step()

#     # Evaluate on validation set
#     model.eval()
#     all_preds, all_labels = [], []
#     with torch.no_grad():
#         for batch in val_loader:
#             input_ids = batch["input_ids"].to(device)
#             attention_mask = batch["attention_mask"].to(device)
#             labels = batch["labels"].to(device)

#             outputs = model(input_ids, attention_mask)
#             preds = torch.argmax(outputs, dim=1)

#             all_preds.extend(preds.cpu().numpy())
#             all_labels.extend(labels.cpu().numpy())

#     from sklearn.metrics import accuracy_score
#     val_acc = accuracy_score(all_labels, all_preds)
#     return val_acc

## New

In [13]:
def fit(model, criterion, optimizer, train_loader, device, epochs=3):
    history = {"train_loss": []}

    model.to(device)
    model.train()

    for epoch in range(epochs):
        epoch_loss = 0.0
        for batch in train_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            epoch_loss += loss.item() * input_ids.size(0)

        avg_loss = epoch_loss / len(train_loader.dataset)
        history["train_loss"].append(avg_loss)
        print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_loss:.4f}")

    return history


In [14]:
from sklearn.metrics import accuracy_score, roc_auc_score

def evaluate_model(model, val_loader, num_classes, device, return_auc=False):
    model.eval()
    all_preds = []
    all_probs = []
    all_labels = []

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask)
            probs = torch.softmax(outputs, dim=1)

            preds = torch.argmax(probs, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    acc = accuracy_score(all_labels, all_preds)

    if return_auc and num_classes == 2:
        auc = roc_auc_score(all_labels, [p[1] for p in all_probs])
        return acc, auc
    elif return_auc and num_classes > 2:
        auc = roc_auc_score(all_labels, all_probs, multi_class='ovo')
        return acc, auc
    else:
        return acc


In [15]:
def objective(trial):
    # Hyperparameters
    dim_head = trial.suggest_int("dim_head", 8, 64, step=8)
    heads = trial.suggest_int("heads", 1, 8)
    sliding_window_size = trial.suggest_int("sliding_window_size", 1, 8)
    compress_block_size = trial.suggest_int("compress_block_size", 4, 16, step=4)
    selection_block_size = trial.suggest_int("selection_block_size", 2, compress_block_size, step=2)
    num_selected_blocks = trial.suggest_int("num_selected_blocks", 1, 4)
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-4, log=True)
    batch_size = trial.suggest_int("batch_size", 16, 64, step=16)

    # Model
    model = BertTabNSA(
        num_features=NUM_FEATURES,
        num_classes=NUM_CLASSES,
        dim_head=dim_head,
        heads=heads,
        sliding_window_size=sliding_window_size,
        compress_block_size=compress_block_size,
        selection_block_size=selection_block_size,
        num_selected_blocks=num_selected_blocks
    ).to(device)

    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()

    # DataLoader
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=64)

    # Train
    fit(model, criterion, optimizer, train_loader, device, epochs=EPOTCHES)

    # Evaluate
    acc, auc = evaluate_model(model, val_loader, num_classes=NUM_CLASSES, device=device, return_auc=True)
    print(f"Validation Accuracy: {acc:.4f}, AUC: {auc:.4f}")

    return auc

## Optimize

In [16]:
import optuna

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=TRIAL)  # Increase for better tuning
best_params = study.best_params
print("Best parameters:", best_params)


[I 2025-05-15 23:20:22,960] A new study created in memory with name: no-name-788908ce-f59a-4a0e-b65c-b12029bd2c97
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 - Loss: 0.6984
Epoch 2/3 - Loss: 0.6875


[I 2025-05-15 23:20:42,657] Trial 0 finished with value: 0.46969696969696967 and parameters: {'dim_head': 16, 'heads': 4, 'sliding_window_size': 8, 'compress_block_size': 16, 'selection_block_size': 10, 'num_selected_blocks': 3, 'learning_rate': 7.021517014772568e-05, 'batch_size': 16}. Best is trial 0 with value: 0.46969696969696967.


Epoch 3/3 - Loss: 0.6866
Validation Accuracy: 0.5217, AUC: 0.4697


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3 - Loss: 0.6873
Epoch 2/3 - Loss: 0.6680
Epoch 3/3 - Loss: 0.6173
Validation Accuracy: 0.7246, AUC: 0.7668


[I 2025-05-15 23:20:59,629] Trial 1 finished with value: 0.7668350168350168 and parameters: {'dim_head': 40, 'heads': 8, 'sliding_window_size': 3, 'compress_block_size': 4, 'selection_block_size': 2, 'num_selected_blocks': 2, 'learning_rate': 2.622396095322153e-05, 'batch_size': 64}. Best is trial 1 with value: 0.7668350168350168.


Best parameters: {'dim_head': 40, 'heads': 8, 'sliding_window_size': 3, 'compress_block_size': 4, 'selection_block_size': 2, 'num_selected_blocks': 2, 'learning_rate': 2.622396095322153e-05, 'batch_size': 64}


## Test 

In [17]:
# Separate model-related params from training params
tabnsa_params = {
    k: v for k, v in best_params.items()
    if k not in ["learning_rate", "batch_size"]
}

learning_rate = best_params["learning_rate"]
batch_size = best_params["batch_size"]

model = BertTabNSA(
    num_features=NUM_FEATURES,
    num_classes=NUM_CLASSES,
    **tabnsa_params
).to(device)


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# from torch.utils.data import DataLoader

# learning_rate = best_params["learning_rate"]
# batch_size = best_params["batch_size"]


# model = BertTabNSA(num_features=NUM_FEATURES, num_classes=NUM_CLASSES, **best_params).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

# DataLoader
few_loader = DataLoader(few_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64)

fit(model, criterion, optimizer, few_loader, device, epochs=EPOTCHES)

acc, auc = evaluate_model(model, test_loader, num_classes=NUM_CLASSES, device=device, return_auc=True)

print(f"Accuracy: {acc:.4f}")
print(f"AUC:      {auc:.4f}")

Epoch 1/3 - Loss: 0.6993
Epoch 2/3 - Loss: 0.6522
Epoch 3/3 - Loss: 0.5829
Accuracy: 0.8261
AUC:      0.7960
