In [None]:
import pandas as pd

train_df = pd.read_csv("filtered_train (1).csv")
test_df = pd.read_csv("filtered_test (1).csv")

train_df.shape, test_df.shape

((135951, 3), (46435, 3))

In [None]:
from transformers import AutoModel, AutoTokenizer

model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
encoder = AutoModel.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

In [None]:
from collections import defaultdict

# Creating dict for product info as prod_groups
# and esci_label as label_groups
prod_groups_train = defaultdict(list)
prod_groups_test  = defaultdict(list)
label_groups_train = defaultdict(list)
label_groups_test  = defaultdict(list)

def get_dicts(df, prod_groups, label_groups):
  for _, row in df.iterrows():
    query = row["query"]
    product = row["product_input"]
    relevance = float(row["esci_label"])

    prod_groups[query].append(product)
    label_groups[query].append(relevance)

get_dicts(train_df, prod_groups_train, label_groups_train)
get_dicts(test_df, prod_groups_test, label_groups_test)

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

In [None]:
import torch
from torch.utils.data import Dataset

class ESCIDataset(Dataset):
    def __init__(self, tokenizer, prod_groups, label_groups, max_len=128):
        self.tokenizer = tokenizer
        self.pairs = []
        self.reg_labels = []
        self.cls_labels = []

        ## Labels are 0.0(I), 0.01(C), 0.1(S) and 1.0(E),
        ## Models would prefer to promote with labels 1.0 and 0.1
        ## over 0.01 and 0.0

        score_to_index = {0.0: 0, 0.01: 1, 0.1: 2, 1.0: 3}

        for query in prod_groups:
            product_info = prod_groups[query]
            labels = label_groups[query]

            for idx, label in enumerate(labels):
                self.pairs.append((query, product_info[idx]))
                self.reg_labels.append(label)
                self.cls_labels.append(score_to_index[label])

        self.max_len = max_len

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        query, product = self.pairs[idx]
        reg_label = self.reg_labels[idx]
        cls_label = self.cls_labels[idx]

        encoded = self.tokenizer(
            query,
            product,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        encoded = {k: v.squeeze(0) for k, v in encoded.items()}
        encoded["reg_label"] = torch.tensor(reg_label, dtype=torch.float)
        encoded["cls_label"] = torch.tensor(cls_label, dtype=torch.long)

        return encoded

In [None]:
import torch.nn as nn
class CrossEncoder(nn.Module):
    def __init__(self, encoder):
        super(CrossEncoder, self).__init__()
        self.encoder = encoder
        hidden_size = encoder.config.hidden_size
        self.reg_head = nn.Linear(hidden_size, 1)   # for regression
        self.cls_head = nn.Linear(hidden_size, 4)   # for classification (4 classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        reg_logits = self.reg_head(pooled_output).squeeze(-1)
        cls_logits = self.cls_head(pooled_output)
        return reg_logits, cls_logits

In [None]:
import torch.nn.functional as F

def list_ce_loss(logits, labels):
    true_dist = F.softmax(labels, dim=0)
    log_pred_dist = F.log_softmax(logits, dim=0)
    return -torch.sum(true_dist * log_pred_dist)

def rcr_loss_function(logits, reg_labels, alpha=0.3):
    softplus_logits = F.softplus(logits)

    mse_loss = F.mse_loss(softplus_logits, reg_labels)
    listwise_loss = list_ce_loss(softplus_logits, reg_labels)

    # return (1 - alpha) * mse_loss + alpha * listwise_loss
    return {
        "mse" : mse_loss,
        "listwise" : listwise_loss
    }

def multitask_loss(reg_logits, cls_logits, reg_labels, cls_labels, x=1/3, alpha=0.5):
    """
    x: weight for classification vs regression
    alpha: weight inside RCR loss
    """
    rcr = rcr_loss_function(reg_logits, reg_labels, alpha)
    ce = F.cross_entropy(cls_logits, cls_labels)
    # return (1 - x) * rcr + x * ce
    return {
        "mse" : rcr["mse"],
        "listwise" : rcr["listwise"],
        "ce" : ce
    }

In [None]:
from torch.utils.data import DataLoader
import torch.nn.functional as F

train_dataset = ESCIDataset(tokenizer, prod_groups_train, label_groups_train, max_len=128)
test_dataset = ESCIDataset(tokenizer, prod_groups_test, label_groups_test, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [None]:
from collections import OrderedDict
from copy import deepcopy

def get_grads(loss, model):
    """
    Returns a 1D tensor of gradients for all model parameters, filling zeros if any parameter is unused.
    """
    param_list = [p for p in model.parameters() if p.requires_grad]

    grads = torch.autograd.grad(
        outputs=loss,
        inputs=param_list,
        retain_graph=True,
        create_graph=False,
        allow_unused=True
    )

    flat_grads = []
    for p, g in zip(param_list, grads):
        if g is None:
            flat_grads.append(torch.zeros_like(p).view(-1))  # zero for unused params
        else:
            flat_grads.append(g.contiguous().view(-1))

    return torch.cat(flat_grads)

In [None]:
import torch.optim as optim

def solve_nash_weights(task_grads, lr=8e-6, weight_decay=0.01, steps=20):
  """
  task_grads: list of grad vectors for each task
  """

  T = len(task_grads)
  G = torch.stack(task_grads)

  w = torch.ones(T, device = G.device, requires_grad=True)

  optimizer = optim.AdamW([w], lr=lr, weight_decay=weight_decay)

  for _ in range(steps):
    optimizer.zero_grad()

    agg_grad = torch.matmul(w, G)
    grad_norm_sq = torch.sum(agg_grad ** 2)

    loss = -torch.sum(torch.log(w + 1e-8) + 0.5 * grad_norm_sq)

    loss.backward()
    optimizer.step()

    with torch.no_grad():
      w.clamp_(min=1e-4)
      w /= w.sum()

  return w.detach()

In [None]:
import torch
from tqdm import tqdm
from transformers import get_scheduler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CrossEncoder(encoder).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=8e-6, weight_decay=0.01)
num_epochs = 1
alpha = 0.5
nash_steps = 20

num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=100,
    num_training_steps=num_training_steps
)

global_step = 0
model.train()
progress_bar = tqdm(total=num_training_steps, desc="training", ncols=100)

for epoch in range(num_epochs):
  for batch in train_loader:
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    reg_labels = batch["reg_label"].to(device)
    cls_labels = batch["cls_label"].to(device)

    optimizer.zero_grad()

    reg_logits, cls_logits = model(input_ids, attention_mask)
    losses = multitask_loss(reg_logits, cls_logits, reg_labels, cls_labels)

    if global_step % 5 == 0:
      grads = []
      for task_loss in [losses["mse"], losses["listwise"], losses["ce"]]:
        grads.append(get_grads(task_loss, model))

      weights = solve_nash_weights(grads, steps=nash_steps)

    total_loss = weights[0] * losses["mse"] + weights[1] * losses["listwise"] + weights[2] * losses["ce"]

    total_loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()
    lr_scheduler.step()

    progress_bar.set_postfix(loss=f"{total_loss.item():.4f}", rcr_mse=losses["mse"].item(), rcr_listwise=losses["listwise"].item(), ce=losses["ce"].item(), step=global_step)
    progress_bar.update(1)
    global_step += 1

progress_bar.close()

training:   0%| | 6/16994 [00:09<7:14:04,  1.53s/it, ce=1.21, loss=1.1425, rcr_listwise=2.08, rcr_ms
training:  50%|▌| 8545/16994 [1:25:15<1:01:51,  2.28it/s, ce=1.19, loss=1.1165, rcr_listwise=2.05, r

In [None]:
from sklearn.metrics import ndcg_score
from collections import defaultdict
import torch.nn.functional as F
import torch
from tqdm import tqdm

model.eval()

query_to_scores = defaultdict(list)
query_to_labels = defaultdict(list)

all_cls_preds = []
all_cls_trues = []

test_pairs = test_dataset.pairs
reg_labels = test_dataset.reg_labels
cls_labels = test_dataset.cls_labels

batch_size = 16
with torch.no_grad():
    for i in tqdm(range(0, len(test_pairs), batch_size), desc="Evaluating"):
        batch_pairs = test_pairs[i:i+batch_size]
        batch_reg_labels = reg_labels[i:i+batch_size]
        batch_cls_labels = cls_labels[i:i+batch_size]

        queries = [q for q, _ in batch_pairs]
        products = [p for _, p in batch_pairs]

        encoded = tokenizer(
            queries,
            products,
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )

        input_ids = encoded["input_ids"].to(device)
        attention_mask = encoded["attention_mask"].to(device)

        reg_logits, cls_logits = model(input_ids=input_ids, attention_mask=attention_mask)

        reg_scores = F.softplus(reg_logits).cpu().tolist()
        cls_preds = torch.argmax(F.softmax(cls_logits, dim=-1), dim=-1).cpu().tolist()
        cls_trues = batch_cls_labels

        for q, s, l in zip(queries, reg_scores, batch_reg_labels):
            query_to_scores[q].append(s)
            query_to_labels[q].append(l)

        all_cls_preds.extend(cls_preds)
        all_cls_trues.extend(cls_trues)

ndcg_total = 0
qualifiable_count = 0

for q in query_to_labels:
    labels = query_to_labels[q]
    scores = query_to_scores[q]
    if len(labels) > 1 and sum(labels) > 0:
        try:
            ndcg = ndcg_score([labels], [scores], k=10)
            ndcg_total += ndcg
            qualifiable_count += 1
        except ValueError:
            continue

avg_ndcg_10 = ndcg_total / qualifiable_count if qualifiable_count > 0 else 0

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(all_cls_trues, all_cls_preds)

print(f"Average NDCG@10 (for {qualifiable_count} qualifiable queries): {avg_ndcg_10:.4f}")
print(f"Classification Accuracy: {accuracy:.4f}")