In [6]:
from re import escape
import pandas as pd
import csv

processed_train_df = pd.read_csv('filtered_train.csv')
processed_test_df = pd.read_csv('filtered_test.csv')

In [7]:
print(processed_train_df.info())
print('\n')
print('=' * 65)
print('\n')
processed_test_df.info()
print('\n')
print('=' * 65)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75000 entries, 0 to 74999
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   query          75000 non-null  object 
 1   product_input  75000 non-null  object 
 2   esci_label     75000 non-null  float64
dtypes: float64(1), object(2)
memory usage: 1.7+ MB
None




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   query          25000 non-null  object 
 1   product_input  25000 non-null  object 
 2   esci_label     25000 non-null  float64
dtypes: float64(1), object(2)
memory usage: 586.1+ KB




In [8]:
processed_train_df["esci_label"].value_counts()

Unnamed: 0_level_0,count
esci_label,Unnamed: 1_level_1
1.0,41037
0.1,18511
0.0,13866
0.01,1586


In [9]:
query_counts = processed_train_df["query"].value_counts()
qualifiable_queries = (query_counts > 1).sum()
qualifiable_queries

np.int64(6606)

In [10]:
from transformers import AutoModel, AutoTokenizer

model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
encoder = AutoModel.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/371M [00:00<?, ?B/s]

In [11]:
from collections import defaultdict

# Creating dict for product info as prod_groups
# and esci_label as label_groups
prod_groups_train = defaultdict(list)
prod_groups_test  = defaultdict(list)
label_groups_train = defaultdict(list)
label_groups_test  = defaultdict(list)

def get_dicts(df, prod_groups, label_groups):
  for _, row in df.iterrows():
    query = row["query"]
    product = row["product_input"]
    relevance = float(row["esci_label"])

    prod_groups[query].append(product)
    label_groups[query].append(relevance)

get_dicts(processed_train_df, prod_groups_train, label_groups_train)
get_dicts(processed_test_df, prod_groups_test, label_groups_test)

In [12]:
import torch
from torch.utils.data import Dataset

class ESCIDataset(Dataset):
    def __init__(self, tokenizer, prod_groups, label_groups, max_len=128):
        self.tokenizer = tokenizer
        self.pairs = []
        self.reg_labels = []
        self.cls_labels = []

        ## Labels are 0.0(I), 0.01(C), 0.1(S) and 1.0(E),
        ## Models would prefer to promote with labels 1.0 and 0.1
        ## over 0.01 and 0.0

        score_to_index = {0.0: 0, 0.01: 1, 0.1: 2, 1.0: 3}

        for query in prod_groups:
            product_info = prod_groups[query]
            labels = label_groups[query]

            for idx, label in enumerate(labels):
                self.pairs.append((query, product_info[idx]))
                self.reg_labels.append(label)
                self.cls_labels.append(score_to_index[label])

        self.max_len = max_len

    def __len__(self):
        return len(self.pairs)

    def __getitem__(self, idx):
        query, product = self.pairs[idx]
        reg_label = self.reg_labels[idx]
        cls_label = self.cls_labels[idx]

        encoded = self.tokenizer(
            query,
            product,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        encoded = {k: v.squeeze(0) for k, v in encoded.items()}
        encoded["reg_label"] = torch.tensor(reg_label, dtype=torch.float)
        encoded["cls_label"] = torch.tensor(cls_label, dtype=torch.long)

        return encoded

In [13]:
import torch.nn as nn
class CrossEncoder(nn.Module):
    def __init__(self, encoder):
        super(CrossEncoder, self).__init__()
        self.encoder = encoder
        hidden_size = encoder.config.hidden_size
        self.reg_head = nn.Linear(hidden_size, 1)   # for regression
        self.cls_head = nn.Linear(hidden_size, 4)   # for classification (4 classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        reg_logits = self.reg_head(pooled_output).squeeze(-1)
        cls_logits = self.cls_head(pooled_output)
        return reg_logits, cls_logits

In [14]:
import torch.nn.functional as F

def list_ce_loss(logits, labels):
    true_dist = F.softmax(labels, dim=0)
    log_pred_dist = F.log_softmax(logits, dim=0)
    return -torch.sum(true_dist * log_pred_dist)

def rcr_loss_function(logits, reg_labels, alpha=0.3):
    softplus_logits = F.softplus(logits)

    mse_loss = F.mse_loss(softplus_logits, reg_labels)
    listwise_loss = list_ce_loss(softplus_logits, reg_labels)

    return (1 - alpha) * mse_loss + alpha * listwise_loss

def multitask_loss(reg_logits, cls_logits, reg_labels, cls_labels, x=1/3, alpha=0.5):
    """
    x: weight for classification vs regression
    alpha: weight inside RCR loss
    """
    rcr = rcr_loss_function(reg_logits, reg_labels, alpha)
    ce = F.cross_entropy(cls_logits, cls_labels)
    return (1 - x) * rcr + x * ce

In [16]:
from torch.utils.data import DataLoader
import torch.nn.functional as F

train_dataset = ESCIDataset(tokenizer, prod_groups_train, label_groups_train, max_len=128)
test_dataset = ESCIDataset(tokenizer, prod_groups_test, label_groups_test, max_len=128)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [17]:
import torch
from tqdm import tqdm
from transformers import get_scheduler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CrossEncoder(encoder).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=8e-6, weight_decay=0.01)
num_epochs = 1
alpha = 0.5    # Inner RCR: (1-alpha)*MSE + alpha*ListCE
x = 0.33       # Outer MTL: (1-x)*RCR + x*CrossEntropy

num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=100,
    num_training_steps=num_training_steps,
)

global_step = 0
losses = []

model.train()
progress_bar = tqdm(total=num_epochs * len(train_loader), desc="Training", ncols=100)

for epoch in range(num_epochs):
    for batch in train_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        reg_labels = batch["reg_label"].to(device)
        cls_labels = batch["cls_label"].to(device)

        optimizer.zero_grad()

        reg_logits, cls_logits = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = multitask_loss(reg_logits, cls_logits, reg_labels, cls_labels, x=x, alpha=alpha)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        lr_scheduler.step()

        losses.append(loss.item())
        global_step += 1

        progress_bar.set_postfix(loss=f"{loss.item():.4f}", step=global_step)
        progress_bar.update(1)

progress_bar.close()

Training: 100%|█████████████████████████| 4688/4688 [39:16<00:00,  1.99it/s, loss=0.9178, step=4688]


In [18]:
from sklearn.metrics import ndcg_score
from collections import defaultdict
import torch.nn.functional as F
import torch
from tqdm import tqdm

model.eval()

query_to_scores = defaultdict(list)
query_to_labels = defaultdict(list)

all_cls_preds = []
all_cls_trues = []

test_pairs = test_dataset.pairs
reg_labels = test_dataset.reg_labels
cls_labels = test_dataset.cls_labels

batch_size = 16
with torch.no_grad():
    for i in tqdm(range(0, len(test_pairs), batch_size), desc="Evaluating"):
        batch_pairs = test_pairs[i:i+batch_size]
        batch_reg_labels = reg_labels[i:i+batch_size]
        batch_cls_labels = cls_labels[i:i+batch_size]

        queries = [q for q, _ in batch_pairs]
        products = [p for _, p in batch_pairs]

        encoded = tokenizer(
            queries,
            products,
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )

        input_ids = encoded["input_ids"].to(device)
        attention_mask = encoded["attention_mask"].to(device)

        reg_logits, cls_logits = model(input_ids=input_ids, attention_mask=attention_mask)

        reg_scores = F.softplus(reg_logits).cpu().tolist()
        cls_preds = torch.argmax(F.softmax(cls_logits, dim=-1), dim=-1).cpu().tolist()
        cls_trues = batch_cls_labels

        for q, s, l in zip(queries, reg_scores, batch_reg_labels):
            query_to_scores[q].append(s)
            query_to_labels[q].append(l)

        all_cls_preds.extend(cls_preds)
        all_cls_trues.extend(cls_trues)

ndcg_total = 0
qualifiable_count = 0

for q in query_to_labels:
    labels = query_to_labels[q]
    scores = query_to_scores[q]
    if len(labels) > 1 and sum(labels) > 0:
        try:
            ndcg = ndcg_score([labels], [scores], k=10)
            ndcg_total += ndcg
            qualifiable_count += 1
        except ValueError:
            continue

avg_ndcg_10 = ndcg_total / qualifiable_count if qualifiable_count > 0 else 0

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(all_cls_trues, all_cls_preds)

print(f"Average NDCG@10 (for {qualifiable_count} qualifiable queries): {avg_ndcg_10:.4f}")
print(f"Classification Accuracy: {accuracy:.4f}")

Evaluating: 100%|██████████| 1563/1563 [04:16<00:00,  6.10it/s]


Average NDCG@10 (for 2438 qualifiable queries): 0.9006
Classification Accuracy: 0.6870


In [19]:
import pandas as pd

test_df = pd.DataFrame({
    "query": [q for q, _ in test_dataset.pairs],
    "reg_label": test_dataset.reg_labels,
    "cls_label": test_dataset.cls_labels,
})

grouped = test_df.groupby("query")["reg_label"]
multi_item_queries = grouped.count() > 1
has_relevant = grouped.sum() > 0
qualifiable_queries = multi_item_queries & has_relevant
num_qualifiable = qualifiable_queries.sum()

print(f"Number of qualifiable queries for NDCG@10: {num_qualifiable}")

Number of qualifiable queries for NDCG@10: 2438


In [20]:
from google.colab import runtime
runtime.unassign()