In [1]:
# load the dataset
import pandas as pd

## I used the PAWS dataset
## This particular subset contains english sentence, labelled either 1 (same meaning) or 0 (different meaning)
## Ideally I should have used other datasets as well such as PAWS-de and PAWS-fr for german and french support
## And Cross lingual datasets as well. Which I did look into it, but due to lack of time and model training time
## I stuck to simple implementation
## The remaining dataset can easily be incorporated accordinly

splits = {'train': 'en/train-00000-of-00001.parquet', 'test': 'en/test-00000-of-00001.parquet', 'validation': 'en/validation-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/google-research-datasets/paws-x/" + splits["train"])

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer('sentence-transformers/LaBSE')

sentences1 = df['sentence1'].tolist()
sentences2 = df['sentence2'].tolist()

emb1 = model.encode(sentences1, batch_size=64, show_progress_bar=True, convert_to_numpy=True)
emb2 = model.encode(sentences2, batch_size=64, show_progress_bar=True, convert_to_numpy=True)

X = np.concatenate([emb1, emb2, np.abs(emb1 - emb2), emb1 * emb2], axis=1)
y = df['label'].values

Batches:  80%|███████▉  | 615/772 [45:28<11:36,  4.44s/it] 


KeyboardInterrupt: 

In [None]:
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader

# Train/Val Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Convert to PyTorch tensors
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                              torch.tensor(y_train, dtype=torch.float32))
val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float32),
                            torch.tensor(y_val, dtype=torch.float32))

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

In [None]:
import torch.nn as nn

class SentencePairClassifier(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SentencePairClassifier(input_dim=3072).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=2e-4)

def evaluate(model, dataloader):
    model.eval()
    y_true, y_pred, y_prob = [], [], []
    with torch.no_grad():
        for xb, yb in dataloader:
            xb, yb = xb.to(device), yb.to(device)
            probs = model(xb).squeeze()
            preds = (probs >= 0.5).int()
            y_true.extend(yb.cpu().numpy())
            y_pred.extend(preds.cpu().numpy())
            y_prob.extend(probs.cpu().numpy())

    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred),
        "roc_auc": roc_auc_score(y_true, y_prob)
    }

# Training
for epoch in range(20):
    model.train()
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        output = model(xb).squeeze()
        loss = criterion(output, yb)
        loss.backward()
        optimizer.step()

    val_metrics = evaluate(model, val_loader)
    print(f"Epoch {epoch+1}: Val Acc={val_metrics['accuracy']:.4f}, "
          f"F1={val_metrics['f1']:.4f}, AUC={val_metrics['roc_auc']:.4f}")

torch.save(model.state_dict(), "paws_classifier.pt")
