In [1]:
print('Hello World')

Hello World


In [2]:
# load the dataset
import pandas as pd

splits = {'train': 'en/train-00000-of-00001.parquet', 'test': 'en/test-00000-of-00001.parquet', 'validation': 'en/validation-00000-of-00001.parquet'}
df = pd.read_parquet("hf://datasets/google-research-datasets/paws-x/" + splits["train"])

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49401 entries, 0 to 49400
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         49401 non-null  int32 
 1   sentence1  49401 non-null  object
 2   sentence2  49401 non-null  object
 3   label      49401 non-null  int64 
dtypes: int32(1), int64(1), object(2)
memory usage: 1.3+ MB


In [6]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer('sentence-transformers/LaBSE')

In [7]:
sentences1 = df['sentence1'].tolist()
sentences2 = df['sentence2'].tolist()

emb1 = model.encode(sentences1, batch_size=64, show_progress_bar=True, convert_to_numpy=True)
emb2 = model.encode(sentences2, batch_size=64, show_progress_bar=True, convert_to_numpy=True)

Batches: 100%|██████████| 772/772 [36:19<00:00,  2.82s/it] 
Batches: 100%|██████████| 772/772 [35:17<00:00,  2.74s/it]


In [8]:
X = np.concatenate([emb1, emb2, np.abs(emb1 - emb2), emb1 * emb2], axis=1)
y = df['label'].values

In [9]:
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader

In [10]:
# Train/Val Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Convert to PyTorch tensors
train_dataset = TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                              torch.tensor(y_train, dtype=torch.float32))
val_dataset = TensorDataset(torch.tensor(X_val, dtype=torch.float32),
                            torch.tensor(y_val, dtype=torch.float32))

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

In [11]:
import torch.nn as nn

class SentencePairClassifier(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.net(x)


In [13]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = SentencePairClassifier(input_dim=3072).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=2e-4)

def evaluate(model, dataloader):
    model.eval()
    y_true, y_pred, y_prob = [], [], []
    with torch.no_grad():
        for xb, yb in dataloader:
            xb, yb = xb.to(device), yb.to(device)
            probs = model(xb).squeeze()
            preds = (probs >= 0.5).int()
            y_true.extend(yb.cpu().numpy())
            y_pred.extend(preds.cpu().numpy())
            y_prob.extend(probs.cpu().numpy())

    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred),
        "roc_auc": roc_auc_score(y_true, y_prob)
    }

# Training
for epoch in range(50):
    model.train()
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        output = model(xb).squeeze()
        loss = criterion(output, yb)
        loss.backward()
        optimizer.step()

    val_metrics = evaluate(model, val_loader)
    print(f"Epoch {epoch+1}: Val Acc={val_metrics['accuracy']:.4f}, "
          f"F1={val_metrics['f1']:.4f}, AUC={val_metrics['roc_auc']:.4f}")


Epoch 1: Val Acc=0.6071, F1=0.3500, AUC=0.6078
Epoch 2: Val Acc=0.6119, F1=0.4457, AUC=0.6135
Epoch 3: Val Acc=0.6063, F1=0.4549, AUC=0.6164
Epoch 4: Val Acc=0.6156, F1=0.4302, AUC=0.6227
Epoch 5: Val Acc=0.6198, F1=0.4320, AUC=0.6255
Epoch 6: Val Acc=0.6210, F1=0.4446, AUC=0.6271
Epoch 7: Val Acc=0.6274, F1=0.4288, AUC=0.6292
Epoch 8: Val Acc=0.6232, F1=0.4582, AUC=0.6280
Epoch 9: Val Acc=0.6191, F1=0.4659, AUC=0.6250
Epoch 10: Val Acc=0.6252, F1=0.4457, AUC=0.6249
Epoch 11: Val Acc=0.6273, F1=0.4354, AUC=0.6187
Epoch 12: Val Acc=0.6096, F1=0.4742, AUC=0.6140
Epoch 13: Val Acc=0.6230, F1=0.4561, AUC=0.6136
Epoch 14: Val Acc=0.6218, F1=0.4391, AUC=0.6085
Epoch 15: Val Acc=0.6228, F1=0.4538, AUC=0.6111
Epoch 16: Val Acc=0.6126, F1=0.4748, AUC=0.6094
Epoch 17: Val Acc=0.6100, F1=0.4728, AUC=0.6053
Epoch 18: Val Acc=0.6255, F1=0.4283, AUC=0.6028
Epoch 19: Val Acc=0.6171, F1=0.4564, AUC=0.6026
Epoch 20: Val Acc=0.6258, F1=0.4370, AUC=0.6004
Epoch 21: Val Acc=0.6147, F1=0.4697, AUC=0.6010
E

KeyboardInterrupt: 