# Imports

In [15]:
import json
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

from tqdm import tqdm

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [16]:
model_name = "t5-base" 
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [17]:
dev_jsonl = "alphanli-train-dev/dev.jsonl"
dev_labels = "alphanli-train-dev/dev-labels.lst"

In [18]:
def load_data(jsonl_file, labels_file):
    data = []
    with open(jsonl_file, "r") as f_json, open(labels_file, "r") as f_labels:
        labels = [int(line.strip()) for line in f_labels.readlines()]
        for idx, line in enumerate(f_json):
            entry = json.loads(line.strip())
            data.append({
                "obs1": entry["obs1"],
                "obs2": entry["obs2"],
                "hyp1": entry["hyp1"],
                "hyp2": entry["hyp2"],
                "label": labels[idx]  # Ground-truth label (1 or 2)
            })
    return data

dev_data = load_data(dev_jsonl, dev_labels)

In [19]:
def format_input(entry):
    return f"obs1: {entry['obs1']} obs2: {entry['obs2']} hyp1: {entry['hyp1']} hyp2: {entry['hyp2']} Which hypothesis is more plausible?"

# Run zero-shot inference

In [20]:
def run_inference(model, tokenizer, dataset):
    model.eval()
    predictions = []
    ground_truths = []

    token_1 = tokenizer.encode("1", add_special_tokens=False)[0]
    token_2 = tokenizer.encode("2", add_special_tokens=False)[0]

    allowed_tokens = [token_1, token_2]

    for entry in tqdm(dataset):
        input_text = format_input(entry)
        inputs = tokenizer(input_text, return_tensors="pt").to(device)

        with torch.no_grad():
            output = model.generate(
                **inputs,
                max_length=2,  
                pad_token_id=tokenizer.pad_token_id,
                eos_token_id=tokenizer.eos_token_id,
                bad_words_ids=[[x] for x in range(0, tokenizer.vocab_size) if x not in allowed_tokens]  # Block all except "1" and "2"
            )

        pred = tokenizer.decode(output[0], skip_special_tokens=True).strip()

        predicted_label = 1 if pred == "1" else 2

        predictions.append(predicted_label)
        ground_truths.append(entry["label"])

    return predictions, ground_truths

In [21]:
preds, labels = run_inference(model, tokenizer, dev_data)

100%|███████████████████████████████████████| 1532/1532 [02:54<00:00,  8.79it/s]


In [22]:
accuracy = accuracy_score(labels, preds)
precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Accuracy: 0.4935
Precision: 0.5031
Recall: 0.5262
F1-score: 0.5144
