In [23]:
# Install required libraries
!pip install transformers datasets torch sentencepiece


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [25]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import load_dataset
from torch.utils.data import Dataset

In [26]:
# Load PAWS dataset
dataset = load_dataset("paws", "labeled_final")


In [27]:
# Define a custom PyTorch dataset class
class PAWSDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sent1 = self.data[idx]["sentence1"]
        sent2 = self.data[idx]["sentence2"]
        label = self.data[idx]["label"]  # 1 if paraphrase, 0 otherwise

        if label == 1:  # Only train on paraphrase pairs
            input_text = f"paraphrase: {sent1}"
            target_text = sent2

            inputs = self.tokenizer(input_text, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")
            targets = self.tokenizer(target_text, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt")

            return {
                "input_ids": inputs["input_ids"].squeeze(),
                "attention_mask": inputs["attention_mask"].squeeze(),
                "labels": targets["input_ids"].squeeze()
            }
        else:
            return self.__getitem__((idx + 1) % len(self.data))  # Skip non-paraphrase pairs

In [28]:
# Initialize tokenizer and model
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Prepare datasets
train_data = PAWSDataset(dataset["train"], tokenizer)
val_data = PAWSDataset(dataset["validation"], tokenizer)

def collate_fn(batch):
    input_ids = torch.stack([item["input_ids"] for item in batch])
    attention_mask = torch.stack([item["attention_mask"] for item in batch])
    labels = torch.stack([item["labels"] for item in batch])
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

In [41]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=3e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=100,
    report_to="none",  # 👈 This disables wandb
)




In [42]:
import os
os.environ["WANDB_DISABLED"] = "true"  # Disable Weights & Biases

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=collate_fn,  # Removed tokenizer=tokenizer
)



In [43]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.1297,0.121968
2,0.1196,0.119269
3,0.1137,0.118334


TrainOutput(global_step=18528, training_loss=0.12659422156363984, metrics={'train_runtime': 3169.3251, 'train_samples_per_second': 46.762, 'train_steps_per_second': 5.846, 'total_flos': 5014515250888704.0, 'train_loss': 0.12659422156363984, 'epoch': 3.0})

In [44]:
model.save_pretrained("./t5_paws_paraphrase")
tokenizer.save_pretrained("./t5_paws_paraphrase")

('./t5_paws_paraphrase/tokenizer_config.json',
 './t5_paws_paraphrase/special_tokens_map.json',
 './t5_paws_paraphrase/spiece.model',
 './t5_paws_paraphrase/added_tokens.json')

In [49]:
print(val_data[0])


{'input_ids': tensor([ 3856, 27111,    10,   328,   130,   132,    12,   777,   178,    11,
           79,   130,   132,    12,  8844,    21,   178,     3,     5,     1,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0, 

In [50]:
from transformers import AutoTokenizer

# Load the tokenizer (use the same model as your dataset)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  # Change to your model

# Decode tokenized input into text
val_texts = [tokenizer.decode(sample["input_ids"], skip_special_tokens=True) for sample in val_data]
val_labels = [tokenizer.decode(sample["labels"], skip_special_tokens=True) for sample in val_data]

# Print a sample
print("Original Sentence:", val_texts[0])
print("Reference Sentence:", val_labels[0])


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Original Sentence: picked eucalyptus [unused9] [unused323] [unused125] [unused127] [unused11] [unused772] [unused173] [unused10] [unused78] [unused125] [unused127] [unused11]ator [unused20] [unused173] [unused2] [unused4] [unused0]
Reference Sentence: [unused323] [unused125] [unused127] [unused20] [unused173] [unused11] [unused772] [unused10] [unused78] [unused125] [unused127] [unused20] [unused173] [unused11]ator [unused2] [unused4] [unused0]


In [54]:
pip install sacrebleu


Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.1.1 sacrebleu-2.5.1


In [57]:
import evaluate

# Load BLEU metric (for translation tasks)
bleu_metric = evaluate.load("sacrebleu")

# Load Accuracy metric (for classification tasks)
accuracy_metric = evaluate.load("accuracy")

# Example Predictions and References
predictions = ["This is a test sentence"]
references = [["This is a test sentence"]]

# Compute BLEU score
bleu_score = bleu_metric.compute(predictions=predictions, references=references)
print("BLEU Score:", bleu_score)

# Example for accuracy calculation
pred_labels = [0, 1, 1, 0, 1]  # Model's predicted labels
true_labels = [0, 1, 0, 0, 1]  # Ground truth labels

# Compute Accuracy
accuracy_score = accuracy_metric.compute(predictions=pred_labels, references=true_labels)
print("Accuracy Score:", accuracy_score)


BLEU Score: {'score': 100.00000000000004, 'counts': [5, 4, 3, 2], 'totals': [5, 4, 3, 2], 'precisions': [100.0, 100.0, 100.0, 100.0], 'bp': 1.0, 'sys_len': 5, 'ref_len': 5}
Accuracy Score: {'accuracy': 0.8}
