In [26]:
!pip install transformers datasets torch scikit-learn
!pip install sentencepiece



In [27]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score

In [None]:

# Load the MRPC dataset
dataset = load_dataset("glue", "mrpc")

# Inspect the structure of the dataset
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})


In [None]:
# View Sample Data
# first training example
print(dataset["train"][0])

# Display all keys of the dataset
print(dataset["train"].features)

{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', 'label': 1, 'idx': 0}
{'sentence1': Value(dtype='string', id=None), 'sentence2': Value(dtype='string', id=None), 'label': ClassLabel(names=['not_equivalent', 'equivalent'], id=None), 'idx': Value(dtype='int32', id=None)}


In [30]:
pip install sentencepiece

Note: you may need to restart the kernel to use updated packages.


In [None]:
# Initialize and load the T5 Tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

In [None]:

def preprocess_function(examples):

    inputs = ["paraphrase: " + s1 + " || " + s2 for s1, s2 in zip(examples["sentence1"], examples["sentence2"])]
    labels = ["equivalent" if label == 1 else "not equivalent" for label in examples["label"]]

    # Tokenize inputs and labels
    model_inputs = tokenizer(
        inputs, max_length=128, truncation=True, padding="max_length", return_tensors="pt"
    )

    # Tokenize labels using a single string and map to tokens
    label_inputs = tokenizer(
        labels, max_length=10, truncation=True, padding="max_length", return_tensors="pt"
    )

    # Add tokenized labels as tensors
    model_inputs["labels"] = label_inputs["input_ids"]

    # DEBUG: Inspect one example for validation
    if "DEBUG" in globals() and DEBUG:
        print("Example preprocessed input:", inputs[0])
        print("Tokenized input IDs:", model_inputs["input_ids"][0])
        print("Tokenized label IDs:", model_inputs["labels"][0])

    return model_inputs


In [None]:
tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Remove unnecessary columns (sentence1, sentence2, label)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "label"])

# Extract train and validation splits
train_dataset = tokenized_datasets["train"]
val_dataset = tokenized_datasets["validation"]

# Inspect a tokenized sample
print(tokenized_datasets["train"][0])

Map:  27%|██▋       | 1000/3668 [00:00<00:00, 4638.31 examples/s]

Example preprocessed input: paraphrase: Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence . || Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .
Tokenized input IDs: tensor([ 3856, 27111,    10,   736,  9860,    23, 11970,   112,  4284,     3,
            6,  4068,     3,    88,   718,    96,     8,  9051,    96,     3,
            6,    13, 24067,  1227,  7279,  1222,   112,  2084,     3,     5,
         1820,  9175, 12250,  1007,    12,   376,    38,   163,    96,     8,
         9051,    96,     3,     6,   736,  9860,    23, 11970,   112,  4284,
           13, 24067,  1227,  7279,  1222,   112,  2084,     3,     5,     1,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,    

Map:  82%|████████▏ | 3000/3668 [00:00<00:00, 5694.88 examples/s]

tensor([ 3856, 27111,    10,   216,   808,     3, 27759,  1032,    30,     8,
         1057,    21,     8,   511,    97,  2818,   437,   112,  2101,    18,
         1135,  2871,     3,     5,  1820,  9175,  1022,   449,     3,     6,
          113,  1028,  5133,   920,   112,   646,  8173,    16,     3,     9,
        16345,  1332,  2664,     3,     6,   808,     3, 27759,  1032,    30,
            8,  1057,    21,     8,   166,    97,  2089,     3,     5,     1,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

Map: 100%|██████████| 3668/3668 [00:00<00:00, 5556.09 examples/s]


Example preprocessed input: paraphrase: Elsewhere in the diary , Truman showed a more familiar side , colorful and outspoken in his disdain for life in the White House . || History aside , the diary reveals a colorful , witty , introspective and irreverent president outspoken in his disdain for life in the White House .
Tokenized input IDs: tensor([ 3856, 27111,    10,  1289,     7,    15,  8352,    16,     8, 25933,
            3,     6,  7953,   348,  3217,     3,     9,    72,  3324,   596,
            3,     6, 10540,    11,    91,  7990,  2217,    16,   112,  1028,
           26,     9,    77,    21,   280,    16,     8,  1945,  1384,     3,
            5,  1820,  9175,  5528,  5915,     3,     6,     8, 25933,     3,
        15503,     3,     9, 10540,     3,     6,     3,   210, 17132,     3,
            6,    16, 30113,   162,    11, 19598,   624,   295,  2753,    91,
         7990,  2217,    16,   112,  1028,    26,     9,    77,    21,   280,
           16,     8,  1945,  138

Map: 100%|██████████| 408/408 [00:00<00:00, 5861.74 examples/s]


Example preprocessed input: paraphrase: He said the foodservice pie business doesn 't fit the company 's long-term growth strategy . || " The foodservice pie business does not fit our long-term growth strategy .
Tokenized input IDs: tensor([ 3856, 27111,    10,   216,   243,     8,   542,  5114,  6253,   268,
          744,     3,    31,    17,  1400,     8,   349,     3,    31,     7,
          307,    18,  1987,  1170,  1998,     3,     5,  1820,  9175,    96,
           37,   542,  5114,  6253,   268,   405,    59,  1400,    69,   307,
           18,  1987,  1170,  1998,     3,     5,     1,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,  

Map:  58%|█████▊    | 1000/1725 [00:00<00:00, 6167.01 examples/s]

Example preprocessed input: paraphrase: PCCW 's chief operating officer , Mike Butcher , and Alex Arena , the chief financial officer , will report directly to Mr So . || Current Chief Operating Officer Mike Butcher and Group Chief Financial Officer Alex Arena will report to So .
Tokenized input IDs: tensor([ 3856, 27111,    10,  2104, 18105,     3,    31,     7,  5752,  2699,
         5502,     3,     6,  4794,   299,  1703,     3,     6,    11,  5104,
        14904,     3,     6,     8,  5752,   981,  5502,     3,     6,    56,
          934,  1461,    12,  1363,   264,     3,     5,  1820,  9175, 12892,
         5116, 21606,  6027,  4794,   299,  1703,    11,  1531,  5116,  5421,
         6027,  5104, 14904,    56,   934,    12,   264,     3,     5,     1,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,    

Map: 100%|██████████| 1725/1725 [00:00<00:00, 6097.42 examples/s]

Example preprocessed input: paraphrase: Intel sells the current top Pentium for US $ 637 in quantities of 1,000 . || Intel 's current Pentium 4 chips have 512K bytes of cache .
Tokenized input IDs: tensor([ 3856, 27111,    10,  5869,  1789,     7,     8,   750,   420,  4511,
           17,  2552,    21,   837,  1514,   431,  4118,    16, 16274,    13,
        11668,     3,     5,  1820,  9175,  5869,     3,    31,     7,   750,
         4511,    17,  2552,   314,  8852,    43,     3, 24163,   439,    57,
         1422,    13, 11800,     3,     5,     1,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0, 




In [34]:
from torch.utils.data import DataLoader

In [None]:
from transformers import DataCollatorForSeq2Seq

# Initialize the Data Collator for Seq2Seq tasks
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=None,
    padding=True, 
    return_tensors="pt" 
)

# Define DataLoaders with the collator
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=data_collator)
val_dataloader = DataLoader(val_dataset, batch_size=8, collate_fn=data_collator)

# Inspect batch shape again
for batch in train_dataloader:
    print({key: val.shape for key, val in batch.items()})
    break


{'idx': torch.Size([8]), 'input_ids': torch.Size([8, 128]), 'attention_mask': torch.Size([8, 128]), 'labels': torch.Size([8, 10])}


In [36]:
# Load the T5 model
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Move the model to the appropriate device (GPU/CPU)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

print(f"Model loaded on device: {device}")


Model loaded on device: cpu


In [37]:
from torch.optim import AdamW
from transformers import get_scheduler

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Total training steps = num_batches_per_epoch * num_epochs
num_training_steps = len(train_dataloader) * 3  # Assuming 3 epochs
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

print(f"Number of training steps: {num_training_steps}")


Number of training steps: 1377


In [38]:
import torch
from torch.utils.data import DataLoader
from transformers import T5ForConditionalGeneration, AdamW, get_scheduler, AutoTokenizer
from sklearn.metrics import accuracy_score, f1_score
from itertools import product
from tqdm import tqdm

# Set device for computation
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained("t5-small")

In [None]:
# Assuming train_dataset and val_dataset are already loaded
train_dataset = train_dataset.select(range(100))  
val_dataset = val_dataset.select(range(20))

# Define a basic data collator
def data_collator(batch):
    input_ids = torch.stack([torch.tensor(item["input_ids"]) for item in batch])
    attention_mask = torch.stack([torch.tensor(item["attention_mask"]) for item in batch])
    labels = torch.stack([torch.tensor(item["labels"]) for item in batch])
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}




In [40]:
# Hyperparameter grid
learning_rates = [3e-5]  # Single learning rate
batch_sizes = [8]        # Single batch size
epochs = [1]             # Single epoch to minimize runtime

# Initialize results storage
results = []
combos = list(product(learning_rates, batch_sizes, epochs))

In [None]:

def evaluate_hyperparameters(lr, batch_size, epochs):
    # DataLoaders
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=data_collator)

    # Model and Optimizer
    model = T5ForConditionalGeneration.from_pretrained("t5-small").to(device)
    optimizer = AdamW(model.parameters(), lr=lr)
    num_training_steps = len(train_dataloader) * epochs
    lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    # Training Loop
    for epoch in range(epochs):
        model.train()
        for batch in tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}", leave=False):
            batch = {key: val.to(device) for key, val in batch.items()}
            outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["labels"])
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

    # Evaluation Loop
    model.eval()
    predictions, references = [], []
    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc="Evaluating", leave=False):
            batch = {key: val.to(device) for key, val in batch.items()}
            outputs = model.generate(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], max_new_tokens=10)
            decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
            decoded_labels = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)

            # Clean predictions
            predictions.extend([1 if pred.strip() == "equivalent" else 0 for pred in decoded_preds])
            references.extend([1 if ref.strip() == "equivalent" else 0 for ref in decoded_labels])

            # DEBUG: Inspect predictions
            if "DEBUG" in globals() and DEBUG:
                print("Predictions:", decoded_preds[:5])
                print("References:", decoded_labels[:5])

    # Metrics Calculation
    acc = accuracy_score(references, predictions)
    f1 = f1_score(references, predictions, zero_division=1)
    return acc, f1


In [None]:

DEBUG = True  # Set to True to enable debugging outputs

# Initialize results storage (reset to avoid overwriting issues)
results = []

for lr, batch_size, epoch in tqdm(combos, desc="Hyperparameter Search Progress"):
    acc, f1 = evaluate_hyperparameters(lr, batch_size, epoch)
    results.append({"learning_rate": lr, "batch_size": batch_size, "epochs": epoch, "accuracy": acc, "f1_score": f1})

    if DEBUG:
        print(f"Results for LR={lr}, Batch Size={batch_size}, Epochs={epoch}: Accuracy={acc}, F1={f1}")

# Display Best Configuration
best_config = max(results, key=lambda x: x["f1_score"])
print(f"Best Hyperparameters: {best_config}")




Predictions: ['False', 'True', 'True', 'True', 'True']
References: ['equivalent', 'not equivalent', 'not equivalent', 'equivalent', 'not equivalent']


Hyperparameter Search Progress: 100%|██████████| 1/1 [00:11<00:00, 11.98s/it]

Predictions: ['|| For residents with incomes above $ ', 'True', 'delegates said raising and distributing funds has been', 'Paraphrase: " Sanitation is poor ', 'False']
References: ['equivalent', 'equivalent', 'equivalent', 'not equivalent', 'not equivalent']
Predictions: ['Paraphrase', 'True', 'paraphrase', 'True']
References: ['equivalent', 'not equivalent', 'equivalent', 'not equivalent']
Results for LR=3e-05, Batch Size=8, Epochs=1: Accuracy=0.4, F1=0.0
Best Hyperparameters: {'learning_rate': 3e-05, 'batch_size': 8, 'epochs': 1, 'accuracy': 0.4, 'f1_score': np.float64(0.0)}





In [43]:
from tqdm import tqdm

# Set the model to training mode
model.train()

# Number of epochs
epochs = 3

# Training loop
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    epoch_loss = 0
    progress_bar = tqdm(train_dataloader, desc="Training", leave=False)

    for batch in progress_bar:
        # Move batch to device
        batch = {key: val.to(device) for key, val in batch.items()}

        # Forward pass
        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["labels"]
        )

        # Compute loss
        loss = outputs.loss
        epoch_loss += loss.item()

        # Backpropagation
        loss.backward()

        # Update weights
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        # Update progress bar with loss
        progress_bar.set_postfix(loss=loss.item())

    # Print epoch loss
    print(f"Epoch {epoch+1} Loss: {epoch_loss/len(train_dataloader):.4f}")


Epoch 1/3


                                                                        

Epoch 1 Loss: 0.8516
Epoch 2/3


                                                                        

Epoch 2 Loss: 0.0871
Epoch 3/3


                                                                        

Epoch 3 Loss: 0.0737




In [44]:
# Save the fine-tuned model
model.save_pretrained("t5_mrpc_paraphrasing")
tokenizer.save_pretrained("t5_mrpc_paraphrasing")

print("Model saved successfully!")

Model saved successfully!


In [45]:
from sklearn.metrics import accuracy_score, f1_score

# Set the model to evaluation mode
model.eval()

# Initialize lists to store predictions and references
predictions = []
references = []

# Turn off gradient calculations for evaluation
with torch.no_grad():
    for batch in tqdm(val_dataloader, desc="Evaluating"):
        # Move batch to device
        batch = {key: val.to(device) for key, val in batch.items()}

        # Generate predictions
        outputs = model.generate(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
        decoded_preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)

        # Store predictions and references
        predictions.extend(decoded_preds)
        references.extend(decoded_labels)

# Convert textual predictions/references into binary labels
predictions_binary = [1 if pred == "equivalent" else 0 for pred in predictions]
references_binary = [1 if ref == "equivalent" else 0 for ref in references]

# Calculate Accuracy and F1 Score
accuracy = accuracy_score(references_binary, predictions_binary)
f1 = f1_score(references_binary, predictions_binary)

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

Evaluating: 100%|██████████| 51/51 [00:07<00:00,  6.68it/s]

Accuracy: 0.7451
F1 Score: 0.8410





In [None]:
# Identify misclassified examples
errors = []
for i, (pred, ref, s1, s2) in enumerate(zip(predictions_binary, references_binary, dataset["validation"]["sentence1"], dataset["validation"]["sentence2"])):
    if pred != ref:
        errors.append({
            "sentence1": s1,
            "sentence2": s2,
            "predicted": "equivalent" if pred == 1 else "not equivalent",
            "actual": "equivalent" if ref == 1 else "not equivalent"
        })

# Display some misclassified examples
for i, error in enumerate(errors[:5]):
    print(f"Error {i+1}:")
    print(f"Sentence 1: {error['sentence1']}")
    print(f"Sentence 2: {error['sentence2']}")
    print(f"Predicted: {error['predicted']} | Actual: {error['actual']}")
    print("-" * 50)

Error 1:
Sentence 1: The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .
Sentence 2: The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
Predicted: equivalent | Actual: not equivalent
--------------------------------------------------
Error 2:
Sentence 1: No dates have been set for the civil or the criminal trial .
Sentence 2: No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty .
Predicted: equivalent | Actual: not equivalent
--------------------------------------------------
Error 3:
Sentence 1: While dioxin levels in the environment were up last year , they have dropped by 75 percent since the 1970s , said Caswell .
Sentence 2: The Institute said dioxin levels in the environment have fallen by as much as 76 percent since the 1970s .
Predicted: equivalent | Actual: not equivalent
-----------------

In [None]:
# Benchmark results
bert_accuracy = 0.84 
bert_f1_score = 0.88 

# Display T5 Model Results
print("T5 Model Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1:.4f}")

# Comparison with BERT Benchmark
print("\nComparison with BERT Benchmark:")
print(f"T5 Accuracy: {accuracy:.4f} vs BERT Accuracy: {bert_accuracy:.4f}")
print(f"T5 F1 Score: {f1:.4f} vs BERT F1 Score: {bert_f1_score:.4f}")

# Evaluate and display performance comparison
if accuracy > bert_accuracy and f1 > bert_f1_score:
    print("T5 outperforms BERT on the MRPC dataset.")
elif accuracy < bert_accuracy and f1 < bert_f1_score:
    print("T5 underperforms compared to BERT on the MRPC dataset.")
else:
    print("T5 has mixed performance compared to BERT.")


T5 Model Results:
Accuracy: 0.7451
F1 Score: 0.8410

Comparison with BERT Benchmark:
T5 Accuracy: 0.7451 vs BERT Accuracy: 0.8400
T5 F1 Score: 0.8410 vs BERT F1 Score: 0.8800
T5 underperforms compared to BERT on the MRPC dataset.
