<a href="https://colab.research.google.com/github/als138/DistilBERT/blob/main/DistilBART_MRPC%26%26QNLI_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# Step 1: Install and import libraries
!pip install -q transformers datasets evaluate accelerate

import transformers
import torch
import numpy as np
from datasets import load_dataset
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline
)

print(f"Transformers version: {transformers.__version__}")

# Step 2: Load model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
print(">>> Model and tokenizer loaded.")

# Step 3: Load and preprocess the MRPC dataset
dataset = load_dataset("glue", "mrpc")

def tokenize_mrpc(examples):
    # Tokenize the pair of sentences (sentence1 and sentence2)
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)

tokenized_datasets = dataset.map(tokenize_mrpc, batched=True)
print(">>> MRPC dataset loaded and preprocessed.")

# Step 4: Define evaluation metrics (Accuracy and F1)
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels)
    return {"accuracy": accuracy["accuracy"], "f1": f1["f1"]}

# Step 5: Define training arguments (updated)
training_args = TrainingArguments(
    output_dir="distilbert-finetuned-mrpc",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    load_best_model_at_end=True,
    metric_for_best_model="f1",  # Use F1 as the main metric
    report_to=[],
)

# Step 6: Create Trainer and start training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)

print("\n>>> Starting training...")
trainer.train()

# Step 7: Final evaluation and testing
print("\n>>> Final evaluation results for MRPC:")
eval_results = trainer.evaluate()
print(eval_results)

# Test on new sentence pairs
device = 0 if torch.cuda.is_available() else -1
classifier = pipeline("text-classification", model=trainer.model, tokenizer=tokenizer, device=device)

custom_pairs = [
    {"text": "The company reported its earnings for the last quarter.", "text_pair": "Last quarter's earnings were announced by the firm."},
    {"text": "The flight to New York was delayed by an hour.", "text_pair": "The plane arrived in London on time."}
]

results = classifier(custom_pairs)
print("\n>>> Test on new sentence pairs:")
for pair, result in zip(custom_pairs, results):
    label = "Paraphrase" if result['label'] == 'LABEL_1' else "Not paraphrase"
    print(f'Sentences: "{pair["text"]}" and "{pair["text_pair"]}"')
    print(f'  -> Prediction: {label} (score: {result["score"]:.4f})\n')


Transformers version: 4.54.0


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


>>> Model and tokenizer loaded.


Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

>>> MRPC dataset loaded and preprocessed.

>>> Starting training...


ValueError: expected sequence of length 59 at dim 1 (got 43)

In [5]:
# Step 1: Install and import libraries
!pip install -q transformers datasets evaluate accelerate

import transformers
import torch
import numpy as np
from datasets import load_dataset
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    pipeline,
    DataCollatorWithPadding  # ✅ اضافه شده برای Padding داینامیک
)

print(f"Transformers version: {transformers.__version__}")

# Step 2: Load model and tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
print(">>> Model and tokenizer loaded.")

# Step 3: Load and preprocess the MRPC dataset
dataset = load_dataset("glue", "mrpc")

def tokenize_mrpc(examples):
    # Tokenize sentence pairs with truncation only (padding handled later)
    return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)

tokenized_datasets = dataset.map(tokenize_mrpc, batched=True)
print(">>> MRPC dataset loaded and preprocessed.")

# ✅ Step 3.5: Create dynamic padding collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Step 4: Define evaluation metrics
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels)
    return {"accuracy": accuracy["accuracy"], "f1": f1["f1"]}

# Step 5: Define training arguments
training_args = TrainingArguments(
    output_dir="distilbert-finetuned-mrpc",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    load_best_model_at_end=True,
    metric_for_best_model="f1",  # Use F1 as the main metric
    report_to=[],
)

# Step 6: Create Trainer and start training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
    data_collator=data_collator  # ✅ اضافه شده برای Padding درست
)

print("\n>>> Starting training...")
trainer.train()


Transformers version: 4.54.0


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


>>> Model and tokenizer loaded.


Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

>>> MRPC dataset loaded and preprocessed.

>>> Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.379651,0.840686,0.886165
2,No log,0.368589,0.833333,0.885906
3,0.386800,0.587505,0.838235,0.888889


TrainOutput(global_step=690, training_loss=0.32125712132108386, metrics={'train_runtime': 125.8279, 'train_samples_per_second': 87.453, 'train_steps_per_second': 5.484, 'total_flos': 215773868887344.0, 'train_loss': 0.32125712132108386, 'epoch': 3.0})

In [6]:
# ------------------------------------------------------------------
# Step 7: Final Evaluation and Testing on Custom Sentence Pairs
# ------------------------------------------------------------------

# Final evaluation on the validation dataset
print("\n>>> Final evaluation results on the validation dataset:")
eval_results = trainer.evaluate()
print(eval_results)

# Compare with paper-reported performance
print(f"\nComparison with paper: Our model accuracy: {eval_results['eval_accuracy']:.4f} | Reported accuracy in paper: 0.843")

# Create a pipeline for easy inference
from transformers import pipeline

# Determine device (GPU if available, otherwise CPU)
device = 0 if torch.cuda.is_available() else -1

# Create a text-classification pipeline using the fine-tuned model
paraphrase_pipeline = pipeline(
    "text-classification",
    model=trainer.model,
    tokenizer=tokenizer,
    device=device
)

# Custom sentence pairs to test the model
custom_pairs = [
    {"text": "The team won the game in the last minute.", "text_pair": "They secured the victory at the very end of the match."},
    {"text": "She enjoys painting landscapes.", "text_pair": "He plays football with his friends every weekend."},
    {"text": "The weather forecast predicts rain tomorrow.", "text_pair": "Tomorrow, it is expected to be rainy."}
]

print("\n>>> Testing the model on new sentence pairs:")
results = paraphrase_pipeline(custom_pairs)

for pair, result in zip(custom_pairs, results):
    # Interpret labels: LABEL_1 → Paraphrase, LABEL_0 → Not paraphrase
    label = "Paraphrase" if result['label'] == 'LABEL_1' else "Not paraphrase"
    print(f'Sentences: "{pair["text"]}" and "{pair["text_pair"]}"')
    print(f'  -> Prediction: {label} (score: {result["score"]:.4f})\n')

# Save the final model and tokenizer
final_model_path = "./distilbert-finetuned-mrpc-final"
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)
print(f"Final model and tokenizer saved to '{final_model_path}'.")



>>> Final evaluation results on the validation dataset:


Device set to use cuda:0


{'eval_loss': 0.5875053405761719, 'eval_accuracy': 0.8382352941176471, 'eval_f1': 0.8888888888888888, 'eval_runtime': 1.0295, 'eval_samples_per_second': 396.316, 'eval_steps_per_second': 25.255, 'epoch': 3.0}

Comparison with paper: Our model accuracy: 0.8382 | Reported accuracy in paper: 0.843

>>> Testing the model on new sentence pairs:
Sentences: "The team won the game in the last minute." and "They secured the victory at the very end of the match."
  -> Prediction: Not paraphrase (score: 0.7125)

Sentences: "She enjoys painting landscapes." and "He plays football with his friends every weekend."
  -> Prediction: Not paraphrase (score: 0.9474)

Sentences: "The weather forecast predicts rain tomorrow." and "Tomorrow, it is expected to be rainy."
  -> Prediction: Not paraphrase (score: 0.9717)

Final model and tokenizer saved to './distilbert-finetuned-mrpc-final'.


In [10]:
# Step 1 & 2: Install libraries and load model (skip if already done in the same notebook)
# ...

# Step 3: Load and preprocess the QNLI dataset
from datasets import load_dataset
dataset = load_dataset("glue", "qnli")

def tokenize_qnli(examples):
    # Tokenize question and sentence pairs
    return tokenizer(examples["question"], examples["sentence"], truncation=True)

tokenized_datasets = dataset.map(tokenize_qnli, batched=True)
print(">>> QNLI dataset loaded and preprocessed.")

# Step 4: Define evaluation metric (Accuracy is sufficient for this task)
import evaluate
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=predictions, references=labels)

# Step 5: Define training arguments
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="distilbert-finetuned-qnli",
    eval_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to=[],
)

from transformers import DataCollatorWithPadding

# Create a dynamic padding collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Step 6: Create Trainer and start training
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
    data_collator=data_collator  # ✅ added to fix padding error
)

print("\n>>> Starting training...")
trainer.train()


# Step 7: Final evaluation and testing
print("\n>>> Final evaluation results on QNLI validation set:")
eval_results = trainer.evaluate()
print(eval_results)

# Test on new question-sentence pairs
from transformers import pipeline
import torch

device = 0 if torch.cuda.is_available() else -1
classifier = pipeline("text-classification", model=trainer.model, tokenizer=tokenizer, device=device)

custom_pairs = [
    {"text": "What is the capital of France?", "text_pair": "The capital of France is Paris, a major European city."},
    {"text": "Who wrote 'Hamlet'?", "text_pair": "Charles Dickens was a famous English novelist."}
]

results = classifier(custom_pairs)
print("\n>>> Testing the model on new question-sentence pairs:")
for pair, result in zip(custom_pairs, results):
    label = "Entailment" if result['label'] == 'LABEL_0' else "Not Entailment"
    print(f'Question: "{pair["text"]}"\nSentence: "{pair["text_pair"]}"')
    print(f'  -> Prediction: {label} (score: {result["score"]:.4f})\n')


Map:   0%|          | 0/104743 [00:00<?, ? examples/s]

Map:   0%|          | 0/5463 [00:00<?, ? examples/s]

Map:   0%|          | 0/5463 [00:00<?, ? examples/s]

>>> QNLI dataset loaded and preprocessed.

>>> Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3789,0.334598,0.862164
2,0.2564,0.342898,0.866374
3,0.1715,0.491745,0.870218



>>> Final evaluation results on QNLI validation set:


Device set to use cuda:0


{'eval_loss': 0.4917454421520233, 'eval_accuracy': 0.8702178290316676, 'eval_runtime': 14.8748, 'eval_samples_per_second': 367.265, 'eval_steps_per_second': 22.992, 'epoch': 3.0}

>>> Testing the model on new question-sentence pairs:
Question: "What is the capital of France?"
Sentence: "The capital of France is Paris, a major European city."
  -> Prediction: Entailment (score: 0.9992)

Question: "Who wrote 'Hamlet'?"
Sentence: "Charles Dickens was a famous English novelist."
  -> Prediction: Not Entailment (score: 0.9849)



In [11]:
# Step 7: Final Evaluation and Testing on Custom Sentence Pairs

# Final evaluation on the validation set
print("\n>>> Final evaluation results on the QNLI validation set:")
eval_results = trainer.evaluate()
print(eval_results)

# Create a classification pipeline for easy inference
from transformers import pipeline
import torch

device = 0 if torch.cuda.is_available() else -1

classifier = pipeline(
    "text-classification",
    model=trainer.model,
    tokenizer=tokenizer,
    device=device
)

# Custom question-sentence pairs for testing
custom_pairs = [
    {
        "text": "What is the capital of France?",
        "text_pair": "The capital of France is Paris, a major European city."
    },
    {
        "text": "Who wrote 'Hamlet'?",
        "text_pair": "Charles Dickens was a famous English novelist."
    }
]

# Run inference
results = classifier(custom_pairs)

print("\n>>> Testing the model on new question-sentence pairs:")
for pair, result in zip(custom_pairs, results):
    label = "Entailment" if result["label"] == "LABEL_0" else "Not Entailment"
    print(f'Question: "{pair["text"]}"')
    print(f'Sentence: "{pair["text_pair"]}"')
    print(f'  -> Prediction: {label} (score: {result["score"]:.4f})\n')



>>> Final evaluation results on the QNLI validation set:


Device set to use cuda:0


{'eval_loss': 0.4917454421520233, 'eval_accuracy': 0.8702178290316676, 'eval_runtime': 16.2483, 'eval_samples_per_second': 336.219, 'eval_steps_per_second': 21.048, 'epoch': 3.0}

>>> Testing the model on new question-sentence pairs:
Question: "What is the capital of France?"
Sentence: "The capital of France is Paris, a major European city."
  -> Prediction: Entailment (score: 0.9992)

Question: "Who wrote 'Hamlet'?"
Sentence: "Charles Dickens was a famous English novelist."
  -> Prediction: Not Entailment (score: 0.9849)

