<a href="https://colab.research.google.com/github/angomoson/BostonHousePricing/blob/main/mbart.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [7]:
!pip install evaluate
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=502fb2d070815cf98f02a70d3bddecac337ede491903e86b2e2cf0f5484d53db
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
import os
import pandas as pd
import torch
from datasets import Dataset
import evaluate  # ✅ Use `evaluate` instead of `datasets.load_metric`
from transformers import (
    AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq,
    TrainingArguments, Trainer
)

# ✅ Ensure CUDA is available if using GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# ✅ Load the dataset
dataset_path = "/content/drive/MyDrive/fixed_dataset.csv"  # Change this to your actual dataset path
if not os.path.exists(dataset_path):
    raise FileNotFoundError(f"Dataset file '{dataset_path}' not found!")

print("Loading dataset...")
df = pd.read_csv(dataset_path, encoding="utf-8").dropna().reset_index(drop=True)

# ✅ Ensure the dataset has the required columns
required_columns = {"article", "summary"}
if not required_columns.issubset(df.columns):
    raise ValueError(f"Dataset must have columns: {required_columns}. Found: {df.columns}")

# ✅ Convert dataset to Hugging Face format
dataset = Dataset.from_pandas(df)

# ✅ Load the tokenizer for mBART
model_name = "facebook/mbart-large-50"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# ✅ Tokenization function
def tokenize_function(examples, max_length=128):
    """Tokenizes input articles and summaries for mBART"""
    inputs = tokenizer(examples["article"], max_length=max_length, padding="max_length", truncation=True)
    targets = tokenizer(examples["summary"], max_length=max_length, padding="max_length", truncation=True)

    inputs["labels"] = targets["input_ids"]  # Target labels for summarization
    return inputs

# ✅ Apply tokenization
print("Tokenizing dataset...")
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# ✅ Split dataset into training & validation sets
split_ratio = 0.9
train_size = int(len(tokenized_dataset) * split_ratio)

train_dataset = tokenized_dataset.select(range(train_size))
eval_dataset = tokenized_dataset.select(range(train_size, len(tokenized_dataset)))

print(f"Dataset split: {len(train_dataset)} training samples, {len(eval_dataset)} validation samples.")

# ✅ Load the mBART Model for Summarization
print("Loading model...")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

# ✅ Use Data Collator for Padding
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

# ✅ Define Training Arguments
training_args = TrainingArguments(
    output_dir="./mbart_summarization",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    learning_rate=3e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    report_to="none",
    fp16=torch.cuda.is_available(),
)

# ✅ Fix: Use `evaluate` Instead of `load_metric`
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    """Computes ROUGE scores for evaluation"""
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    return {k: round(v.mid.fmeasure * 100, 2) for k, v in result.items()}

# ✅ Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# ✅ Start Training
print("\n🚀 Starting training...")
trainer.train()

# ✅ Save the Fine-Tuned Model
print("\n✅ Training complete! Saving model...")
model.save_pretrained("mbart_finetuned")
tokenizer.save_pretrained("mbart_finetuned")

# ✅ Test the Fine-Tuned Model
print("\n🔹 Testing model with a sample article...")
sample_text = "Your sample article text here"  # Replace with actual text
inputs = tokenizer(sample_text, return_tensors="pt", truncation=True, padding="max_length", max_length=128).to(device)

# Generate summary
model.eval()
with torch.no_grad():
    output = model.generate(**inputs, max_length=50)

summary = tokenizer.decode(output[0], skip_special_tokens=True)
print("\n📝 Generated Summary:", summary)


Using device: cpu
Loading dataset...
Tokenizing dataset...


Map:   0%|          | 0/1293 [00:00<?, ? examples/s]

Dataset split: 1163 training samples, 130 validation samples.
Loading model...


  trainer = Trainer(



🚀 Starting training...


Epoch,Training Loss,Validation Loss
