# Fine-Tune a Summarization Model with Custom TSV Data + Training Curves
This notebook trains a text summarizer using custom TSV files (xsum-style format) and visualizes training metrics.

## Install Required Libraries

In [None]:
#!pip install transformers datasets scikit-learn matplotlib pandas --quiet

## Load and Explore Custom TSV Data

In [None]:
# import pandas as pd

# tr = pd.read_csv("xsum_train.tsv", sep="\t", encoding='utf-8')
# ts = pd.read_csv("xsum_test.tsv", sep="\t", encoding='utf-8')
# vl = pd.read_csv("xsum_val.tsv", sep="\t", encoding='utf-8')

# print(tr.shape,ts.shape,vl.shape)

# #################################
# import pandas as pd


# # Sample the data
# tr_sample = tr.sample(n=10000, random_state=42)
# ts_sample = ts.sample(n=1000, random_state=42)
# vl_sample = vl.sample(n=1000, random_state=42)

# # Save the samples to TSV files
# tr_sample_path = "xsum_train.tsv"
# ts_sample_path = "xsum_test.tsv"
# vl_sample_path = "xsum_val.tsv"

# tr_sample.to_csv(tr_sample_path, sep="\t", index=False)
# ts_sample.to_csv(ts_sample_path, sep="\t", index=False)
# vl_sample.to_csv(vl_sample_path, sep="\t", index=False)

# (tr_sample_path, ts_sample_path, vl_sample_path)



In [None]:
from datasets import load_dataset

data_files = {
    "train": "xsum_train.tsv",
    "validation": "xsum_val.tsv",
    "test": "xsum_val.tsv"
}
dataset = load_dataset("csv", data_files=data_files, delimiter="\t")
print(dataset["train"][0])

## Tokenize the Dataset for Summarization

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

max_input_length = 512
max_target_length = 64


def preprocess_function(examples):
    texts = [str(x) for x in examples["text"]]
    summaries = [str(x) for x in examples["summary"]]

    model_inputs = tokenizer(texts, max_length=max_input_length, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(summaries, max_length=max_target_length, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


tokenized_dataset = dataset.map(preprocess_function, batched=True)

## Load the Pretrained Summarization Model

In [None]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

In [None]:
# !pip install evaluate
# !pip install nltk rouge_score


## Define Evaluation Metrics (ROUGE)

In [None]:
import evaluate
rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
    import numpy as np

    predictions, labels = eval_pred
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds = ["\n".join(p.strip().split(". ")) for p in decoded_preds]
    decoded_labels = ["\n".join(l.strip().split(". ")) for l in decoded_labels]

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    return {k: round(v * 100, 4) for k, v in result.items()}


## Set Training Arguments

In [None]:
from transformers import TrainingArguments, Seq2SeqTrainer, GenerationConfig

# Set generation config
generation_config = GenerationConfig.from_pretrained(model_checkpoint)

# ✅ Set batch size and fp16 in TrainingArguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=2,
    per_device_train_batch_size=1,  # 👈 set here
    per_device_eval_batch_size=1,   # 👈 set here
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    report_to="none",
    fp16=True,  # 👈 enable if your GPU supports it
    save_total_limit=1
    
)
training_args.generation_num_beams=4
training_args.generation_config = generation_config
training_args.predict_with_generate=True

## Initialize Trainer and Train the Model

In [None]:
from transformers import TrainingArguments, Seq2SeqTrainer, GenerationConfig

# Set generation config
generation_config = GenerationConfig.from_pretrained(model_checkpoint)


training_args.generation_config = generation_config
training_args.generation_max_length=max_target_length


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


## Evaluate on the Test Set

In [None]:
trainer.evaluate(tokenized_dataset["test"])

## Plot Training Loss and Evaluation Metrics

In [None]:
#!pip install matplotlib

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

log_df = pd.DataFrame(trainer.state.log_history)

# Training Loss
plt.figure(figsize=(10, 4))
plt.plot(log_df["step"], log_df["loss"], label="Training Loss")
plt.xlabel("Steps")
plt.ylabel("Loss")
plt.title("Training Loss Curve")
plt.legend()
plt.grid(True)
plt.show()

# ROUGE Evaluation (if available)
eval_df = log_df.dropna(subset=["eval_rouge1"])
if not eval_df.empty:
    plt.figure(figsize=(10, 4))
    plt.plot(eval_df["step"], eval_df["eval_rouge1"], label="ROUGE-1")
    plt.plot(eval_df["step"], eval_df["eval_rouge2"], label="ROUGE-2")
    plt.plot(eval_df["step"], eval_df["eval_rougeL"], label="ROUGE-L")
    plt.xlabel("Steps")
    plt.ylabel("ROUGE Score")
    plt.title("Evaluation Metrics")
    plt.legend()
    plt.grid(True)
    plt.show()

## Save the Fine-Tuned Model

In [None]:
model.save_pretrained("./custom-summarizer")
tokenizer.save_pretrained("./custom-summarizer")

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load the fine-tuned model
model = AutoModelForSeq2SeqLM.from_pretrained("./custom-summarizer")
tokenizer = AutoTokenizer.from_pretrained("./custom-summarizer")

# Your input text
# text = """The Transformer architecture has revolutionized NLP tasks. 
# It enabled the development of large language models such as BERT and GPT, 
# which have set state-of-the-art benchmarks in various applications."""

text=ts.iloc[0]['text']

# Tokenize input
inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True)

# Generate summary
summary_ids = model.generate(
    inputs["input_ids"],
    max_length=64,
    num_beams=4,
    length_penalty=2.0,
    early_stopping=True
)

# Decode and print the summary
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
print("📄 Summary:", summary)


In [None]:
print(ts.iloc[0]['text'])
print('*****************************')
print(ts.iloc[0]['summary'])
print('*****************************')

## Summary
- Loaded custom TSV data for summarization
- Fine-tuned BART on the dataset
- Plotted training and evaluation curves
- Saved the trained model for reuse or deployment