# Fine-tuning Phi-2 for Abstractive Text Summarization with XSum Dataset

This notebook fine-tunes Microsoft's Phi-2 model for abstractive text summarization using the XSum dataset. The model learns to generate concise, abstractive summaries through instruction-style prompting and causal language modeling.


**Important:** If you encounter import errors, please:
1. Run the installation cell above
2. **Restart the kernel** (Kernel → Restart Kernel)
3. Run all cells again from the beginning


## 1. Import Libraries and Setup


## Troubleshooting Import Errors

If you're getting import errors, run the cell below to diagnose the issue.


In [2]:
# Troubleshooting: Check if all packages are properly installed
import sys

def check_package(package_name, import_name=None):
    if import_name is None:
        import_name = package_name
    try:
        mod = __import__(import_name)
        version = getattr(mod, '__version__', 'unknown')
        print(f"✓ {package_name}: {version}")
        return True
    except ImportError as e:
        print(f"✗ {package_name}: NOT FOUND - {e}")
        return False

print("Checking package installations...")
print("="*60)
packages_ok = True
packages_ok &= check_package("torch")
packages_ok &= check_package("transformers")
packages_ok &= check_package("datasets")
packages_ok &= check_package("accelerate")
packages_ok &= check_package("peft")
packages_ok &= check_package("bitsandbytes", "bitsandbytes")

print("="*60)
if packages_ok:
    print("✓ All packages are installed correctly!")
else:
    print("✗ Some packages are missing. Please run the installation cell above.")
    print("Then restart the kernel and try again.")


Checking package installations...
✓ torch: 2.9.1+cu128


  from .autonotebook import tqdm as notebook_tqdm


✓ transformers: 4.57.3
✓ datasets: 4.4.2
✓ accelerate: 1.12.0
✓ peft: 0.18.0
✓ bitsandbytes: 0.49.0
✓ All packages are installed correctly!


In [3]:
# Import PEFT FIRST before transformers Trainer to avoid import issues
try:
    import peft
    from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
    PEFT_AVAILABLE = True
    print("✓ PEFT library loaded successfully")
except ImportError as e:
    print(f"✗ Error importing PEFT: {e}")
    print("Please install PEFT: pip install --upgrade peft")
    PEFT_AVAILABLE = False
    raise

import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    GenerationConfig
)

import os
import numpy as np
from tqdm import tqdm
import nltk
nltk.download('punkt', quiet=True)

# Verify versions
print("\n" + "="*50)
print("Library Versions:")
print("="*50)
import transformers
print(f"torch: {torch.__version__}")
print(f"transformers: {transformers.__version__}")
print(f"peft: {peft.__version__}")
print("="*50)
print("✓ All libraries imported successfully")


✓ PEFT library loaded successfully

Library Versions:
torch: 2.9.1+cu128
transformers: 4.57.3
peft: 0.18.0
✓ All libraries imported successfully


## 2. Load XSum Dataset


In [4]:
import datasets

# Load the XSum dataset directly from the Hugging Face Hub
dataset = load_dataset(
    "EdinburghNLP/xsum",
    revision="refs/convert/parquet"
)

print("Dataset structure:")
print(dataset)
print("\nSample from training set:")
print(dataset['train'][0])

Dataset structure:
DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 204045
    })
    validation: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11332
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 11334
    })
})

Sample from training set:


## 3. Load Phi-2 Model and Tokenizer


In [5]:
# Model configuration
model_id = "microsoft/phi-2"

# Configure 4-bit quantization for memory efficiency
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# Load model with quantization
print("Loading Phi-2 model...")
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print(f"Model loaded. Vocabulary size: {len(tokenizer)}")
print(f"EOS token: {tokenizer.eos_token}, PAD token: {tokenizer.pad_token}")


Loading Phi-2 model...


`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|██████████| 2/2 [00:14<00:00,  7.24s/it]


Model loaded. Vocabulary size: 50295
EOS token: <|endoftext|>, PAD token: <|endoftext|>


## 4. Prepare Model for Training (LoRA)


In [6]:
# Prepare model for k-bit training
model = prepare_model_for_kbit_training(model)

# Configure LoRA for efficient fine-tuning
lora_config = LoraConfig(
    r=16,  # LoRA rank
    lora_alpha=32,  # LoRA alpha scaling
    target_modules=["q_proj", "k_proj", "v_proj", "dense"],  # Target attention layers
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply LoRA to model
model = get_peft_model(model, lora_config)

# Enable gradient checkpointing to save memory (for 5.67GB GPU)
if hasattr(model, "gradient_checkpointing_enable"):
    model.gradient_checkpointing_enable()
    print("✓ Gradient checkpointing enabled")

model.print_trainable_parameters()


✓ Gradient checkpointing enabled
trainable params: 10,485,760 || all params: 2,790,169,600 || trainable%: 0.3758


## 5. Preprocess Dataset with Instruction-Style Prompting


In [8]:
def create_instruction_prompt(document, summary=None):
    """
    Create instruction-style prompt for abstractive summarization.
    This format helps the model learn to generate concise summaries.
    """
    instruction = "Summarize the following news article in a concise and abstractive way."

    prompt = f"### Instruction:\n{instruction}\n\n### Article:\n{document}\n\n### Summary:\n"

    if summary is not None:
        prompt += summary

    return prompt

# Test the prompt format
sample = dataset['train'][0]
test_prompt = create_instruction_prompt(sample['document'], sample['summary'])
print("Sample prompt format:")
print(test_prompt[:500] + "...")


Sample prompt format:
### Instruction:
Summarize the following news article in a concise and abstractive way.

### Article:
The full cost of damage in Newton Stewart, one of the areas worst affected, is still being assessed.
Repair work is ongoing in Hawick and many roads in Peeblesshire remain badly affected by standing water.
Trains on the west coast mainline face disruption due to damage at the Lamington Viaduct.
Many businesses and householders were affected by flooding in Newton Stewart after the River Cree over...


In [9]:
def tokenize_function(examples):
    """
    Tokenize the dataset with instruction-style prompts.
    For causal LM training, we mask the instruction part so the model only learns to generate summaries.
    All sequences are padded to max_length for consistent batching.
    """
    # Create prompts with summaries
    prompts_with_summary = [create_instruction_prompt(doc, summ) for doc, summ in zip(examples['document'], examples['summary'])]

    # Create prompts without summaries (to find where summary starts)
    prompts_without_summary = [create_instruction_prompt(doc) for doc in examples['document']]

    # Tokenize both - MUST pad to max_length for consistent batching
    max_length = 512  # Reduced from 1024 to 512 for lower memory usage (5.67GB GPU)
    pad_token_id = tokenizer.pad_token_id

    tokenized_full = tokenizer(
        prompts_with_summary,
        truncation=True,
        max_length=max_length,
        padding="max_length",  # Pad to max_length for consistent batching
        return_tensors=None
    )

    tokenized_prompt = tokenizer(
        prompts_without_summary,
        truncation=True,
        max_length=max_length,
        padding="max_length",  # Pad to max_length for consistent batching
        return_tensors=None
    )

    # Create labels: mask instruction part (set to -100), keep summary part
    labels = []
    for i in range(len(tokenized_full["input_ids"])):
        full_ids = tokenized_full["input_ids"][i]
        prompt_ids = tokenized_prompt["input_ids"][i]

        # Find where the actual prompt ends (before padding)
        # Find first pad token in prompt_ids
        prompt_len = max_length
        for j, token_id in enumerate(prompt_ids):
            if token_id == pad_token_id:
                prompt_len = j
                break

        # Create label: -100 for instruction part, actual token ids for summary part
        # Then pad to max_length with -100
        label = [-100] * prompt_len + full_ids[prompt_len:]

        # Ensure label is exactly max_length
        if len(label) > max_length:
            label = label[:max_length]
        elif len(label) < max_length:
            # Pad with -100 to match max_length
            label = label + [-100] * (max_length - len(label))

        labels.append(label)

    tokenized_full["labels"] = labels

    return tokenized_full

# Process dataset
# OPTIONAL: Limit dataset size for faster training (uncomment to use subset)
# For faster training, use a smaller subset:
USE_SUBSET = True  # Set to False to use full dataset
TRAIN_SUBSET_SIZE = 5000   # Reduced to 5k samples for lower memory usage
VAL_SUBSET_SIZE = 500      # Reduced to 500 samples for validation

if USE_SUBSET:
    print(f"Using subset: {TRAIN_SUBSET_SIZE} training samples, {VAL_SUBSET_SIZE} validation samples")
    train_data = dataset['train'].select(range(min(TRAIN_SUBSET_SIZE, len(dataset['train']))))
    val_data = dataset['validation'].select(range(min(VAL_SUBSET_SIZE, len(dataset['validation']))))
else:
    print("Using full dataset")
    train_data = dataset['train']
    val_data = dataset['validation']

print("Tokenizing training set...")
train_dataset = train_data.map(
    tokenize_function,
    batched=True,
    batch_size=100,
    remove_columns=dataset['train'].column_names
)

print("Tokenizing validation set...")
val_dataset = val_data.map(
    tokenize_function,
    batched=True,
    batch_size=100,
    remove_columns=dataset['validation'].column_names
)

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"Sample tokenized length: {len(train_dataset[0]['input_ids'])}")


Using subset: 5000 training samples, 500 validation samples
Tokenizing training set...


Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map: 100%|██████████| 5000/5000 [00:04<00:00, 1011.33 examples/s]


Tokenizing validation set...


Map: 100%|██████████| 500/500 [00:00<00:00, 1138.93 examples/s]

Training samples: 5000
Validation samples: 500
Sample tokenized length: 512





## 6. Configure Training Arguments


In [10]:
# Training configuration optimized for low memory (5.67GB GPU)
# Reduced batch size, sequence length, and added gradient checkpointing
training_args = TrainingArguments(
    output_dir="./phi2-xsum-finetuned",
    num_train_epochs=1,
    per_device_train_batch_size=1,  # Reduced to 1 for low memory GPU
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=8,  # Increased to maintain effective batch size = 8
    learning_rate=2e-4,
    warmup_steps=50,
    logging_steps=100,
    eval_steps=500,  # More frequent eval for smaller dataset
    save_steps=500,
    eval_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=True,  # Use mixed precision training
    optim="paged_adamw_8bit",  # Memory-efficient optimizer
    lr_scheduler_type="cosine",
    report_to="none",
    save_total_limit=2,
    remove_unused_columns=False,
    dataloader_pin_memory=False,
    gradient_checkpointing=True,  # Enable gradient checkpointing to save memory
    # Optional: Uncomment to limit total training steps (faster training)
    # max_steps=2000,  # Stop after 2000 steps regardless of epochs
)

print("Training arguments configured for faster training:")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Batch size: {training_args.per_device_train_batch_size}")
print(f"  Gradient accumulation: {training_args.gradient_accumulation_steps}")
print(f"  Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"  Eval/Save steps: {training_args.eval_steps}")


Training arguments configured for faster training:
  Epochs: 1
  Batch size: 1
  Gradient accumulation: 8
  Effective batch size: 8
  Eval/Save steps: 500


## 7. Initialize Trainer


In [11]:
# Data collator for causal language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # We're doing causal LM, not masked LM
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)

print("Trainer initialized. Ready to start training!")


Trainer initialized. Ready to start training!


## 8. Start Training


In [12]:
# Start fine-tuning
print("Starting training...")
trainer.train()

print("\nTraining completed!")


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Starting training...


Step,Training Loss,Validation Loss
500,2.2102,2.118299



Training completed!


## 9. Save the Fine-tuned Model


In [13]:
# Save the final model
final_model_path = "./phi2-xsum-final"
trainer.save_model(final_model_path)
tokenizer.save_pretrained(final_model_path)

print(f"Model saved to {final_model_path}")


Model saved to ./phi2-xsum-final


## 10. Test Generation with Generation Control


In [14]:
# Load the fine-tuned model for inference
from transformers import pipeline

# Merge LoRA weights back to base model for inference
model = model.merge_and_unload()

# Configure generation parameters for controlled summarization
generation_config = GenerationConfig(
    max_new_tokens=150,  # Limit summary length
    min_new_tokens=20,   # Minimum summary length
    temperature=0.7,     # Control randomness
    top_p=0.9,           # Nucleus sampling
    do_sample=True,
    repetition_penalty=1.2,  # Reduce repetition
    pad_token_id=tokenizer.eos_token_id
)

# Test on a sample from validation set
test_sample = dataset['validation'][0]
test_document = test_sample['document']
ground_truth_summary = test_sample['summary']

# Create prompt (without summary for inference)
prompt = create_instruction_prompt(test_document)

# Tokenize
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# Generate summary
print("Generating summary...")
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        generation_config=generation_config
    )

# Decode the generated summary
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Extract only the summary part (after "### Summary:\n")
summary = generated_text.split("### Summary:\n")[-1].strip()

print("\n" + "="*80)
print("ORIGINAL ARTICLE:")
print("="*80)
print(test_document[:500] + "...")
print("\n" + "="*80)
print("GROUND TRUTH SUMMARY:")
print("="*80)
print(ground_truth_summary)
print("\n" + "="*80)
print("GENERATED SUMMARY:")
print("="*80)
print(summary)




Generating summary...

ORIGINAL ARTICLE:
The ex-Reading defender denied fraudulent trading charges relating to the Sodje Sports Foundation - a charity to raise money for Nigerian sport.
Mr Sodje, 37, is jointly charged with elder brothers Efe, 44, Bright, 50 and Stephen, 42.
Appearing at the Old Bailey earlier, all four denied the offence.
The charge relates to offences which allegedly took place between 2008 and 2014.
Sam, from Kent, Efe and Bright, of Greater Manchester, and Stephen, from Bexley, are due to stand trial in July.
They ...

GROUND TRUTH SUMMARY:
Former Premier League footballer Sam Sodje has appeared in court alongside three brothers accused of charity fraud.

GENERATED SUMMARY:
A former Reading defender was cleared of fraud accusations related to a charity called the Sodje Sports Foundation that aims to support sports in Nigeria. The men have been accused of committing offenses over several years but they deny any wrongdoing. They will face trials later this year if fo

## 11. Evaluation Function (Optional - ROUGE Scores)


In [1]:
from evaluate import load
from rouge_score import rouge_scorer

# Load ROUGE metric
rouge = load("rouge")

def evaluate_summaries(model, tokenizer, dataset, num_samples=50):
    """
    Evaluate the model on a subset of the validation set using ROUGE scores.
    """
    model.eval()
    predictions = []
    references = []

    # Sample a subset for evaluation
    eval_samples = dataset['validation'].select(range(num_samples))

    print(f"Evaluating on {num_samples} samples...")
    for sample in tqdm(eval_samples):
        # Create prompt
        prompt = create_instruction_prompt(sample['document'])
        inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(model.device)

        # Generate summary
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                generation_config=generation_config,
                max_new_tokens=150
            )

        # Decode
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        summary = generated_text.split("### Summary:\n")[-1].strip()

        predictions.append(summary)
        references.append(sample['summary'])

    # Calculate ROUGE scores
    results = rouge.compute(predictions=predictions, references=references)

    return results, predictions, references

# Uncomment to run evaluation
# rouge_results, preds, refs = evaluate_summaries(model, tokenizer, dataset, num_samples=50)
# print("\nROUGE Scores:")
# print(rouge_results)


  from .autonotebook import tqdm as notebook_tqdm
