# 🧑‍🎓 SlideGenAI: Research-Grade Slide Generation with Flan-T5 (arXiv & PubMed)

This notebook trains, validates, and evaluates a Flan-T5 model to generate slide-style summaries from scientific abstracts.  
- **Datasets:** arXiv & PubMed (train/val/test sets)
- **Framework:** PyTorch + HuggingFace
- **Evaluation:** ROUGE

In [1]:
import os
import json
from tqdm import tqdm
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, get_linear_schedule_with_warmup
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from rouge_score import rouge_scorer
import numpy as np

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Using device:", device)
MODEL_NAME = "google/flan-t5-large"  # or 'google/flan-t5-base'

Using device: cuda


# 1. 📚 Data Loading (arXiv & PubMed)

Both datasets must have `train.txt`, `val.txt`, `test.txt`, one JSON per line.  
We load, filter, format, and cache them for fast repeated experiments.

In [None]:
# ==== Path config: Set to your downloaded dataset folders ====
ARXIV_DIR = r"C:\Users\amira\Downloads\datasets SlidegenAI\arxiv-dataset\arxiv-dataset"
PUBMED_DIR = r"C:\Users\amira\Downloads\datasets SlidegenAI\pubmed-dataset\pubmed-dataset"

def load_jsonl(path):
    with open(path, 'r', encoding='utf-8') as f:
        return [json.loads(line) for line in f]

# New universal filter for both datasets (since they have same structure)
def filter_any(sample):
    # Require non-empty article_text and at least 200 characters
    return "article_text" in sample and len(sample["article_text"]) > 0 and len(" ".join(sample["article_text"])) > 200

def split_into_slides_from_list(article_text, n_slides=4):
    # article_text is a list of sentences/paragraphs
    sentences = article_text
    chunk_size = len(sentences) // n_slides + 1
    slides = [' '.join(sentences[i:i+chunk_size]).strip() for i in range(0, len(sentences), chunk_size)]
    return [s for s in slides if s]

def preprocess(samples, name):
    processed = []
    for sample in samples:
        if filter_any(sample):
            slides = split_into_slides_from_list(sample["article_text"])
            processed.append({
                'title': sample.get('article_id', ''),  # use article_id as a unique "title"
                'slides': slides,
                'abstract': " ".join(sample["article_text"]),  # treat joined text as abstract
            })
    print(f"{name}: {len(processed)} after filtering & formatting")
    return processed


In [3]:
for source, src_dir in [("arxiv", ARXIV_DIR), ("pubmed", PUBMED_DIR)]:
    for split in ["train", "val", "test"]:
        src_path = os.path.join(src_dir, f"{split}.txt")
        print(f"{src_path} exists:", os.path.exists(src_path))
        if os.path.exists(src_path):
            with open(src_path, "r", encoding="utf-8") as f:
                lines = f.readlines()
                print(f"  {len(lines)} lines; first line: {lines[0][:200] if lines else 'EMPTY'}")


C:\Users\amira\Downloads\datasets SlidegenAI\arxiv-dataset\arxiv-dataset\train.txt exists: True
  203037 lines; first line: {"article_id": "1405.3379", "article_text": ["additive models @xcite provide an important family of models for semiparametric regression or classification . some reasons for the success of additive mo
C:\Users\amira\Downloads\datasets SlidegenAI\arxiv-dataset\arxiv-dataset\val.txt exists: True
  6436 lines; first line: {"article_id": "0708.1996", "article_text": ["the interest in anchoring phenomena and phenomena in confined nematic liquid crystals has largely been driven by their potential use in liquid crystal dis
C:\Users\amira\Downloads\datasets SlidegenAI\arxiv-dataset\arxiv-dataset\test.txt exists: True
  6440 lines; first line: {"article_id": "1009.3123", "article_text": ["for about 20 years the problem of properties of short - term changes of solar activity has been considered extensively .", "many investigators studied the
C:\Users\amira\Downloads\dataset

## 🗃️ Load, Filter, and Format All Splits (arXiv & PubMed)
_Cached files will be used if present for fast reruns._

In [None]:
# Where to save filtered data
CACHE_DIR = "./slidegen_cache"
os.makedirs(CACHE_DIR, exist_ok=True)

def cached_load_or_preprocess(src_path, cache_path, name):
    if os.path.exists(cache_path):
        with open(cache_path, "r", encoding="utf-8") as f:
            data = [json.loads(line) for line in f]
        print(f"{name}: Loaded {len(data)} from cache.")
        return data
    raw = load_jsonl(src_path)
    data = preprocess(raw, name)
    with open(cache_path, "w", encoding="utf-8") as f:
        for item in data:
            f.write(json.dumps(item) + "\n")
    return data

datasets = {}
for source, src_dir in [("arxiv", ARXIV_DIR), ("pubmed", PUBMED_DIR)]:
    for split in ["train", "val", "test"]:
        src_path = os.path.join(src_dir, f"{split}.txt")
        cache_path = os.path.join(CACHE_DIR, f"{source}_{split}_filtered.jsonl")
        datasets[f"{source}_{split}"] = cached_load_or_preprocess(
            src_path, cache_path, f"{source} {split}"
        )
# Check the number of samples in each dataset
print("Dataset sizes:")
print("arXiv train/val/test:", [len(datasets[f"arxiv_{s}"]) for s in ["train", "val", "test"]])
print("PubMed train/val/test:", [len(datasets[f"pubmed_{s}"]) for s in ["train", "val", "test"]])


arxiv train: Loaded 0 from cache.
arxiv val: Loaded 0 from cache.
arxiv test: Loaded 0 from cache.
pubmed train: Loaded 0 from cache.
pubmed val: Loaded 0 from cache.
pubmed test: Loaded 0 from cache.
Dataset sizes:
arXiv train/val/test: [0, 0, 0]
PubMed train/val/test: [0, 0, 0]


# 2. 🔄 Dataset Selection & Mixing

Combine datasets for larger training, or use arXiv/PubMed only.  
Edit below as needed.

In [None]:
# Choose your training/validation/test sets here:
train_data = datasets["arxiv_train"] + datasets["pubmed_train"]
val_data   = datasets["arxiv_val"] + datasets["pubmed_val"]
test_data  = datasets["arxiv_test"] + datasets["pubmed_test"]

# To use only arXiv or only PubMed, set e.g.:
# train_data = datasets["arxiv_train"]
# val_data = datasets["arxiv_val"]
# test_data = datasets["arxiv_test"]

print(f"Train: {len(train_data)}, Val: {len(val_data)}, Test: {len(test_data)}")

# 3. ✨ Tokenization & Dataset Preparation

We use the Flan-T5 tokenizer and wrap everything in a PyTorch Dataset.

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
max_input_length = 512
max_target_length = 128

class SlidesDataset(Dataset):
    def __init__(self, data, tokenizer, max_input=512, max_target=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_input = max_input
        self.max_target = max_target
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        sample = self.data[idx]
        source = sample["abstract"]
        target = ' ; '.join(sample["slides"])  # use ';' as slide separator
        
        model_inputs = self.tokenizer(
            source, max_length=self.max_input, truncation=True, padding="max_length", return_tensors="pt"
        )
        labels = self.tokenizer(
            target, max_length=self.max_target, truncation=True, padding="max_length", return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in model_inputs.items()}
        item['labels'] = labels['input_ids'].squeeze(0)
        return item

## 🔥 DataLoader Setup
Efficient batched loading for training and validation.

In [None]:
BATCH_SIZE = 4

train_dataset = SlidesDataset(train_data, tokenizer, max_input_length, max_target_length)
val_dataset   = SlidesDataset(val_data, tokenizer, max_input_length, max_target_length)
test_dataset  = SlidesDataset(test_data, tokenizer, max_input_length, max_target_length)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader   = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)
test_loader  = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

# 4. 🏋️‍♂️ Model, Training & Validation Loops

Train Flan-T5 with full GPU support and validation using ROUGE.

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME).to(device)

# Optimizer, scheduler
EPOCHS = 3  # Increase for full training
LEARNING_RATE = 3e-5
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=200, num_training_steps=total_steps)

# --- Helper for ROUGE ---
def compute_rouge_scores(preds, targets):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    all_scores = {"rouge1": [], "rouge2": [], "rougeL": []}
    for pred, tgt in zip(preds, targets):
        scores = scorer.score(tgt, pred)
        for key in all_scores:
            all_scores[key].append(scores[key].fmeasure)
    return {k: float(np.mean(v)) if v else 0.0 for k, v in all_scores.items()}

## 🚀 Training & Validation Loop
Prints loss and ROUGE for every epoch.  
Model is checkpointed on best validation ROUGE-L.

In [None]:
from copy import deepcopy

def generate_slides(batch_abstracts, model, tokenizer, max_len=128):
    inputs = tokenizer(batch_abstracts, return_tensors="pt", max_length=512, truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_length=max_len,
            num_beams=4,
            early_stopping=True,
        )
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

def validate(model, val_loader, tokenizer):
    model.eval()
    all_preds, all_targets = [], []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validation", leave=False):
            abstracts = tokenizer.batch_decode(batch["input_ids"], skip_special_tokens=True)
            batch_preds = generate_slides(abstracts, model, tokenizer)
            all_preds.extend(batch_preds)
            # Targets: decode labels, strip padding and join if needed
            for labels in batch["labels"]:
                label_text = tokenizer.decode(labels, skip_special_tokens=True)
                all_targets.append(label_text)
    # ROUGE
    rouge_scores = compute_rouge_scores(all_preds, all_targets)
    return rouge_scores

best_val_rougeL = 0.0
model_ckpt_path = "./flan_t5_slidegen_best.pth"

for epoch in range(EPOCHS):
    model.train()
    train_loss = 0.0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1} Training"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        train_loss += loss.item()
    train_loss /= len(train_loader)
    print(f"\nEpoch {epoch+1}: Train loss: {train_loss:.4f}")

    # --- Validation ---
    rouge_scores = validate(model, val_loader, tokenizer)
    print(f"Validation ROUGE: {rouge_scores}")
    if rouge_scores["rougeL"] > best_val_rougeL:
        print("New best model! Saving checkpoint.")
        best_val_rougeL = rouge_scores["rougeL"]
        torch.save(model.state_dict(), model_ckpt_path)

# 5. 🧪 Final Evaluation on Test Set

Restore the best model and run on test set for final ROUGE reporting.

In [None]:
# Restore best checkpoint
model.load_state_dict(torch.load(model_ckpt_path))
model.eval()

test_rouge = validate(model, test_loader, tokenizer)
print("Final Test ROUGE:", test_rouge)

# 🎉 Results, Download, and Next Steps

- Download your best model checkpoint: `flan_t5_slidegen_best.pth`
- Results: Final ROUGE scores on test set (see above)
- You can now deploy or export model for inference!