
# Automated Slide Generation from Research Papers
*Final Study Project Notebook*

**Goal:** Automatically generate PowerPoint slides from research papers using the arXiv dataset, local NLP models, and Python.


## 1. Setup and Imports

In [1]:

import os
import json
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from pptx import Presentation
from pptx.util import Inches, Pt
from pptx.enum.text import PP_ALIGN
import re

# Check for GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")


Using device: cuda


## 2. Data Loading & Exploration

In [11]:
import json

def load_jsonl_lines(filepath, n=None, source_name=None):
    """Load up to n lines from a .txt file (JSON objects per line). If n=None, load all."""
    data = []
    with open(filepath, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if n is not None and i >= n:
                break
            item = json.loads(line.strip())
            if source_name:
                item['source'] = source_name
            data.append(item)
    return data

# Paths for both datasets
ARXIV_PATH = r"C:/Users/amira/Downloads/datasets SlidegenAI/arxiv-dataset/arxiv-dataset/train.txt"
PUBMED_PATH = r"C:/Users/amira/Downloads/datasets SlidegenAI/pubmed-dataset/pubmed-dataset/train.txt"

# Load data (set n=None to load all, or choose a fixed n for each)
arxiv_samples = load_jsonl_lines(ARXIV_PATH, n=1000, source_name="arxiv")
pubmed_samples = load_jsonl_lines(PUBMED_PATH, n=1000, source_name="pubmed")

# Combine into one dataset
combined_samples = arxiv_samples + pubmed_samples

print(f"Loaded {len(arxiv_samples)} arXiv articles, {len(pubmed_samples)} PubMed articles.")
print(f"Combined dataset size: {len(combined_samples)}")

# Preview samples as text
def preview(sample, name):
    print(f"{name} sample keys: {sample.keys()}")
    print(f"{name} sample (article_text):", " ".join(sample['article_text'])[:500])
    print(f"{name} sample (abstract_text):", " ".join(sample['abstract_text'])[:500])

preview(arxiv_samples[0], "arXiv")
print()
preview(pubmed_samples[0], "PubMed")


Loaded 1000 arXiv articles, 1000 PubMed articles.
Combined dataset size: 2000
arXiv sample keys: dict_keys(['article_id', 'article_text', 'abstract_text', 'labels', 'section_names', 'sections', 'source'])
arXiv sample (article_text): additive models @xcite provide an important family of models for semiparametric regression or classification . some reasons for the success of additive models are their increased flexibility when compared to linear or generalized linear models and their increased interpretability when compared to fully nonparametric models . it is well - known that good estimators in additive models are in general less prone to the curse of high dimensionality than good estimators in fully nonparametric models .
arXiv sample (abstract_text): <S> additive models play an important role in semiparametric statistics . </S> <S> this paper gives learning rates for regularized kernel based methods for additive models . </S> <S> these learning rates compare favourably in particula

## 3. Article Section Splitting (Heuristic)

This cell will process all 2000 articles (or however many you have in combined_samples).

The tqdm library gives you a progress bar in the notebook—if you don’t have it, run pip install tqdm.

The result, all_sections, is a list of dictionaries—each with article metadata, abstract, and a list of (section_title, section_text) tuples.

Handles any errors, skipping problematic articles but keeping their abstracts.



In [16]:
import re
from tqdm import tqdm  # for a progress bar

def split_sections(text):
    """
    Split article into sections using common section headers.
    Returns a list of (section_title, section_text) tuples.
    """
    headers = [
        r'\n\s*abstract\s*\n',
        r'\n\s*introduction\s*\n',
        r'\n\s*related work[s]?\s*\n',
        r'\n\s*method[s]?(ology)?\s*\n',
        r'\n\s*experiment[s]?\s*\n',
        r'\n\s*results\s*\n',
        r'\n\s*discussion[s]?\s*\n',
        r'\n\s*conclusion[s]?\s*\n',
        r'\n\s*references\s*\n'
    ]
    pattern = "|".join(headers)
    splits = re.split(pattern, text, flags=re.IGNORECASE)
    headers_found = re.findall(pattern, text, flags=re.IGNORECASE)
    sections = []
    for idx, content in enumerate(splits[1:]):  # First split is before first header
        sec_title = re.sub(r'\W+', ' ', headers_found[idx]).strip().title()
        sec_content = content.strip()
        if len(sec_content) > 100:  # Skip empty/very short sections
            sections.append((sec_title, sec_content))
    return sections

all_sections = []
print(f"Processing {len(combined_samples)} articles...")

for sample in tqdm(combined_samples):
    full_article_text = " ".join(sample['article_text'])
    try:
        sections = split_sections(full_article_text)
        all_sections.append({
            "article_id": sample.get('article_id'),
            "source": sample.get('source'),
            "sections": sections,
            "abstract": " ".join(sample['abstract_text'])
        })
    except Exception as e:
        print(f"Failed to process article_id {sample.get('article_id')}: {e}")
        all_sections.append({
            "article_id": sample.get('article_id'),
            "source": sample.get('source'),
            "sections": [],
            "abstract": " ".join(sample['abstract_text'])
        })

print(f"Processed {len(all_sections)} articles.")
print("Example (first article):")
for sec_title, sec_text in all_sections[0]['sections']:
    print(f"\n--- {sec_title} ---\n{sec_text[:200]} ...")


Processing 2000 articles...


100%|██████████| 2000/2000 [00:00<00:00, 33540.08it/s]

Failed to process article_id PMC4227732: list index out of range
Processed 2000 articles.
Example (first article):





## 4. Summarization (Local T5 Model, GPU Accelerated)

In [None]:
# Load the summarization model

summarizer = pipeline(
    "summarization",
    model="t5-small",   # change to your fine-tuned model if available
    tokenizer="t5-small",
    device=0 if device=="cuda" else -1
)

def summarize_section(text, max_length=120, min_length=30):
    """
    Summarize a section using T5.
    """
    input_text = "summarize: " + text
    summary = summarizer(
        input_text,
        max_length=max_length,
        min_length=min_length,
        do_sample=False
    )[0]['summary_text']
    return summary

# Summarize the first few sections for the demo
summaries = []
for section_title, section_text in sections[:4]:
    print(f"\nSummarizing section: {section_title}")
    summary = summarize_section(section_text[:1500])  # Truncate for demo
    summaries.append((section_title, summary))
    print(summary)


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Device set to use cuda:0


## 5. Slide Generation (PowerPoint)

In [18]:

def create_presentation(summaries, out_path="generated_presentation.pptx"):
    prs = Presentation()
    title_slide_layout = prs.slide_layouts[0]
    content_slide_layout = prs.slide_layouts[1]
    
    # Title Slide
    slide = prs.slides.add_slide(title_slide_layout)
    slide.shapes.title.text = "Automated Slide Generation"
    slide.placeholders[1].text = "Generated from arXiv Paper Using NLP"
    
    # Content Slides
    for section_title, summary in summaries:
        slide = prs.slides.add_slide(content_slide_layout)
        slide.shapes.title.text = section_title
        tf = slide.placeholders[1].text_frame
        # Split summary into bullet points if possible
        bullets = summary.split('. ')
        for bullet in bullets:
            if bullet.strip():
                tf.add_paragraph().text = bullet.strip() + ('.' if not bullet.strip().endswith('.') else '')
        # Remove first empty paragraph
        if tf.paragraphs and not tf.paragraphs[0].text.strip():
            tf._element.remove(tf.paragraphs[0]._element)
    prs.save(out_path)
    print(f"\nPresentation saved as: {out_path}")

# Create the slides!
create_presentation(summaries)



Presentation saved as: generated_presentation.pptx


## 6. Full Pipeline Function (Reusable)

In [26]:
def process_article(article_text, n_sections=4):
    """
    Splits, summarizes, and returns sections for slide generation.
    """
    sections = split_sections(article_text)
    summaries = []
    for section_title, section_text in sections[:n_sections]:
        summary = summarize_section(section_text[:1500])
        summaries.append((section_title, summary))
    return summaries

# Example: Full pipeline for the first article
article_text = " ".join(combined_samples[0]['article_text'])
demo_summaries = process_article(article_text)
create_presentation(demo_summaries, out_path="demo_presentation.pptx")




Presentation saved as: demo_presentation.pptx


## 7. Conclusion

In [27]:

# - This notebook demonstrates a full offline pipeline for extracting, summarizing, and generating PowerPoint slides from research papers.
# - It is ready for further customization, batch processing, and improvements.
# - For best results, use a domain-specific or fine-tuned summarization model and enhance section splitting heuristics.
# - You can easily expand this to process multiple articles, generate fancier slides, or add images from PDF extraction.
# - Everything is designed to run offline after model download, and GPU acceleration is enabled.

print("Notebook pipeline complete! Ready for final project submission or demonstration.")


Notebook pipeline complete! Ready for final project submission or demonstration.
