
# Automated Slide Generation from Research Papers
*Final Study Project Notebook*

**Goal:** Automatically generate PowerPoint slides from research papers using the arXiv dataset, local NLP models, and Python.


## 1. Setup and Imports

In [None]:

import os
import json
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from pptx import Presentation
from pptx.util import Inches, Pt
from pptx.enum.text import PP_ALIGN
import re

# Check for GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")


## 2. Data Loading & Exploration

In [None]:

# Adjust path as needed
DATA_PATH = "C:/Users/amira/Downloads/datasets SlidegenAI/arxiv-dataset/arxiv-dataset/train.txt"

def load_jsonl_lines(filepath, n=1):
    """Load the first n lines from a .txt file (JSON objects per line)"""
    data = []
    with open(filepath, "r", encoding="utf-8") as f:
        for i, line in enumerate(f):
            if i >= n:
                break
            data.append(json.loads(line.strip()))
    return data

# Load 1 sample for demo (change n for more)
samples = load_jsonl_lines(DATA_PATH, n=1)
print("Sample keys:", samples[0].keys())
print("\nArticle preview:\n", samples[0]['article'][:1000])
print("\nAbstract preview:\n", samples[0]['abstract'])


## 3. Article Section Splitting (Heuristic)

In [None]:

def split_sections(text):
    """
    Split article into sections using common section headers.
    Returns a list of (section_title, section_text) tuples.
    """
    # Typical section headers in arXiv papers
    headers = [
        r'(?i)\n\s*abstract\s*\n',
        r'(?i)\n\s*introduction\s*\n',
        r'(?i)\n\s*related work[s]?\s*\n',
        r'(?i)\n\s*method[s]?(ology)?\s*\n',
        r'(?i)\n\s*experiment[s]?\s*\n',
        r'(?i)\n\s*results\s*\n',
        r'(?i)\n\s*discussion[s]?\s*\n',
        r'(?i)\n\s*conclusion[s]?\s*\n',
        r'(?i)\n\s*references\s*\n'
    ]
    pattern = "|".join(headers)
    # Split, keep the headers
    splits = re.split(pattern, text)
    # Find where the headers occur for titles
    headers_found = re.findall(pattern, text)
    sections = []
    for idx, content in enumerate(splits[1:]):  # First split is before first header
        sec_title = re.sub(r'\W+', ' ', headers_found[idx]).strip().title()
        sec_content = content.strip()
        if len(sec_content) > 100:  # Skip empty/very short sections
            sections.append((sec_title, sec_content))
    return sections

# Example: split the sample article
sections = split_sections(samples[0]['article'])
for t, c in sections:
    print(f"\n--- {t} ---\n{c[:200]} ...")


## 4. Summarization (Local T5 Model, GPU Accelerated)

In [None]:

# Download the model if not already present (first time only, then offline/cached)
# You can use "t5-small" or "t5-base" for better results (or your fine-tuned model)

summarizer = pipeline(
    "summarization",
    model="t5-small",   # change to your fine-tuned model if available
    tokenizer="t5-small",
    device=0 if device=="cuda" else -1
)

def summarize_section(text, max_length=120, min_length=30):
    """
    Summarize a section using T5.
    """
    input_text = "summarize: " + text
    summary = summarizer(
        input_text,
        max_length=max_length,
        min_length=min_length,
        do_sample=False
    )[0]['summary_text']
    return summary

# Summarize the first few sections for the demo
summaries = []
for section_title, section_text in sections[:4]:
    print(f"\nSummarizing section: {section_title}")
    summary = summarize_section(section_text[:1500])  # Truncate for demo
    summaries.append((section_title, summary))
    print(summary)


## 5. Slide Generation (PowerPoint)

In [None]:

def create_presentation(summaries, out_path="generated_presentation.pptx"):
    prs = Presentation()
    title_slide_layout = prs.slide_layouts[0]
    content_slide_layout = prs.slide_layouts[1]
    
    # Title Slide
    slide = prs.slides.add_slide(title_slide_layout)
    slide.shapes.title.text = "Automated Slide Generation"
    slide.placeholders[1].text = "Generated from arXiv Paper Using NLP"
    
    # Content Slides
    for section_title, summary in summaries:
        slide = prs.slides.add_slide(content_slide_layout)
        slide.shapes.title.text = section_title
        tf = slide.placeholders[1].text_frame
        # Split summary into bullet points if possible
        bullets = summary.split('. ')
        for bullet in bullets:
            if bullet.strip():
                tf.add_paragraph().text = bullet.strip() + ('.' if not bullet.strip().endswith('.') else '')
        # Remove first empty paragraph
        if tf.paragraphs and not tf.paragraphs[0].text.strip():
            tf._element.remove(tf.paragraphs[0]._element)
    prs.save(out_path)
    print(f"\nPresentation saved as: {out_path}")

# Create the slides!
create_presentation(summaries)


## 6. Full Pipeline Function (Reusable)

In [None]:

def process_article(article_text, n_sections=4):
    """
    Splits, summarizes, and returns sections for slide generation.
    """
    sections = split_sections(article_text)
    summaries = []
    for section_title, section_text in sections[:n_sections]:
        summary = summarize_section(section_text[:1500])
        summaries.append((section_title, summary))
    return summaries

# Example: Full pipeline for the first article
demo_summaries = process_article(samples[0]['article'])
create_presentation(demo_summaries, out_path="demo_presentation.pptx")


## 7. Conclusion

In [None]:

# - This notebook demonstrates a full offline pipeline for extracting, summarizing, and generating PowerPoint slides from research papers.
# - It is ready for further customization, batch processing, and improvements.
# - For best results, use a domain-specific or fine-tuned summarization model and enhance section splitting heuristics.
# - You can easily expand this to process multiple articles, generate fancier slides, or add images from PDF extraction.
# - Everything is designed to run offline after model download, and GPU acceleration is enabled.

print("Notebook pipeline complete! Ready for final project submission or demonstration.")
