In [None]:
from transformers import pipeline
import re

# Initialize the summarization pipeline using BART
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

print("Summarizer is ready.")

Device set to use cpu


Summarizer is ready.


In [None]:
input_path = "../data/summaries/contextual_key_topics.txt"
contextual_paragraphs = []

with open(input_path, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue

        # Match pattern like: [123.45s - 678.90s] actual text
        match = re.match(r"\[(\d+\.?\d*)s\s*-\s*(\d+\.?\d*)s\]\s*(.*)", line)
        if match:
            start = float(match.group(1))
            end = float(match.group(2))
            paragraph = match.group(3).strip()

            contextual_paragraphs.append({
                "start": start,
                "end": end,
                "paragraph": paragraph
            })
        else:
            print(f"Skipping line: {line}")

print(f"Loaded {len(contextual_paragraphs)} contextual segments.")


Loaded 20 contextual segments.


In [None]:
summaries = []

for para in contextual_paragraphs:
    text = para["paragraph"]

    # Truncate to first 500 words if too long
    if len(text.split()) > 500:
        text = " ".join(text.split()[:500])

    try:
        result = summarizer(text, max_length=60, min_length=15, do_sample=False)
        summary_text = result[0]["summary_text"]
    except Exception as e:
        summary_text = "[Error during summarization]"
        print(f"Error summarizing segment starting at {para['start']}s: {e}")

    summaries.append({
        "start": para["start"],
        "end": para["end"],
        "summary": summary_text
    })

print(f"Generated {len(summaries)} summaries.")


In [None]:
output_path = "../data/summaries/contextual_key_summaries.txt"

with open(output_path, "w", encoding="utf-8") as f:
    for s in summaries:
        f.write(f"[{s['start']}s - {s['end']}s] {s['summary']}\n\n")

print(f"Saved {len(summaries)} summaries to: {output_path}")