In [5]:
import re
from transformers import pipeline

In [6]:
# Load key topic segments from file
input_path = "../data/summaries/key_topics.txt"

with open(input_path, "r", encoding="utf-8") as f:
    lines = f.readlines()

# Parse into list of dicts: start, end, text
segments = []
pattern = r"\[(\d+\.\d+)s - (\d+\.\d+)s\] (.+)"

for line in lines:
    match = re.match(pattern, line.strip())
    if match:
        start, end, text = match.groups()
        segments.append({
            "start": float(start),
            "end": float(end),
            "text": text.strip()
        })

print(f"Loaded {len(segments)} segments")
segments[:2]  # preview


Loaded 20 segments


[{'start': 1861.53,
  'end': 1872.69,
  'text': 'wanted, it wrote some code and that was it. Now you can have this back and forth dialogue where you can say, no, no, I meant this, or no, no, fix this bug or no, no, do this. And then of course the next version is the system can debug'},
 {'start': 5817.89,
  'end': 5828.01,
  'text': 'in the world? I think the world is going to find out that if you can have 10 times as much code at the same price, you can just use even more to write even more code. The world just needs way more code. It is true that a lot'}]

In [7]:
# Load summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

# Add summary for each segment
for seg in segments:
    original_text = seg["text"]

    # Truncate long inputs (Bart limit ≈ 512 tokens)
    if len(original_text.split()) > 512:
        original_text = " ".join(original_text.split()[:512])

    summary = summarizer(original_text, max_length=30, min_length=5, do_sample=False)[0]['summary_text']
    seg["summary"] = summary.strip()

# Preview
for seg in segments[:3]:
    print(f"[{seg['start']}s - {seg['end']}s]")
    print("Original:", seg["text"])
    print("Summary :", seg["summary"])
    print("-" * 60)

Device set to use cpu


[1861.53s - 1872.69s]
Original: wanted, it wrote some code and that was it. Now you can have this back and forth dialogue where you can say, no, no, I meant this, or no, no, fix this bug or no, no, do this. And then of course the next version is the system can debug
Summary : The system can now be programmed to fix bugs. It can also be used to debug bugs. "It wrote some code and that was
------------------------------------------------------------
[5817.89s - 5828.01s]
Original: in the world? I think the world is going to find out that if you can have 10 times as much code at the same price, you can just use even more to write even more code. The world just needs way more code. It is true that a lot
Summary : "The world just needs way more code," he says. "If you can have 10 times as much code at the same price,
------------------------------------------------------------
[4744.14s - 4754.19s]
Original: if created has a lot of power. How do you think we're doing? Like, honest. How do y

In [9]:
# Helper: Convert float seconds to HH:MM:SS format
def format_timestamp(seconds):
    hrs = int(seconds // 3600)
    mins = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    return f"{hrs:02}:{mins:02}:{secs:02}"

# Sort by start time
segments_sorted = sorted(segments, key=lambda x: x["start"])

# Output path
output_path = "../data/summaries/key_topics_summarized.txt"

# Save formatted results
with open(output_path, "w", encoding="utf-8") as f:
    for seg in segments_sorted:
        start_fmt = format_timestamp(seg["start"])
        end_fmt = format_timestamp(seg["end"])
        f.write(f"[{start_fmt} - {end_fmt}] {seg['summary']}\n")

print(f"Saved readable timestamp summaries to: {output_path}")

Saved readable timestamp summaries to: ../data/summaries/key_topics_summarized.txt
