In [18]:
import re
from keybert import KeyBERT
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [2]:
# Load raw transcription from file
with open("../data/transcriptions/transcript.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

# Parse into list of dictionaries
utterances = []
pattern = r"\[(\d+\.\d+)s - (\d+\.\d+)s\] (.+)"

for line in lines:
    match = re.match(pattern, line.strip())
    if match:
        start, end, text = match.groups()
        utterances.append({
            "start": float(start),
            "end": float(end),
            "text": text.strip()
        })

print(f"Loaded {len(utterances)} utterances")
utterances[:3]  # Preview

Loaded 819 utterances


[{'start': 0.16,
  'end': 10.92,
  'text': 'We have been a misunderstood and badly mocked org for a long time. Like when we started, we like announced the org at the end of 2015 and'},
 {'start': 10.92,
  'end': 21.04,
  'text': 'said we were going to work on AGI. Like people thought we were batshit insane. Yeah, you know, like I, I remember at the time a eminent AI scientist at a'},
 {'start': 22.32,
  'end': 32.4,
  'text': "large industrial AI lab was like dming individual reporters being like, you know, these people aren't very good and it's ridiculous to talk about AGI and I can't believe you're giving"}]

In [27]:
# Join all utterance text into a single document
full_text = " ".join([utt["text"] for utt in utterances])

# Initialize KeyBERT model (uses all-MiniLM-L6-v2 by default)
kw_model = KeyBERT()

# Define additional stopwords to exclude
scikit_stopwords = list(ENGLISH_STOP_WORDS)

context_stopwords = [
    "misunderstood", "badly", "mocked", "org", "long",
    "announced", "agi", "thought", "insane", "remember",
    "eminent", "scientist", "lab", "dming", "reporters",
    "ridiculous", "talk", "believe", "giving", "pettiness",
    "rancor", "field", "group", "brave", "face",
    "mockery", "ceo", "company", "behind", "podcast"
]

stopwords = list(scikit_stopwords + context_stopwords)

# Extract top 20 keyphrases (1 word), excluding stopwords
keywords = kw_model.extract_keywords(
    full_text,
    keyphrase_ngram_range=(1, 1),
    stop_words=stopwords,
    top_n=20
)

# Store just the keyword strings
global_keywords = set([kw[0].lower() for kw in keywords])

print("Top extracted keywords from transcript:")
for kw in global_keywords:
    print("-", kw)

Top extracted keywords from transcript:
- organization
- researchers
- developments
- research
- technological
- innovative
- organizations
- ais
- insights
- minds
- breakthroughs
- elite
- mismanagement
- industrial
- deepmind
- microsoft
- intellectual
- agis
- ai
- openai


In [28]:
def score_utterance(text, keyword_set):
    text_lower = text.lower()
    score = 0
    for keyword in keyword_set:
        if keyword in text_lower:
            score += 1  # You can weight this higher if needed
    return score + len(text.split()) * 0.1  # small bonus for length

# Score each utterance
for utt in utterances:
    utt["score"] = score_utterance(utt["text"], global_keywords)

# Sort by score descending
utterances_sorted = sorted(utterances, key=lambda x: x["score"], reverse=True)

# Preview top 5 important utterances
top_n = 20
important_segments = utterances_sorted[:top_n]

print("\nTop Important Segments:")
for seg in important_segments:
    print(f"[{seg['start']}s - {seg['end']}s] {seg['text']} (score: {seg['score']:.2f})")



Top Important Segments:
[32.4s - 42.44s] them time of day. And it's like that was the level of like pettiness and rancor in the field at a new group of people saying we're going to try to build AGI. So OpenAI and DeepMind was a (score: 6.90)
[2228.96s - 2239.12s] much if that's applied to an AI system. You know, we've talked about putting out the base model as, at least for researchers or something, but it's not very easy to use everyone's like, give me the base model. (score: 6.90)
[7040.55s - 7051.79s] tweeted something that he also was really kind to send me to communicate with me, send me a long email describing the history of OpenAI, all the different developments. He (score: 6.00)
[2608.23s - 2618.35s] thing to get from 3 to 3.5 to 4. It's like hundreds of complicated things. So tiny little thing with the training, with the like everything with the data organization, how we like collect the data, how (score: 5.70)
[4389.54s - 4400.5s] saying we're going to try to build AGI. So O

In [None]:
# Create output path
output_path = "../data/summaries/key_topics.txt"

# Save top 20 segments to file
with open(output_path, "w", encoding="utf-8") as f:
    for seg in important_segments[:20]:
        f.write(f"[{seg['start']}s - {seg['end']}s] {seg['text']}\n")

print(f"Saved top 20 important segments to: {output_path}")

In [5]:
# Define the context window (±2 minutes)
context_window = 120  

# Store final contextual segments
contextual_paragraphs = []

for seg in important_segments:
    start_context = seg["start"] - context_window
    end_context = seg["end"] + context_window

    # Get all utterances within this window
    context_utts = [
        utt["text"]
        for utt in utterances
        if utt["start"] >= start_context and utt["end"] <= end_context
    ]

    # Join into a single paragraph
    paragraph = " ".join(context_utts)

    # Store with timestamp info
    contextual_paragraphs.append({
        "start": start_context,
        "end": end_context,
        "paragraph": paragraph
    })

# Print all contextual paragraphs
for idx, para in enumerate(contextual_paragraphs, 1):
    start_min = int(para["start"] // 60)
    start_sec = int(para["start"] % 60)
    end_min = int(para["end"] // 60)
    end_sec = int(para["end"] % 60)
    print(f"\nSegment {idx} [{start_min:02}:{start_sec:02} - {end_min:02}:{end_sec:02}]:\n{para['paragraph']}")



Segment 1 [-2:32 - 02:42]:
We have been a misunderstood and badly mocked org for a long time. Like when we started, we like announced the org at the end of 2015 and said we were going to work on AGI. Like people thought we were batshit insane. Yeah, you know, like I, I remember at the time a eminent AI scientist at a large industrial AI lab was like dming individual reporters being like, you know, these people aren't very good and it's ridiculous to talk about AGI and I can't believe you're giving them time of day. And it's like that was the level of like pettiness and rancor in the field at a new group of people saying we're going to try to build AGI. So OpenAI and DeepMind was a small collection of folks who are brave enough to talk about AGI in the face of mockery. We don't get mocked as much now and don't get mocked as much now. The following is a conversation with Sam Altman, CEO of OpenAI, the company behind GPT4, ChatGPT, Dall? E, Codex, and many other AI technologies which, bo

In [9]:
# Define output path
output_path = "../data/summaries/contextual_key_topics.txt"

# Save contextual paragraphs in [start - end] format in seconds
with open(output_path, "w", encoding="utf-8") as f:
    for para in contextual_paragraphs:
        start_sec = round(para["start"], 2)
        end_sec = round(para["end"], 2)
        f.write(f"[{start_sec}s - {end_sec}s] {para['paragraph']}\n")

print(f"Saved contextual segments in seconds format to: {output_path}")


Saved contextual segments in seconds format to: ../data/summaries/contextual_key_topics.txt
