In [1]:
import os
import nltk
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer

In [None]:
# Path to the transcript file from previous step
transcript_path = "../data/transcriptions/transcript.txt"

# Load transcript lines
with open(transcript_path, "r", encoding="utf-8") as f:
    lines = f.readlines()

# Separate timestamps and text
timestamps = []
texts = []

for line in lines:
    if "]" in line:
        try:
            time_part, text = line.strip().split("] ")
            timestamps.append(time_part[1:])  # Remove leading '['
            texts.append(text)
        except ValueError:
            continue  # skip any malformed lines

# Confirm data was loaded
print(f"Loaded {len(texts)} transcript segments.")

In [None]:
# Load pre-trained sentence embedding model
embedder = SentenceTransformer("all-MiniLM-L6-v2")  # Fast, good balance of speed/quality

# Generate embeddings for all transcript segments
embeddings = embedder.encode(texts, show_progress_bar=True)

# Convert to NumPy array
embeddings = np.array(embeddings)

print(f"Generated embeddings for {len(embeddings)} segments.")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/26 [00:00<?, ?it/s]

✅ Generated embeddings for 819 segments.


In [None]:
# Choose number of clusters (topics) — you can tune this
num_clusters = 8

# Apply KMeans clustering
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
labels = kmeans.fit_predict(embeddings)

print(f"Clustered transcript into {num_clusters} topics.")

✅ Clustered transcript into 8 topics.


In [5]:
# Combine cluster labels, timestamps, and texts into a DataFrame
df = pd.DataFrame({
    "cluster": labels,
    "timestamp": timestamps,
    "text": texts
})

# Sort by cluster and then by timestamp (optional for better readability)
df_grouped = df.sort_values(by=["cluster", "timestamp"])

# Show a few sample entries from each cluster
for cluster_id in range(num_clusters):
    print(f"\n🧠 Topic Cluster {cluster_id}")
    sample = df_grouped[df_grouped["cluster"] == cluster_id].head(3)
    for _, row in sample.iterrows():
        print(f"  [{row['timestamp']}] {row['text']}")


🧠 Topic Cluster 0
  [0.16s - 10.92s] We have been a misunderstood and badly mocked org for a long time. Like when we started, we like announced the org at the end of 2015 and
  [1112.21s - 1122.77s] Second, we are building in public and we are putting out technology because we think it is important for the world to get access to this early to shape the way it's going to be developed,
  [1153.83s - 1164.26s] the technology and shape it with us and provide feedback we believe is really important. The trade off of that is the trade off of building in public, which is we put out things that are going to be deeply

🧠 Topic Cluster 1
  [1080.61s - 1090.81s] in a sequence of prompts how to understand that it failed to do so previously and where it succeeded. And all of those like multi, like parallel reasonings
  [1090.81s - 1101.01s] that it's doing, it just seems like it's struggling. So two separate things going on here. Number one, some of the things that seem like they should be obvious

In [None]:
# Save the clustered data to a CSV for use in 04_summarization.ipynb
output_path = "../data/transcriptions/clustered_segments.csv"
df_grouped.to_csv(output_path, index=False, encoding="utf-8")

print(f"Clustered segments saved to: {output_path}")

✅ Clustered segments saved to: ../data/transcriptions/clustered_segments.csv
