In [1]:
import os
import pandas as pd
from transformers import pipeline

In [2]:
# Load the clustered transcript segments
input_path = "../data/transcriptions/clustered_segments.csv"
df = pd.read_csv(input_path)

# Preview the data
print("Loaded clustered segments:")
print(df.head())


Loaded clustered segments:
   cluster            timestamp  \
0        0       0.16s - 10.92s   
1        0  1112.21s - 1122.77s   
2        0  1153.83s - 1164.26s   
3        0    125.81s - 136.06s   
4        0  1344.76s - 1355.68s   

                                                text  
0  We have been a misunderstood and badly mocked ...  
1  Second, we are building in public and we are p...  
2  the technology and shape it with us and provid...  
3  humans to create, to flourish, to escape the w...  
4  and I also like, I get why this is such an imp...  


In [3]:
# Load the summarization model pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

print("Summarization model loaded successfully.")


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


Summarization model loaded successfully.


In [4]:
# Group by cluster and concatenate text within each group
summaries = []
for cluster_id, group in df.groupby("cluster"):
    combined_text = " ".join(group["text"].tolist())
    
    # Truncate input if too long for the model
    if len(combined_text) > 1024:
        combined_text = combined_text[:1024]
    
    # Generate summary
    summary = summarizer(combined_text, max_length=100, min_length=25, do_sample=False)[0]["summary_text"]
    
    summaries.append({
        "cluster": cluster_id,
        "summary": summary
    })

print("Summaries generated for each topic cluster.")

Summaries generated for each topic cluster.


In [5]:
# Convert the summaries to a DataFrame
summary_df = pd.DataFrame(summaries)

# Save to CSV
output_path = "../data/summaries/topic_summaries.csv"
os.makedirs("../data/summaries", exist_ok=True)
summary_df.to_csv(output_path, index=False, encoding="utf-8")

print(f"Topic summaries saved to: {output_path}")


Topic summaries saved to: ../data/summaries/topic_summaries.csv
