In [1]:
import os
import re
import json
import time
import pickle
import pandas as pd
from openai import OpenAI
import os
import pandas as pd
import json
import time

API_KEY = 'sk-bedceae2ceba437f944db22706354095'

In [2]:
def summarize_creator_audience_interaction(api_client, all_comments, creator_id):
    """
    all_comments: list of comment texts from latest videos of a creator
    """

    comment_sample = all_comments

    print(f"Number of comments considered for {creator_id} = {len(comment_sample)}")

    user_prompt = '\n'.join(comment_sample)
    print(f'user prompt is {user_prompt}')

    system_prompt = """
You are an expert in social behavior analysis.

The user will provide a list of YouTube comments taken from the most recent videos of a single creator.

Your task is to summarize the overall tone, quality, diversity, and dynamics of the audience interaction with the creator’s content **across these videos**.

Your output must be in JSON format with the following keys:

{
  "audience_sentiment_overview": {"positive": X, "neutral": Y, "negative": Z},
  "common_emotions_expressed": [...],
  "overall_audience_behavior_summary": "...",
  "recurrent_themes": [...],
  "bias_or_group_mentions": [...],
  "is_sarcasm_common": true/false,
  "languages_used": [...],
  "spam_or_toxicity_prevalence": "low/medium/high",
  "concluding_summary": "..."
}

Keep it balanced and based on what is observable from the comments. Avoid making assumptions beyond the text.
"""

    response = api_client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        response_format={'type': 'json_object'}
    )

    result = json.loads(response.choices[0].message.content)
    result["creator_id"] = creator_id
    return result




In [3]:


ANALYZED_COMMENTS_DIR = 'analyzed_comments'
analyzed_comments_file_path = f'{ANALYZED_COMMENTS_DIR}/analyzed_comments.csv'
df_analysis=pd.read_csv(analyzed_comments_file_path)    



SUMMARIZED_PATH = "summarized_analyzed_comments/summarized_analyzed.csv"

# Load both datasets
df_summary = pd.read_csv(SUMMARIZED_PATH) if os.path.exists(SUMMARIZED_PATH) else pd.DataFrame(columns=["timestamp", "creator_id", "summarized_video_ids", "summary_analysis"])



client = OpenAI(api_key=API_KEY, base_url="https://api.deepseek.com")
# Process each creator
new_entries = []
for creator_id in df_analysis["creator_id"].unique():
    df_creator = df_analysis[df_analysis["creator_id"] == creator_id].sort_values("timestamp", ascending=False)
    print(creator_id,df_creator.shape)
    latest_n = min(10, int(0.9 * len(df_creator)))
    df_latest = df_creator.head(latest_n)
    print(df_latest.shape)

    current_ids = sorted(df_latest["id"].tolist())

    # Check if already summarized with this ID set
    if not df_summary[df_summary["creator_id"] == creator_id].empty:
        existing = list(df_summary[df_summary["creator_id"] == creator_id]["summarized_video_ids"])[0]
        if existing=='_*_'.join(current_ids):
            print(f"✅ Already summarized for same videos for creator: {creator_id}")
            continue

    # Gather comments
    all_comments = []
    for analysis_json in df_latest["analysis"]:
        try:
            parsed = json.loads(analysis_json)            
            all_comments.append(json.dumps(parsed))
        except Exception as e:
            print(f"Error parsing comments for creator {creator_id}: {e}")

    if not all_comments:
        print(f"⚠️ No comments found for creator {creator_id}, skipping.")
        continue

    # Summarize
    try:
        summary = summarize_creator_audience_interaction(client, all_comments, creator_id)
        
        current_ids_str='_*_'.join(current_ids)
        new_entries.append({
            "timestamp": time.time(),
            "creator_id": creator_id,
            "summarized_video_ids": current_ids_str,
            "summary_analysis": json.dumps(summary)
        })
    except Exception as e:
        print(f"❌ Failed summarization for {creator_id}: {e}")

# Save updates
if new_entries:
    df_new = pd.DataFrame(new_entries)
    df_combined = pd.concat([df_summary, df_new], ignore_index=True)
    df_combined.to_csv(SUMMARIZED_PATH, index=False)
    print(f"✅ Saved to {SUMMARIZED_PATH}")
else:
    print("No new summaries generated.")

@smiletojannah (72, 5)
(10, 5)
Number of comments considered for @smiletojannah = 10
user prompt is {"overall_sentiment_distribution": {"positive": 2, "neutral": 1, "negative": 22}, "dominant_emotions": ["anger", "sarcasm", "disgust"], "toxic_comment_count": 18, "controversy_score": 0.9, "key_topics": ["religious intolerance", "nationalism", "interfaith relations"], "frequent_bias_or_group_mentions": ["Hindus", "Jews", "Muslims", "Indians", "Israel"], "sarcasm_detected": true, "languages_detected": ["English", "Hindi", "Urdu"], "spam_comment_count": 0, "summary": "The comment section is highly polarized and toxic, with a strong presence of sarcasm and anger. Comments frequently target Hindus and Jews, reflecting deep-seated religious and nationalistic tensions. The discourse is marked by a high level of controversy and a lack of constructive dialogue.", "title": "Indian Found A New Idol \ud83d\uddff", "uploader_id": "@smiletojannah"}
{"overall_sentiment_distribution": {"positive": 0, "

In [4]:
df_analysis.shape

(291, 5)