In [1]:
import os
import re
import json
import time
import pickle
import pandas as pd
from openai import OpenAI


API_KEY = 'sk-bedceae2ceba437f944db22706354095'

In [2]:


def to_process(video_id, reference_file_path):
    if not os.path.exists(reference_file_path):
        return True
    df = pd.read_csv(reference_file_path)
    #print(df['id'].astype(str).values)
    return video_id not in df['id'].astype(str).values



In [3]:

def summarize_comments(api_client, comments, title, uploader_id, video_id):
    top_comments = [c['text'] for c in comments if not c['is_reply']][:max(1, int(0.5 * len(comments)))]
    
    print(f'number of comments to be considered={len(top_comments)}')
    user_prompt = '\n'.join(top_comments)
    system_prompt = """
    The user will provide a list of YouTube comments. Please analyze all the comments together and generate a single, structured JSON object summarizing the overall qualitative dynamics.
    
    EXAMPLE INPUT:
    ["Great job! This video really opened my eyes.", "What a biased take. Shameful.", "😂😂 you're so clueless it's funny.", "Pakistan zindabad!", "Link to giveaway 👉 http://spamlink"]
    
    EXAMPLE JSON OUTPUT:
    {
      "overall_sentiment_distribution": {"positive": 1, "neutral": 1, "negative": 3},
      "dominant_emotions": ["anger", "sarcasm", "joy"],
      "toxic_comment_count": 2,
      "controversy_score": 0.75,
      "key_topics": ["bias in media", "nationalism", "truth and misinformation"],
      "frequent_bias_or_group_mentions": ["Pakistan", "India", "YouTube creators"],
      "sarcasm_detected": true,
      "languages_detected": ["English", "Urdu"],
      "spam_comment_count": 1,
      "summary": "The comment section is emotionally charged with a mix of national pride, strong criticism, and sarcasm. There's significant polarization, and a moderate amount of toxicity and spam."
    }
    """
    response = api_client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        response_format={'type': 'json_object'}
    )

    
    summary = json.loads(response.choices[0].message.content)
    summary['title'] = title
    summary['uploader_id'] = uploader_id

    return summary

In [4]:
PROCESSED_COMMENTS_DIR = 'processed_comments'
df_comments=pd.read_csv(f'{PROCESSED_COMMENTS_DIR}/comments.csv')

df_comments['comments'] = df_comments['comments'].apply(json.loads)



ANALYZED_COMMENTS_DIR = 'analyzed_comments'
analyzed_comments_file_path = f'{ANALYZED_COMMENTS_DIR}/analyzed_comments.csv'


client = OpenAI(api_key=API_KEY, base_url="https://api.deepseek.com")
data_res = []
for index, row in df_comments.iterrows():
    try:
        #print(row)
        if not to_process(row['id'],analyzed_comments_file_path):
            print(f"⏩ Skipping already processed video ID: {row['id']}")
            continue
        print(f"Analyzing video ID: {row['id']}")
        analysis=summarize_comments(client, row['comments'], row['title'] ,row['creator_id'], row['id'])
        data_res.append({
                'creator_id': row['creator_id'],
                'id': row['id'],
                'title': row['title'],
                'timestamp': str(time.time()),
                'analysis': json.dumps(analysis)
            })
        # Append to CSV immediately to avoid reprocessing on crash
        df = pd.DataFrame(data_res)
        if os.path.exists(analyzed_comments_file_path):
            df.to_csv(analyzed_comments_file_path, mode='a', index=False, header=False)
        else:
            df.to_csv(analyzed_comments_file_path, index=False)
        data_res.clear()        
    except Exception as e:
        print(f"❌ Failed to process video ID: {row['id']}")
        print(e)
    
    

⏩ Skipping already processed video ID: dua91-ntjgU
⏩ Skipping already processed video ID: M88UpaT_iio
⏩ Skipping already processed video ID: yn-IiJODZLc
⏩ Skipping already processed video ID: cwOPw3fBMs4
⏩ Skipping already processed video ID: 6DTys4OCPhs
⏩ Skipping already processed video ID: mGjeWrERxPg
⏩ Skipping already processed video ID: x55BVjUo77U
⏩ Skipping already processed video ID: wvBLz5ViSvs
⏩ Skipping already processed video ID: APA0htT-AV0
⏩ Skipping already processed video ID: jiLTkhkVwP4
⏩ Skipping already processed video ID: dzi66yHoOyA
⏩ Skipping already processed video ID: SyB6dj0kkxU
⏩ Skipping already processed video ID: yzkkzt595WE
⏩ Skipping already processed video ID: FfV434-TN7Q
⏩ Skipping already processed video ID: g0QYWoh8hsM
⏩ Skipping already processed video ID: 7_5WdJlXKLo
⏩ Skipping already processed video ID: pOIEdCZvbrQ
⏩ Skipping already processed video ID: -VK7GF3bgnA
⏩ Skipping already processed video ID: 0UtHedAXIkA
⏩ Skipping already processed vi

In [5]:
df.tail()

Unnamed: 0,creator_id,id,title,timestamp,analysis
0,@thedeshbhakt,jiWuSvwxxF4,Is India Ready For The Drone Age? | Ukraine St...,1749044883.099927,"{""overall_sentiment_distribution"": {""positive""..."
