In [1]:
import os
import re
import json
import time
import pickle
import subprocess
from datetime import datetime
from collections import defaultdict
import pandas as pd
from openai import OpenAI
import hashlib

API_KEY = 'sk-bedceae2ceba437f944db22706354095'


COMMENTS_DIR = "comments_scraped"
CSV_DIR = "comments_scraped_csv"
PKL_DIR = "comments_scraped_pkl"
INSIGHTS_DIR = "insights"
HASH_DIR = "hashes"



def save_latest_youtube_urls(creator_id, num_videos=15, output_dir="URLs"):
    """
    Fetches the latest YouTube video URLs from a given creator's channel
    and appends new ones to the existing file, avoiding duplicates.

    Args:
        creator_id (str): YouTube channel handle (e.g., "@smiletojannah")
        num_videos (int): Number of latest video URLs to fetch
        output_dir (str): Folder to save the CSV file with URLs

    Returns:
        List[str]: List of all URLs saved in the CSV (old + new, unique)
    """
    os.makedirs(output_dir, exist_ok=True)
    channel_url = f"https://www.youtube.com/{creator_id}"
    output_path = os.path.join(output_dir, f"{creator_id.strip('@')}.csv")

    try:
        result = subprocess.run(
            ["yt-dlp", "--dump-json", "--flat-playlist", "--playlist-end", str(num_videos), channel_url],
            capture_output=True, text=True, check=True
        )

        video_entries = [json.loads(line) for line in result.stdout.strip().split('\n')]
        new_urls = [f"https://www.youtube.com/watch?v={entry['id']}" for entry in video_entries]

        # Load existing if available
        if os.path.exists(output_path):
            existing_df = pd.read_csv(output_path)
            all_urls = pd.Series(existing_df['urls'].tolist() + new_urls).drop_duplicates().tolist()
        else:
            all_urls = new_urls

        # Save updated list
        pd.DataFrame({'urls': all_urls}).to_csv(output_path, index=False)
        print(f"✅ Total {len(all_urls)} URLs saved to {output_path}")        

    except subprocess.CalledProcessError as e:
        print(f"❌ Failed to fetch videos for {creator_id}")
        print(e.stderr)
        return []
        

def fetch_latest_youtube_urls(creator_id, dir_="URLs"):
    CHANNEL_URLS = pd.read_csv(f'{dir_}/{creator_id}.csv')['urls']
    return CHANNEL_URLS






def compute_hash(top_comments):
    joined = '\n'.join(top_comments).strip()
    return hashlib.sha256(joined.encode('utf-8')).hexdigest()

def get_saved_hash(video_id):
    path = os.path.join(HASH_DIR, f"{video_id}.hash")
    if os.path.exists(path):
        with open(path, "r") as f:
            return f.read().strip()
    return None

def save_hash(video_id, hash_str):
    with open(os.path.join(HASH_DIR, f"{video_id}.hash"), "w") as f:
        f.write(hash_str)

os.makedirs(COMMENTS_DIR, exist_ok=True)
os.makedirs(CSV_DIR, exist_ok=True)
os.makedirs(PKL_DIR, exist_ok=True)
os.makedirs(INSIGHTS_DIR, exist_ok=True)
os.makedirs(HASH_DIR, exist_ok=True)

def get_video_id(url):
    return url.split('/watch?v=')[-1]

def run_yt_dlp(video_id):
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filepath = os.path.join(COMMENTS_DIR, f"{video_id}_at_{timestamp}")
    command = ["yt-dlp", "--skip-download", "--write-comments", "--no-warnings", "--output", filepath, f"https://www.youtube.com/watch?v={video_id}"]
    subprocess.run(command, check=True, text=True)



    
def get_latest_files():
    files = [f for f in os.listdir(COMMENTS_DIR) if f.endswith(".info.json")]
    latest = {}
    for f in files:
        match = re.match(r"(.+?)_at_(\d+)_\d+\.info\.json", f)
        if match:
            vid, ts_str = match.groups()
            ts = datetime.strptime(ts_str, "%Y%m%d")
            if vid not in latest or ts > latest[vid][0]:
                latest[vid] = (ts, f)
    return [f for _, f in latest.values()]

def parse_comments(json_file):
    with open(os.path.join(COMMENTS_DIR, json_file), "r", encoding="utf-8") as f:
        data = json.load(f)
    comments = data['comments']
    title = data['title']
    uploader = data['uploader_id']
    
    comment_dict = {}
    replies = defaultdict(list)
    for c in comments:
        if c.get("parent") and c["parent"] != "root":
            replies[c["parent"]].append({**c, "is_reply": True})
        else:
            comment_dict[c["id"]] = {**c, "replies": [], "is_reply": False}
    for pid, rep in replies.items():
        if pid in comment_dict:
            comment_dict[pid]["replies"] = rep
    return list(comment_dict.values()), title, uploader

def save_comments(video_id, comments):
    rows = []
    for c in comments:
        rows.append({**c, "parent_id": None})
        for r in c.get("replies", []):
            rows.append({**r, "parent_id": c["id"]})
    df = pd.DataFrame(rows)
    df.to_csv(f"{CSV_DIR}/{video_id}.csv", index=False)
    with open(f"{PKL_DIR}/{video_id}.pkl", "wb") as f:
        pickle.dump(comments, f)
    return df

def summarize_comments(api_client, comments, title, uploader_id, video_id):
    top_comments = [c['text'] for c in comments if not c['is_reply']][:max(1, int(0.05 * len(comments)))]
    comment_hash = compute_hash(top_comments)

    # Check if same hash already processed
    if get_saved_hash(video_id) == comment_hash:
        print(f"🔁 Skipping {video_id}: already summarized with same top comments.")
        return

    user_prompt = '\n'.join(top_comments)
    system_prompt = """
    The user will provide a list of YouTube comments. Please analyze all the comments together and generate a single, structured JSON object summarizing the overall qualitative dynamics.
    
    EXAMPLE INPUT:
    ["Great job! This video really opened my eyes.", "What a biased take. Shameful.", "😂😂 you're so clueless it's funny.", "Pakistan zindabad!", "Link to giveaway 👉 http://spamlink"]
    
    EXAMPLE JSON OUTPUT:
    {
      "overall_sentiment_distribution": {"positive": 1, "neutral": 1, "negative": 3},
      "dominant_emotions": ["anger", "sarcasm", "joy"],
      "toxic_comment_count": 2,
      "controversy_score": 0.75,
      "key_topics": ["bias in media", "nationalism", "truth and misinformation"],
      "frequent_bias_or_group_mentions": ["Pakistan", "India", "YouTube creators"],
      "sarcasm_detected": true,
      "languages_detected": ["English", "Urdu"],
      "spam_comment_count": 1,
      "summary": "The comment section is emotionally charged with a mix of national pride, strong criticism, and sarcasm. There's significant polarization, and a moderate amount of toxicity and spam."
    }
    """
    response = api_client.chat.completions.create(
        model="deepseek-chat",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        response_format={'type': 'json_object'}
    )

    
    summary = json.loads(response.choices[0].message.content)
    summary['title'] = title
    summary['uploader_id'] = uploader_id

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    with open(f"{INSIGHTS_DIR}/{video_id}_insight1_{timestamp}.json", "w", encoding="utf-8") as f:
        json.dump(summary, f, ensure_ascii=False, indent=2)

    # Save hash
    save_hash(video_id, comment_hash)
    print(f"✅ Summarized {video_id}")

    
 

In [3]:
creator='@smiletojannah'


save_latest_youtube_urls(creator, num_videos=15, output_dir="URLs")


✅ Total 35 URLs saved to URLs/smiletojannah.csv


In [4]:
CHANNEL_URLS=fetch_latest_youtube_urls(creator, dir_="URLs")
print(CHANNEL_URLS)

0     https://www.youtube.com/watch?v=0UtHedAXIkA
1     https://www.youtube.com/watch?v=7tqYgfhCwb4
2     https://www.youtube.com/watch?v=1suta8hYDxo
3     https://www.youtube.com/watch?v=Ojm4C0qwhMM
4     https://www.youtube.com/watch?v=ItK0890LP-g
5     https://www.youtube.com/watch?v=dIAMoGzeut0
6     https://www.youtube.com/watch?v=feMoy5ioaeU
7     https://www.youtube.com/watch?v=Slx5ZH5NZBM
8     https://www.youtube.com/watch?v=aGkNouIMzFY
9     https://www.youtube.com/watch?v=4XaMJrS9DvY
10    https://www.youtube.com/watch?v=6xhUFLlG_58
11    https://www.youtube.com/watch?v=x84vLClrXHw
12    https://www.youtube.com/watch?v=YpZnd1xmP_4
13    https://www.youtube.com/watch?v=irE6mn-dDZk
14    https://www.youtube.com/watch?v=RPd-Buti7gk
15    https://www.youtube.com/watch?v=yn-IiJODZLc
16    https://www.youtube.com/watch?v=cwOPw3fBMs4
17    https://www.youtube.com/watch?v=JTcydTXWh6A
18    https://www.youtube.com/watch?v=IsqAu9bt9tI
19    https://www.youtube.com/watch?v=jGSm0wWxhxw


In [None]:
for url in CHANNEL_URLS:
    video_id = get_video_id(url)
    try:
        run_yt_dlp(video_id)
    except subprocess.CalledProcessError as e:
        print(f"Failed for {video_id}:", e.stderr)

[youtube] Extracting URL: https://www.youtube.com/watch?v=0UtHedAXIkA
[youtube] 0UtHedAXIkA: Downloading webpage
[youtube] 0UtHedAXIkA: Downloading tv client config
[youtube] 0UtHedAXIkA: Downloading tv player API JSON
[youtube] 0UtHedAXIkA: Downloading ios player API JSON
[youtube] 0UtHedAXIkA: Downloading m3u8 information
[youtube] Downloading comment section API JSON
[youtube] Downloading ~2053 comments
[youtube] Sorting comments by newest first
[youtube] Downloading comment API JSON page 1 (0/~2053)
[youtube]     Downloading comment API JSON reply thread 1 (1/~2053)
[youtube]        Downloading comment replies API JSON page 1 (11/~2053)
[youtube] Downloading comment API JSON page 2 (33/~2053)
[youtube] Downloading comment API JSON page 3 (53/~2053)
[youtube]     Downloading comment API JSON reply thread 1 (61/~2053)
[youtube] Downloading comment API JSON page 4 (74/~2053)
[youtube]     Downloading comment API JSON reply thread 1 (81/~2053)
[youtube] Downloading comment API JSON pag

In [None]:

client = OpenAI(api_key=API_KEY, base_url="https://api.deepseek.com")

for json_file in get_latest_files():
    match = re.match(r"(.+?)_at_\d+_\d+\.info\.json", json_file)
    if not match:
        continue
    video_id = match.group(1)
    comments, title, uploader = parse_comments(json_file)
    df = save_comments(video_id, comments)
    summarize_comments(client, comments, title, uploader, video_id)
    print(f"Completed {video_id}\n{'*'*10}")

