In [1]:
import pandas as pd
import os
import time
import subprocess
import json
from collections import defaultdict

TEMP_DIR = 'scratchpad'
PROCESSED_COMMENTS_DIR = 'processed_comments'
COMMENTS_CSV = f'{PROCESSED_COMMENTS_DIR}/comments.csv'

os.makedirs(TEMP_DIR, exist_ok=True)
os.makedirs(PROCESSED_COMMENTS_DIR, exist_ok=True)

def run_yt_dlp(video_id, max_comments=100):
    filepath = os.path.join(TEMP_DIR, f"{video_id}")
    command = [
        "yt-dlp",
        "--skip-download",
        "--write-comments",
        "--no-warnings",
        "--output", filepath,
        "--extractor-args", f"youtube:max_comments={max_comments}",
        f"https://www.youtube.com/watch?v={video_id}"
    ]
    subprocess.run(command, check=True, text=True)


def parse_comments(video_id):
    with open(os.path.join(TEMP_DIR, f'{video_id}.info.json'), "r", encoding="utf-8") as f:
        data = json.load(f)
    comments = data['comments']
    title = data['title']
    uploader = data['uploader_id']

    comment_dict = {}
    replies = defaultdict(list)
    for c in comments:
        if c.get("parent") and c["parent"] != "root":
            replies[c["parent"]].append({**c, "is_reply": True})
        else:
            comment_dict[c["id"]] = {**c, "replies": [], "is_reply": False}
    for pid, rep in replies.items():
        if pid in comment_dict:
            comment_dict[pid]["replies"] = rep
    return list(comment_dict.values()), title, uploader


def to_process(video_id):
    if not os.path.exists(COMMENTS_CSV):
        return True
    df = pd.read_csv(COMMENTS_CSV)
    return video_id not in df['id'].astype(str).values


# Main loop
df_url_details = pd.read_csv('URLs/url_channel_id.csv')
data_res = []

for video_id in df_url_details.id.astype(str):
    if not to_process(video_id):
        #print(f"⏩ Skipping already processed video ID: {video_id}")
        continue

    try:
        run_yt_dlp(video_id, max_comments=100)
        comments, title, uploader = parse_comments(video_id)

        data_res.append({
            'creator_id': uploader,
            'id': video_id,
            'title': title,
            'timestamp': str(time.time()),
            'comments': json.dumps(comments)
        })

        # Append to CSV immediately to avoid reprocessing on crash
        df = pd.DataFrame(data_res)
        if os.path.exists(COMMENTS_CSV):
            df.to_csv(COMMENTS_CSV, mode='a', index=False, header=False)
        else:
            df.to_csv(COMMENTS_CSV, index=False)
        data_res.clear()

    except Exception as e:
        print(f"❌ Failed to process video ID: {video_id}")
        print(e)

[youtube] Extracting URL: https://www.youtube.com/watch?v=4rvFm18eyfU
[youtube] 4rvFm18eyfU: Downloading webpage
[youtube] 4rvFm18eyfU: Downloading tv client config
[youtube] 4rvFm18eyfU: Downloading tv player API JSON
[youtube] 4rvFm18eyfU: Downloading ios player API JSON
[youtube] 4rvFm18eyfU: Downloading m3u8 information
[youtube] Downloading comment section API JSON
[info] 4rvFm18eyfU: Downloading 1 format(s): 616+251
[info] Writing video metadata as JSON to: scratchpad/4rvFm18eyfU.info.json
❌ Failed to process video ID: 4rvFm18eyfU
'comments'
[youtube] Extracting URL: https://www.youtube.com/watch?v=IlEav0f3tcc
[youtube] IlEav0f3tcc: Downloading webpage
[youtube] IlEav0f3tcc: Downloading tv client config
[youtube] IlEav0f3tcc: Downloading tv player API JSON
[youtube] IlEav0f3tcc: Downloading ios player API JSON
[youtube] IlEav0f3tcc: Downloading m3u8 information
[youtube] Downloading comment section API JSON
[youtube] Downloading ~104 comments
[youtube] Sorting comments by newest 

[download] Got error: HTTP Error 403: Forbidden
ERROR: fragment 1 not found, unable to continue


[info] Unable to download format 616. Skipping...
[info] l3bNZpckIg0: Downloading 1 format(s): 248+251
[info] Writing video metadata as JSON to: scratchpad/l3bNZpckIg0.info.json
❌ Failed to process video ID: l3bNZpckIg0
[youtube] Extracting URL: https://www.youtube.com/watch?v=A2uVpFRuHLc
[youtube] A2uVpFRuHLc: Downloading webpage
[youtube] A2uVpFRuHLc: Downloading tv client config
[youtube] A2uVpFRuHLc: Downloading tv player API JSON
[youtube] A2uVpFRuHLc: Downloading ios player API JSON
[youtube] A2uVpFRuHLc: Downloading m3u8 information
[youtube] A2uVpFRuHLc: Downloading m3u8 information
[youtube] Downloading comment section API JSON
[youtube] Extracted 0 comments
[info] A2uVpFRuHLc: Downloading 1 format(s): 96
[info] Writing video metadata as JSON to: scratchpad/A2uVpFRuHLc.info.json
[youtube] Extracting URL: https://www.youtube.com/watch?v=ZHuZ_8VYCWA
[youtube] ZHuZ_8VYCWA: Downloading webpage
[youtube] ZHuZ_8VYCWA: Downloading tv client config
[youtube] ZHuZ_8VYCWA: Downloading t

[download] Got error: HTTP Error 403: Forbidden
ERROR: fragment 1 not found, unable to continue


[info] Unable to download format 616. Skipping...
[info] kHGqEXQ_9OM: Downloading 1 format(s): 399+251
[info] Writing video metadata as JSON to: scratchpad/kHGqEXQ_9OM.info.json
❌ Failed to process video ID: kHGqEXQ_9OM
[youtube] Extracting URL: https://www.youtube.com/watch?v=1CjmqK55KS4
[youtube] 1CjmqK55KS4: Downloading webpage
[youtube] 1CjmqK55KS4: Downloading tv client config
[youtube] 1CjmqK55KS4: Downloading tv player API JSON
[youtube] 1CjmqK55KS4: Downloading ios player API JSON
[youtube] 1CjmqK55KS4: Downloading m3u8 information
[youtube] Downloading comment section API JSON
[youtube] Downloading ~53 comments
[youtube] Sorting comments by newest first
[youtube] Downloading comment API JSON page 1 (0/~53)
[youtube]     Downloading comment API JSON reply thread 1 (5/~53)
[youtube]     Downloading comment API JSON reply thread 2 (8/~53)
[youtube]     Downloading comment API JSON reply thread 3 (11/~53)
[youtube]     Downloading comment API JSON reply thread 4 (17/~53)
[youtube]

[download] Got error: HTTP Error 403: Forbidden
ERROR: fragment 1 not found, unable to continue


[info] Unable to download format 616. Skipping...
[info] 1CjmqK55KS4: Downloading 1 format(s): 248+251
[info] Writing video metadata as JSON to: scratchpad/1CjmqK55KS4.info.json
❌ Failed to process video ID: 1CjmqK55KS4
[youtube] Extracting URL: https://www.youtube.com/watch?v=4-HIh3eMulE
[youtube] 4-HIh3eMulE: Downloading webpage
[youtube] 4-HIh3eMulE: Downloading tv client config
[youtube] 4-HIh3eMulE: Downloading tv player API JSON
[youtube] 4-HIh3eMulE: Downloading ios player API JSON
[youtube] 4-HIh3eMulE: Downloading m3u8 information
[youtube] Downloading comment section API JSON
[youtube] Downloading ~52 comments
[youtube] Sorting comments by newest first
[youtube] Downloading comment API JSON page 1 (0/~52)
[youtube]     Downloading comment API JSON reply thread 1 (9/~52)
[youtube]     Downloading comment API JSON reply thread 2 (16/~52)
[youtube]     Downloading comment API JSON reply thread 3 (18/~52)
[youtube] Downloading comment API JSON page 2 (23/~52)
[youtube]     Downlo

[download] Got error: HTTP Error 403: Forbidden
ERROR: fragment 1 not found, unable to continue


[info] Unable to download format 616. Skipping...
[info] 4-HIh3eMulE: Downloading 1 format(s): 399+251
[info] Writing video metadata as JSON to: scratchpad/4-HIh3eMulE.info.json
❌ Failed to process video ID: 4-HIh3eMulE
[youtube] Extracting URL: https://www.youtube.com/watch?v=RwlTxj32w5E
[youtube] RwlTxj32w5E: Downloading webpage
[youtube] RwlTxj32w5E: Downloading tv client config
[youtube] RwlTxj32w5E: Downloading tv player API JSON
[youtube] RwlTxj32w5E: Downloading ios player API JSON
[youtube] RwlTxj32w5E: Downloading m3u8 information
[youtube] Downloading comment section API JSON
[youtube] Downloading ~3061 comments
[youtube] Sorting comments by newest first
[youtube] Downloading comment API JSON page 1 (0/~3061)
[youtube] Downloading comment API JSON page 2 (20/~3061)
[youtube] Downloading comment API JSON page 3 (40/~3061)
[youtube] Downloading comment API JSON page 4 (60/~3061)
[youtube] Downloading comment API JSON page 5 (80/~3061)
[youtube] Extracted 100 comments
[info] Tes

[download] Got error: HTTP Error 403: Forbidden
ERROR: fragment 1 not found, unable to continue


[info] Unable to download format 616. Skipping...
[info] RwlTxj32w5E: Downloading 1 format(s): 399+251
[info] Writing video metadata as JSON to: scratchpad/RwlTxj32w5E.info.json
❌ Failed to process video ID: RwlTxj32w5E
[youtube] Extracting URL: https://www.youtube.com/watch?v=iYV2EX1-i40
[youtube] iYV2EX1-i40: Downloading webpage
[youtube] iYV2EX1-i40: Downloading tv client config
[youtube] iYV2EX1-i40: Downloading tv player API JSON
[youtube] iYV2EX1-i40: Downloading ios player API JSON
[youtube] iYV2EX1-i40: Downloading m3u8 information
[youtube] Downloading comment section API JSON
[youtube] Downloading ~158 comments
[youtube] Sorting comments by newest first
[youtube] Downloading comment API JSON page 1 (0/~158)
[youtube] Downloading comment API JSON page 2 (20/~158)
[youtube] Downloading comment API JSON page 3 (40/~158)
[youtube]     Downloading comment API JSON reply thread 1 (53/~158)
[youtube] Downloading comment API JSON page 4 (61/~158)
[youtube]     Downloading comment API

[download] Got error: HTTP Error 403: Forbidden
ERROR: fragment 1 not found, unable to continue


[info] Unable to download format 616. Skipping...
[info] mUrOy2D7ck4: Downloading 1 format(s): 399+251
[info] Writing video metadata as JSON to: scratchpad/mUrOy2D7ck4.info.json
❌ Failed to process video ID: mUrOy2D7ck4
[youtube] Extracting URL: https://www.youtube.com/watch?v=WWPvrkezWqQ
[youtube] WWPvrkezWqQ: Downloading webpage
[youtube] WWPvrkezWqQ: Downloading tv client config
[youtube] WWPvrkezWqQ: Downloading tv player API JSON
[youtube] WWPvrkezWqQ: Downloading ios player API JSON
[youtube] WWPvrkezWqQ: Downloading m3u8 information
[youtube] Downloading comment section API JSON
[youtube] Downloading ~242 comments
[youtube] Sorting comments by newest first
[youtube] Downloading comment API JSON page 1 (0/~242)
[youtube] Downloading comment API JSON page 2 (20/~242)
[youtube] Downloading comment API JSON page 3 (40/~242)
[youtube] Downloading comment API JSON page 4 (60/~242)
[youtube]     Downloading comment API JSON reply thread 1 (63/~242)
[youtube]     Downloading comment API

In [3]:
df_test=pd.read_csv(f'{PROCESSED_COMMENTS_DIR}/comments.csv')
df_test['comments'] = df_test['comments'].apply(json.loads)

In [4]:
df_test.shape

(291, 5)

In [5]:
df_test.tail()

Unnamed: 0,creator_id,id,title,timestamp,comments
286,@smiletojannah,IlEav0f3tcc,This Is Why Trump Will Join Israel’s War,1750366000.0,"[{'id': 'UgyLj4QyX4SNwrpHHzF4AaABAg', 'parent'..."
287,@MuslimSkeptic,A2uVpFRuHLc,🌽 Woman NEEDS a Male Guardian! Post Debate Ana...,1750366000.0,[]
288,@TheDiaryOfACEO,ZHuZ_8VYCWA,Has Feminism Betrayed Women? THE SEX REVOLUTIO...,1750366000.0,"[{'id': 'UgyM0IupJujXiBgl6hl4AaABAg', 'parent'..."
289,@TheDailyShow,iYV2EX1-i40,Trump is working hard to come up with a soluti...,1750366000.0,"[{'id': 'UgzJ6IocmWOmY9SvYwJ4AaABAg', 'parent'..."
290,@TheDailyShow,WWPvrkezWqQ,MTG has a reasonable take on U.S. involvement ...,1750366000.0,"[{'id': 'UgxtMZ0AJn9LknZqszF4AaABAg', 'parent'..."
