In [1]:
import pandas as pd
import os
import time
import subprocess
import json
from collections import defaultdict

TEMP_DIR = 'scratchpad'
PROCESSED_COMMENTS_DIR = 'processed_comments'
COMMENTS_CSV = f'{PROCESSED_COMMENTS_DIR}/comments.csv'

os.makedirs(TEMP_DIR, exist_ok=True)
os.makedirs(PROCESSED_COMMENTS_DIR, exist_ok=True)

def run_yt_dlp(video_id, max_comments=100):
    filepath = os.path.join(TEMP_DIR, f"{video_id}")
    command = [
        "yt-dlp",
        "--skip-download",
        "--write-comments",
        "--no-warnings",
        "--output", filepath,
        "--extractor-args", f"youtube:max_comments={max_comments}",
        f"https://www.youtube.com/watch?v={video_id}"
    ]
    subprocess.run(command, check=True, text=True)


def parse_comments(video_id):
    with open(os.path.join(TEMP_DIR, f'{video_id}.info.json'), "r", encoding="utf-8") as f:
        data = json.load(f)
    comments = data['comments']
    title = data['title']
    uploader = data['uploader_id']

    comment_dict = {}
    replies = defaultdict(list)
    for c in comments:
        if c.get("parent") and c["parent"] != "root":
            replies[c["parent"]].append({**c, "is_reply": True})
        else:
            comment_dict[c["id"]] = {**c, "replies": [], "is_reply": False}
    for pid, rep in replies.items():
        if pid in comment_dict:
            comment_dict[pid]["replies"] = rep
    return list(comment_dict.values()), title, uploader


def to_process(video_id):
    if not os.path.exists(COMMENTS_CSV):
        return True
    df = pd.read_csv(COMMENTS_CSV)
    return video_id not in df['id'].astype(str).values


# Main loop
df_url_details = pd.read_csv('URLs/url_channel_id.csv')
data_res = []

for video_id in df_url_details.id.astype(str):
    if not to_process(video_id):
        print(f"⏩ Skipping already processed video ID: {video_id}")
        continue

    try:
        run_yt_dlp(video_id, max_comments=100)
        comments, title, uploader = parse_comments(video_id)

        data_res.append({
            'creator_id': uploader,
            'id': video_id,
            'title': title,
            'timestamp': str(time.time()),
            'comments': json.dumps(comments)
        })

        # Append to CSV immediately to avoid reprocessing on crash
        df = pd.DataFrame(data_res)
        if os.path.exists(COMMENTS_CSV):
            df.to_csv(COMMENTS_CSV, mode='a', index=False, header=False)
        else:
            df.to_csv(COMMENTS_CSV, index=False)
        data_res.clear()

    except Exception as e:
        print(f"❌ Failed to process video ID: {video_id}")
        print(e)

[youtube] Extracting URL: https://www.youtube.com/watch?v=EYvhkTYkBgc
[youtube] EYvhkTYkBgc: Downloading webpage
[youtube] EYvhkTYkBgc: Downloading tv client config
[youtube] EYvhkTYkBgc: Downloading player 9fe2e06e-main
[youtube] EYvhkTYkBgc: Downloading tv player API JSON
[youtube] EYvhkTYkBgc: Downloading ios player API JSON
[youtube] EYvhkTYkBgc: Downloading m3u8 information
[youtube] Downloading comment section API JSON
[youtube] Downloading ~356 comments
[youtube] Sorting comments by newest first
[youtube] Downloading comment API JSON page 1 (0/~356)
[youtube] Downloading comment API JSON page 2 (20/~356)
[youtube] Downloading comment API JSON page 3 (40/~356)
[youtube]     Downloading comment API JSON reply thread 1 (60/~356)
[youtube] Downloading comment API JSON page 4 (61/~356)
[youtube] Downloading comment API JSON page 5 (81/~356)
[youtube]     Downloading comment API JSON reply thread 1 (85/~356)
[youtube]     Downloading comment API JSON reply thread 2 (89/~356)
[youtube]

In [2]:
df_test=pd.read_csv(f'{PROCESSED_COMMENTS_DIR}/comments.csv')
df_test['comments'] = df_test['comments'].apply(json.loads)

In [3]:
df_test.shape

(286, 5)

In [4]:
df_test.tail()

Unnamed: 0,creator_id,id,title,timestamp,comments
281,@TheDailyShow,lbyVqqZng-I,"Tighten your belts, kids! Trump needs his big,...",1750282000.0,"[{'id': 'Ugw1UyUEMgqm7KzqT-Z4AaABAg', 'parent'..."
282,@TheDailyShow,KyucS9eSB24,How kind of American taxpayers to throw Trump ...,1750282000.0,"[{'id': 'UgwmO8-wbXcxbigw7eh4AaABAg', 'parent'..."
283,@TheDailyShow,XRaS6tDHUI4,Did Trump really institute a “no fatties” poli...,1750282000.0,"[{'id': 'UgwQP_7-U38S8leq5_V4AaABAg', 'parent'..."
284,@TheDailyShow,DxDCugpYO-8,Is that hot reporter with CBS? Because Desi Ly...,1750282000.0,"[{'id': 'Ugyom7U6LRwEjtDqNb14AaABAg', 'parent'..."
285,@TheDailyShow,jj11N-0rIQg,"Is Trump violating the Posse Comitatus Act, a ...",1750282000.0,"[{'id': 'Ugw-y2RfycN0pVk3Dt94AaABAg', 'parent'..."
