In [1]:
import pandas as pd
import os
import time
import subprocess
import json
from collections import defaultdict

TEMP_DIR = 'scratchpad'
PROCESSED_COMMENTS_DIR = 'processed_comments'
COMMENTS_CSV = f'{PROCESSED_COMMENTS_DIR}/comments.csv'

os.makedirs(TEMP_DIR, exist_ok=True)
os.makedirs(PROCESSED_COMMENTS_DIR, exist_ok=True)

def run_yt_dlp(video_id, max_comments=100):
    filepath = os.path.join(TEMP_DIR, f"{video_id}")
    command = [
        "yt-dlp",
        "--skip-download",
        "--write-comments",
        "--no-warnings",
        "--output", filepath,
        "--extractor-args", f"youtube:max_comments={max_comments}",
        f"https://www.youtube.com/watch?v={video_id}"
    ]
    subprocess.run(command, check=True, text=True)


def parse_comments(video_id):
    with open(os.path.join(TEMP_DIR, f'{video_id}.info.json'), "r", encoding="utf-8") as f:
        data = json.load(f)
    comments = data['comments']
    title = data['title']
    uploader = data['uploader_id']

    comment_dict = {}
    replies = defaultdict(list)
    for c in comments:
        if c.get("parent") and c["parent"] != "root":
            replies[c["parent"]].append({**c, "is_reply": True})
        else:
            comment_dict[c["id"]] = {**c, "replies": [], "is_reply": False}
    for pid, rep in replies.items():
        if pid in comment_dict:
            comment_dict[pid]["replies"] = rep
    return list(comment_dict.values()), title, uploader


def to_process(video_id):
    if not os.path.exists(COMMENTS_CSV):
        return True
    df = pd.read_csv(COMMENTS_CSV)
    return video_id not in df['id'].astype(str).values


# Main loop
df_url_details = pd.read_csv('URLs/url_channel_id.csv')
data_res = []

for video_id in df_url_details.id.astype(str):
    if not to_process(video_id):
        #print(f"⏩ Skipping already processed video ID: {video_id}")
        continue

    try:
        run_yt_dlp(video_id, max_comments=100)
        comments, title, uploader = parse_comments(video_id)

        data_res.append({
            'creator_id': uploader,
            'id': video_id,
            'title': title,
            'timestamp': str(time.time()),
            'comments': json.dumps(comments)
        })

        # Append to CSV immediately to avoid reprocessing on crash
        df = pd.DataFrame(data_res)
        if os.path.exists(COMMENTS_CSV):
            df.to_csv(COMMENTS_CSV, mode='a', index=False, header=False)
        else:
            df.to_csv(COMMENTS_CSV, index=False)
        data_res.clear()

    except Exception as e:
        print(f"❌ Failed to process video ID: {video_id}")
        print(e)

[youtube] Extracting URL: https://www.youtube.com/watch?v=4rvFm18eyfU
[youtube] 4rvFm18eyfU: Downloading webpage
[youtube] 4rvFm18eyfU: Downloading tv client config
[youtube] 4rvFm18eyfU: Downloading tv player API JSON
[youtube] 4rvFm18eyfU: Downloading ios player API JSON
[youtube] 4rvFm18eyfU: Downloading m3u8 information
[youtube] Downloading comment section API JSON
[info] 4rvFm18eyfU: Downloading 1 format(s): 616+251
[info] There are no subtitles for the requested languages
Deleting existing file scratchpad/4rvFm18eyfU.webp
[info] Downloading video thumbnail 44 ...
[info] Writing video thumbnail 44 to: scratchpad/4rvFm18eyfU.webp
[info] Writing video metadata as JSON to: scratchpad/4rvFm18eyfU.info.json
❌ Failed to process video ID: 4rvFm18eyfU
'comments'
[youtube] Extracting URL: https://www.youtube.com/watch?v=2-WBcZeTY8k
[youtube] 2-WBcZeTY8k: Downloading webpage
[youtube] 2-WBcZeTY8k: Downloading tv client config
[youtube] 2-WBcZeTY8k: Downloading tv player API JSON
[youtube] 



ERROR: Did not get any data blocks


[info] Writing video thumbnail 41 to: scratchpad/2-WBcZeTY8k.webp
[info] Writing video metadata as JSON to: scratchpad/2-WBcZeTY8k.info.json
❌ Failed to process video ID: 2-WBcZeTY8k
[youtube] Extracting URL: https://www.youtube.com/watch?v=QhevOHvbPMM
[youtube] QhevOHvbPMM: Downloading webpage
[youtube] QhevOHvbPMM: Downloading tv client config
[youtube] QhevOHvbPMM: Downloading tv player API JSON
[youtube] QhevOHvbPMM: Downloading ios player API JSON
[youtube] QhevOHvbPMM: Downloading m3u8 information
[info] QhevOHvbPMM: Downloading subtitles: en-IN
[youtube] Downloading comment section API JSON
[youtube] Downloading ~4882 comments
[youtube] Sorting comments by newest first
[youtube] Downloading comment API JSON page 1 (0/~4882)
[youtube]     Downloading comment API JSON reply thread 1 (1/~4882)
[youtube]        Downloading comment replies API JSON page 1 (11/~4882)
[youtube] Downloading comment API JSON page 2 (69/~4882)
[youtube] Downloading comment API JSON page 3 (89/~4882)
[yout



ERROR: Did not get any data blocks


[info] Writing video metadata as JSON to: scratchpad/QhevOHvbPMM.info.json
❌ Failed to process video ID: QhevOHvbPMM
[youtube] Extracting URL: https://www.youtube.com/watch?v=uCk5k6t8Afo
[youtube] uCk5k6t8Afo: Downloading webpage
[youtube] uCk5k6t8Afo: Downloading tv client config
[youtube] uCk5k6t8Afo: Downloading tv player API JSON
[youtube] uCk5k6t8Afo: Downloading ios player API JSON
[youtube] uCk5k6t8Afo: Downloading m3u8 information
[info] uCk5k6t8Afo: Downloading subtitles: en-IN
[youtube] Downloading comment section API JSON
[youtube] Downloading ~8337 comments
[youtube] Sorting comments by newest first
[youtube] Downloading comment API JSON page 1 (0/~8337)
[youtube]     Downloading comment API JSON reply thread 1 (19/~8337)
[youtube] Downloading comment API JSON page 2 (21/~8337)
[youtube] Downloading comment API JSON page 3 (41/~8337)
[youtube]     Downloading comment API JSON reply thread 1 (42/~8337)
[youtube]     Downloading comment API JSON reply thread 2 (50/~8337)
[you



ERROR: Did not get any data blocks


[youtube] Extracting URL: https://www.youtube.com/watch?v=zOflJPIo3Bs
[youtube] zOflJPIo3Bs: Downloading webpage
[youtube] zOflJPIo3Bs: Downloading tv client config
[youtube] zOflJPIo3Bs: Downloading tv player API JSON
[youtube] zOflJPIo3Bs: Downloading ios player API JSON
[youtube] zOflJPIo3Bs: Downloading m3u8 information
[info] zOflJPIo3Bs: Downloading subtitles: en-IN
[youtube] Downloading comment section API JSON
[youtube] Downloading ~3347 comments
[youtube] Sorting comments by newest first
[youtube] Downloading comment API JSON page 1 (0/~3347)
[youtube]     Downloading comment API JSON reply thread 1 (3/~3347)
[youtube]     Downloading comment API JSON reply thread 2 (19/~3347)
[youtube] Downloading comment API JSON page 2 (22/~3347)
[youtube]     Downloading comment API JSON reply thread 1 (32/~3347)
[youtube]     Downloading comment API JSON reply thread 2 (38/~3347)
[youtube] Downloading comment API JSON page 3 (44/~3347)
[youtube]     Downloading comment API JSON reply thre



ERROR: Did not get any data blocks


[youtube] Extracting URL: https://www.youtube.com/watch?v=7jrbnq79UZk
[youtube] 7jrbnq79UZk: Downloading webpage
[youtube] 7jrbnq79UZk: Downloading tv client config
[youtube] 7jrbnq79UZk: Downloading tv player API JSON
[youtube] 7jrbnq79UZk: Downloading ios player API JSON
[youtube] 7jrbnq79UZk: Downloading m3u8 information
[info] 7jrbnq79UZk: Downloading subtitles: en-IN
[youtube] Downloading comment section API JSON
[youtube] Downloading ~6928 comments
[youtube] Sorting comments by newest first
[youtube] Downloading comment API JSON page 1 (0/~6928)
[youtube]     Downloading comment API JSON reply thread 1 (18/~6928)
[youtube] Downloading comment API JSON page 2 (21/~6928)
[youtube] Downloading comment API JSON page 3 (41/~6928)
[youtube] Downloading comment API JSON page 4 (61/~6928)
[youtube]     Downloading comment API JSON reply thread 1 (66/~6928)
[youtube] Downloading comment API JSON page 5 (82/~6928)
[youtube]     Downloading comment API JSON reply thread 1 (85/~6928)
[youtub



ERROR: Did not get any data blocks


[youtube] Extracting URL: https://www.youtube.com/watch?v=LSFWYCwirtw
[youtube] LSFWYCwirtw: Downloading webpage
[youtube] LSFWYCwirtw: Downloading tv client config
[youtube] LSFWYCwirtw: Downloading tv player API JSON
[youtube] LSFWYCwirtw: Downloading ios player API JSON
[youtube] LSFWYCwirtw: Downloading m3u8 information
[info] LSFWYCwirtw: Downloading subtitles: en-IN
[youtube] Downloading comment section API JSON
[youtube] Downloading ~7739 comments
[youtube] Sorting comments by newest first
[youtube] Downloading comment API JSON page 1 (0/~7739)
[youtube]     Downloading comment API JSON reply thread 1 (1/~7739)
[youtube]        Downloading comment replies API JSON page 1 (11/~7739)
[youtube]        Downloading comment replies API JSON page 2 (61/~7739)
[youtube] Extracted 100 comments
[info] LSFWYCwirtw: Downloading 1 format(s): 616+251
[info] Writing video subtitles to: scratchpad/LSFWYCwirtw.en-IN.vtt

Deleting existing file scratchpad/LSFWYCwirtw.webp
[info] Downloading video



ERROR: Did not get any data blocks


[youtube] Extracting URL: https://www.youtube.com/watch?v=LUs4luvC4JQ
[youtube] LUs4luvC4JQ: Downloading webpage
[youtube] LUs4luvC4JQ: Downloading tv client config
[youtube] LUs4luvC4JQ: Downloading tv player API JSON
[youtube] LUs4luvC4JQ: Downloading ios player API JSON
[youtube] LUs4luvC4JQ: Downloading m3u8 information
[info] LUs4luvC4JQ: Downloading subtitles: en-IN
[youtube] Downloading comment section API JSON
[youtube] Downloading ~5664 comments
[youtube] Sorting comments by newest first
[youtube] Downloading comment API JSON page 1 (0/~5664)
[youtube] Downloading comment API JSON page 2 (20/~5664)
[youtube] Downloading comment API JSON page 3 (40/~5664)
[youtube]     Downloading comment API JSON reply thread 1 (47/~5664)
[youtube]     Downloading comment API JSON reply thread 2 (52/~5664)
[youtube]     Downloading comment API JSON reply thread 3 (57/~5664)
[youtube] Downloading comment API JSON page 4 (64/~5664)
[youtube]     Downloading comment API JSON reply thread 1 (73/~5



ERROR: Did not get any data blocks


[youtube] Extracting URL: https://www.youtube.com/watch?v=a5YpJqSTqZ4
[youtube] a5YpJqSTqZ4: Downloading webpage
[youtube] a5YpJqSTqZ4: Downloading tv client config
[youtube] a5YpJqSTqZ4: Downloading tv player API JSON
[youtube] a5YpJqSTqZ4: Downloading ios player API JSON
[youtube] a5YpJqSTqZ4: Downloading m3u8 information
[info] a5YpJqSTqZ4: Downloading subtitles: en-IN
[youtube] Downloading comment section API JSON
[youtube] Downloading ~4290 comments
[youtube] Sorting comments by newest first
[youtube] Downloading comment API JSON page 1 (0/~4290)
[youtube]     Downloading comment API JSON reply thread 1 (1/~4290)
[youtube]        Downloading comment replies API JSON page 1 (11/~4290)
[youtube]        Downloading comment replies API JSON page 2 (61/~4290)
[youtube] Extracted 100 comments
[info] a5YpJqSTqZ4: Downloading 1 format(s): 616+251
[info] Writing video subtitles to: scratchpad/a5YpJqSTqZ4.en-IN.vtt

Deleting existing file scratchpad/a5YpJqSTqZ4.webp
[info] Downloading video



ERROR: Did not get any data blocks


[youtube] Extracting URL: https://www.youtube.com/watch?v=HsBDTHGW6Yc
[youtube] HsBDTHGW6Yc: Downloading webpage
[youtube] HsBDTHGW6Yc: Downloading tv client config
[youtube] HsBDTHGW6Yc: Downloading tv player API JSON
[youtube] HsBDTHGW6Yc: Downloading ios player API JSON
[youtube] HsBDTHGW6Yc: Downloading m3u8 information
[info] HsBDTHGW6Yc: Downloading subtitles: en-IN
[youtube] Downloading comment section API JSON
[youtube] Downloading ~7856 comments
[youtube] Sorting comments by newest first
[youtube] Downloading comment API JSON page 1 (0/~7856)
[youtube]     Downloading comment API JSON reply thread 1 (1/~7856)
[youtube]     Downloading comment API JSON reply thread 2 (18/~7856)
[youtube] Downloading comment API JSON page 2 (29/~7856)
[youtube] Downloading comment API JSON page 3 (49/~7856)
[youtube] Downloading comment API JSON page 4 (69/~7856)
[youtube]     Downloading comment API JSON reply thread 1 (85/~7856)
[youtube] Downloading comment API JSON page 5 (90/~7856)
[youtube



ERROR: Did not get any data blocks


[info] Writing video metadata as JSON to: scratchpad/HsBDTHGW6Yc.info.json
❌ Failed to process video ID: HsBDTHGW6Yc
[youtube] Extracting URL: https://www.youtube.com/watch?v=Co7B4-Lx-kI
[youtube] Co7B4-Lx-kI: Downloading webpage
[youtube] Co7B4-Lx-kI: Downloading tv client config
[youtube] Co7B4-Lx-kI: Downloading tv player API JSON
[youtube] Co7B4-Lx-kI: Downloading ios player API JSON
[youtube] Co7B4-Lx-kI: Downloading m3u8 information
[info] Co7B4-Lx-kI: Downloading subtitles: en-IN
[youtube] Downloading comment section API JSON
[youtube] Downloading ~8486 comments
[youtube] Sorting comments by newest first
[youtube] Downloading comment API JSON page 1 (0/~8486)
[youtube] Downloading comment API JSON page 2 (20/~8486)
[youtube] Downloading comment API JSON page 3 (40/~8486)
[youtube]     Downloading comment API JSON reply thread 1 (53/~8486)
[youtube] Downloading comment API JSON page 4 (64/~8486)
[youtube]     Downloading comment API JSON reply thread 1 (74/~8486)
[youtube]     Do



ERROR: Did not get any data blocks


[youtube] Extracting URL: https://www.youtube.com/watch?v=uMZvylF4CUE
[youtube] uMZvylF4CUE: Downloading webpage
[youtube] uMZvylF4CUE: Downloading tv client config
[youtube] uMZvylF4CUE: Downloading tv player API JSON
[youtube] uMZvylF4CUE: Downloading ios player API JSON
[youtube] uMZvylF4CUE: Downloading m3u8 information
[info] uMZvylF4CUE: Downloading subtitles: en-IN
[youtube] Downloading comment section API JSON
[youtube] Downloading ~7102 comments
[youtube] Sorting comments by newest first
[youtube] Downloading comment API JSON page 1 (0/~7102)
[youtube]     Downloading comment API JSON reply thread 1 (1/~7102)
[youtube]        Downloading comment replies API JSON page 1 (11/~7102)
[youtube]        Downloading comment replies API JSON page 2 (61/~7102)
[youtube] Extracted 100 comments
[info] uMZvylF4CUE: Downloading 1 format(s): 616+251
[info] Writing video subtitles to: scratchpad/uMZvylF4CUE.en-IN.vtt

Deleting existing file scratchpad/uMZvylF4CUE.webp
[info] Downloading video



ERROR: Did not get any data blocks


[youtube] Extracting URL: https://www.youtube.com/watch?v=l3bNZpckIg0
[youtube] l3bNZpckIg0: Downloading webpage
[youtube] l3bNZpckIg0: Downloading tv client config
[youtube] l3bNZpckIg0: Downloading tv player API JSON
[youtube] l3bNZpckIg0: Downloading ios player API JSON
[youtube] l3bNZpckIg0: Downloading m3u8 information
[info] l3bNZpckIg0: Downloading subtitles: en-IN
[youtube] Downloading comment section API JSON
[youtube] Downloading ~1534 comments
[youtube] Sorting comments by newest first
[youtube] Downloading comment API JSON page 1 (0/~1534)
[youtube]     Downloading comment API JSON reply thread 1 (1/~1534)
[youtube]        Downloading comment replies API JSON page 1 (11/~1534)
[youtube] Downloading comment API JSON page 2 (80/~1534)
[youtube] Extracted 100 comments
[info] l3bNZpckIg0: Downloading 1 format(s): 616+251
[info] Writing video subtitles to: scratchpad/l3bNZpckIg0.en-IN.vtt

Deleting existing file scratchpad/l3bNZpckIg0.webp
[info] Downloading video thumbnail 41 .



ERROR: Did not get any data blocks


[info] Writing video thumbnail 41 to: scratchpad/l3bNZpckIg0.webp
[info] Writing video metadata as JSON to: scratchpad/l3bNZpckIg0.info.json
❌ Failed to process video ID: l3bNZpckIg0
[youtube] Extracting URL: https://www.youtube.com/watch?v=_qq2E8_LB98
[youtube] _qq2E8_LB98: Downloading webpage
[youtube] _qq2E8_LB98: Downloading tv client config
[youtube] _qq2E8_LB98: Downloading tv player API JSON
[youtube] _qq2E8_LB98: Downloading ios player API JSON
[youtube] _qq2E8_LB98: Downloading m3u8 information
[info] _qq2E8_LB98: Downloading subtitles: en-IN
[youtube] Downloading comment section API JSON
[youtube] Downloading ~6144 comments
[youtube] Sorting comments by newest first
[youtube] Downloading comment API JSON page 1 (0/~6144)
[youtube]     Downloading comment API JSON reply thread 1 (1/~6144)
[youtube]        Downloading comment replies API JSON page 1 (11/~6144)
[youtube] Downloading comment API JSON page 2 (79/~6144)
[youtube] Downloading comment API JSON page 3 (99/~6144)
[yout



ERROR: Did not get any data blocks


[youtube] Extracting URL: https://www.youtube.com/watch?v=oJ69Yy3epBU
[youtube] oJ69Yy3epBU: Downloading webpage
[youtube] oJ69Yy3epBU: Downloading tv client config
[youtube] oJ69Yy3epBU: Downloading tv player API JSON
[youtube] oJ69Yy3epBU: Downloading ios player API JSON
[youtube] oJ69Yy3epBU: Downloading m3u8 information
[info] oJ69Yy3epBU: Downloading subtitles: en-IN
[youtube] Downloading comment section API JSON
[youtube] Downloading ~2029 comments
[youtube] Sorting comments by newest first
[youtube] Downloading comment API JSON page 1 (0/~2029)
[youtube]     Downloading comment API JSON reply thread 1 (1/~2029)
[youtube]        Downloading comment replies API JSON page 1 (11/~2029)
[youtube] Downloading comment API JSON page 2 (50/~2029)
[youtube] Downloading comment API JSON page 3 (70/~2029)
[youtube] Downloading comment API JSON page 4 (90/~2029)
[youtube] Extracted 100 comments
[info] oJ69Yy3epBU: Downloading 1 format(s): 616+251
[info] Writing video subtitles to: scratchpad



ERROR: Did not get any data blocks


[youtube] Extracting URL: https://www.youtube.com/watch?v=QECHcQyQhN0
[youtube] QECHcQyQhN0: Downloading webpage
[youtube] QECHcQyQhN0: Downloading tv client config
[youtube] QECHcQyQhN0: Downloading tv player API JSON
[youtube] QECHcQyQhN0: Downloading ios player API JSON
[youtube] QECHcQyQhN0: Downloading m3u8 information
[youtube] Downloading comment section API JSON
[youtube] Downloading ~964 comments
[youtube] Sorting comments by newest first
[youtube] Downloading comment API JSON page 1 (0/~964)
[youtube] Downloading comment API JSON page 2 (20/~964)
[youtube] Downloading comment API JSON page 3 (40/~964)
[youtube] Downloading comment API JSON page 4 (60/~964)
[youtube]     Downloading comment API JSON reply thread 1 (80/~964)
[youtube] Downloading comment API JSON page 5 (81/~964)
[youtube]     Downloading comment API JSON reply thread 1 (83/~964)
[youtube]     Downloading comment API JSON reply thread 2 (87/~964)
[youtube] Extracted 100 comments
[info] QECHcQyQhN0: Downloading 

In [2]:
df_test=pd.read_csv(f'{PROCESSED_COMMENTS_DIR}/comments.csv')
df_test['comments'] = df_test['comments'].apply(json.loads)

In [3]:
df_test.shape

(291, 5)

In [4]:
df_test.tail()

Unnamed: 0,creator_id,id,title,timestamp,comments
286,@VanessaWingårdh,een8H8TIDF4,Retro Tech Revival: Trading AI for Nostalgia &...,1757256000.0,"[{'id': 'Ugwf5IosRA1xwsvLBSN4AaABAg', 'parent'..."
287,@VanessaWingårdh,PUV5Wjmh_CM,Boring Games are Making Millions,1757256000.0,"[{'id': 'Ugy_NeOm9JQie0lxANx4AaABAg', 'parent'..."
288,@VanessaWingårdh,CqSA-nTpkb0,How Instagram Stories hack your brain's reward...,1757256000.0,"[{'id': 'UgzylCkQp-hIlrqup7Z4AaABAg', 'parent'..."
289,@VanessaWingårdh,QQ71Y9cOemM,How a Software Engineer Stays Consistent | My ...,1757256000.0,"[{'id': 'UgxSVaT3iSuK4jthI3J4AaABAg', 'parent'..."
290,@VanessaWingårdh,wSxxlXC9nsM,Why Swedish Nature Laws are Like Open Source #...,1757256000.0,"[{'id': 'UgyU7WkIj3lBFriB4ft4AaABAg', 'parent'..."
