In [1]:
import pandas as pd
import os
import time
import subprocess
import json
from collections import defaultdict

TEMP_DIR = 'scratchpad'
PROCESSED_COMMENTS_DIR = 'processed_comments'
COMMENTS_CSV = f'{PROCESSED_COMMENTS_DIR}/comments.csv'

os.makedirs(TEMP_DIR, exist_ok=True)
os.makedirs(PROCESSED_COMMENTS_DIR, exist_ok=True)

def run_yt_dlp(video_id, max_comments=100):
    filepath = os.path.join(TEMP_DIR, f"{video_id}")
    command = [
        "yt-dlp",
        "--skip-download",
        "--write-comments",
        "--no-warnings",
        "--output", filepath,
        "--extractor-args", f"youtube:max_comments={max_comments}",
        f"https://www.youtube.com/watch?v={video_id}"
    ]
    subprocess.run(command, check=True, text=True)


def parse_comments(video_id):
    with open(os.path.join(TEMP_DIR, f'{video_id}.info.json'), "r", encoding="utf-8") as f:
        data = json.load(f)
    comments = data['comments']
    title = data['title']
    uploader = data['uploader_id']

    comment_dict = {}
    replies = defaultdict(list)
    for c in comments:
        if c.get("parent") and c["parent"] != "root":
            replies[c["parent"]].append({**c, "is_reply": True})
        else:
            comment_dict[c["id"]] = {**c, "replies": [], "is_reply": False}
    for pid, rep in replies.items():
        if pid in comment_dict:
            comment_dict[pid]["replies"] = rep
    return list(comment_dict.values()), title, uploader


def to_process(video_id):
    if not os.path.exists(COMMENTS_CSV):
        return True
    df = pd.read_csv(COMMENTS_CSV)
    return video_id not in df['id'].astype(str).values


# Main loop
df_url_details = pd.read_csv('URLs/url_channel_id.csv')
data_res = []

for video_id in df_url_details.id.astype(str):
    if not to_process(video_id):
        #print(f"⏩ Skipping already processed video ID: {video_id}")
        continue

    try:
        run_yt_dlp(video_id, max_comments=100)
        comments, title, uploader = parse_comments(video_id)

        data_res.append({
            'creator_id': uploader,
            'id': video_id,
            'title': title,
            'timestamp': str(time.time()),
            'comments': json.dumps(comments)
        })

        # Append to CSV immediately to avoid reprocessing on crash
        df = pd.DataFrame(data_res)
        if os.path.exists(COMMENTS_CSV):
            df.to_csv(COMMENTS_CSV, mode='a', index=False, header=False)
        else:
            df.to_csv(COMMENTS_CSV, index=False)
        data_res.clear()

    except Exception as e:
        print(f"❌ Failed to process video ID: {video_id}")
        print(e)

[youtube] Extracting URL: https://www.youtube.com/watch?v=4rvFm18eyfU
[youtube] 4rvFm18eyfU: Downloading webpage
[youtube] 4rvFm18eyfU: Downloading tv client config
[youtube] 4rvFm18eyfU: Downloading tv player API JSON
[youtube] 4rvFm18eyfU: Downloading ios player API JSON
[youtube] 4rvFm18eyfU: Downloading m3u8 information
[youtube] Downloading comment section API JSON
[info] 4rvFm18eyfU: Downloading 1 format(s): 616+251
[info] Writing video metadata as JSON to: scratchpad/4rvFm18eyfU.info.json
❌ Failed to process video ID: 4rvFm18eyfU
'comments'
[youtube] Extracting URL: https://www.youtube.com/watch?v=l3bNZpckIg0
[youtube] l3bNZpckIg0: Downloading webpage
[youtube] l3bNZpckIg0: Downloading tv client config
[youtube] l3bNZpckIg0: Downloading tv player API JSON
[youtube] l3bNZpckIg0: Downloading ios player API JSON
[youtube] l3bNZpckIg0: Downloading m3u8 information
[youtube] Downloading comment section API JSON
[youtube] Downloading ~1561 comments
[youtube] Sorting comments by newest

In [5]:
df_test=pd.read_csv(f'{PROCESSED_COMMENTS_DIR}/comments.csv')
df_test['comments'] = df_test['comments'].apply(json.loads)

In [6]:
df_test.shape

(334, 5)

In [7]:
df_test.tail()

Unnamed: 0,creator_id,id,title,timestamp,comments
329,@SajiSharma,LrmmTHPUGdk,A Stream Full of Twists and Turns,1750535000.0,"[{'id': 'UgxS6qhbuJ284d9OI5J4AaABAg', 'parent'..."
330,@SajiSharma,WK1M8BFldv8,Watching an Alpha Male Podcast Live Because I ...,1750535000.0,"[{'id': 'UgyOMc-Jf7cOWiHTnU14AaABAg', 'parent'..."
331,@SajiSharma,tVgxXOzpN10,"AI Just Became Terrifying, new video as soon a...",1750535000.0,"[{'id': 'UgzbrCmqMh2Beyp7ijh4AaABAg', 'parent'..."
332,@SajiSharma,G66P7X8wQuI,Charlie Kirk thinks we should stop talking dow...,1750535000.0,"[{'id': 'UgwPgvPYslcAaJ7HjcN4AaABAg', 'parent'..."
333,@SajiSharma,aaUn3rrZs-U,"Posting shorts now, does this make me a sellou...",1750535000.0,"[{'id': 'UgwJ-IQWWyl6N1Kk4nB4AaABAg', 'parent'..."
