In [81]:
import pandas as pd
import os
import time
import subprocess
import json
from collections import defaultdict

TEMP_DIR = 'scratchpad'
PROCESSED_COMMENTS_DIR = 'processed_comments'
COMMENTS_CSV = f'{PROCESSED_COMMENTS_DIR}/comments.csv'

os.makedirs(TEMP_DIR, exist_ok=True)
os.makedirs(PROCESSED_COMMENTS_DIR, exist_ok=True)

def run_yt_dlp(video_id, max_comments=100):
    filepath = os.path.join(TEMP_DIR, f"{video_id}")
    command = [
        "yt-dlp",
        "--skip-download",
        "--write-comments",
        "--no-warnings",
        "--output", filepath,
        "--extractor-args", f"youtube:max_comments={max_comments}",
        f"https://www.youtube.com/watch?v={video_id}"
    ]
    subprocess.run(command, check=True, text=True)


def parse_comments(video_id):
    with open(os.path.join(TEMP_DIR, f'{video_id}.info.json'), "r", encoding="utf-8") as f:
        data = json.load(f)
    comments = data['comments']
    title = data['title']
    uploader = data['uploader_id']

    comment_dict = {}
    replies = defaultdict(list)
    for c in comments:
        if c.get("parent") and c["parent"] != "root":
            replies[c["parent"]].append({**c, "is_reply": True})
        else:
            comment_dict[c["id"]] = {**c, "replies": [], "is_reply": False}
    for pid, rep in replies.items():
        if pid in comment_dict:
            comment_dict[pid]["replies"] = rep
    return list(comment_dict.values()), title, uploader


def to_process(video_id):
    if not os.path.exists(COMMENTS_CSV):
        return True
    df = pd.read_csv(COMMENTS_CSV)
    return video_id not in df['id'].astype(str).values


# Main loop
df_url_details = pd.read_csv('URLs/url_channel_id.csv')
data_res = []

for video_id in df_url_details.id.astype(str):
    if not to_process(video_id):
        print(f"⏩ Skipping already processed video ID: {video_id}")
        continue

    try:
        run_yt_dlp(video_id, max_comments=100)
        comments, title, uploader = parse_comments(video_id)

        data_res.append({
            'creator_id': uploader,
            'id': video_id,
            'title': title,
            'timestamp': str(time.time()),
            'comments': json.dumps(comments)
        })

        # Append to CSV immediately to avoid reprocessing on crash
        df = pd.DataFrame(data_res)
        if os.path.exists(COMMENTS_CSV):
            df.to_csv(COMMENTS_CSV, mode='a', index=False, header=False)
        else:
            df.to_csv(COMMENTS_CSV, index=False)
        data_res.clear()

    except Exception as e:
        print(f"❌ Failed to process video ID: {video_id}")
        print(e)

⏩ Skipping already processed video ID: dua91-ntjgU
⏩ Skipping already processed video ID: M88UpaT_iio
⏩ Skipping already processed video ID: yn-IiJODZLc
⏩ Skipping already processed video ID: cwOPw3fBMs4
⏩ Skipping already processed video ID: 6DTys4OCPhs
⏩ Skipping already processed video ID: mGjeWrERxPg
⏩ Skipping already processed video ID: x55BVjUo77U
⏩ Skipping already processed video ID: wvBLz5ViSvs
⏩ Skipping already processed video ID: APA0htT-AV0
⏩ Skipping already processed video ID: jiLTkhkVwP4
⏩ Skipping already processed video ID: dzi66yHoOyA
⏩ Skipping already processed video ID: SyB6dj0kkxU
⏩ Skipping already processed video ID: yzkkzt595WE
⏩ Skipping already processed video ID: FfV434-TN7Q
⏩ Skipping already processed video ID: g0QYWoh8hsM
⏩ Skipping already processed video ID: 7_5WdJlXKLo
⏩ Skipping already processed video ID: pOIEdCZvbrQ
⏩ Skipping already processed video ID: -VK7GF3bgnA
⏩ Skipping already processed video ID: 0UtHedAXIkA
⏩ Skipping already processed vi

In [82]:
df_test=pd.read_csv(f'{PROCESSED_COMMENTS_DIR}/comments.csv')
df_test['comments'] = df_test['comments'].apply(json.loads)

In [83]:
df_test

Unnamed: 0,creator_id,id,title,timestamp,comments
0,@smiletojannah,dua91-ntjgU,Irishman Gets Revenge on Israeli Soldier,1.748887e+09,"[{'id': 'UgxdGNAomfWHXVZoCvB4AaABAg', 'parent'..."
1,@smiletojannah,M88UpaT_iio,He Ran Over 72 — Media Defended Him #liverpool,1.748887e+09,"[{'id': 'Ugy7ZCIFDTQVmWWm43t4AaABAg', 'parent'..."
2,@smiletojannah,yn-IiJODZLc,LIVE Q&A With Smile2Jannah,1.748887e+09,"[{'id': 'UgyRvBg6srdrqo-Qzx14AaABAg', 'parent'..."
3,@smiletojannah,cwOPw3fBMs4,LIVE Q&A With Smile2Jannah (Iftaar Special),1.748887e+09,"[{'id': 'UgxT5ldlprckLSZLh714AaABAg', 'parent'..."
4,@thedeshbhakt,6DTys4OCPhs,Now Youtube Vs Youtubers? | How Content Creato...,1.748887e+09,"[{'id': 'Ugxe5Lr_jG2H2ZX3FRx4AaABAg', 'parent'..."
...,...,...,...,...,...
163,@MuslimSkeptic,WFusrTpA_cM,Khaled: Secularizing Islam Through Christmas,1.748970e+09,"[{'id': 'UgzQAB-7vGnp1pIbot14AaABAg', 'parent'..."
164,@smiletojannah,qXNUVNAxghk,Charlie Kirk Thought Islam Was Weak… Then the ...,1.748970e+09,"[{'id': 'Ugw0mtvHRmrF9c-ODI14AaABAg', 'parent'..."
165,@thedeshbhakt,UG0Niaf_t78,From 'Content Creators' To Tourists - How Gawa...,1.748970e+09,"[{'id': 'Ugx0CwLqtYdQRIu3m5l4AaABAg', 'parent'..."
166,@smiletojannah,0PZq_OY28aM,Surprise! Christians Do Too 👀,1.749045e+09,"[{'id': 'UgwNHqOT17NfCIGH2HV4AaABAg', 'parent'..."
