In [None]:
import os
import requests
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm   

# ------------------------------------------------------
# Load API key from .env
# ------------------------------------------------------
load_dotenv()
API_KEY = os.getenv("YOUTUBE_API_KEY")
BASE_URL = "https://www.googleapis.com/youtube/v3"

# ------------------------------------------------------
# 1. Fetch video metadata (title, views, likes, channel info)
# ------------------------------------------------------
def get_video_metadata(video_id):
    params = {
        "id": video_id,
        "key": API_KEY,
        "part": "snippet,statistics"
    }
    r = requests.get(f"{BASE_URL}/videos", params=params).json()
    item = r["items"][0]

    meta = {
        "channel_name": item["snippet"]["channelTitle"],
        "channel_id": item["snippet"]["channelId"],
        "video_id": video_id,
        "video_title": item["snippet"]["title"],
        "video_published_at": item["snippet"]["publishedAt"],
        "view_count": item["statistics"].get("viewCount", 0),
        "like_count": item["statistics"].get("likeCount", 0)
    }
    return meta

# ------------------------------------------------------
# 2. Get all top-level comments for the video
# ------------------------------------------------------
def get_comment_threads(video_id):
    threads = []
    page_token = None

    while True:
        params = {
            "part": "snippet",
            "videoId": video_id,
            "maxResults": 100,
            "order": "time",
            "textFormat": "plainText",
            "key": API_KEY,
        }
        if page_token:
            params["pageToken"] = page_token

        r = requests.get(f"{BASE_URL}/commentThreads", params=params).json()
        threads.extend(r.get("items", []))
        page_token = r.get("nextPageToken")
        if not page_token:
            break

    return threads

# ------------------------------------------------------
# 3. Get all replies for a given top-level comment
# ------------------------------------------------------
def get_replies(parent_id):
    replies = []
    page_token = None

    while True:
        params = {
            "part": "snippet",
            "parentId": parent_id,
            "maxResults": 100,
            "textFormat": "plainText",
            "key": API_KEY,
        }
        if page_token:
            params["pageToken"] = page_token

        r = requests.get(f"{BASE_URL}/comments", params=params).json()
        replies.extend(r.get("items", []))
        page_token = r.get("nextPageToken")
        if not page_token:
            break

    return replies

# ------------------------------------------------------
# 4. Normalize a single comment/reply
# ------------------------------------------------------
def extract_comment_data(item, is_reply=False, parent_id=None, author_channel_id=None, meta=None, is_pinned=False):
    snippet = item["snippet"]

    data = {
        "channel_name": meta["channel_name"],
        "channel_id": meta["channel_id"],
        "video_id": meta["video_id"],
        "video_title": meta["video_title"],
        "video_published_at": meta["video_published_at"],
        "view_count": meta["view_count"],
        "video_like_count": meta["like_count"],
        "comment_id": item["id"],
        "comment_text": snippet.get("textDisplay", ""),
        "author": snippet.get("authorDisplayName", ""),
        "author_id": snippet.get("authorChannelId", {}).get("value", ""),
        "comment_like_count": snippet.get("likeCount", 0),
        "comment_published_at": snippet.get("publishedAt", ""),
        "is_pinned": is_pinned,
        "is_reply": is_reply,
        "parent_comment_id": parent_id,
    }
    return data

# ------------------------------------------------------
# 5. Fetch all comments and replies (with tqdm progress bar)
# ------------------------------------------------------
def fetch_all_comments(video_id):
    meta = get_video_metadata(video_id)
    author_channel_id = meta["channel_id"]

    all_comments = []
    threads = get_comment_threads(video_id)

    for thread in tqdm(threads, desc="Processing threads"):
        top_comment_id = thread["snippet"]["topLevelComment"]["id"]
        top_snip = thread["snippet"]["topLevelComment"]["snippet"]

        is_pinned = thread["snippet"].get("pinned", False)

        top_comment = {
            "id": top_comment_id,
            "snippet": top_snip
        }

        top_data = extract_comment_data(
            top_comment,
            is_reply=False,
            parent_id=None,
            author_channel_id=author_channel_id,
            meta=meta,
            is_pinned=is_pinned
        )

        if top_data["author_id"] != author_channel_id or is_pinned:
            all_comments.append(top_data)

        total_replies = thread["snippet"].get("totalReplyCount", 0)
        if total_replies > 0:
            for reply in tqdm(get_replies(top_comment_id), desc=f"Replies to {top_comment_id}", leave=False):
                reply_data = extract_comment_data(
                    reply,
                    is_reply=True,
                    parent_id=top_comment_id,
                    author_channel_id=author_channel_id,
                    meta=meta
                )
                if reply_data["author_id"] != author_channel_id:
                    all_comments.append(reply_data)

    return all_comments

# ------------------------------------------------------
# Example run
# ------------------------------------------------------
video_id = "zECoaEZRRFU"
comments = fetch_all_comments(video_id)

df = pd.DataFrame(comments)

# Ensure channel_name is the first column
cols = ["channel_name"] + [c for c in df.columns if c != "channel_name"]
df = df[cols]


Processing threads: 100%|██████████| 4285/4285 [00:55<00:00, 77.30it/s] 


In [31]:
import os

# Get project root from current working directory (assumes you're inside "notebooks/")
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Ensure data/raw exists inside project root
data_dir = os.path.join(project_root, "data", "raw")
os.makedirs(data_dir, exist_ok=True)

# Save CSV there
output_path = os.path.join(data_dir, "2025_Pod_30_No.1_Nitric_Oxide_Expert.csv")
df.to_csv(output_path, index=False, encoding="utf-8-sig")

print(f"✅ Saved {len(df)} comments to {output_path}")

✅ Saved 6940 comments to /Users/riadanas/Desktop/MLE Diary of a CEO/data/raw/2025_Pod_30_No.1_Nitric_Oxide_Expert.csv


In [32]:
df.shape

(6940, 16)

In [4]:
df.sort_values(by=["comment_like_count"], ascending=[False], inplace=True)

In [5]:
df.head(2)

Unnamed: 0,channel_name,channel_id,video_id,video_title,video_published_at,view_count,video_like_count,comment_id,comment_text,author,author_id,comment_like_count,comment_published_at,is_pinned,is_reply,parent_comment_id
2130,The Diary Of A CEO,UCGq-a57w-aPwyi3pW7XLiHw,G4hkYDjPSFs,Secret To Living Without Fear & Anxiety Foreve...,2025-03-13T08:00:09Z,2611959,57541,Ugzqd6_jsnewWeYx6Yh4AaABAg,most people stay stuck because they avoid book...,@Michael_Saidon9988,UC0FUO57nnAV49MbQ8ex8LwA,2300,2025-03-13T23:30:22Z,False,False,
2056,The Diary Of A CEO,UCGq-a57w-aPwyi3pW7XLiHw,G4hkYDjPSFs,Secret To Living Without Fear & Anxiety Foreve...,2025-03-13T08:00:09Z,2611959,57541,Ugzp0f2sPaNXupqTS6J4AaABAg,Awesome video. Here are few things that helped...,@Loreta04-t3b,UCnztzfJWxq63j1Ikj6M0Wtg,1693,2025-03-14T00:07:23Z,False,False,
