In [79]:
import os
import requests
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm   

# ------------------------------------------------------
# Load API key from .env
# ------------------------------------------------------
load_dotenv()
API_KEY = os.getenv("YOUTUBE_API_KEY")
BASE_URL = "https://www.googleapis.com/youtube/v3"

# ------------------------------------------------------
# 1. Fetch video metadata (title, views, likes, channel info, + description)
# ------------------------------------------------------
def get_video_metadata(video_id):
    params = {
        "id": video_id,
        "key": API_KEY,
        "part": "snippet,statistics"
    }
    r = requests.get(f"{BASE_URL}/videos", params=params).json()
    item = r["items"][0]
    snippet = item["snippet"]

    # Truncate description to first 500 characters (clean line breaks)
    description = snippet.get("description", "").replace("\n", " ").strip()[:500]

    meta = {
        "channel_name": snippet["channelTitle"],
        "channel_id": snippet["channelId"],
        "video_id": video_id,
        "video_title": snippet["title"],
        "video_description": description,      # <-- added
        "video_published_at": snippet["publishedAt"],
        "view_count": item["statistics"].get("viewCount", 0),
        "like_count": item["statistics"].get("likeCount", 0),
        "comment_count": int(item["statistics"].get("commentCount", 0))
    }
    return meta


# ------------------------------------------------------
# 2. Get all top-level comments for the video
# ------------------------------------------------------
def get_comment_threads(video_id):
    threads = []
    page_token = None

    while True:
        params = {
            "part": "snippet",
            "videoId": video_id,
            "maxResults": 100,
            "order": "time",
            "textFormat": "plainText",
            "key": API_KEY,
        }
        if page_token:
            params["pageToken"] = page_token

        r = requests.get(f"{BASE_URL}/commentThreads", params=params).json()
        threads.extend(r.get("items", []))
        page_token = r.get("nextPageToken")
        if not page_token:
            break

    return threads

# ------------------------------------------------------
# 3. Get all replies for a given top-level comment
# ------------------------------------------------------
def get_replies(parent_id):
    replies = []
    page_token = None

    while True:
        params = {
            "part": "snippet",
            "parentId": parent_id,
            "maxResults": 100,
            "textFormat": "plainText",
            "key": API_KEY,
        }
        if page_token:
            params["pageToken"] = page_token

        r = requests.get(f"{BASE_URL}/comments", params=params).json()
        replies.extend(r.get("items", []))
        page_token = r.get("nextPageToken")
        if not page_token:
            break

    return replies

# ------------------------------------------------------
# 4. Normalize a single comment/reply
# ------------------------------------------------------
def extract_comment_data(item, is_reply=False, parent_id=None, author_channel_id=None, meta=None, is_pinned=False):
    snippet = item["snippet"]

    data = {
        "channel_name": meta["channel_name"],
        "channel_id": meta["channel_id"],
        "video_id": meta["video_id"],
        "video_title": meta["video_title"],
        "video_description": meta.get("video_description", ""),  # ✅ Added here
        "video_published_at": meta["video_published_at"],
        "view_count": meta["view_count"],
        "video_like_count": meta["like_count"],
        "comment_count": meta["comment_count"],
        "comment_id": item["id"],
        "comment_text": snippet.get("textDisplay", ""),
        "author": snippet.get("authorDisplayName", ""),
        "author_id": snippet.get("authorChannelId", {}).get("value", ""),
        "comment_like_count": snippet.get("likeCount", 0),
        "comment_published_at": snippet.get("publishedAt", ""),
        "is_pinned": is_pinned,
        "is_reply": is_reply,
        "parent_comment_id": parent_id,
    }
    return data

# ------------------------------------------------------
# 5. Fetch all comments and replies (with tqdm progress bar)
# ------------------------------------------------------
def fetch_all_comments(video_id):
    meta = get_video_metadata(video_id)
    author_channel_id = meta["channel_id"]

    all_comments = []
    threads = get_comment_threads(video_id)

    for thread in tqdm(threads, desc="Processing threads"):
        top_comment_id = thread["snippet"]["topLevelComment"]["id"]
        top_snip = thread["snippet"]["topLevelComment"]["snippet"]

        is_pinned = thread["snippet"].get("pinned", False)

        top_comment = {
            "id": top_comment_id,
            "snippet": top_snip
        }

        top_data = extract_comment_data(
            top_comment,
            is_reply=False,
            parent_id=None,
            author_channel_id=author_channel_id,
            meta=meta,
            is_pinned=is_pinned
        )

        if top_data["author_id"] != author_channel_id or is_pinned:
            all_comments.append(top_data)

        total_replies = thread["snippet"].get("totalReplyCount", 0)
        if total_replies > 0:
            for reply in tqdm(get_replies(top_comment_id), desc=f"Replies to {top_comment_id}", leave=False):
                reply_data = extract_comment_data(
                    reply,
                    is_reply=True,
                    parent_id=top_comment_id,
                    author_channel_id=author_channel_id,
                    meta=meta
                )
                if reply_data["author_id"] != author_channel_id:
                    all_comments.append(reply_data)

    return all_comments

# ------------------------------------------------------
# Example run
# ------------------------------------------------------
video_id = "ZHuZ_8VYCWA"
comments = fetch_all_comments(video_id)

df = pd.DataFrame(comments)

# Ensure channel_name is the first column
cols = ["channel_name"] + [c for c in df.columns if c != "channel_name"]
df = df[cols]

Processing threads: 100%|██████████| 15083/15083 [03:56<00:00, 63.77it/s] 


In [80]:
df.shape

(24551, 18)

In [81]:
df.sort_values(by=["comment_like_count"], ascending=[False], inplace=True)

In [82]:
df.head(2)

Unnamed: 0,channel_name,channel_id,video_id,video_title,video_description,video_published_at,view_count,video_like_count,comment_count,comment_id,comment_text,author,author_id,comment_like_count,comment_published_at,is_pinned,is_reply,parent_comment_id
19317,The Diary Of A CEO,UCGq-a57w-aPwyi3pW7XLiHw,ZHuZ_8VYCWA,DEBATE: Feminist Women Vs Non-Feminist Women,Has modern feminism betrayed the very women it...,2025-06-19T07:00:24Z,1968583,46664,24218,UgwTyiHkmp_0h1qTNjh4AaABAg,The lady in red is using her personal experien...,@coachschuman,UCIK83dbsz-IacK7h3wcdlIw,16113,2025-06-19T13:17:54Z,False,False,
20889,The Diary Of A CEO,UCGq-a57w-aPwyi3pW7XLiHw,ZHuZ_8VYCWA,DEBATE: Feminist Women Vs Non-Feminist Women,Has modern feminism betrayed the very women it...,2025-06-19T07:00:24Z,1968583,46664,24218,UgwbTDzrIWyv3LoacTx4AaABAg,2 are focused on the long term negative effect...,@Real_Rizz,UCxfyMXaizMdJa6FQIvsG27Q,9882,2025-06-19T12:43:39Z,False,False,


In [83]:
df['video_description'].values[0]

"Has modern feminism betrayed the very women it promised to empower? Deborah France-White (Guilty Feminist), Louise Perry, and Erica Komisar go head-to-head on sexual freedom.  Deborah Frances-White is a bestselling author and host of The Guilty Feminist podcast, Louise Perry is a journalist and author of The Case Against the Sexual Revolution, and Erica Komisar is a clinical social worker, psychoanalyst, and author of books such as, ‘Chicken Little the Sky Isn't Falling: Raising Resilient Adoles"

In [84]:
import os

# Get project root from current working directory (assumes you're inside "notebooks/")
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Ensure data/raw exists inside project root
data_dir = os.path.join(project_root, "data", "raw")
os.makedirs(data_dir, exist_ok=True)

# Save CSV there
output_path = os.path.join(data_dir, "2025_53_Feminism.csv")
df.to_csv(output_path, index=False, encoding="utf-8-sig")

print(f"✅ Saved {len(df)} comments to {output_path}")

✅ Saved 24551 comments to /Users/riadanas/Desktop/steven_bartlett_project/data/raw/2025_53_Feminism.csv
