In [1]:
import os
import pandas as pd
from googleapiclient.discovery import build

API_KEY = os.getenv("YOUTUBE_API_KEY")



In [5]:
import os
import pandas as pd
from googleapiclient.discovery import build
from textblob import TextBlob
import googleapiclient.http

API_KEY = "AIzaSyAOQqpmp7g86-dduizo45CcAsoWsOxhp1I"

os.environ.pop('GOOGLE_APPLICATION_CREDENTIALS', None)

# Build the YouTube service
youtube = build(
    "youtube", 
    "v3", 
    developerKey=API_KEY,
    credentials=None,  # Explicitly tell it to use API key, not service account
    cache_discovery=False  # Helps avoid ADC issues
)


# -----------------------------
# Video Search
# -----------------------------
def search_videos(query, max_results=10):
    results = youtube.search().list(
        part="snippet",
        q=query,
        type="video",
        maxResults=max_results
    ).execute()

    video_ids = [item["id"]["videoId"] for item in results.get("items", [])]
    return video_ids


# -----------------------------
# Video Metadata
# -----------------------------
def get_video_metadata(video_id):
    response = youtube.videos().list(
        part="snippet,statistics",
        id=video_id
    ).execute()

    if not response.get("items"):
        return {}

    item = response["items"][0]
    snippet = item.get("snippet", {})
    stats = item.get("statistics", {})

    return {
        "video_title": snippet.get("title"),
        "video_description": snippet.get("description"),
        "video_publishedAt": snippet.get("publishedAt"),
        "video_channel": snippet.get("channelTitle"),
        "video_viewCount": int(stats.get("viewCount", 0)),
        "video_likeCount": int(stats.get("likeCount", 0)),
        "video_commentCount": int(stats.get("commentCount", 0))
    }


# -----------------------------
# Comment Extraction
# -----------------------------
def get_video_comments(video_id, max_comments=200):
    comments = []
    next_page_token = None

    video_meta = get_video_metadata(video_id)

    while len(comments) < max_comments:
        response = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            maxResults=100,
            pageToken=next_page_token,
            textFormat="plainText"
        ).execute()

        for item in response.get("items", []):
            snippet = item["snippet"]["topLevelComment"]["snippet"]
            text = snippet.get("textDisplay", "")
            sentiment = TextBlob(text).sentiment

            comments.append({
                # --- TEXT & ANALYSIS (matched to Reddit) ---
                "text": text,
                "comment_length": len(text.split()),
                "sentiment_polarity": sentiment.polarity,
                "sentiment_subjectivity": sentiment.subjectivity,

                # --- ENGAGEMENT ---
                "likes": snippet.get("likeCount", 0),
                "num_replies": item.get("snippet", {}).get("totalReplyCount", 0),

                # --- TIME ---
                "created_utc": snippet.get("publishedAt"),

                # --- VIDEO-LEVEL METADATA ---
                "video_id": video_id,
                **video_meta
            })

            if len(comments) >= max_comments:
                break

        next_page_token = response.get("nextPageToken")
        if not next_page_token:
            break

    return comments


# -----------------------------
# Full Pipeline
# -----------------------------
def scrape_youtube_comments(query="AI artificial intelligence", num_videos=2, comments_per_video=10):
    print(f"Searching for videos about: {query}")
    video_ids = search_videos(query, max_results=num_videos)

    all_comments = []

    for vid in video_ids:
        print(f"Scraping comments for video: {vid}")
        video_comments = get_video_comments(vid, max_comments=comments_per_video)
        print(f"Collected {len(video_comments)} comments")
        all_comments.extend(video_comments)

    df = pd.DataFrame(all_comments)
    return df


# -----------------------------
# Main Execution
# -----------------------------
if __name__ == "__main__":
    df = scrape_youtube_comments(
        query="AI opinions ethical AI artificial intelligence", 
        num_videos=2,
        comments_per_video=10
    )


Searching for videos about: AI opinions ethical AI artificial intelligence
Scraping comments for video: eXdVDhOGqoE
Collected 10 comments
Scraping comments for video: KiT0T12Yyno
Collected 10 comments


In [6]:
df.head()

Unnamed: 0,text,comment_length,sentiment_polarity,sentiment_subjectivity,likes,num_replies,created_utc,video_id,video_title,video_description,video_publishedAt,video_channel,video_viewCount,video_likeCount,video_commentCount
0,Important ethical talk from November 2023 arou...,242,0.119231,0.658974,0,0,2025-12-02T16:20:06Z,eXdVDhOGqoE,"AI Is Dangerous, but Not for the Reasons You T...",AI won't kill us all — but that doesn't make i...,2023-11-06T12:00:37Z,TED,1758425,31933,2686
1,"Ai sucks... it is limited and sterile, it will...",26,-0.342857,0.391964,0,0,2025-12-02T16:14:42Z,eXdVDhOGqoE,"AI Is Dangerous, but Not for the Reasons You T...",AI won't kill us all — but that doesn't make i...,2023-11-06T12:00:37Z,TED,1758425,31933,2686
2,Woke narrative. Boring.,3,-1.0,1.0,0,0,2025-11-28T06:25:42Z,eXdVDhOGqoE,"AI Is Dangerous, but Not for the Reasons You T...",AI won't kill us all — but that doesn't make i...,2023-11-06T12:00:37Z,TED,1758425,31933,2686
3,Copyright is an interesting phenomenon. Litera...,135,0.146491,0.383772,0,1,2025-11-27T21:07:04Z,eXdVDhOGqoE,"AI Is Dangerous, but Not for the Reasons You T...",AI won't kill us all — but that doesn't make i...,2023-11-06T12:00:37Z,TED,1758425,31933,2686
4,AI allows the ability to monitor every single ...,25,-0.030357,0.278571,0,0,2025-11-27T21:01:01Z,eXdVDhOGqoE,"AI Is Dangerous, but Not for the Reasons You T...",AI won't kill us all — but that doesn't make i...,2023-11-06T12:00:37Z,TED,1758425,31933,2686
