In [3]:
import os
import sys
from pathlib import Path
import asyncio
import pandas as pd
from datetime import datetime, timedelta

# Add project root to Python path
project_root = Path.cwd().parent
sys.path.append(str(project_root))

# Import collectors
from collectors.collectors import (
    collect_youtube_data,
)

# Load environment variables
from dotenv import load_dotenv
load_dotenv()

# Create data directories
Path("../data/raw").mkdir(parents=True, exist_ok=True)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## YouTube Data Collection

In [5]:
VIDEOS = [
    # Official Content
    {"id": "HfWLgELllZs", "title": "Kendrick Lamar - luther (Official Audio)", "category": "official"},
    {"id": "sNY_2TEmzho", "title": "Kendrick Lamar & SZA - luther (Official Music Video)", "category": "official"},

    # Repost Content
    {"id": "DH5oKK2KRsU", "title": "Kendrick Lamar - luther (Letra/Lyrics)", "category": "repost"},
    {"id": "l0wJqJT3gh8", "title": "Kendrick Lamar, SZA - luther (Lyrics)", "category": "repost"},

    # Reaction Videos
    {"id": "himeAlEJXf4", "title": "Kendrick Lamar - luther (Ft. SZA) - FANTANO REACTION", "category": "reaction"},
    {"id": "Kn8DZJOgg-M", "title": "KENDRICK LAMAR & SZA DROPPED A MOVIE! Luther (Music Video) REACTION", "category": "reaction"},
    {"id": "FtUZj2bqoSU", "title": "DRAKE'S LANE BUT BETTER?! | Rapper Reacts to Kendrick Lamar - luther FIRST REACTION", "category": "reaction"},
    {"id": "x8yBQHKaf28", "title": "ImDOntai Reacts TO Kendrick - Luther Music Video", "category": "reaction"},
    {"id": "Ix9vchZExWE", "title": "REAL BLACK LOVE! | Kendrick Lamar & SZA - luther (REACTION!!!)", "category": "reaction"},
    {"id": "YwtwK-itEyU", "title": "Kendrick Lamar - luther - UNCUT REACTION MASHUP", "category": "reaction"},
    {"id": "AiTnshH_v3A", "title": "Kendrick Lamar & SZA - luther (Official Music Video) (FIRST REACTION)", "category": "reaction"},
    
    # Analysis and Review Videos
    {"id": "3KLG3Q53B7s", "title": "The artistic triumph of the 'Luther' Video - Kendrick Lamar ft. SZA analysis", "category": "analysis"},
    {"id": "a4d9IcaYpVM", "title": "Kendrick's 'Luther': A Cinematic Masterpiece About Love & Betrayal", "category": "analysis"},
    {"id": "OcC5nNgNHX8", "title": "Kendrick Lamar's Luther: The Most Underwhelming song from GNX", "category": "analysis"},
]

In [6]:
async def get_youtube_data(videos=VIDEOS):
    all_videos = []
    all_comments = []
    failed_videos = []
    
    for video in videos:
        video_id = video['id']
        title = video['title']
        category = video['category']
        
        print(f"Collecting YouTube data for {title} ({category})...")
        try:
            youtube_result = await collect_youtube_data(
                query=title,
                api_key=os.getenv('YOUTUBE_API_KEY'),
            )

            if 'error' not in youtube_result:
                # Add metadata to video data
                video_data = youtube_result['data']
                video_data['category'] = category
                video_data['title'] = title
                all_videos.append(video_data)
                
                # Add metadata to comments
                comments = youtube_result['comments']
                for comment in comments:
                    comment['video_id'] = video_id
                    comment['video_title'] = title
                    comment['category'] = category
                all_comments.extend(comments)
                
                print(f"✓{len(comments)} comments")
            else:
                print(f"Error collecting data for {title}: {youtube_result['error']}")
                failed_videos.append({
                    'id': video_id,
                    'title': title,
                    'category': category,
                    'error': youtube_result['error']
                })
        except Exception as e:
            print(f"Exception occurred for {title}: {str(e)}")
            failed_videos.append({
                'id': video_id,
                'title': title,
                'category': category,
                'error': str(e)
            })
        
        # Save progress after each video
        if all_videos or all_comments:
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            
            # Save successful data
            if all_videos:
                videos_df = pd.DataFrame(all_videos)
                videos_file = f"../data/raw/youtube_videos_{timestamp}.csv"
                videos_df.to_csv(videos_file, index=False)
                print(f"Saved {len(videos_df)} videos to {videos_file}")
            
            if all_comments:
                comments_df = pd.DataFrame(all_comments)
                comments_file = f"../data/raw/youtube_comments_{timestamp}.csv"
                comments_df.to_csv(comments_file, index=False)
                print(f"Saved {len(comments_df)} comments to {comments_file}")
            
            # Save failed videos info
            if failed_videos:
                failed_df = pd.DataFrame(failed_videos)
                failed_file = f"../data/raw/failed_videos_{timestamp}.csv"
                failed_df.to_csv(failed_file, index=False)
                print(f"Saved {len(failed_videos)} failed videos info to {failed_file}")
        
        # Add a small delay between requests to avoid rate limiting
        await asyncio.sleep(10)
    
    # Final summary
    print("\nCollection Summary:")
    print(f"Total videos collected: {len(all_videos)}")
    print(f"Total comments collected: {len(all_comments)}")
    print(f"Total failed videos: {len(failed_videos)}")
    
    if failed_videos:
        print("\nFailed videos:")
        for video in failed_videos:
            print(f"- {video['title']}: {video['error']}")

#### *Scrape data from target videos*

In [4]:
#await get_youtube_data()

Collecting YouTube data for Kendrick Lamar - luther (Official Audio) (official)...
✓5469 comments
Saved 1 videos to ../data/raw/youtube_videos_20250521_195905.csv
Saved 5469 comments to ../data/raw/youtube_comments_20250521_195905.csv
Collecting YouTube data for Kendrick Lamar & SZA - luther (Official Music Video) (official)...
✓7564 comments
Saved 2 videos to ../data/raw/youtube_videos_20250521_200420.csv
Saved 13033 comments to ../data/raw/youtube_comments_20250521_200420.csv
Collecting YouTube data for Kendrick Lamar - luther (Letra/Lyrics) (repost)...
✓365 comments
Saved 3 videos to ../data/raw/youtube_videos_20250521_200514.csv
Saved 13398 comments to ../data/raw/youtube_comments_20250521_200514.csv
Collecting YouTube data for Kendrick Lamar & SZA - luther | Spider-Verse (repost)...
✓7500 comments
Saved 4 videos to ../data/raw/youtube_videos_20250521_201034.csv
Saved 20898 comments to ../data/raw/youtube_comments_20250521_201034.csv
Collecting YouTube data for Kendrick Lamar - lut

#### *Handle failed videos*

In [7]:
failed_videos_df = pd.read_csv("../data/raw/failed_videos_20250522_235227.csv")
failed_videos_df

Unnamed: 0,id,title,category,error
0,Ix9vchZExWE,REAL BLACK LOVE! | Kendrick Lamar & SZA - luth...,reaction,timed out


In [8]:
# Convert failed videos to VIDEOS format
failed_videos_array = [
    {
        'id': row['id'],
        'title': row['title'],
        'category': row['category']
    }
    for _, row in failed_videos_df.iterrows()
]

failed_videos_array.append({"id": "l0wJqJT3gh8", "title": "Kendrick Lamar, SZA - luther (Lyrics)", "category": "repost"})

In [9]:
await get_youtube_data(failed_videos_array)

Collecting YouTube data for REAL BLACK LOVE! | Kendrick Lamar & SZA - luther (REACTION!!!) (reaction)...
✓1369 comments
Saved 1 videos to ../data/raw/youtube_videos_20250524_061743.csv
Saved 1369 comments to ../data/raw/youtube_comments_20250524_061743.csv
Collecting YouTube data for Kendrick Lamar, SZA - luther (Lyrics) (repost)...
✓1349 comments
Saved 2 videos to ../data/raw/youtube_videos_20250524_062019.csv
Saved 2718 comments to ../data/raw/youtube_comments_20250524_062019.csv

Collection Summary:
Total videos collected: 2
Total comments collected: 2718
Total failed videos: 0
