# 02 - YouTube API Data Collection

This notebook demonstrates collecting YouTube video data using the official YouTube Data API v3.

**Required:** YouTube API Key (set in .env file)

In [None]:
import sys
sys.path.append('../src')

from api_collector import YouTubeAPICollector
import pandas as pd
import matplotlib.pyplot as plt
import os

In [None]:
# Initialize API collector
# Make sure you have YOUTUBE_API_KEY in your .env file
collector = YouTubeAPICollector()

In [None]:
# Define search queries (same as scraper for fair comparison)
search_queries = [
    'python tutorial', 'machine learning', 'cooking recipes',
    'travel vlog', 'music video', 'gaming',
    'movie trailers', 'sports highlights', 'tech reviews',
    'comedy sketches', 'news today', 'fitness workout'
]

print(f"Will collect from {len(search_queries)} queries")

In [None]:
# Collect data
df = collector.collect_from_queries(search_queries, videos_per_query=50)
print(f"\nCollected {len(df)} videos")
print(f"API quota used: {collector.quota_used} units")

In [None]:
# Explore data
print(f"Columns: {list(df.columns)}")
df.head()

In [None]:
# Statistics
df.describe()

In [None]:
# Category distribution
if 'category_name' in df.columns:
    print("\nVideos by Category:")
    print(df['category_name'].value_counts())

In [None]:
# Quick visualizations
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Views distribution
axes[0, 0].hist(df['view_count'], bins=50, edgecolor='black')
axes[0, 0].set_xlabel('Views')
axes[0, 0].set_title('Views Distribution')
axes[0, 0].set_yscale('log')

# Likes distribution
axes[0, 1].hist(df['like_count'], bins=50, edgecolor='black', color='coral')
axes[0, 1].set_xlabel('Likes')
axes[0, 1].set_title('Likes Distribution')
axes[0, 1].set_yscale('log')

# Duration distribution
axes[1, 0].hist(df['duration_seconds']/60, bins=50, edgecolor='black', color='green')
axes[1, 0].set_xlabel('Duration (minutes)')
axes[1, 0].set_title('Duration Distribution')

# Category distribution
if 'category_name' in df.columns:
    df['category_name'].value_counts().head(10).plot(kind='barh', ax=axes[1, 1])
    axes[1, 1].set_xlabel('Count')
    axes[1, 1].set_title('Top 10 Categories')

plt.tight_layout()
plt.show()

In [None]:
# Save data
output_path = '../data/raw/api_data.csv'
collector.save_data(df, output_path)
print(f"Data saved to {output_path}")

In [None]:
# Summary
print(f"\nAPI Data Collection Summary:")
print(f"- Total videos: {len(df)}")
print(f"- Unique channels: {df['channel_id'].nunique()}")
print(f"- Categories: {df['category_name'].nunique() if 'category_name' in df.columns else 'N/A'}")
print(f"- Average views: {df['view_count'].mean():,.0f}")
print(f"- Average likes: {df['like_count'].mean():,.0f}")
print(f"- Average comments: {df['comment_count'].mean():,.0f}")