# 01 - Web Scraping YouTube Videos

This notebook demonstrates how to scrape YouTube video data using Selenium and BeautifulSoup.

**Data collected:**
- Video ID
- Title
- Channel name
- View count
- Upload date
- Duration
- Video URL

In [None]:
# Import required libraries
import sys
sys.path.append('../src')

from scraper import YouTubeScraper
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Initialize the Scraper

In [None]:
# Initialize scraper (set headless=True to run without browser window)
scraper = YouTubeScraper(headless=False)

## Define Search Queries

We use diverse queries to collect videos from different categories

In [None]:
search_queries = [
    'python tutorial',
    'machine learning',
    'cooking recipes',
    'travel vlog',
    'music video',
    'gaming',
    'movie trailers',
    'sports highlights',
    'tech reviews',
    'comedy sketches'
]

print(f"Will scrape {len(search_queries)} queries")
print(f"Target: {len(search_queries) * 100} videos")

## Scrape Data

In [None]:
# Scrape videos from all queries
df = scraper.scrape_multiple_queries(search_queries, videos_per_query=100)

## Explore Scraped Data

In [None]:
print(f"Total videos collected: {len(df)}")
print(f"Columns: {list(df.columns)}")
df.head()

In [None]:
# Basic statistics
df.describe()

In [None]:
# Check for missing values
df.isnull().sum()

## Quick Visualizations

In [None]:
# View distribution
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.hist(df['views'], bins=50, edgecolor='black')
plt.xlabel('Views')
plt.ylabel('Frequency')
plt.title('Distribution of Views')
plt.yscale('log')

plt.subplot(1, 2, 2)
plt.hist(df['duration_seconds']/60, bins=50, edgecolor='black')
plt.xlabel('Duration (minutes)')
plt.ylabel('Frequency')
plt.title('Distribution of Video Duration')

plt.tight_layout()
plt.show()

## Save Raw Data

In [None]:
# Save to CSV
output_path = '../data/raw/scraped_data.csv'
scraper.save_data(df, output_path)
print(f"Data saved to {output_path}")

## Summary

- Total videos collected
- Data quality check
- Next steps: Data preprocessing

In [None]:
print(f"\nData Collection Summary:")
print(f"- Total unique videos: {len(df)}")
print(f"- Date range: {df['scraped_at'].min()} to {df['scraped_at'].max()}")
print(f"- Average views: {df['views'].mean():,.0f}")
print(f"- Median views: {df['views'].median():,.0f}")
print(f"- Average duration: {df['duration_seconds'].mean()/60:.1f} minutes")