# YouTube Scraper for Video Classification

## Import Libraries and Set Up API Key
> Import necessary libraries and set up the YouTube API key.

In [None]:
# Import necessary libraries
from googleapiclient.discovery import build
import pandas as pd
import os
import re
import time  # Import the time module for rate limiting
from dotenv import load_dotenv # import .env file for API key

# Load environment variables
load_dotenv()

# Set up YouTube API key
API_KEY = os.getenv("API_KEY")
youtube = build('youtube', 'v3', developerKey=API_KEY)

## Define Parameters and Functions
> Define categories, maximum results, minimum samples per category, and functions for sanitizing descriptions and scraping YouTube data.

In [None]:
# Define categories and other parameters
categories = ['Travel Blogs', 'Science and Technology', 'Food', 'Manufacturing', 'History', 'Art and Music']
max_results = 50
min_samples_per_category = 1700

# Initialize an empty list to store all videos
all_videos = []

# Function to sanitize video descriptions
def sanitize_description(description):
    # Use regular expressions to remove unwanted information
    # For example, removing email addresses, phone numbers, etc.
    # Modify this based on your specific needs
    cleaned_description = re.sub(r'\S+@\S+', '', description)
    return cleaned_description

# Function to scrape YouTube data for a given category
def scrape_youtube_data(category):
    videos = []
    while len(videos) < min_samples_per_category:
        for query in [f'{category} videos', f'{category} documentary']:
            try:
                # Make a YouTube API search request
                request = youtube.search().list(
                    q=query,
                    part='id,snippet',
                    type='video',
                    maxResults=max_results
                )
                response = request.execute()
                items = response.get('items', [])

                # Loop through the retrieved videos
                for item in items:
                    video_id = item['id']['videoId']
                    title = item['snippet']['title']

                    # Make a YouTube API video request
                    video_request = youtube.videos().list(
                        part='snippet',
                        id=video_id
                    )
                    video_response = video_request.execute()
                    description = video_response['items'][0]['snippet']['description']

                    # Sanitize the description using the defined function
                    cleaned_description = sanitize_description(description)

                    # Append video information to the list
                    videos.append({'Video id': video_id, 'Title': title, 'Description': cleaned_description, 'Category': category})
            except Exception as e:
                print(f"Error in API request: {str(e)}")

            # Add rate limiting to avoid hitting API rate limits
            time.sleep(1)  # Sleep for 1 second between requests

        # Break the loop if there are no more results
        if not items:
            break

    return videos

## Scrape YouTube Data
> Loop through each category and scrape YouTube data.

In [None]:
# Loop through each category and scrape YouTube data
for category in categories:
    videos = scrape_youtube_data(category)
    all_videos.extend(videos)

## Create DataFrame and Save to CSV
> Create a DataFrame and save the data to a CSV file.

In [None]:
# Create a DataFrame and CSV file
df = pd.DataFrame(all_videos)
df.to_csv('youtube_data.csv', index=False)

## Conclusion

The YouTube data has been successfully scraped and stored in a CSV file (`youtube_data.csv`). This dataset can be used for text classification tasks based on video descriptions.