Installing the required libraries

In [None]:
!pip install google-api-python-client
!pip install --upgrade google-api-python-client google-auth google-auth-oauthlib google-auth-httplib2 tqdm pandas

Confirming the Google API Client installation

In [None]:
import googleapiclient.discovery
print("Google API Client Installed Successfully!")

Initializing Youtube API Key generated from Google Cloud Platform

In [None]:
import googleapiclient.discovery

# Replace with your own API key
API_KEY = "AIzaSyC-rQkvH_377WDr4e9y0005Dk9aCdonKhg"

# Build the YouTube API client
youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=API_KEY)

print("YouTube API client initialized successfully!")


Fetched Youtube-8M Dataset features from Google Youtube API 
(3287 rows) have been fetched throught the API having features:
('video_id', 'title', 'description', 'published_at', 'view_count', 'like_count', 'comment_count) ]

In [None]:
import os
import time
import pandas as pd
from googleapiclient.discovery import build
import requests

# ===========================
# ✅ 1. SETUP YOUTUBE API
# ===========================
API_KEY = "AIzaSyAva4WwJSmrTtxu25csmwA5kmdLb7Ob1Qk"  # Replace with your API Key
youtube = build("youtube", "v3", developerKey=API_KEY)

# ===========================
# ✅ 2. DEFINE SEARCH QUERIES
# ===========================
queries = [
    "Copyright Claims",
    "Copyright Strike YouTube",
    "YouTube Fair Use",
    "Copyright Infringement",
    "YouTube Monetization Issues",
    "YouTube Copyright Takedown",
    "DMCA Takedown YouTube"
]

# ===========================
# ✅ 3. FETCH VIDEO IDS (2,500+)
# ===========================
def get_video_ids(query, max_results=50, total_videos=500):
    video_ids = set()
    next_page_token = None

    while len(video_ids) < total_videos:
        request = youtube.search().list(
            q=query, part="id", type="video",
            maxResults=max_results, pageToken=next_page_token
        )
        response = request.execute()

        for item in response.get("items", []):
            video_ids.add(item["id"]["videoId"])

        next_page_token = response.get("nextPageToken")
        if not next_page_token:
            break  # Stop if no more pages available

        time.sleep(1)  # Avoid API quota limits

    return list(video_ids)[:total_videos]

video_ids = set()
for query in queries:
    video_ids.update(get_video_ids(query, max_results=50, total_videos=500))

video_ids = list(video_ids)  # Remove duplicates
print(f"✅ Collected {len(video_ids)} unique video IDs!")

# ===========================
# ✅ 4. FETCH VIDEO DETAILS
# ===========================
def get_video_details(video_ids):
    video_data = []
    
    for i in range(0, len(video_ids), 50):  # YouTube API allows max 50 videos per request
        request = youtube.videos().list(
            part="snippet,statistics",
            id=",".join(video_ids[i:i+50])
        )
        response = request.execute()

        for item in response["items"]:
            video_info = {
                "video_id": item["id"],
                "title": item["snippet"]["title"],
                "description": item["snippet"]["description"],
                "published_at": item["snippet"]["publishedAt"],
                "view_count": item["statistics"].get("viewCount", 0),
                "like_count": item["statistics"].get("likeCount", 0),
                "comment_count": item["statistics"].get("commentCount", 0)
            }
            video_data.append(video_info)

        time.sleep(1)  # Avoid API quota limit

    return pd.DataFrame(video_data)

df = get_video_details(video_ids)
print(f"✅ Collected {len(df)} rows!")

# Save dataset
df.to_csv("youtube_copyright_data.csv", index=False)
print("✅ YouTube dataset saved successfully!")

# ===========================
# ✅ 5. DOWNLOAD YOUTUBE UGC DATASET
# ===========================
UGC_DATASET_URL = "https://storage.googleapis.com/ugc-dataset-public/YouTube-UGC.zip"
UGC_DATASET_PATH = "YouTube-UGC.zip"

print("📥 Downloading YouTube-UGC dataset...")
response = requests.get(UGC_DATASET_URL, stream=True)

if response.status_code == 200:
    with open(UGC_DATASET_PATH, "wb") as f:
        for chunk in response.iter_content(chunk_size=1024):
            f.write(chunk)
    print("✅ YouTube-UGC dataset downloaded successfully!")
else:
    print("❌ Failed to download YouTube-UGC dataset.")

print("🎯 All tasks completed successfully!")

Installing other required libraries

In [None]:
!pip install vaderSentiment

Fetching other features from Youtube API and merging it with the existing dataset to make it a more strong dataset.

In [None]:
import pandas as pd
import requests
import numpy as np
import librosa
from io import BytesIO
from PIL import Image
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Load existing dataset (Replace 'your_dataset.csv' with your actual file)
df = pd.read_csv("youtube_copyright_data.csv")

# Initialize sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Your YouTube API Key
API_KEY = "AIzaSyAva4WwJSmrTtxu25csmwA5kmdLb7Ob1Qk"

# Function to fetch video metadata
def get_video_metadata(video_id):
    url = f"https://www.googleapis.com/youtube/v3/videos?part=snippet,contentDetails&id={video_id}&key={API_KEY}"
    response = requests.get(url).json()
    if "items" in response and len(response["items"]) > 0:
        video = response["items"][0]
        return {
            "tags": ", ".join(video["snippet"].get("tags", [])),
            "category": video["snippet"]["categoryId"],
            "duration": video["contentDetails"].get("duration", "N/A"),
            "channel_name": video["snippet"]["channelTitle"]
        }
    return {"tags": "", "category": "", "duration": "", "channel_name": ""}

# Function to extract image features from thumbnail
def extract_image_features(thumbnail_url):
    try:
        response = requests.get(thumbnail_url)
        img = Image.open(BytesIO(response.content)).convert("RGB")
        img_array = np.array(img)
        brightness = np.mean(img_array)  # Average pixel intensity
        return brightness
    except:
        return None

# Function for sentiment analysis of comments
def get_sentiment_score(comment_text):
    return analyzer.polarity_scores(str(comment_text))['compound']

# Function to extract audio features
def extract_audio_features(audio_file):
    try:
        y, sr = librosa.load(audio_file)
        pitch = librosa.feature.spectral_centroid(y=y, sr=sr).mean()
        return pitch
    except:
        return None

# Add new features to the dataset
metadata_list = []
brightness_list = []
sentiment_scores = []

for index, row in df.iterrows():
    video_id = row["video_id"]
    metadata = get_video_metadata(video_id)
    metadata_list.append(metadata)

    thumbnail_url = f"https://img.youtube.com/vi/{video_id}/hqdefault.jpg"
    brightness = extract_image_features(thumbnail_url)
    brightness_list.append(brightness)

    sentiment = get_sentiment_score(row["comment_count"])
    sentiment_scores.append(sentiment)

# Convert metadata list to DataFrame and merge with original dataset
metadata_df = pd.DataFrame(metadata_list)
df = pd.concat([df, metadata_df], axis=1)

# Add image brightness and sentiment score
df["brightness"] = brightness_list
df["sentiment_score"] = sentiment_scores

# Save enhanced dataset
df.to_csv("enhanced_youtube_dataset.csv", index=False)

print("✅ Enhanced dataset saved successfully!")


In [None]:
df