In [21]:
import requests
import time
import pandas as pd
from datetime import datetime, timezone


API_KEY = ""

PODCAST_QUERIES = [
    "podcast", "talk show", "discussion", "roundtable", "conversation","ngobrol","bincang","diskusi","obrolan"
]

def get_videos(query, published_after, published_before, max_results=1000):
    videos = []
    next_page_token = None
    base_url = "https://www.googleapis.com/youtube/v3/search"
    
    while len(videos) < max_results:
        params = {
            "part": "snippet",
            "q": query,
            "type": "video",
            "publishedAfter": published_after,
            "publishedBefore": published_before,
            "videoDuration": "medium,long",  # Filter hanya video > 4 menit
            "maxResults": min(50, max_results - len(videos)),
            "key": API_KEY,
            "pageToken": next_page_token,
        }
        
        response = requests.get(base_url, params=params).json()
        time.sleep(1)
        
        if "items" in response:
            for item in response["items"]:
                video_id = item["id"]["videoId"]
                title = item["snippet"]["title"]
                description = item["snippet"]["description"]
                published_at = item["snippet"]["publishedAt"]
                channel_id = item["snippet"]["channelId"]
                videos.append((video_id, title, description, published_at, channel_id))
        
        next_page_token = response.get("nextPageToken")
        if not next_page_token:
            break
    
    return videos

def get_video_stats(video_ids):
    stats = []
    base_url = "https://www.googleapis.com/youtube/v3/videos"
    
    for i in range(0, len(video_ids), 50):  
        params = {
            "part": "statistics,contentDetails,snippet",
            "id": ",".join(video_ids[i:i+50]),
            "key": API_KEY,
        }
        response = requests.get(base_url, params=params).json()
        time.sleep(1)
        
        if "items" in response:
            for item in response["items"]:
                video_id = item["id"]
                category_id = item["snippet"].get("categoryId", "Unknown")
                duration = item["contentDetails"].get("duration", "Unknown")
                view_count = int(item["statistics"].get("viewCount", 0))
                like_count = int(item["statistics"].get("likeCount", 0))
                comment_count = int(item["statistics"].get("commentCount", 0))
                if duration != "Unknown":  # Filter hanya video dengan durasi valid
                    stats.append((video_id, category_id, duration, view_count, like_count, comment_count))
    
    return stats

def get_video_category_name(category_id):
    url = f"https://youtube.googleapis.com/youtube/v3/videoCategories?part=snippet&id={category_id}&key={API_KEY}"
    response = requests.get(url).json()
    time.sleep(1)
    if "items" in response and len(response["items"]) > 0:
        return response["items"][0]["snippet"]["title"]
    return "Unknown"

start_date = "2020-01-01T00:00:00Z"
today = "2021-01-01T00:00:00Z"

all_videos = []
for query in PODCAST_QUERIES:
    all_videos.extend(get_videos(query, start_date, today, max_results=5000))  # Batas 5000 per query

video_ids = [v[0] for v in all_videos]
stats = get_video_stats(video_ids)

video_df = pd.DataFrame(all_videos, columns=["video_id", "title", "description", "published_at", "channel_id"])
stats_df = pd.DataFrame(stats, columns=["video_id", "category_id", "duration", "views", "likes", "comments"])

stats_df["category_name"] = stats_df["category_id"].apply(get_video_category_name)
final_df = video_df.merge(stats_df, on="video_id")

VALID_CATEGORIES = ["Podcast", "Education", "News & Politics"]
final_df = final_df[final_df["category_name"].isin(VALID_CATEGORIES)]

final_df = final_df.drop_duplicates(subset=["video_id"])

final_df.to_csv("youtube_3days_stats.csv", index=False)
print(f"✅ Data berhasil disimpan! Jumlah record: {len(final_df)}")


✅ Data berhasil disimpan! Jumlah record: 0
