In [28]:
import requests
import time
import os
import csv
import re
import pandas as pd
from datetime import datetime, timedelta, timezone
from urllib.parse import urlencode

API_KEY = os.getenv("YOUTUBE_API_KEY")


In [29]:
# USER SETTINGS
# it searches for videos whose title, description, or tags contain that keyword — not the channel name.
KEYWORDS = [    "podcast", "interview", "talk", "conversation", 
    "discussion", "life podcast", "tech podcast", "data podcast"]

SUB_MIN = 400       # minimum subscribers
SUB_MAX = 3000      # maximum subscribers
MIN_VIDEOS = 15          # minimum total uploads on the channel
MAX_VIDEOS = 60          # maximum total uploads on the channel

RECENT_DAYS_FOR_ACTIVITY = 15   # must have uploaded within last 10 days
DAYS_FOR_FREQ = 30             # look at uploads in past 30 days
MIN_UPLOADS_IN_30_DAYS = 2     # roughly 1 video/week

# === Quota-saving options ===
MAX_CHANNELS = 100        # analyze max 100 channels per run
MAX_SEARCH_PAGES = 5      # fetch only 5 pages of search results (≈250 videos)
CHECK_VIDEO_DURATION = False  # set True to use videos.list for exact duration, False to skip (uses only #shorts filter)

In [30]:
def iso8601(dt):
    """Return datetime in YouTube API compatible UTC format (YYYY-MM-DDTHH:MM:SSZ)."""
    # ensure it's UTC and strip microseconds + timezone offset
    return dt.replace(microsecond=0, tzinfo=None).isoformat("T") + "Z"



def search_recent_videos_for_keyword(keyword, published_after_iso, max_pages=MAX_SEARCH_PAGES):
    """Search recent videos by keyword and return a set of channel IDs that uploaded them."""
    channel_ids = set()
    params = {
        "part": "snippet",
        "q": keyword,
        "type": "video",
        "order": "date",
        "publishedAfter": published_after_iso,
        "maxResults": 50,
        "key": API_KEY
    }

    nextPageToken = None
    page = 0
    while page < max_pages:
        if nextPageToken:
            params["pageToken"] = nextPageToken

        try:
            r = requests.get("https://www.googleapis.com/youtube/v3/search", params=params)
            r.raise_for_status()
        except requests.exceptions.HTTPError as e:
            print(f"⚠️ API error while searching '{keyword}': {e}")
            break

        data = r.json()
        for item in data.get("items", []):
            channel_ids.add(item["snippet"]["channelId"])

        nextPageToken = data.get("nextPageToken")
        if not nextPageToken:
            break
        page += 1
        time.sleep(0.1)

    return channel_ids

- This step gives us the pool of candidates.
- They’re all recent, relevant, and active in your niche — the next step is to check which ones fit your target (subscriber count, total videos, etc.).

In [31]:
def get_channel_info(channel_ids):
    """
    Get subscriber count, total videos, handle, and basic info for each channel.
    """
    result = {}
    ids = list(channel_ids)

    for i in range(0, len(ids), 50):  # can fetch up to 50 channels per API call
        chunk = ids[i:i + 50]
        params = {
            "part": "snippet,statistics,contentDetails",
            "id": ",".join(chunk),
            "maxResults": 50,
            "key": API_KEY
        }

        r = requests.get("https://www.googleapis.com/youtube/v3/channels", params=params)
        r.raise_for_status()
        data = r.json()

        for ch in data.get("items", []):
            cid = ch["id"]
            snippet = ch.get("snippet", {})
            stats = ch.get("statistics", {})
            content = ch.get("contentDetails", {})

            # customUrl often returns '@handle' (or legacy custom URL). Use if present.
            handle = snippet.get("customUrl") or None

            result[cid] = {
                "title": snippet.get("title"),
                "description": snippet.get("description"),
                "publishedAt": snippet.get("publishedAt"),
                "subscriberCount": int(stats.get("subscriberCount") or 0),
                "videoCount": int(stats.get("videoCount") or 0),
                "uploadsPlaylistId": content.get("relatedPlaylists", {}).get("uploads"),
                "handle": handle
            }

        time.sleep(0.1)

    return result


what we got now are details for each of those channels — like subscriber count, video count, and when they started — so we can filter by the chosen range.

In [32]:
def duration_to_seconds(iso_duration: str) -> int:
    """
    Convert 'PT#H#M#S' to seconds. Returns 0 if unknown.
    """
    if not iso_duration:
        return 0
    m = re.fullmatch(r"PT(?:(\d+)H)?(?:(\d+)M)?(?:(\d+)S)?", iso_duration)
    if not m:
        return 0
    h = int(m.group(1) or 0)
    mm = int(m.group(2) or 0)
    s = int(m.group(3) or 0)
    return h * 3600 + mm * 60 + s

In [33]:
def count_recent_longform_uploads(channel_id, published_after_iso, max_pages=3, min_seconds=120):
    """
    Count long-form uploads after the given date.
    Uses #shorts filter only if CHECK_VIDEO_DURATION=False to save quota.
    """
    total = 0
    params = {
        "part": "snippet",
        "channelId": channel_id,
        "type": "video",
        "order": "date",
        "publishedAfter": published_after_iso,
        "maxResults": 50,
        "key": API_KEY
    }

    nextPageToken = None
    page = 0
    while page < max_pages:
        if nextPageToken:
            params["pageToken"] = nextPageToken

        r = requests.get("https://www.googleapis.com/youtube/v3/search", params=params)
        if r.status_code != 200:
            print(f"⚠️ Skipping channel {channel_id} (quota or request error)")
            break
        data = r.json()
        items = data.get("items", [])

        # collect IDs excluding Shorts by title
        ids = []
        for it in items:
            title = (it["snippet"]["title"] or "").lower()
            if "#shorts" in title:
                continue
            vid = it.get("id", {}).get("videoId")
            if vid:
                ids.append(vid)

        if CHECK_VIDEO_DURATION:
            # heavier version: check duration
            for i in range(0, len(ids), 50):
                chunk = ids[i:i+50]
                vr = requests.get(
                    "https://www.googleapis.com/youtube/v3/videos",
                    params={"part": "contentDetails", "id": ",".join(chunk), "key": API_KEY}
                )
                if vr.status_code != 200:
                    continue
                vdata = vr.json()
                for v in vdata.get("items", []):
                    dur = v.get("contentDetails", {}).get("duration")
                    if duration_to_seconds(dur) >= min_seconds:
                        total += 1
        else:
            # light version: just count non-shorts
            total += len(ids)

        nextPageToken = data.get("nextPageToken")
        if not nextPageToken:
            break
        page += 1
        time.sleep(0.1)

    return total

## Filtering

In [34]:
def filter_channels(channels_info, category_label="podcast"):
    """
    Apply filters and return list of dicts with these keys:
    Youtuber, handle, category, subs, lifetime_uploads, latest_upload
    """
    print("\n🛠️ Filtering channels based on activity and size...")

    published_after_freq = iso8601(datetime.now(timezone.utc) - timedelta(days=DAYS_FOR_FREQ))
    published_after_activity = iso8601(datetime.now(timezone.utc) - timedelta(days=RECENT_DAYS_FOR_ACTIVITY))
    rows = []

    for cid, info in list(channels_info.items())[:MAX_CHANNELS]:
        subs = info["subscriberCount"]
        vids = info["videoCount"]

        if (subs < SUB_MIN or subs > SUB_MAX) or (vids < MIN_VIDEOS or vids > MAX_VIDEOS):
            continue


        # long-form uploads in last 30 days
        uploads_30 = count_recent_longform_uploads(cid, published_after_freq)

        # latest upload date (any video, but you can also force long-form with a heavier call)
        r = requests.get(
            "https://www.googleapis.com/youtube/v3/search",
            params={
                "part": "snippet",
                "channelId": cid,
                "type": "video",
                "order": "date",
                "maxResults": 1,
                "key": API_KEY
            }
        )
        latest_items = r.json().get("items", [])
        latest_date = latest_items[0]["snippet"]["publishedAt"] if latest_items else None

        # active in last X days?
        active_ok = False
        if latest_date:
            dt_latest = datetime.fromisoformat(latest_date.replace("Z", "+00:00"))
            active_ok = (datetime.now(timezone.utc) - dt_latest).days <= RECENT_DAYS_FOR_ACTIVITY

        if active_ok and uploads_30 >= MIN_UPLOADS_IN_30_DAYS:
            rows.append({
                "Youtuber": info["title"],
                "handle": info.get("handle"),
                "category": category_label,
                "subs": subs,
                "lifetime_uploads": vids,
                "latest_upload": latest_date,
                # keep internal fields if you still want them later:
                "_uploads_last30_longform": uploads_30,
                "_channelId": cid,
                "_channelUrl": f"https://www.youtube.com/channel/{cid}"
            })

        time.sleep(0.1)

    print(f"✅ Found {len(rows)} matching channels.")
    return rows


In [35]:
def to_dataframe(rows):
    """
    Turn results into a clean DataFrame with the requested columns.
    """
    if not rows:
        return pd.DataFrame(columns=["Youtuber", "handle", "category", "subs", "lifetime_uploads", "latest_upload"])
    df = pd.DataFrame(rows)
    # Only keep the public columns (you can drop the underscored ones)
    return df[["Youtuber", "handle", "category", "subs", "lifetime_uploads", "latest_upload"]]


In [36]:
if __name__ == "__main__":
    # 1️⃣ Find channels from all keywords
    published_after = iso8601(datetime.now(timezone.utc) - timedelta(days=30))
    all_channels = set()

    print(f"🎯 Searching YouTube for {len(KEYWORDS)} keywords...")
    for kw in KEYWORDS:
        print(f"\n🔍 Searching for keyword: {kw}")
        found = search_recent_videos_for_keyword(kw, published_after)
        print(f"  ➜ Found {len(found)} channels for '{kw}'")
        all_channels |= found  # merge into global set

    print(f"\n📊 Total unique channels found: {len(all_channels)}")

    # 2️⃣ Get channel info (with caching)
    import json
    from pathlib import Path

    cache_file = Path("channels_cache.json")

    if cache_file.exists():
        print("📦 Loading cached channel info...")
        with open(cache_file, "r", encoding="utf-8") as f:
            info = json.load(f)
    else:
        print("📡 Fetching new channel info...")
        info = get_channel_info(all_channels)
        with open(cache_file, "w", encoding="utf-8") as f:
            json.dump(info, f, ensure_ascii=False, indent=2)

    # 3️⃣ Filter and tag results by keyword category
    combined_rows = []
    for kw in KEYWORDS:
        filtered = filter_channels(info, category_label=kw)
        print(f"✅ {len(filtered)} channels matched after filtering for '{kw}'")
        combined_rows.extend(filtered)

    # 4️⃣ Combine into one DataFrame
    df = to_dataframe(combined_rows)
    print("\n🧾 Final DataFrame preview:")
    print(df.head(20).to_string(index=False))

    # 5️⃣ Save (optional)
    #df.to_csv("found_channels.csv", index=False)
    #print("\n💾 Saved all channels to found_channels.csv")


🎯 Searching YouTube for 8 keywords...

🔍 Searching for keyword: podcast
  ➜ Found 216 channels for 'podcast'

🔍 Searching for keyword: interview
  ➜ Found 194 channels for 'interview'

🔍 Searching for keyword: talk
  ➜ Found 180 channels for 'talk'

🔍 Searching for keyword: conversation
  ➜ Found 206 channels for 'conversation'

🔍 Searching for keyword: discussion
  ➜ Found 201 channels for 'discussion'

🔍 Searching for keyword: life podcast
  ➜ Found 212 channels for 'life podcast'

🔍 Searching for keyword: tech podcast
  ➜ Found 177 channels for 'tech podcast'

🔍 Searching for keyword: data podcast
  ➜ Found 207 channels for 'data podcast'

📊 Total unique channels found: 1537
📦 Loading cached channel info...

🛠️ Filtering channels based on activity and size...
✅ Found 2 matching channels.
✅ 2 channels matched after filtering for 'podcast'

🛠️ Filtering channels based on activity and size...
✅ Found 2 matching channels.
✅ 2 channels matched after filtering for 'interview'

🛠️ Filterin