In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
import time
import pandas as pd


In [2]:
def setup_driver():
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run in headless mode
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
    service = Service('C:/Users/user/Desktop/WebScrapingProject/chromedriver.exe')
    driver = webdriver.Chrome(service=service, options=options)
    return driver

In [3]:
def get_top_twitch_categories(k=10):
    # Set up Chrome WebDriver
    driver = setup_driver()
    
    url = 'https://www.twitch.tv/directory?sort=VIEWER_COUNT'
    driver.get(url)
    time.sleep(3)  # Wait for page to load
    
    # Find category elements
    categories = driver.find_elements(By.XPATH, '//a[contains(@class, "ScCoreLink-sc-16kq0mq-0") and h2]')
    top_k_categories = []
    for cat in categories[:k]:
        # Try to get the category name from a span or h2 element
        try:
            # Sometimes the category name is in a span with data-a-target="tw-core-button-label-text"
            name_elem = cat.find_element(By.XPATH, './/h2')
            if "viewers" in name_elem.text.lower():
                continue  # Skip if it contains viewer count
            name = name_elem.text
            link = cat.get_attribute('href')
            print(f"Category: {name}, Link: {link}")
            top_k_categories.append((name, link))
        except Exception:
            continue
    driver.quit()
    return top_k_categories


In [4]:
def scrape_channels_from_categories(top_categories, max_channels_per_category=20):
    driver = setup_driver()
    all_channels = []

    # top_categories = top_categories[:1]  # Limit to top 1 category for brevity
    for name, link in top_categories:
        print(f"Scraping category: {name}")
        driver.get(link + "?sort=VIEWER_COUNT")
        time.sleep(3)  # Wait for page to load

        # Set language filter to English if the option is available
        try:
            # Open the language filter menu first
            filter_button = driver.find_element(
                By.XPATH,
                '//button[.//div[@data-a-target="tw-core-button-label-text" and contains(text(), "Language")]]'
            )
            filter_button.click()
            time.sleep(1)
            english_label = driver.find_element(
                By.XPATH,
                '//label[.//div[text()="English"]]'
            )
            # Find the associated checkbox input
            checkbox = english_label.find_element(
                By.XPATH,
                './preceding-sibling::input[@type="checkbox"]'
            )
            # Only click if not already checked
            if not checkbox.is_selected():
                english_label.click()
                time.sleep(1)  # Wait for the page to reload with the filter applied
            else:
                print("English language filter already checked, skipping click.")
            print("Set language filter to English.")
        except Exception:
            print("English language filter not found or could not be set.")

        # Scroll to load more channels if needed
        last_height = driver.execute_script("return document.body.scrollHeight")
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(2)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height or len(driver.find_elements(
                By.XPATH, '//a[@data-a-target="preview-card-channel-link"]')) >= max_channels_per_category:
                break
            last_height = new_height

        channel_elements = driver.find_elements(
            By.XPATH, '//a[@data-a-target="preview-card-channel-link"]')
        for elem in channel_elements[:max_channels_per_category]:
            href = elem.get_attribute('href')
            channelId = href.split('/')[-1]
            print(f"Category: {name}, Channel ID: {channelId}, URL: {href}")
            all_channels.append({'category': name, 'href': href, 'channelId': channelId})

    driver.quit()
    return all_channels

In [5]:
k_categories = 10
k_channels_per_category = 5

top_categories = get_top_twitch_categories(k_categories)

channels_by_category = scrape_channels_from_categories(top_categories, k_channels_per_category)

Category: Just Chatting, Link: https://www.twitch.tv/directory/category/just-chatting
Category: Grand Theft Auto V, Link: https://www.twitch.tv/directory/category/grand-theft-auto-v
Category: League of Legends, Link: https://www.twitch.tv/directory/category/league-of-legends
Category: Call of Duty: Black Ops 7, Link: https://www.twitch.tv/directory/category/call-of-duty-black-ops-7
Category: Minecraft, Link: https://www.twitch.tv/directory/category/minecraft
Category: Fortnite, Link: https://www.twitch.tv/directory/category/fortnite
Category: EA Sports FC 26, Link: https://www.twitch.tv/directory/category/ea-sports-fc-26
Category: IRL, Link: https://www.twitch.tv/directory/category/irl
Category: VALORANT, Link: https://www.twitch.tv/directory/category/valorant
Category: Cloverpit, Link: https://www.twitch.tv/directory/category/cloverpit
Scraping category: Just Chatting
Set language filter to English.
Category: Just Chatting, Channel ID: xqc, URL: https://www.twitch.tv/xqc
Category: Jus

In [6]:
channels_df = pd.DataFrame(channels_by_category)
channels_df.to_csv('../data/channels_by_category.csv', index=False)