In [1]:
import os
import requests
import selenium
from selenium import webdriver 
from selenium.webdriver.chrome.service import Service
import webdriver_manager
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json
import re
import time
import tqdm
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed


In [2]:
def get_AppList(version):
    url = f"https://api.steampowered.com/ISteamApps/GetAppList/{version}/"
    try:
        r = requests.get(url)
        r.raise_for_status()
        data = r.json()
        apps = data['applist']['apps']
        if version == 'v2':
            return apps            
        else:
            return apps['app']
    except requests.exceptions.RequestException as e:
        return None

In [3]:
v1 = 'v1'
v2 = 'v2'

apps = get_AppList(v1)
df1 = pd.DataFrame(apps)

apps = get_AppList(v2)
df2 = pd.DataFrame(apps)

app_ids_df = pd.concat([df1, df2], ignore_index=True).drop_duplicates(subset='appid')  
app_ids_df.to_csv('all_steam_games.csv', index=False)

In [4]:
app_ids_df.tail()

Unnamed: 0,appid,name
283650,70,Half-Life
283651,8,winui2
283652,80,Counter-Strike: Condition Zero
283653,90,Half-Life Dedicated Server
283654,92,Codename Gordon


In [5]:
# df1.head()

In [6]:
# df2.head()

In [7]:
def get_AppData(url):
    """Fetch HTML content from a given URL.
    
    Parameters:
        url (str): The URL of the webpage to fetch.

    Returns:
        BeautifulSoup object if successful, None otherwise.
    """
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()  # Raise an HTTPError if status is 4xx or 5xx
        return BeautifulSoup(response.text, 'lxml')
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch data for URL {url}: {e}")
        return None


In [8]:
def find_general_details(soup):
    """Extract general details such as title, description, genre, and tags.

    Parameters:
        soup (BeautifulSoup): Parsed HTML content of the game page.

    Returns:
        tuple: title, description, content, genre, player type, tags list, and release date.
    """
    title = description = content = genre = player = tags_list = release_date = None

    try:
        title = soup.find('div', class_='apphub_AppName').get_text(strip=True)
        description = soup.find('div', class_='game_description_snippet').get_text(strip=True)
        content_div = soup.find('div', class_='shared_game_rating')
        content = content_div.find('p').get_text(strip=True) if content_div else None
        genre = [g.get_text(strip=True) for g in soup.select('div.details_block a')]
        tags_list = [tag.get_text(strip=True) for tag in soup.select('div.glance_tags a')]
        release_date = soup.find('div', class_='date').get_text(strip=True).replace(',', '')
        player = soup.find('a', class_='game_area_details_specs_ctn').find('div', class_='label').get_text(strip=True)
    except AttributeError:
        pass

    return title, description, content, genre, player, tags_list, release_date


In [9]:
def extract_details(div, index):
    """Extract developer or publisher details.
    
    Parameters:
        div (list): List of div elements containing developer/publisher info.
        index (int): Index of the div to extract details from.
    
    Returns:
        tuple: (link, name) if found, else (None, None).
    """
    try:
        detail_div = div[index].find('div', class_='summary column')
        link = detail_div.find('a').get('href')
        name = detail_div.get_text(strip=True)
        return link, name
    except (AttributeError, IndexError):
        return None, None


In [10]:
def fetch_followers(link):
    """Fetch number of followers from a given developer/publisher link.
    
    Parameters:
        link (str): URL of the developer/publisher page.
    
    Returns:
        str: Number of followers or None if not found.
    """
    soup = None
    if not link:
        return None
    try:
        soup = get_AppData(link)
        res = soup.find('div', class_="num_followers").get_text(strip=True).replace(',', '') if soup else None
        return res
    except:
        return None


In [11]:
def find_developer_publisher_details(soup):
    """Extract developer and publisher details, along with follower counts.
    
    Parameters:
        soup (BeautifulSoup): Parsed HTML content of the game page.
    
    Returns:
        tuple: Developer and publisher names, and their follower counts.
    """
    developer = publisher = dev_followers = pub_followers = None
    try:
        divs = soup.find_all('div', class_='dev_row')
        dev_link, developer = extract_details(divs, 0)
        pub_link, publisher = extract_details(divs, 1)
        dev_followers = fetch_followers(dev_link)
        pub_followers = fetch_followers(pub_link)
    except Exception as e:
        pass
    return developer, publisher, dev_followers, pub_followers


In [12]:
def find_price(soup):
    """Extract price and discount prices from the game page.
    
    Parameters:
        soup (BeautifulSoup): Parsed HTML content of the game page.
    
    Returns:
        tuple: Regular price and list of discount prices.
    """
    price = discount_prices = None
    try:
        price = soup.find('div', class_='game_purchase_price').get_text(strip=True).replace('$', '')
        discount_divs = soup.find_all('div', class_='discount_final_price')
        discount_prices = [dp.get_text(strip=True).replace('$', '') for dp in discount_divs] or None
    except AttributeError:
        pass
    return price, discount_prices


In [13]:
def find_review_count(soup):
    """Extract monthly and all-time review counts and ratings.
    
    Parameters:
        soup (BeautifulSoup): Parsed HTML content of the game page.
    
    Returns:
        tuple: Monthly and all-time review counts and ratings.
    """
    try:
        review_divs = soup.find_all('span', class_="nonresponsive_hidden responsive_reviewdesc")
        numbers = re.findall(r'\d{1,3}(?:,\d{3})*', review_divs[0].get_text())
        positive_review_ratio_month, month_reviews = int(numbers[0].replace(',', '')), int(numbers[1].replace(',', ''))
        numbers = re.findall(r'\d{1,3}(?:,\d{3})*', review_divs[1].get_text())
        positive_review_ratio_all_time, total_reviews = int(numbers[0].replace(',', '')), int(numbers[1].replace(',', ''))
    except (AttributeError, IndexError, ValueError):
        return None, None, None, None
    return month_reviews, positive_review_ratio_month, total_reviews, positive_review_ratio_all_time


In [14]:
def find_media_links(soup):
    """Extract media links, including header image, screenshots, and videos.
    
    Parameters:
        soup (BeautifulSoup): Parsed HTML content of the game page.
    
    Returns:
        tuple: Header URL, image URLs, thumbnail URLs, HD video URLs, and 480p video URLs.
    """
    header_url = image_url_list = image_small_url_list = video_urls_hd_list = video_urls_480p_list = None
    try:
        header_url = soup.find('img', class_='game_header_image_full')['src']
        image_url_list = [img['href'] for img in soup.find_all('a', class_='highlight_screenshot_link')]
        image_small_url_list = [img.find('img')['src'] for img in soup.find_all('div', class_='highlight_strip_item highlight_strip_screenshot')]
        video_links = soup.find_all('div', class_='highlight_player_item highlight_movie')
        video_urls_hd_list = [v_link['data-mp4-hd-source'] for v_link in video_links if 'data-mp4-hd-source' in v_link.attrs]
        video_urls_480p_list = [v_link['data-mp4-source'] for v_link in video_links if 'data-mp4-source' in v_link.attrs]
    except (AttributeError, TypeError):
        pass
    return header_url, image_url_list, image_small_url_list, video_urls_hd_list, video_urls_480p_list


In [15]:
def find_requirements(soup):
    """Extract system requirements.
    
    Parameters:
        soup (BeautifulSoup): Parsed HTML content of the game page.
    
    Returns:
        list: List of system requirements or None if not found.
    """
    try:
        lines = soup.find('div', class_='sysreq_tabs').get_text(strip=True).split('\n')
        return [item.strip() for item in lines]
    except AttributeError:
        return None


In [16]:
def find_languages(soup):
    """Extract list of supported languages.
    
    Parameters:
        soup (BeautifulSoup): Parsed HTML content of the game page.
    
    Returns:
        list: List of supported languages.
    """
    return [td.get_text(strip=True) for td in soup.select("td.ellipsis")]


In [17]:
def extract_data(appid):
    
    time.sleep(1) 
    url = f"https://store.steampowered.com/app/{appid}/"
    soup = get_AppData(url) 
    
    app_id = appid
    title, description, content, genre, player, tags_list, release_date = find_general_details(soup)
    developer, publisher, dev_followers, pub_followers = find_developer_publisher_details(soup) 
    price, discount_prices = find_price(soup)
    month_reviews, pos_ratio_month, total_reviews, pos_ratio_all = find_review_count(soup)
    header_url,image_url_list,image_small_url_list, video_urls_hd_list, video_urls_480p_list = find_media_links(soup)   
    software = find_requirements(soup)
    languages = find_languages(soup)
        
    data = {
        'app_id': app_id,
        'title': title,
        'description': description,
        'content': content,
        'developer': developer,
        'publisher': publisher,
        'dev_followers':dev_followers,
        'pub_followers' : pub_followers,
        'genre': genre,
        'release_date': release_date,
        'price_usd': price,
        'discount_price':discount_prices,
        'software': software,
        'player': player,
        'languages' : languages,
        'month_reviews': month_reviews,
        'positive_ratio_month': pos_ratio_month,  
        'total_reviews': total_reviews,
        'positive_ratio_all': pos_ratio_all,
        'tags_list': tags_list,
        'header_url': header_url,
        'image_url_list': image_url_list,
        'image_small_url_list': image_small_url_list,
        'video_urls_hd_list': video_urls_hd_list,
        'video_urls_480p_list': video_urls_480p_list
    }

    
    return data
    
    


In [18]:
# test_id = ['2878980', '10', '1293830', '306130']
# for i in test_id:
#     display(extract_data(i))

In [19]:
def save_batch_to_csv(batch_number, df, directory='scraped_data_1'):
    """Save a DataFrame to a CSV file in a specified directory.

    Parameters:
        batch_number (int): Batch identifier for the file name.
        df (pd.DataFrame): DataFrame containing batch data.
        directory (str): Directory path where the CSV file will be saved. Defaults to 'scraped_data_1'.
    """
    os.makedirs(directory, exist_ok=True)
    file_path = os.path.join(directory, f'steam_games_data_batch_{batch_number}.csv')
    df.to_csv(file_path, index=False)
    print(f"Batch {batch_number} saved successfully to {file_path}")


In [20]:
def process_batch(batch_number, app_ids, max_workers=10):
    """Process a batch of app IDs and save the results to a CSV.

    Parameters:
        batch_number (int): Batch number for file naming.
        app_ids (list): List of app IDs to process.
        max_workers (int): Maximum number of threads for concurrent execution. Defaults to 10.
    """
    list_of_data = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(extract_data, app_id) for app_id in app_ids]
        for future in tqdm(as_completed(futures), total=len(futures), desc=f"Processing Batch {batch_number}"):
            list_of_data.append(future.result())
    df_scraped = pd.DataFrame(list_of_data)
    save_batch_to_csv(batch_number, df_scraped)

In [21]:
def count_files(dir_path):
    """Count the number of files in a given directory.

    Parameters:
        dir_path (str): Path to the directory.

    Returns:
        int: Number of files in the directory.
    """
    return sum(1 for path in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, path)))

In [None]:
# Initialize the starting batch and batch size
starting_batch = count_files('./scraped_data_1')
batch_size = 1000
a = starting_batch * batch_size
b = len(app_ids_df)

# Process each batch within the range of IDs
for batch_start in range(a, b, batch_size):
    batch_number = batch_start // batch_size
    ids = app_ids_df['appid'][batch_start:batch_start + batch_size].to_list()
    process_batch(batch_number, ids)


Processing Batch 9:   2%|█▏                                                          | 20/1000 [00:55<14:01,  1.16it/s]

Failed to fetch data for URL https://store.steampowered.com/developer/ArcForged?snr=1_5_9__2000: HTTPSConnectionPool(host='store.steampowered.com', port=443): Read timed out. (read timeout=10)


Processing Batch 9:   2%|█▍                                                          | 24/1000 [01:17<52:14,  3.21s/it]

Failed to fetch data for URL https://store.steampowered.com/app/1617948/: HTTPSConnectionPool(host='store.steampowered.com', port=443): Read timed out. (read timeout=10)
Failed to fetch data for URL https://store.steampowered.com/search/?publisher=Wankil%20Studio&snr=1_5_9__422: HTTPSConnectionPool(host='store.steampowered.com', port=443): Max retries exceeded with url: /search/?publisher=Wankil%20Studio&snr=1_5_9__422 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x0000021CBE90E880>, 'Connection to store.steampowered.com timed out. (connect timeout=10)'))





Failed to fetch data for URL https://store.steampowered.com/app/1617949/: HTTPSConnectionPool(host='store.steampowered.com', port=443): Max retries exceeded with url: /app/1617949/ (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x0000021CC378A490>, 'Connection to store.steampowered.com timed out. (connect timeout=10)'))Failed to fetch data for URL https://store.steampowered.com/search/?publisher=Wankil%20Studio&snr=1_5_9__422: HTTPSConnectionPool(host='store.steampowered.com', port=443): Max retries exceeded with url: /search/?publisher=Wankil%20Studio&snr=1_5_9__422 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x0000021CC3796A90>, 'Connection to store.steampowered.com timed out. (connect timeout=10)'))

Failed to fetch data for URL https://store.steampowered.com/search/?publisher=Wankil%20Studio&snr=1_5_9__422: HTTPSConnectionPool(host='store.steampowered.com', port=443): Max retries exceeded with url: /search/?publisher=Wa

In [None]:
url = "https://store.steampowered.com/charts/topselling/global" 
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get(url) 


In [None]:
tbody_element = driver.find_element(By.TAG_NAME, "tbody")
tbody_html = tbody_element.get_attribute("outerHTML")
soup = BeautifulSoup(tbody_html, 'lxml')

In [None]:
app_ids = []
ranks = []
urls = []

# Loop through each row in the <tbody> tag
for row in soup.find_all('tr', class_="_2-RN6nWOY56sNmcDHu069P"):
    # Extract the rank
    rank = row.find('td', class_="_34h48M_x9S-9Q2FFPX_CcU").get_text(strip=True)
    ranks.append(rank)
    
    # Extract the URL and app id
    link = row.find('a', class_="_2C5PJOUH6RqyuBNEwaCE9X")
    if link:
        url = link['href']
        urls.append(url)
        
        # Extract app id from the URL using regex
        app_id = re.search(r'/app/(\d+)/', url)
        app_ids.append(app_id.group(1) if app_id else None)

# Create a DataFrame
df_top_100 = pd.DataFrame({
    'appid': app_ids,
    'rank': ranks,
    'url': urls
})

# Save the DataFrame to a CSV file
df_top_100.to_csv("top_100_games.csv", index=False)

print("Data saved to top_100_games.csv")

In [None]:
df_top_100.tail()

In [None]:
df_100 = pd.read_csv('top_100_games.csv')

In [None]:
df1_unique = app_ids_df.drop_duplicates(subset='appid')
df2_unique = df_100.drop_duplicates(subset='appid')
common_apps = pd.merge(df1_unique, df2_unique, on='appid', how='inner').sort_values(by='rank', ascending=True) 

In [None]:
common_apps