In [8]:
import os
import requests
import selenium
from selenium import webdriver 
from selenium.webdriver.chrome.service import Service
import webdriver_manager
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import json
import re
import time
import tqdm
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed


In [9]:
def get_AppList(version):
    url = f"https://api.steampowered.com/ISteamApps/GetAppList/{version}/"
    try:
        r = requests.get(url)
        r.raise_for_status()
        data = r.json()
        apps = data['applist']['apps']
        if version == 'v2':
            return apps            
        else:
            return apps['app']
    except requests.exceptions.RequestException as e:
        return None

In [10]:
v1 = 'v1'
v2 = 'v2'

apps = get_AppList(v1)
df1 = pd.DataFrame(apps)

apps = get_AppList(v2)
df2 = pd.DataFrame(apps)

app_ids_df = pd.concat([df1, df2], ignore_index=True).drop_duplicates(subset='appid')  
app_ids_df.to_csv('all_steam_games.csv', index=False)

In [11]:
app_ids_df.tail()

Unnamed: 0,appid,name
219689,3133560,Demon Quest
219690,3230280,Horizons: The End Of Words
219691,738700,Secret of the Rendrasha Blade
219692,2435600,The Dark Tones: Loss
219693,2580020,THRESHOLD


In [12]:
# df1.head()

In [13]:
# df2.head()

In [14]:
def get_AppData(url):
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'lxml')
        return soup
    except Exception as e:
        print(e)
        return None

In [15]:
def find_general_details(soup):
    """ 
        Input: soup (data of html)
        Output: title, description, content and genre of the game
    """
    
    title = soup.find('div', class_='apphub_AppName').text if soup.find('div', class_='apphub_AppName') else None
    description = soup.find('div', class_='game_description_snippet').text.strip() if soup.find('div', class_='game_description_snippet') else None
    try:
        content_div = soup.find('div', class_='shared_game_rating') 
        content = content_div.find('p').text.strip()
    except Exception as e:
        content = None
    try:
        genres = (soup.find('div', class_='details_block').find_all('a')) if soup.find('div', class_='details_block') else None
        genre = [g.text for g in genres]  
    except Exception as e: 
        genre = None   
    try:
        release_date = soup.find('div', class_='date').text.strip().replace(',', '')
    except Exception as e:
        release_date = None
        
    try:
        tags_div = soup.find('div', class_='glance_tags popular_tags').find_all('a')
        tags_all = [tag.text for tag in tags_div] if tags_div else None
        tags_list = [item.strip() for item in tags_all]
    except Exception as e:
        tags_list = None
        
    try:
        player = soup.find('a', class_='game_area_details_specs_ctn').find('div', class_='label').text.strip()
    except Exception as e:
        player = None
        
    return title, description, content, genre, player, tags_list, release_date

In [16]:
def extract_details(div, index):
    """Helper function to extract developer or publisher details."""
    try:
        detail_div = div[index].find('div', class_='summary column')
        link = detail_div.find('a').get('href')
        name = detail_div.text.strip()
        return link, name
    except (AttributeError, IndexError) as e:
        return None, None

def fetch_followers(link):
    """Helper function to fetch followers from a link."""
    if not link:
        return None
    try:
        soup = get_AppData(link)
        return soup.find('div', class_="num_followers").text.strip().replace(',', '')
    except Exception as e:
        return None

def find_developer_publisher_details(soup):
    """Find developer and publisher details and their followers."""
    try:
        divs = soup.find('div', class_="glance_ctn_responsive_left").find_all('div', 'dev_row')
        
        dev_link, developer = extract_details(divs, 0)
        pub_link, publisher = extract_details(divs, 1)

        dev_followers = fetch_followers(dev_link)
        pub_followers = fetch_followers(pub_link)

    except Exception as e:
        developer, publisher, dev_followers, pub_followers = None, None, None, None

    return developer, publisher, dev_followers, pub_followers


In [17]:
def find_price(soup):
    # Extract the regular price
    try:
        price = soup.find('div', class_='game_purchase_price price').text.replace('$', '').replace(',', '').replace('USD', '').strip()
    except AttributeError:
        price = None

    # Extract discount prices
    discount_prices = []
    try:
        discount_divs = soup.find_all('div', class_='discount_final_price')
        discount_prices = [dp.text.replace('$', '').replace(',', '').replace('USD', '').strip() for dp in discount_divs]
    except Exception as e:
        print(f"Error extracting discount prices: {e}")

    return price, discount_prices if discount_prices else None


In [18]:
def find_review_count(soup):
    try:
        review_div = soup.find_all('span', class_ = "nonresponsive_hidden responsive_reviewdesc")
        month = review_div[0]
        all_time = review_div[1]
        
        numbers = re.findall(r'\d{1,3}(?:,\d{3})*', month.text.strip())        
        parsed_numbers = [int(num.replace(',', '')) for num in numbers]
        positive_review_ratio_month = parsed_numbers[0] 
        month_reviews = parsed_numbers[1]          
        
        numbers = re.findall(r'\d{1,3}(?:,\d{3})*', all_time.text.strip())        
        parsed_numbers = [int(num.replace(',', '')) for num in numbers]
        positive_review_ratio_all_time = parsed_numbers[0] 
        total_reviews = parsed_numbers[1]
        
    except Exception as e:
        positive_review_ratio_month = None
        month_reviews = None
        positive_review_ratio_all_time = None
        total_reviews = None
        
    return month_reviews, positive_review_ratio_month, total_reviews, positive_review_ratio_all_time

In [19]:
def find_media_links(soup):
    try:
        header_url = soup.find('img', class_='game_header_image_full')['src']
    except:
        header_url = None
    try:
        image_link = soup.find_all('a', class_='highlight_screenshot_link') 
        image_url_list = [img['href'] for img in image_link] if image_link else None
    except:
        image_url_list = None
        
    try:
        image_link_bar = soup.find_all('div', class_='highlight_strip_item highlight_strip_screenshot')
        image_small_url_list = [img.find('img')['src'] for img in image_link_bar] if image_link_bar else None
    except:
        image_small_url_list= None
        

    video_link = soup.find_all('div', class_='highlight_player_item highlight_movie')
    video_urls_hd_list = [v_link['data-mp4-hd-source'] for v_link in video_link] if video_link else None
    video_urls_480p_list = [v_link['data-mp4-source'] for v_link in video_link] if video_link else None
    
    return header_url,image_url_list,image_small_url_list, video_urls_hd_list, video_urls_480p_list

In [20]:
def find_requirements(soup):
    try:
        lines_platform = soup.find('div', class_='sysreq_tabs')
        lines = lines_platform.text.strip("'").strip().split('\n')
        software = [item.strip() for item in lines]
    except Exception as e:
        software = None
        
    
    return software

In [21]:
def find_languages(soup):
    return [td.get_text(strip=True) for td in soup.select("td.ellipsis")]

In [22]:
def extract_data(appid):
    
    time.sleep(1) 
    url = f"https://store.steampowered.com/app/{appid}/"
    soup = get_AppData(url) 
    
    app_id = appid
    title, description, content, genre, player, tags_list, release_date = find_general_details(soup)
    developer, publisher, dev_followers, pub_followers = find_developer_publisher_details(soup) 
    price, discount_prices = find_price(soup)
    month_reviews, pos_ratio_month, total_reviews, pos_ratio_all = find_review_count(soup)
    header_url,image_url_list,image_small_url_list, video_urls_hd_list, video_urls_480p_list = find_media_links(soup)   
    software = find_requirements(soup)
    languages = find_languages(soup)
        
    data = {
        'app_id': app_id,
        'title': title,
        'description': description,
        'content': content,
        'developer': developer,
        'publisher': publisher,
        'dev_followers':dev_followers,
        'pub_followers' : pub_followers,
        'genre': genre,
        'release_date': release_date,
        'price_usd': price,
        'discount_price':discount_prices,
        'software': software,
        'player': player,
        'languages' : languages,
        'month_reviews': month_reviews,
        'positive_ratio_month': pos_ratio_month,  
        'total_reviews': total_reviews,
        'positive_ratio_all': pos_ratio_all,
        'tags_list': tags_list,
        'header_url': header_url,
        'image_url_list': image_url_list,
        'image_small_url_list': image_small_url_list,
        'video_urls_hd_list': video_urls_hd_list,
        'video_urls_480p_list': video_urls_480p_list
    }

    
    return data
    
    


In [23]:
#test_id = ['2878980', '10', '1293830', '306130']
# for i in test_id:
#     display(extract_data(i))

In [None]:
def save_batch_to_csv(batch_number, df):
    os.makedirs('scraped_data_1', exist_ok=True)
    file_path = f'scraped_data_1/steam_games_data_batch_{batch_number}.csv'
    df.to_csv(file_path, index=False)
    print(f"Batch {batch_number} saved successfully to {file_path}")

def process_batch(batch_number, app_ids):
    list_of_data = []

    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(extract_data, app_id) for app_id in app_ids]
        for future in tqdm(as_completed(futures), total=len(futures), desc=f"Processing Batch {batch_number}"):
            list_of_data.append(future.result())

    df_scraped = pd.DataFrame(list_of_data)
    save_batch_to_csv(batch_number, df_scraped)
def count_files(dir_path):
    file_count = 0
    for path in os.listdir(dir_path):
        if os.path.isfile(os.path.join(dir_path, path)):
            file_count += 1
    return file_count

starting_batch = count_files('./scraped_data_1')
batch_size = 1000
a = starting_batch * batch_size  
b = len(app_ids_df)

for batch_start in range(a, b, batch_size):
    batch_number = batch_start // batch_size
    ids = app_ids_df['appid'][batch_start:batch_start + batch_size].to_list()
    process_batch(batch_number, ids)


Processing Batch 1:   8%|█████                                                       | 84/1000 [01:22<12:05,  1.26it/s]

In [None]:
url = "https://store.steampowered.com/charts/topselling/global" 
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get(url) 


In [None]:
tbody_element = driver.find_element(By.TAG_NAME, "tbody")
tbody_html = tbody_element.get_attribute("outerHTML")
soup = BeautifulSoup(tbody_html, 'lxml')

In [None]:
app_ids = []
ranks = []
urls = []

# Loop through each row in the <tbody> tag
for row in soup.find_all('tr', class_="_2-RN6nWOY56sNmcDHu069P"):
    # Extract the rank
    rank = row.find('td', class_="_34h48M_x9S-9Q2FFPX_CcU").get_text(strip=True)
    ranks.append(rank)
    
    # Extract the URL and app id
    link = row.find('a', class_="_2C5PJOUH6RqyuBNEwaCE9X")
    if link:
        url = link['href']
        urls.append(url)
        
        # Extract app id from the URL using regex
        app_id = re.search(r'/app/(\d+)/', url)
        app_ids.append(app_id.group(1) if app_id else None)

# Create a DataFrame
df_top_100 = pd.DataFrame({
    'appid': app_ids,
    'rank': ranks,
    'url': urls
})

# Save the DataFrame to a CSV file
df_top_100.to_csv("top_100_games.csv", index=False)

print("Data saved to top_100_games.csv")

In [None]:
df_top_100.tail()

In [None]:
df_100 = pd.read_csv('top_100_games.csv')

In [None]:
df1_unique = app_ids_df.drop_duplicates(subset='appid')
df2_unique = df_100.drop_duplicates(subset='appid')
common_apps = pd.merge(df1_unique, df2_unique, on='appid', how='inner').sort_values(by='rank', ascending=True) 

In [None]:
common_apps