# This script scrapes the top sellers of special deals on steam

Contains the game name, the original price and the discounted price, the percentage of discount, the game rating and number of votes, the tags of the game, and the release year

https://youtu.be/oKk3dplKLVg
https://stackoverflow.com/questions/69425134/i-am-trying-to-scrape-multiple-pages-using-beautfiul-soup-but-the-code-keeps-ret
https://stackoverflow.com/questions/48490940/beautifulsoup-steam-market-web-scraping-errors

In [1]:
import requests
import csv
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import concurrent.futures
import threading
import time
import json

In [2]:
# Obtain the source
api_url = "https://store.steampowered.com/contenthub/querypaginated/specials/TopSellers/render/?query=&start=0&count=15&cc=CA&l=english&v=4&tag="

In [3]:
# Get the data (only the resulting html page from JSON)
def get_data(url):
    r = requests.get(url)
    # Store the data as a dictionary
    data = dict(r.json())
    return data['results_html']

print(get_data(api_url))

	<a href="https://store.steampowered.com/app/597180/Old_World/?snr=1_2300_4__104_1" class="tab_item  "  data-ds-appid="597180" data-ds-itemkey="App_597180" data-ds-tagids="[9,4364,1670,1741,3987,1678,1708]" data-ds-crtrids="[33019866,38719421]" onmouseover="GameHover( this, event, 'global_hover', {&quot;type&quot;:&quot;app&quot;,&quot;id&quot;:597180,&quot;params&quot;:{&quot;bDisableHover&quot;:false},&quot;public&quot;:1,&quot;v6&quot;:1} );" onmouseout="HideGameHover( this, event, 'global_hover' )">
		<div class="tab_item_cap">
			<img class="tab_item_cap_img" src="https://cdn.cloudflare.steamstatic.com/steam/apps/597180/capsule_184x69.jpg?t=1653035882" >
		</div>
				<div class="discount_block tab_item_discount" data-price-final="4499"><div class="discount_pct">-10%</div><div class="discount_prices"><div class="discount_original_price">CDN$ 49.99</div><div class="discount_final_price">CDN$ 44.99</div></div></div>		<div class="tab_item_content">
			<div class="tab_item_name">O

In [4]:
# Get the total number of results
def total_results(url):
    r = requests.get(url)
    # Store the data as a dictionary
    data = dict(r.json())
    total_results = data['total_count']
    return int(total_results)

total_results = total_results(api_url)
total_results

1579

In [5]:
# Get the number of games per page
def page_size(url):
    r = requests.get(url)
    # Store the data as a dictionary
    data = dict(r.json())
    page_size = data['pagesize']
    return int(page_size)

games_per_page = page_size(api_url)
games_per_page

15

In [6]:
# Parse to get all the game details
def parse(url):
    
    game_list = []
    
    soup = BeautifulSoup(url, 'lxml')
    # Since the get_data function filtered out all anchor texts
    games = soup.find_all('a')
    for game in games:
        title = game.find('div', class_ = 'tab_item_name').text.replace(":", "")
        cur_before_disc = game.find('div', class_ = 'discount_original_price').text.strip().replace("CND$", "")
        cur_before_disc = cur_before_disc.replace("CDN$", "")
        
        try:
            cur_discount_pct = game.find('div', class_ = 'discount_pct').text.replace("-", "").replace("%", "")
            cur_after_disc = game.find('div', class_ = 'discount_final_price').text
            cur_after_disc = cur_after_disc.replace("CDN$", "")
            
        except:
            cur_discount_pct = 0
            cur_after_disc = cur_before_disc
        
        link = game['href']
        game_list.append([title, cur_before_disc, cur_discount_pct, cur_after_disc, link])
        
    return game_list

In [7]:
# Create a list to store all the API queries
pages = []

for x in range(0, total_results, games_per_page):
    pages.append(f"https://store.steampowered.com/contenthub/querypaginated/specials/TopSellers/render/?query=&start={x}&count=15&cc=CA&l=english&v=4&tag=")
    
with concurrent.futures.ThreadPoolExecutor() as executor:
    results = executor.map(get_data, pages)
    
pages = list(results)
    

In [8]:
# Create a threading pool and feed the queries to BeautifulSoup
with concurrent.futures.ThreadPoolExecutor() as executor:
    results = executor.map(parse, pages)

# Create a final list to convert into DataFrame
final_list = []
    
for result in results:
    final_list += result
    
final_list = pd.DataFrame(final_list, columns = ['Name', 'Before Discount $', 
                                                 "Discount Percentage %", 'After Discount $', 'Link'])
final_list

Unnamed: 0,Name,Before Discount $,Discount Percentage %,After Discount $,Link
0,Old World,49.99,10,44.99,https://store.steampowered.com/app/597180/Old_...
1,Little Witch in the Woods,18.49,10,16.64,https://store.steampowered.com/app/1594940/Lit...
2,Expeditions Rome,44.99,25,33.74,https://store.steampowered.com/app/987840/Expe...
3,State of Decay 2 Juggernaut Edition,39.99,67,13.19,https://store.steampowered.com/app/495420/Stat...
4,Crusader Kings III,56.99,20,45.59,https://store.steampowered.com/app/1158310/Cru...
...,...,...,...,...,...
1573,Rift's Cave,1.19,50,0.59,https://store.steampowered.com/app/335210/Rift...
1574,Tetropunk,3.99,85,0.59,https://store.steampowered.com/app/540190/Tetr...
1575,Zombie Killer Drift - Racing Survival,5.69,90,0.56,https://store.steampowered.com/app/1720950/Zom...
1576,MechDefender - Tower Defense,5.69,90,0.56,https://store.steampowered.com/app/562730/Mech...


In [9]:
# Export to CSV
final_list.to_csv('Games Special.csv', index = False)