# ðŸŽ® Steam Scraper (robust fallbacks + retries)

Scrape 1k rows (sample) and 30â€“50k rows (full) with safe defaults (None â†’ NaN).


In [4]:
!pip install requests beautifulsoup4 pandas tqdm



In [None]:
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm

# --- HTTP session with headers ---
session = requests.Session()
session.headers.update({
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0 Safari/537.36"
})

REQUEST_TIMEOUT = 15  # seconds
RETRY_ATTEMPTS = 3
DELAY_BETWEEN_REQUESTS = 0.7  # seconds, be polite

def get_url(url):
    """GET with retries and timeout."""
    for attempt in range(RETRY_ATTEMPTS):
        try:
            resp = session.get(url, timeout=REQUEST_TIMEOUT)
            if resp.status_code == 200 and resp.text:
                return resp
        except requests.RequestException:
            pass
        time.sleep(0.8 + attempt * 0.5)
    return None

def safe_text(node, selector=None):
    try:
        if selector:
            el = node.select_one(selector)
            return el.get_text(strip=True) if el else None
        return node.get_text(strip=True) if node else None
    except Exception:
        return None

def safe_attr(node, selector, attr):
    try:
        el = node.select_one(selector)
        return el.get(attr) if el and el.has_attr(attr) else None
    except Exception:
        return None

def scrape_listing_page(start):
    url = f"https://store.steampowered.com/search/?filter=topsellers&start={start}&count=50"
    resp = get_url(url)
    if not resp:
        return []
    soup = BeautifulSoup(resp.text, 'html.parser')
    rows = soup.select('.search_result_row')
    items = []
    for row in rows:
        try:
            title = safe_text(row, '.title')
            link = row.get('href') if row else None
            price = safe_text(row, '.discount_final_price')
            release_date = safe_text(row, '.search_released')
            # platform_elems = row.select('.search_name .platform_img') if row else []
            # platforms = ', '.join([p['class'][1] for p in platform_elems if p.has_attr('class') and len(p['class']) > 1]) if platform_elems else None

            # Defaults for detail fields; will be filled later
            items.append({
                'title': title,
                'release_date': release_date,
                'price': price,
                'is_free': (price is not None and 'Free' in price),
                # 'platforms': platforms,
                'link': link
            })
        except Exception:
            # Skip malformed row but continue
            continue
    return items

def scrape_detail_fields(link):
    """Scrape publisher, developer, tags, review_summary from detail page with fallbacks."""
    result = {
        'publisher': None,
        'developer': None,
        'tags': None,
        'review_summary': None
    }
    if not link:
        return result
    resp = get_url(link)
    if not resp:
        return result
    soup = BeautifulSoup(resp.text, 'html.parser')

    # Publisher/Developer blocks
    try:
        for block in soup.select('.dev_row'):
            header = safe_text(block, '.subtitle') or ''
            names = ', '.join([a.get_text(strip=True) for a in block.select('a')]) or None
            if 'Developer' in header:
                result['developer'] = names
            if 'Publisher' in header:
                result['publisher'] = names
    except Exception:
        pass

    # Tags
    try:
        tag_elems = soup.select('.glance_tags.popular_tags a')
        result['tags'] = ', '.join([t.get_text(strip=True) for t in tag_elems]) if tag_elems else None
    except Exception:
        pass

    # Review summary (tooltip html)
    try:
        review = soup.select_one('.user_reviews_summary_row')
        tooltip = review.get('data-tooltip-html') if review and review.has_attr('data-tooltip-html') else None
        if tooltip:
            result['review_summary'] = tooltip.replace('<br>', ' ')
    except Exception:
        pass

    return result

def scrape_block(start_range_end):
    data = []
    for start in tqdm(range(0, start_range_end, 50)):
        listings = scrape_listing_page(start)
        for item in listings:
            # Scrape detail for each item with safe fallbacks
            detail = scrape_detail_fields(item.get('link'))
            row = {
                'title': item.get('title'),
                'release_date': item.get('release_date'),
                'price': item.get('price'),
                'is_free': item.get('is_free'),
                'tags': detail.get('tags'),
                'publisher': detail.get('publisher'),
                'developer': detail.get('developer'),
                'review_summary': detail.get('review_summary'),
                # 'platforms': item.get('platforms')
            }
            data.append(row)
            time.sleep(DELAY_BETWEEN_REQUESTS)
    return data

def to_csv_safe(rows, path):
    df = pd.DataFrame(rows)
    # Ensure None stays as NaN for analysis
    df.to_csv(path, index=False)
    return df


## ðŸ”¹ Scrape 1k rows (fast sample)


In [6]:
sample_rows = scrape_block(1000)  # 20 pages Ã— 50
df_sample = to_csv_safe(sample_rows, 'sample_data.csv')
df_sample.head()

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 20/20 [24:06<00:00, 72.30s/it]


Unnamed: 0,title,release_date,price,is_free,tags,publisher,developer,review_summary,platforms
0,PUBG: BATTLEGROUNDS,"21 Dec, 2017",Free,True,"Survival, Shooter, Battle Royale, Multiplayer,...","KRAFTON, Inc.",PUBG Corporation,"69% of the 14,331 user reviews in the last 30 ...",
1,Europa Universalis V,"4 Nov, 2025","â‚¹3,259.00",False,"Resource Management, 4X, Grand Strategy, Tradi...",Paradox Interactive,Paradox Tinto,"77% of the 1,695 user reviews in your language...",
2,ARC Raiders,"30 Oct, 2025","â‚¹2,467.00",False,"Extraction Shooter, PvP, Third-Person Shooter,...",Embark Studios,Embark Studios,"90% of the 30,400 user reviews in your languag...",
3,Counter-Strike 2,"21 Aug, 2012",Free,True,"FPS, Shooter, Multiplayer, Competitive, Action...",Valve,Valve,"81% of the 83,999 user reviews in the last 30 ...",
4,Apex Legendsâ„¢,"4 Nov, 2020",Free,True,"Free to Play, Multiplayer, Battle Royale, FPS,...",Electronic Arts,Respawn,"66% of the 4,652 user reviews in the last 30 d...",


In [8]:
import pandas as pd

# Load your full dataset
df = pd.read_csv("sample_data.csv")

# Take a random sample of 200 rows (change number as you like)
df_sample = df.sample(n=200, random_state=42)

# OR: take the first 200 rows instead of random
# df_sample = df.head(200)

# Save to a new CSV
df_sample.to_csv("sample_200.csv", index=False)

print("Sample created with shape:", df_sample.shape)


Sample created with shape: (200, 9)


## ðŸ”¹ Scrape 30â€“50k rows (slow full dataset)

Adjust the range end: 30000 for ~30k or 50000 for ~50k.


In [9]:
# Set to 30000 for ~30k rows; 50000 for ~50k rows
FULL_RANGE_END = 10000
full_rows = scrape_block(FULL_RANGE_END)
df_full = to_csv_safe(full_rows, 'raw_data.csv')
df_full.head()

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 200/200 [3:36:15<00:00, 64.88s/it]   


Unnamed: 0,title,release_date,price,is_free,tags,publisher,developer,review_summary,platforms
0,PUBG: BATTLEGROUNDS,"21 Dec, 2017",Free,True,"Survival, Shooter, Battle Royale, Multiplayer,...","KRAFTON, Inc.",PUBG Corporation,"69% of the 14,380 user reviews in the last 30 ...",
1,Europa Universalis V,"4 Nov, 2025","â‚¹3,259.00",False,"Resource Management, 4X, Grand Strategy, Tradi...",Paradox Interactive,Paradox Tinto,"77% of the 1,695 user reviews in your language...",
2,ARC Raiders,"30 Oct, 2025","â‚¹2,467.00",False,"Extraction Shooter, PvP, PvE, Third-Person Sho...",Embark Studios,Embark Studios,"90% of the 30,474 user reviews in your languag...",
3,Counter-Strike 2,"21 Aug, 2012",Free,True,"FPS, Shooter, Multiplayer, Competitive, Action...",Valve,Valve,"81% of the 83,999 user reviews in the last 30 ...",
4,Apex Legendsâ„¢,"4 Nov, 2020",Free,True,"Free to Play, Multiplayer, Battle Royale, FPS,...",Electronic Arts,Respawn,"66% of the 4,670 user reviews in the last 30 d...",
