In [1]:
# standard library imports
import csv
import datetime as dt
import json
import os
import statistics
import time

# third-party imports
import numpy as np
import pandas as pd
import requests

# customisations - ensure tables show all columns
pd.set_option("max_columns", 100)

from config import api_key

First step is to get a list of all relevant appids on steam using steam spy.

We begin by testing connection to api

In [None]:
url = "https://steamspy.com/api.php"

In [None]:
# Make a get request to get the latest position of the international space station from the opennotify api.
response = requests.get(url)
# Print the status code of the response.
print(response.status_code)

In [14]:
def get_request(url, parameters=None):
    """Return json-formatted response of a get request using optional parameters.
    
    Parameters
    ----------
    url : string
    parameters : {'parameter': 'value'}
        parameters to pass as part of get request
    
    Returns
    -------
    json_data
        json-formatted response (dict-like)
    """
    try:
        response = requests.get(url=url, params=parameters)
    except SSLError as s:
        print('SSL Error:', s)
        
        for i in range(5, 0, -1):
            print('\rWaiting... ({})'.format(i), end='')
            time.sleep(1)
        print('\rRetrying.' + ' '*10)
        
        # recusively try again
        return get_request(url, parameters)
    
    if response:
        return response.json()
    else:
        # response is none usually means too many requests. Wait and try again 
        print('No response, waiting 30 seconds...')
        time.sleep(30)
        print('Retrying.')
        return get_request(url, parameters)

In [15]:
url = "https://steamspy.com/api.php"
parameters = {"request": "all"}

# request 'all' from steam spy and parse into dataframe
json_data = get_request(url, parameters=parameters)
steam_spy_all = pd.DataFrame.from_dict(json_data, orient='index')

# generate sorted app_list from steamspy data
app_list = steam_spy_all[['appid', 'name']].sort_values('appid').reset_index(drop=True)



In [5]:
# export disabled to keep consistency across download sessions
app_list.to_csv('../data/download/app_list_SS.csv', index=False)

In [7]:
app_list.info

<bound method DataFrame.info of          appid                          name
0           10                Counter-Strike
1           20         Team Fortress Classic
2           30                 Day of Defeat
3           40            Deathmatch Classic
4           50     Half-Life: Opposing Force
...        ...                           ...
37679  1376260  HTML5 Javascript Game Engine
37680  1376940                Monster Killer
37681  1378290                   The Citadel
37682  1380110    Sprite Basic 2 Game Engine
37683  1384770                   Scrutinized

[37684 rows x 2 columns]>

In [8]:
# instead read from stored csv
app_list = pd.read_csv('../data/download/app_list_SS.csv')

# display first few rows
app_list.head()

Unnamed: 0,appid,name
0,10,Counter-Strike
1,20,Team Fortress Classic
2,30,Day of Defeat
3,40,Deathmatch Classic
4,50,Half-Life: Opposing Force


In [9]:
def get_app_data(start, stop, parser, pause):
    """Return list of app data generated from parser.
    
    parser : function to handle request
    """
    app_data = []
    
    # iterate through each row of app_list, confined by start and stop
    for index, row in app_list[start:stop].iterrows():
        print('Current index: {}'.format(index), end='\r')
        
        appid = row['appid']
        name = row['name']

        # retrive app data for a row, handled by supplied parser, and append to list
        data = parser(appid, name)
        app_data.append(data)

        time.sleep(pause) # prevent overloading api with requests
    
    return app_data

def process_batches(parser, app_list, download_path, data_filename, index_filename,
                    columns, begin=0, end=-1, batchsize=100, pause=1):
    """Process app data in batches, writing directly to file.
    
    parser : custom function to format request
    app_list : dataframe of appid and name
    download_path : path to store data
    data_filename : filename to save app data
    index_filename : filename to store highest index written
    columns : column names for file
    
    Keyword arguments:
    
    begin : starting index (get from index_filename, default 0)
    end : index to finish (defaults to end of app_list)
    batchsize : number of apps to write in each batch (default 100)
    pause : time to wait after each api request (defualt 1)
    
    returns: none
    """
    print('Starting at index {}:\n'.format(begin))
    
    # by default, process all apps in app_list
    if end == -1:
        end = len(app_list) + 1
    
    # generate array of batch begin and end points
    batches = np.arange(begin, end, batchsize)
    batches = np.append(batches, end)
    
    apps_written = 0
    batch_times = []
    
    for i in range(len(batches) - 1):
            start_time = time.time()

            start = batches[i]
            stop = batches[i+1]

            app_data = get_app_data(start, stop, parser, pause)

            rel_path = os.path.join(download_path, data_filename)

            # writing app data to file
            with open(rel_path, 'a', newline='', encoding='utf-8') as f:
                writer = csv.DictWriter(f, fieldnames=columns, extrasaction='ignore')

                for j in range(3,0,-1):
                    print("\rAbout to write data, don't stop script! ({})".format(j), end='')
                    time.sleep(0.5)

                writer.writerows(app_data)
                print('\rExported lines {}-{} to {}.'.format(start, stop-1, data_filename), end=' ')

            apps_written += len(app_data)

            idx_path = os.path.join(download_path, index_filename)

            # writing last index to file
            with open(idx_path, 'w') as f:
                index = stop
                print(index, file=f)

            # logging time taken
            end_time = time.time()
            time_taken = end_time - start_time

            batch_times.append(time_taken)
            mean_time = statistics.mean(batch_times)

            est_remaining = (len(batches) - i - 2) * mean_time

            remaining_td = dt.timedelta(seconds=round(est_remaining))
            time_td = dt.timedelta(seconds=round(time_taken))
            mean_td = dt.timedelta(seconds=round(mean_time))

            print('Batch {} time: {} (avg: {}, remaining: {})'.format(i, time_td, mean_td, remaining_td))

    print('\nProcessing batches complete. {} apps written'.format(apps_written))

    

In [10]:
def reset_index(download_path, index_filename):
    """Reset index in file to 0."""
    rel_path = os.path.join(download_path, index_filename)
    
    with open(rel_path, 'w') as f:
        print(0, file=f)
        

def get_index(download_path, index_filename):
    """Retrieve index from file, returning 0 if file not found."""
    try:
        rel_path = os.path.join(download_path, index_filename)

        with open(rel_path, 'r') as f:
            index = int(f.readline())
    
    except FileNotFoundError:
        index = 0
        
    return index


def prepare_data_file(download_path, filename, index, columns):
    """Create file and write headers if index is 0."""
    if index == 0:
        rel_path = os.path.join(download_path, filename)

        with open(rel_path, 'w', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=columns)
            writer.writeheader()

In [18]:
def parse_steam_request(appid, name):
    """Unique parser to handle data from Steam Store API.
    
    Returns : json formatted data (dict-like)
    """
    url = "http://store.steampowered.com/api/appdetails/"
    parameters = {"appids": appid}
    
    json_data = get_request(url, parameters=parameters)
    json_app_data = json_data[str(appid)]
    
    if json_app_data['success']:
        data = json_app_data['data']
    else:
        data = {'name': name, 'steam_appid': appid}
        
    return data


# Set file parameters
download_path = '../data/download'
steam_app_data = 'steam_app_data.csv'
steam_index = 'steam_index.txt'

steam_columns = [
    'type', 'name', 'steam_appid', 'required_age', 'is_free', 'controller_support',
    'dlc', 'detailed_description', 'about_the_game', 'short_description', 'fullgame',
    'supported_languages', 'header_image', 'website', 'pc_requirements', 'mac_requirements',
    'linux_requirements', 'legal_notice', 'drm_notice', 'ext_user_account_notice',
    'developers', 'publishers', 'demos', 'price_overview', 'packages', 'package_groups',
    'platforms', 'metacritic', 'reviews', 'categories', 'genres', 'screenshots',
    'movies', 'recommendations', 'achievements', 'release_date', 'support_info',
    'background', 'content_descriptors'
]

# Overwrites last index for demonstration (would usually store highest index so can continue across sessions)
reset_index(download_path, steam_index)

# Retrieve last index downloaded from file
index = get_index(download_path, steam_index)

# Wipe or create data file and write headers if index is 0
prepare_data_file(download_path, steam_app_data, index, steam_columns)

Starting at index 9700:

Exported lines 9700-9799 to steam_app_data.csv. Batch 0 time: 0:02:17 (avg: 0:02:17, remaining: 10:36:30)
Exported lines 9800-9899 to steam_app_data.csv. Batch 1 time: 0:02:19 (avg: 0:02:18, remaining: 10:38:00)
Exported lines 9900-9999 to steam_app_data.csv. Batch 2 time: 0:02:16 (avg: 0:02:17, remaining: 10:33:36)
No response, waiting 30 seconds...
Retrying.
Exported lines 10000-10099 to steam_app_data.csv. Batch 3 time: 0:02:47 (avg: 0:02:25, remaining: 11:05:06)
Exported lines 10100-10199 to steam_app_data.csv. Batch 4 time: 0:02:16 (avg: 0:02:23, remaining: 10:55:03)
No response, waiting 30 seconds...
Retrying.
Exported lines 10200-10299 to steam_app_data.csv. Batch 5 time: 0:02:47 (avg: 0:02:27, remaining: 11:11:10)
Exported lines 10300-10399 to steam_app_data.csv. Batch 6 time: 0:02:17 (avg: 0:02:26, remaining: 11:02:22)
No response, waiting 30 seconds...
Retrying.
Exported lines 10400-10499 to steam_app_data.csv. Batch 7 time: 0:02:46 (avg: 0:02:28, rem

No response, waiting 30 seconds...
Retrying.
Exported lines 16000-16099 to steam_app_data.csv. Batch 63 time: 0:02:45 (avg: 0:02:32, remaining: 9:05:48)
Exported lines 16100-16199 to steam_app_data.csv. Batch 64 time: 0:02:17 (avg: 0:02:31, remaining: 9:02:29)
No response, waiting 30 seconds...
Retrying.
Exported lines 16200-16299 to steam_app_data.csv. Batch 65 time: 0:02:50 (avg: 0:02:32, remaining: 9:00:56)
Exported lines 16300-16399 to steam_app_data.csv. Batch 66 time: 0:02:16 (avg: 0:02:31, remaining: 8:57:36)
No response, waiting 30 seconds...
Retrying.
Exported lines 16400-16499 to steam_app_data.csv. Batch 67 time: 0:02:46 (avg: 0:02:32, remaining: 8:55:51)
Exported lines 16500-16599 to steam_app_data.csv. Batch 68 time: 0:02:17 (avg: 0:02:31, remaining: 8:52:34)
No response, waiting 30 seconds...
Retrying.
Exported lines 16600-16699 to steam_app_data.csv. Batch 69 time: 0:02:45 (avg: 0:02:32, remaining: 8:50:44)
Exported lines 16700-16799 to steam_app_data.csv. Batch 70 time:

Exported lines 22200-22299 to steam_app_data.csv. Batch 125 time: 0:02:48 (avg: 0:02:32, remaining: 6:29:28)
Exported lines 22300-22399 to steam_app_data.csv. Batch 126 time: 0:02:16 (avg: 0:02:32, remaining: 6:26:37)
No response, waiting 30 seconds...
Retrying.
Exported lines 22400-22499 to steam_app_data.csv. Batch 127 time: 0:02:46 (avg: 0:02:32, remaining: 6:24:23)
Exported lines 22500-22599 to steam_app_data.csv. Batch 128 time: 0:02:17 (avg: 0:02:32, remaining: 6:21:33)
No response, waiting 30 seconds...
Retrying.
Exported lines 22600-22699 to steam_app_data.csv. Batch 129 time: 0:02:47 (avg: 0:02:32, remaining: 6:19:20)
Exported lines 22700-22799 to steam_app_data.csv. Batch 130 time: 0:02:16 (avg: 0:02:32, remaining: 6:16:30)
No response, waiting 30 seconds...
Retrying.
Exported lines 22800-22899 to steam_app_data.csv. Batch 131 time: 0:02:48 (avg: 0:02:32, remaining: 6:14:16)
Exported lines 22900-22999 to steam_app_data.csv. Batch 132 time: 0:02:17 (avg: 0:02:32, remaining: 6:

Exported lines 28400-28499 to steam_app_data.csv. Batch 187 time: 0:02:46 (avg: 0:02:32, remaining: 3:53:13)
Exported lines 28500-28599 to steam_app_data.csv. Batch 188 time: 0:02:27 (avg: 0:02:32, remaining: 3:50:39)
No response, waiting 30 seconds...
Retrying.
Exported lines 28600-28699 to steam_app_data.csv. Batch 189 time: 0:02:47 (avg: 0:02:32, remaining: 3:48:14)
Exported lines 28700-28799 to steam_app_data.csv. Batch 190 time: 0:02:22 (avg: 0:02:32, remaining: 3:45:37)
No response, waiting 30 seconds...
Retrying.
Exported lines 28800-28899 to steam_app_data.csv. Batch 191 time: 0:02:47 (avg: 0:02:32, remaining: 3:43:12)
Exported lines 28900-28999 to steam_app_data.csv. Batch 192 time: 0:02:18 (avg: 0:02:32, remaining: 3:40:34)
No response, waiting 30 seconds...
Retrying.
Exported lines 29000-29099 to steam_app_data.csv. Batch 193 time: 0:02:52 (avg: 0:02:32, remaining: 3:38:10)
Exported lines 29100-29199 to steam_app_data.csv. Batch 194 time: 0:02:19 (avg: 0:02:32, remaining: 3:

Exported lines 34600-34699 to steam_app_data.csv. Batch 249 time: 0:02:48 (avg: 0:02:33, remaining: 1:16:18)
Exported lines 34700-34799 to steam_app_data.csv. Batch 250 time: 0:02:18 (avg: 0:02:33, remaining: 1:13:44)
No response, waiting 30 seconds...
Retrying.
Exported lines 34800-34899 to steam_app_data.csv. Batch 251 time: 0:02:48 (avg: 0:02:33, remaining: 1:11:13)
Exported lines 34900-34999 to steam_app_data.csv. Batch 252 time: 0:02:16 (avg: 0:02:33, remaining: 1:08:39)
No response, waiting 30 seconds...
Retrying.
Exported lines 35000-35099 to steam_app_data.csv. Batch 253 time: 0:02:48 (avg: 0:02:33, remaining: 1:06:08)
Exported lines 35100-35199 to steam_app_data.csv. Batch 254 time: 0:02:18 (avg: 0:02:33, remaining: 1:03:34)
No response, waiting 30 seconds...
Retrying.
Exported lines 35200-35299 to steam_app_data.csv. Batch 255 time: 0:02:48 (avg: 0:02:33, remaining: 1:01:02)
Exported lines 35300-35399 to steam_app_data.csv. Batch 256 time: 0:02:19 (avg: 0:02:33, remaining: 0:

In [None]:
# Set end and chunksize for demonstration - remove to run through entire app list
process_batches(
    parser=parse_steam_request,
    app_list=app_list,
    download_path=download_path,
    data_filename=steam_app_data,
    index_filename=steam_index,
    columns=steam_columns,
    begin=index,
    #end=10,
    #batchsize=5

In [39]:
df = pd.read_csv('../data/download/steam_app_data.csv', delimiter=';', encoding='latin1')

ParserError: Error tokenizing data. C error: Expected 1 fields in line 43, saw 2


In [40]:
def parse_steamspy_request(appid, name):
    """Parser to handle SteamSpy API data."""
    url = "https://steamspy.com/api.php"
    parameters = {"request": "appdetails", "appid": appid}
    
    json_data = get_request(url, parameters)
    return json_data


# set files and columns
download_path = '../data/download'
steamspy_data = 'steamspy_data.csv'
steamspy_index = 'steamspy_index.txt'

steamspy_columns = [
    'appid', 'name', 'developer', 'publisher', 'score_rank', 'positive',
    'negative', 'userscore', 'owners', 'average_forever', 'average_2weeks',
    'median_forever', 'median_2weeks', 'price', 'initialprice', 'discount',
    'languages', 'genre', 'ccu', 'tags'
]

reset_index(download_path, steamspy_index)
index = get_index(download_path, steamspy_index)

# Wipe data file if index is 0
prepare_data_file(download_path, steamspy_data, index, steamspy_columns)

process_batches(
    parser=parse_steamspy_request,
    app_list=app_list,
    download_path=download_path, 
    data_filename=steamspy_data,
    index_filename=steamspy_index,
    columns=steamspy_columns,
    begin=index,
    end=20,
    batchsize=5,
    pause=0.3
)

Starting at index 0:

Exported lines 0-4 to steamspy_data.csv. Batch 0 time: 0:00:05 (avg: 0:00:05, remaining: 0:00:14)
Exported lines 5-9 to steamspy_data.csv. Batch 1 time: 0:00:04 (avg: 0:00:05, remaining: 0:00:09)
Exported lines 10-14 to steamspy_data.csv. Batch 2 time: 0:00:04 (avg: 0:00:04, remaining: 0:00:04)
Exported lines 15-19 to steamspy_data.csv. Batch 3 time: 0:00:04 (avg: 0:00:04, remaining: 0:00:00)

Processing batches complete. 20 apps written


In [41]:
pd.read_csv('../data/download/steamspy_data.csv').head()


Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags
0,10,Counter-Strike,Valve,Valve,,159654,4174,0,"10,000,000 .. 20,000,000",19751,1498,181,264,999,999,0,"English, French, German, Italian, Spanish - Sp...",Action,12975,"{'Action': 5314, 'FPS': 4718, 'Multiplayer': 3..."
1,20,Team Fortress Classic,Valve,Valve,,4333,771,0,"2,000,000 .. 5,000,000",797,0,9,0,499,499,0,"English, French, German, Italian, Spanish - Sp...",Action,114,"{'Action': 731, 'FPS': 297, 'Multiplayer': 251..."
2,30,Day of Defeat,Valve,Valve,,4322,483,0,"5,000,000 .. 10,000,000",882,0,19,0,499,499,0,"English, French, German, Italian, Spanish - Spain",Action,139,"{'FPS': 777, 'World War II': 242, 'Multiplayer..."
3,40,Deathmatch Classic,Valve,Valve,,1591,345,0,"5,000,000 .. 10,000,000",1187,0,8,0,499,499,0,"English, French, German, Italian, Spanish - Sp...",Action,6,"{'Action': 624, 'FPS': 135, 'Classic': 103, 'M..."
4,50,Half-Life: Opposing Force,Gearbox Software,Valve,,8886,475,0,"5,000,000 .. 10,000,000",1112,391,181,391,499,499,0,"English, French, German, Korean",Action,117,"{'FPS': 869, 'Action': 310, 'Classic': 237, 'S..."
