导入必要包
这里有个小提示，当aiohttp包因为vpn等原因无法正确处理session时候，可以使用httpx等不同包处理

In [35]:
import httpx
import asyncio
import json
import pandas as pd
import os

异步抓取游戏信息

In [37]:
def get_available_filename(base_path, base_name, extension):
    index = 1
    while True:
        new_filename = f"{base_path}/{base_name}{index}.{extension}"
        if not os.path.exists(new_filename):
            return new_filename
        index += 1

async def fetch_game_data(client, app_id):
    url = f"https://store.steampowered.com/api/appdetails?appids={app_id}"
    try:
        resp = await client.get(url)
        resp.raise_for_status()
        return resp.json()
    except httpx.HTTPStatusError as e:
        if e.response.status_code == 429:
            return 'rate_limit'
        else:
            print(f"Failed to fetch {app_id}: {str(e)}")
            return None

async def fetch_batch(client, app_ids, sleep_time):
    tasks = [fetch_game_data(client, app_id) for app_id in app_ids]
    results = await asyncio.gather(*tasks)
    return results, sleep_time

async def main(app_ids, batch_size=10):
    sleep_time = 7  # Start with 7 second sleep
    client = httpx.AsyncClient()

    file_name = get_available_filename('../../../data/processed/json_datas', 'game_data', 'json')

    try:
        for i in range(0, len(app_ids), batch_size):
            batch = app_ids[i:i+batch_size]
            results, sleep = await fetch_batch(client, batch, sleep_time)
            if any(result == 'rate_limit' for result in results):
                sleep_time += 2
            else:
                sleep_time = sleep_time - 3 if sleep_time > 10 else max(7, sleep_time - 0.1)
            
            with open(file_name, 'a') as f:  # Open the file once determined
                json.dump([result for result in results if result is not None and result != 'rate_limit'], f)
                f.write('\n')
            
            print(f"Batch completed. Sleeping for {sleep_time} seconds.")
            await asyncio.sleep(sleep_time)
    finally:
        await client.aclose()


[](../../data/processed/)

In [3]:
df = pd.read_csv("../../../data/processed/game_list.csv")
app_ids = df['game_id'].tolist()

In [39]:
'''运行异步代码，抓取游戏信息'''
await main(app_ids)

Batch completed. Sleeping for 7 seconds.
Batch completed. Sleeping for 7 seconds.
Batch completed. Sleeping for 7 seconds.
Batch completed. Sleeping for 7 seconds.
Batch completed. Sleeping for 7 seconds.
Batch completed. Sleeping for 7 seconds.
Batch completed. Sleeping for 7 seconds.


当出现网络问题中断，或者跑完发现其中有未抓取到（大概率是429错误）情况，请完成下面几个步骤：
1. 请先使用当前文件夹下的[json_checker](json_checker.ipynb)修复文件【../../data/processed/json_datas/game_data**注意后缀**.json】
2. 使用下面的代码更新[文件](../../data/processed/game_list.csv)【game_list.csv】，用于删除列表中已抓取的游戏
3. 最后run👆上面的await main（app_ids）代码

In [40]:
'''读取文件信息'''
searched_game_data = pd.read_json('../../../data/processed/json_datas/game_data7.json') # 每次注意修改game_data的后缀对应你要读取的最新数据
all_game_id_data = pd.read_csv('../../../data/processed/game_list.csv')['game_id']

'''定义提取游戏id的函数'''
def find_already_fetched_ids(cell):
    try:
        game_id = list(cell.keys())[0]
        return game_id
    except:
        return None

'''提取游戏id'''
game_id_data = searched_game_data.applymap(find_already_fetched_ids)

'''将提取的游戏id整理成一列'''
game_id_data = game_id_data.stack().reset_index(drop=True).astype(int) # 这里需要调整下数据类型成为int保持同步

'''提取出不相同的id'''
filtered_series = all_game_id_data[~all_game_id_data.isin(game_id_data)]

'''覆盖原game_list文件'''
filtered_series.to_csv('../../../data/processed/game_list.csv')

  game_id_data = searched_game_data.applymap(find_already_fetched_ids)


In [41]:
filtered_series

Series([], Name: game_id, dtype: int64)