## 随机抽样

## 开始抓取

In [1]:
import asyncio
import aiohttp
import json
from pathlib import Path
from collections import deque
import aiofiles
import get_api

In [3]:
class SteamDataFetcher:
    '''初始化参数'''
    def __init__(self, api_key, initial_friend_id, max_requests, concurrent_limit=20):

        self.api_key = api_key
        self.initial_friend_id = initial_friend_id # 初始的Steam用户ID，从列表用户开始抓取好友数据
        self.max_requests = max_requests # 允许最大API请求数
        self.concurrent_limit = concurrent_limit # 并发请求的限制，使用asyncio.Semaphore控制同一时间信号数量
        self.num_requests = 0 # 记录已发出的请求次数
        self.data_path = Path(r"try\steam_data.json") # 存储抓取的数据
        self.visited_path = Path(r"try\visited_ids.json") # 已访问的用户ID的本地文件路径，避免多次重复查询
        self.session = None # 用于发起HTTP请求
        self.visited_ids = set() # 已访问的Steam用户ID集合
        self.semaphore = asyncio.Semaphore(self.concurrent_limit) # 控制同一时间信号数量，避免频繁请求造成的“429”错误

    '''初始化HTTP客户端会话，从文件加载已经访问过的用户ID列表，如果文件存在的话'''
    async def initialize(self):
        self.session = aiohttp.ClientSession() # 用于创建一个会话对象，用来发送多种类型的HTTP请求（GET、POST、PUT、DELETE）等
        if self.visited_path.exists():
            async with aiofiles.open(self.visited_path, "r") as file:
                data = await file.read()
                self.visited_ids = set(json.loads(data)) if data else set()

    async def fetch_friends(self, steam_id):
        async with self.semaphore:
            if self.num_requests >= self.max_requests or steam_id in self.visited_ids:
                return None
            self.visited_ids.add(steam_id)
            self.num_requests += 1
            url = f"http://api.steampowered.com/ISteamUser/GetFriendList/v0001/?key={self.api_key}&steamid={steam_id}&relationship=friend"
            async with self.session.get(url) as response:
                if response.status == 200:
                    return await response.json()
                return {}

    async def save_visited_ids(self):
        async with aiofiles.open(self.visited_path, "w") as file:
            await file.write(json.dumps(list(self.visited_ids)))

    async def save_data(self, data):
        existing_data = await self.load_data()
        existing_data.update(data)
        async with aiofiles.open(self.data_path, "w") as file:
            await file.write(json.dumps(existing_data, indent=4))

    async def load_data(self):
        if self.data_path.exists():
            async with aiofiles.open(self.data_path, 'r') as file:
                data = await file.read()
                return json.loads(data) if data else {}
        return {}

    async def close(self):
        if self.session:
            await self.session.close()

    async def run(self):
        await self.initialize()
        friends_data = await self.load_data()
        queue = deque([self.initial_friend_id])
        save_threshold = 50
        count = 0
        batch_data = {}

        while queue and (self.num_requests < self.max_requests):
            current_id = queue.popleft()
            if current_id not in self.visited_ids:
                friends = await self.fetch_friends(current_id)
                if friends:
                    friends_data[current_id] = friends
                    batch_data[current_id] = friends
                    for friend in friends.get('friendslist', {}).get('friends', []):
                        friend_id = friend.get('steamid')
                        if friend_id and friend_id not in self.visited_ids:
                            queue.append(friend_id)

                count += 1
                if count >= save_threshold:
                    await self.save_data(batch_data)
                    batch_data = {}
                    count = 0
                    await self.save_visited_ids()

        if batch_data:
            await self.save_data(batch_data)
        await self.save_visited_ids()
        await self.close()

# 示例运行代码，确保在异步环境中调用
async def main():
    fetcher = SteamDataFetcher(api_key=get_api.steam_api_key, initial_friend_id="76561199054306973", max_requests=10000, concurrent_limit=20)
    try:
        await fetcher.run()
    finally:
        await fetcher.close()

# 运行 main 函数
await(main())

FileNotFoundError: [Errno 2] No such file or directory: 'try\\visited_ids.json'