## 随机抽样

**广度优先搜索**需要考虑几个因素：
- 网络连通性：在Steam社区网络中，用户之间的连接密度如何？如果每个用户有很多朋友，那么少量的初始点可能就足够覆盖大量的用户。
- 样本多样性：从多个起始点开始可以增加样本的多样性，减少因起始点选择导致的偏差。
- 计算资源和时间：更多的初始点意味着更大的并行处理能力需求和可能更长的数据收集时间。

<p align="center">
  <img src="../../images/Breadth-first-tree.jpg" alt="BFT,path中../表示回到上一级" width="200"/>
</p>

<div style="text-align: center; color: gray; font-size: 14px; font-style: italic;">
  广度优先树模型
</div>

本次计划选取3000名玩家作为样本分析整体steam市场，该3000名玩家需要具备以下条件：
- 基础信息可见
- 朋友信息可见
- 游戏游玩信息可见

本次起始随机点搜索点设置50个，以便样本具备多样性，下面是随机抽样代码

In [32]:
'''加载工作包'''
import aiohttp
import asyncio
import aiofiles
import random
import os
import csv
import json
import pandas as pd
from pathlib import Path
from collections import deque

In [83]:
""" 异步获取有效的Steam ID ，返回值steam_id和他的friends信息，如出现错误返回报错信息"""
async def fetch_friends(session, steam_api_key, steam_id):
    url = f"http://api.steampowered.com/ISteamUser/GetFriendList/v0001/?key={steam_api_key}&steamid={steam_id}&relationship=friend"
    try:
        async with session.get(url) as response: # session用于创建一个会话对象，用来发送多种类型的HTTP请求（GET、POST、PUT、DELETE）等
            if response.status == 200:
                data = await response.json()
                # 确保friendslist存在且列表不为空
                if ('friendslist' in data) and (data['friendslist']['friends']):
                    return steam_id, data['friendslist']['friends']
    except Exception as e:
        print(f"Error fetching friends for {steam_id}: {str(e)}")
    return None, None  # 如果没有朋友列表或请求失败，返回None

""" 使用并行请求寻找具有朋友列表的Steam用户ID，返回 """
async def collect_ids(steam_api_key, target_count=50, batch_size=10):
    collected_ids = []
    try:
        async with aiohttp.ClientSession() as session: # 当不足50个时候异步调用api查询id是否有效，每个查询如有效返回id和其朋友列表
            while len(collected_ids) < target_count: 
                tasks = [] # 创建10个随机id的查询任务
                for _ in range(batch_size):
                    random_id = random.randint(76561197960265728, 76561197960265728 + 50000000)
                    task = fetch_friends(session, steam_api_key, random_id)
                    tasks.append(task)
                
                results = await asyncio.gather(*tasks) # 将所有的tasks集合异步进行,await将阻断主线程，异步10个任务gather后再返回主线程;在这里results是一个列表，元素由各个查询任务的元组组成例如[(a,{...}),(b,{...}})]
            
                for steam_id, friends in results:
                    if (friends) and (steam_id not in collected_ids):  # 需要同时有id和朋友列表
                        collected_ids.append(steam_id)
                        print(f"Found valid ID: {steam_id} with {len(friends)} friends.")
                        if len(collected_ids) >= target_count:
                            break
    except Exception as e:
        print(f"An error occurred: {str(e)}")
    return collected_ids

配置API信息，调用collect_ids函数

In [84]:
# 使用你的Steam API Key
steam_api_key = os.getenv('STEAM_API_KEY') # 这里注意替换为你自己保存在环境变量中的API KEY，并当然你也可以显式赋值，但切记隐私安全，因为STEAM所提供的API功能相当丰富，涉及多种交易

'''一般在异步编程中，main用于控制创建多个任务的并行'''
async def main(): 
    valid_ids = await collect_ids(steam_api_key)
    # 写入CSV文件
    with open('../../data/raw/vilid_ids.csv', 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Steam ID'])
        for steam_id in valid_ids:
            writer.writerow([steam_id])  # 假设你有朋友数的数据，这里需要调整
    print(f"Collected {len(valid_ids)} valid Steam IDs with friends.")

# 运行异步主函数
await main()

Found valid ID: 76561198008732451 with 44 friends.
Found valid ID: 76561197992537552 with 19 friends.
Found valid ID: 76561198009550897 with 33 friends.
Found valid ID: 76561197998636571 with 1 friends.
Found valid ID: 76561198000080278 with 1 friends.
Found valid ID: 76561197971637289 with 14 friends.
Found valid ID: 76561197981600058 with 30 friends.
Found valid ID: 76561197982240245 with 6 friends.
Found valid ID: 76561197979962491 with 93 friends.
Found valid ID: 76561197999475452 with 1 friends.
Found valid ID: 76561197982199822 with 37 friends.
Found valid ID: 76561197995231721 with 5 friends.
Found valid ID: 76561197990325414 with 1 friends.
Found valid ID: 76561198006975782 with 1 friends.
Found valid ID: 76561198007047021 with 1 friends.
Found valid ID: 76561197970316965 with 7 friends.
Found valid ID: 76561197960945147 with 6 friends.
Found valid ID: 76561198002801681 with 117 friends.
Found valid ID: 76561197995699851 with 1 friends.
Found valid ID: 76561197991415823 with 1 

## 开始抓取

利用抓取的50个初始IDs进行广度优先搜索，数据原始文件。👉[初始IDs](../../data/raw/vilid_ids.csv)<br>
计划至少需要抓取30000个用户ID。

In [49]:
class SteamDataFetcher:
    
    '''初始化参数'''
    def __init__(self, api_key, initial_friend_ids, max_requests, concurrent_limit=20):
        self.api_key = api_key
        self.initial_friend_ids = initial_friend_ids # 初始的Steam用户ID，从列表用户开始抓取好友数据
        self.max_requests = max_requests # 允许最大API请求数
        self.concurrent_limit = concurrent_limit # 并发请求的限制，使用asyncio.Semaphore控制同一时间信号数量
        self.num_requests = 0 # 记录已发出的请求次数
        self.data_path = Path(r"../../data/raw/steam_datas.json") # 存储抓取的数据
        self.visited_path = Path(r"../../data/raw/visited_datas.json") # 已访问的用户ID的本地文件路径，避免多次重复查询，例如["76561198056544941", "76561198119058878".....]
        self.session = None # 用于发起HTTP请求
        self.visited_ids = set() # 已访问的Steam用户ID集合
        self.semaphore = asyncio.Semaphore(self.concurrent_limit) # 控制同一时间信号数量，避免频繁请求造成的“429”错误

    '''读取所有玩家列表汇总'''
    async def load_data(self):
        if self.data_path.exists(): # 存在则返回{72xxxxxxx:{}....,72xxxx:{},}data，没有数据则返回空字典{}
            async with aiofiles.open(self.data_path, 'r') as file: 
                data = await file.read()
                return json.loads(data) if data else {}
        return {} # 文件不存在则返回空字典{}

    '''保存查询的所有玩家列表汇总'''
    async def save_data(self, data):
        existing_data = await self.load_data() # 读取json文件成字典{72xxxxxxx:{}....,72xxxx:{},....}
        existing_data.update(data)
        async with aiofiles.open(self.data_path, "w") as file:
            await file.write(json.dumps(existing_data, indent=4))

    '''保存查询过的玩家列表'''
    async def save_visited_ids(self):
        async with aiofiles.open(self.visited_path, "w") as file:
            await file.write(json.dumps(list(self.visited_ids)))

    '''初始化HTTP客户端会话，从加载visited_ids文件如果文件存在的话'''
    async def initialize(self):
        self.session = aiohttp.ClientSession() # 用于创建一个会话对象，用来发送多种类型的HTTP请求（GET、POST、PUT、DELETE）等
        if self.visited_path.exists():
            async with aiofiles.open(self.visited_path, "r") as file:
                data = await file.read() # 如果有的话data等于["76561198056544941", "76561198119058878".....]
                self.visited_ids = set(json.loads(data)) if data else set() 

    '''获得ID的玩家列表'''
    async def fetch_friends(self, steam_id):
        async with self.semaphore:
            if (self.num_requests >= self.max_requests) or (steam_id in self.visited_ids):
                return None, steam_id # 这里返回steam_id与结果一起
            self.visited_ids.add(steam_id)
            self.num_requests += 1
            url = f"http://api.steampowered.com/ISteamUser/GetFriendList/v0001/?key={self.api_key}&steamid={steam_id}&relationship=friend"
            async with self.session.get(url) as response:
                if response.status == 200:
                    return await response.json(), steam_id # 这里返回steam_id与结果一起
                return {}, steam_id # 这里返回steam_id与结果一起

    '''关闭会话'''
    async def close(self):
        if self.session:
            await self.session.close()

    '''上述方法的集合调用，涉及initialize、load_data、fetch_friends、save_data、save_visited_ids、close全部过程'''   
    async def run(self):
        await self.initialize() 
        friends_data = await self.load_data()
        queue = deque(self.initial_friend_ids)
        count = 0
        save_threshold = 50 #控制单次写入的batch大小，及时释放内存
        batch_data = {}

        while queue and (self.num_requests < self.max_requests):
            tasks=[]
            while (queue and len(tasks) < self.concurrent_limit) and (self.num_requests < self.max_requests):
                current_id = queue.popleft() #将列表最左侧的提取出，并与visited比对，如没有则假如任务列
                if current_id not in self.visited_ids: 
                    tasks.append(self.fetch_friends(current_id)) # 任务列表添加任务：例如返回的json数据例如"76561197991624457": {"friendslist": {"friends": [...]}}

            results = await asyncio.gather(*tasks) # 并发执行收集到的任务
                
            for friends, current_id in results: 
                if friends:
                    friends_data[current_id] = friends
                    batch_data[current_id] = friends
                    for friend in friends.get('friendslist', {}).get('friends', []): # 返回{{  "steamid": "76561197963520450","relationship": "friend"}
                        friend_id = friend.get('steamid') # 提取id，如"76561197963520450"
                        if friend_id and friend_id not in self.visited_ids:
                            queue.append(friend_id) # 添加"76561197963520450"至队列最下方

                count += 1
                if count >= save_threshold:
                    await self.save_data(batch_data)
                    batch_data = {}
                    count = 0
                    await self.save_visited_ids()

        if batch_data:
            await self.save_data(batch_data)
        await self.save_visited_ids()
        await self.close()

加载初始搜索IDs并储存至变量vilid_ids

In [50]:
# 读取CSV文件中所有的Steam ID
def read_all_steam_ids(file_path):
    df = pd.read_csv(file_path)
    # 假设Steam ID存储在名为'steamid'的列
    if ('Steam ID' in df.columns) and (not df['Steam ID'].empty):
        return df['Steam ID'].tolist()
    else:
        return []  # 如果没有找到数据，返回空列表

# 设置初始的Steam ID列表
file_path = "../../data/raw/vilid_ids.csv"
vilid_ids = read_all_steam_ids(file_path)
vilid_ids = deque(vilid_ids)

调用SteamDataFetcher函数，网络正常情况下运行10分钟，即可得到足够的id量

In [None]:
# 示例运行代码，确保在异步环境中调用
steam_api_key = os.getenv('STEAM_API_KEY')

async def main():
    fetcher = SteamDataFetcher(api_key=steam_api_key, initial_friend_ids=list(vilid_ids), max_requests=10000, concurrent_limit=20)
    try:
        await fetcher.run()
    finally:
        await fetcher.close()

# 运行 main 函数
await(main())