### 导入必要包

In [1]:
import pandas as pd
import json 

### 读取数据

In [4]:
# 基础文件名和文件数
data_location = '../../data/raw/'
base_name = 'all_steam_data'
file_count = 7  # 假设你有7个文件,这里请视情况调整
file = open('data.json','r')
test_data = json.load(file)

# 创建一个空的DataFrame来存储所有数据
all_data = pd.DataFrame()

# 循环加载每个文件
for i in range(1, file_count + 1):
    file_name = f"{data_location}{base_name}{i}.json"  # 构造文件名
    with open(file_name, 'r') as file:
        data = json.load(file)
        # 假设每个文件的数据都是字典格式，需要转换为DataFrame
        df = pd.DataFrame(data)  # 将文件整理成DF格式
        all_data = pd.concat([all_data, df], ignore_index=True) # 将每个DF沿着column方向录入空dataframe
all_data

Unnamed: 0,0,1,2,3,4
0,{'76561197960269909': {'player_info': {'respon...,{'76561197960270577': {'player_info': {'respon...,{'76561197960272861': {'player_info': {'respon...,{'76561197960272949': {'player_info': {'respon...,{'76561197960275607': {'player_info': {'respon...
1,{'76561197960280440': {'player_info': {'respon...,{'76561197960280876': {'player_info': {'respon...,{'76561197960283131': {'player_info': {'respon...,{'76561197960286225': {'player_info': {'respon...,{'76561197960288216': {'player_info': {'respon...
2,{'76561197960290472': {'player_info': {'respon...,{'76561197960296208': {'player_info': {'respon...,{'76561197960310011': {'player_info': {'respon...,{'76561197960311652': {'player_info': {'respon...,{'76561197960312502': {'player_info': {'respon...
3,{'76561197960315957': {'player_info': {'respon...,{'76561197960321844': {'player_info': {'respon...,{'76561197960321926': {'player_info': {'respon...,{'76561197960329729': {'player_info': {'respon...,{'76561197960331066': {'player_info': {'respon...
4,{'76561197960331205': {'player_info': {'respon...,{'76561197960331712': {'player_info': {'respon...,{'76561197960337084': {'player_info': {'respon...,{'76561197960337842': {'player_info': {'respon...,{'76561197960338139': {'player_info': {'respon...
...,...,...,...,...,...
4143,{'76561198011445620': {'player_info': {'respon...,{'76561198121843777': {'player_info': {'respon...,{'76561199178102171': {'player_info': {'respon...,{'76561198016465459': {'player_info': {'respon...,{'76561198130132230': {'player_info': {'respon...
4144,{'76561197984619081': {'player_info': {'respon...,{'76561198261550186': {'player_info': {'respon...,{'76561197994687631': {'player_info': {'respon...,{'76561198024291390': {'player_info': {'respon...,{'76561198981187936': {'player_info': {'respon...
4145,{'76561197987720434': {'player_info': {'respon...,{'76561198261859448': {'player_info': {'respon...,{'76561198090836870': {'player_info': {'respon...,{'76561197990627157': {'player_info': {'respon...,{'76561198286511601': {'player_info': {'respon...
4146,{'76561198020983212': {'player_info': {'respon...,{'76561198018893187': {'player_info': {'respon...,{'76561198089061950': {'player_info': {'respon...,{'76561197976357098': {'player_info': {'respon...,{'76561198106511894': {'player_info': {'respon...


In [5]:
'''将数据整理成一列'''
df = pd.concat([all_data[col] for col in all_data.columns], ignore_index= True)
df = pd.DataFrame(df)
df

Unnamed: 0,0
0,{'76561197960269909': {'player_info': {'respon...
1,{'76561197960280440': {'player_info': {'respon...
2,{'76561197960290472': {'player_info': {'respon...
3,{'76561197960315957': {'player_info': {'respon...
4,{'76561197960331205': {'player_info': {'respon...
...,...
20735,{'76561198130132230': {'player_info': {'respon...
20736,{'76561198981187936': {'player_info': {'respon...
20737,{'76561198286511601': {'player_info': {'respon...
20738,{'76561198106511894': {'player_info': {'respon...


### 提取数据函数定义<br>
这里可以调用test_data进行调试,观察json数据结构，如需查阅请将开头的注释符去掉

In [6]:
# test_data # 原始数据
# test_data[list(test_data)[0]] # 每个动态steam_id下面的数据
# test_data[list(test_data)[0]]['player_info']['response']['players'][0] # 每位玩家的所有基础信息
# test_data[list(test_data)[0]]['player_games']['response'] # 每位玩家所拥有的游戏信息
# test_data[list(test_data)[0]]['recently_played_games']['response'] #每位玩家最近两周玩过的游戏信息

定义函数

In [6]:
'''抓取玩家基础信息'''
def extract_player_info(player_data):
    # 查阅json文件，以下是所需查找的键
    info_keys = ['steamid', 'communityvisibilitystate', 'profilestate', 'personaname', 'profileurl', 'avatar', 'avatarmedium', 'avatarfull', 'avatarhash', 'personastate', 'primaryclanid', 'timecreated', 'personastateflags', 'loccountrycode']

    # 从嵌套的JSON中提取特定玩家的信息并转换为pandas Series
    try:
        steam_id = list(player_data.keys())[0]
        player_info = player_data[steam_id]['player_info']['response']['players'][0]
        return pd.Series({key: player_info.get(key, None) for key in info_keys})
    except Exception as e:
        # print(f"Error extracting player info: {e}")
        return pd.Series({key: None for key in info_keys})

'''抓取玩家游戏信息'''
def extract_player_games(player_data):

    # 初始化空DataFrame，确保在出错时这个变量已定义
    player_games_sorted_df = pd.DataFrame() 

    # 从嵌套的JSON中提取游戏时长最长的5款游戏信息并转换为一行pandas Series
    try:
        steam_id = list(player_data.keys())[0]
        player_games = player_data[steam_id]['player_games']['response']
        player_games_df = pd.DataFrame(player_games['games'])  # 将玩家游戏列表整理成DataFrame
        player_games_sorted_df = player_games_df.sort_values(by='playtime_forever', ascending=False, ignore_index=True).head(5)[['appid', 'name', 'playtime_forever']]  # 选择玩家玩的最多的前5款游戏
        
        new_se = pd.Series()
        for i in range(1, len(player_games_sorted_df) + 1):
            temp_df = player_games_sorted_df.iloc[i-1].copy()
            temp_df.index = [f'{idx}_game{i}' for idx in temp_df.index]
            new_se = pd.concat([new_se, temp_df])
        new_se['game_count'] = player_games['game_count']
        return new_se
    except Exception as e:
        # print(f"Error extracting player games: {e}")
        all_keys = [f'{idx}_game{i}' for i in range(1, len(player_games_sorted_df) + 1) for idx in player_games_sorted_df.columns] + ['game_count']
        return pd.Series({key: None for key in all_keys})

'''抓取玩家近期游戏信息'''
def extract_recently_played_games(player_data):
    recently_played_games = []
    try:
        player_id = list(player_data.keys())[0]
        recently_played_games = pd.DataFrame(player_data[player_id]['recently_played_games']['response']['games'])[['appid','name','playtime_2weeks','playtime_forever']] 
        null_se = pd.Series()
        for i in range(1,len(recently_played_games)+1):
            temp_se = pd.Series(recently_played_games.iloc[i-1]).copy()
            temp_se.index = [f'{idx}_game{i}' for idx in temp_se.index]
            null_se = pd.concat([null_se,temp_se])
        return null_se
    except:
        # print(f"Error extracting recently played games: {e}")
        all_keys = [f'{idx}_game{i}' for i in range(1, len(recently_played_games) + 1) for idx in recently_played_games.columns]
        return pd.Series({key: None for key in all_keys})


### 查询并整合

In [7]:
player_recently_played_games_df = df[0].apply(extract_recently_played_games)

In [9]:
player_info_df = df[0].apply(extract_player_info)
player_games_df = df[0].apply(extract_player_games)
player_recently_played_games_df = df[0].apply(extract_recently_played_games)

In [14]:
new_df = pd.concat([player_info_df,player_games_df,player_recently_played_games_df],ignore_index=False, axis=1)
'''重新定义列名称'''
new_df.columns = ['steamid', 'communityvisibilitystate', 'profilestate', 'personaname',
       'profileurl', 'avatar', 'avatarmedium', 'avatarfull', 'avatarhash',
       'personastate', 'primaryclanid', 'timecreated', 'personastateflags',
       'loccountrycode', 'game_count', 'appid_game1', 'name_game1',
       'playtime_forever_game1', 'appid_game2', 'name_game2',
       'playtime_forever_game2', 'appid_game3', 'name_game3',
       'playtime_forever_game3', 'appid_game4', 'name_game4',
       'playtime_forever_game4', 'appid_game5', 'name_game5',
       'playtime_forever_game5', 'appid_game1_2weeks', 'name_game1_2weeks',
       'playtime_2weeks_game1', 'playtime_forever_game1_2weeks', 'appid_game2_2weeks',
       'name_game2_2weeks', 'playtime_2weeks_game2_2weeks', 'playtime_forever_game2_2weeks',
       'appid_game3_2weeks', 'name_game3_2weeks', 'playtime_2weeks_game3_2weeks',
       'playtime_forever_game3_2weeks']
new_df

Unnamed: 0,steamid,communityvisibilitystate,profilestate,personaname,profileurl,avatar,avatarmedium,avatarfull,avatarhash,personastate,...,playtime_2weeks_game1,playtime_forever_game1_2weeks,appid_game2_2weeks,name_game2_2weeks,playtime_2weeks_game2_2weeks,playtime_forever_game2_2weeks,appid_game3_2weeks,name_game3_2weeks,playtime_2weeks_game3_2weeks,playtime_forever_game3_2weeks
0,76561197960269909,3.0,1.0,ツxxツ,https://steamcommunity.com/id/xcari/,https://avatars.steamstatic.com/c8499ee4d5ebde...,https://avatars.steamstatic.com/c8499ee4d5ebde...,https://avatars.steamstatic.com/c8499ee4d5ebde...,c8499ee4d5ebdebd78f07fc3fa19ce5370da82be,0.0,...,,,,,,,,,,
1,76561197960280440,3.0,1.0,recon,https://steamcommunity.com/id/pzrecon/,https://avatars.steamstatic.com/628974cb0fcec1...,https://avatars.steamstatic.com/628974cb0fcec1...,https://avatars.steamstatic.com/628974cb0fcec1...,628974cb0fcec15a07cd1601fdadc7aa44ac245d,0.0,...,,,,,,,,,,
2,76561197960290472,3.0,1.0,JKBe,https://steamcommunity.com/profiles/7656119796...,https://avatars.steamstatic.com/c698ae39dd85c1...,https://avatars.steamstatic.com/c698ae39dd85c1...,https://avatars.steamstatic.com/c698ae39dd85c1...,c698ae39dd85c1a1567e184a4b1735e9077a475f,0.0,...,,,,,,,,,,
3,76561197960315957,3.0,1.0,chuNami5000,https://steamcommunity.com/id/chuNk--/,https://avatars.steamstatic.com/b16314c3aff86b...,https://avatars.steamstatic.com/b16314c3aff86b...,https://avatars.steamstatic.com/b16314c3aff86b...,b16314c3aff86bf70b0bb5e54570fe8aa3efdd5d,3.0,...,1189.0,371092.0,1877600.0,MidnightGuns,82.0,82.0,952060.0,ResidentEvil3,67.0,272.0
4,76561197960331205,3.0,1.0,Mikki,https://steamcommunity.com/profiles/7656119796...,https://avatars.steamstatic.com/fef49e7fa7e199...,https://avatars.steamstatic.com/fef49e7fa7e199...,https://avatars.steamstatic.com/fef49e7fa7e199...,fef49e7fa7e1997310d705b2a6158ff8dc1cdfeb,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20735,76561198130132230,3.0,1.0,Fishvit,https://steamcommunity.com/id/hackeraim007/,https://avatars.steamstatic.com/18cf9e93edbe79...,https://avatars.steamstatic.com/18cf9e93edbe79...,https://avatars.steamstatic.com/18cf9e93edbe79...,18cf9e93edbe79213f24fd277a0c8fc05e9ec68f,1.0,...,3644.0,26382.0,1172470.0,ApexLegends,215.0,56888.0,,,,
20736,76561198981187936,3.0,1.0,juhobossu,https://steamcommunity.com/id/zeihou16/,https://avatars.steamstatic.com/3fe6469f4eb139...,https://avatars.steamstatic.com/3fe6469f4eb139...,https://avatars.steamstatic.com/3fe6469f4eb139...,3fe6469f4eb13922612c7bf2a66d7df43ac47c5e,0.0,...,,,,,,,,,,
20737,76561198286511601,3.0,1.0,OhNo|Zigludo,https://steamcommunity.com/profiles/7656119828...,https://avatars.steamstatic.com/d472abf1cd758e...,https://avatars.steamstatic.com/d472abf1cd758e...,https://avatars.steamstatic.com/d472abf1cd758e...,d472abf1cd758e4e47c0b03ed915300ad5ca5fa4,0.0,...,48.0,23865.0,,,,,,,,
20738,76561198106511894,3.0,1.0,MUNK,https://steamcommunity.com/profiles/7656119810...,https://avatars.steamstatic.com/3604ac34b47c87...,https://avatars.steamstatic.com/3604ac34b47c87...,https://avatars.steamstatic.com/3604ac34b47c87...,3604ac34b47c87e187d151f22aa17e107253ce34,1.0,...,1037.0,6228.0,553850.0,HELLDIVERS™2,51.0,7962.0,1938090.0,CallofDuty®,5.0,6860.0


### 提取所有涉及的游戏信息，用于后续用户画像分类

提取游戏信息的列

In [15]:
column_names1 = ['appid_game' + str(i) for i in range(1, 6) ]
column_names2 = ['appid_game' + str(i) + '_2weeks' for i in range(1,4)]
all_columns =  column_names1 + column_names2
selected_columns = new_df[all_columns]
selected_columns # 这里有两个appid_game 即为一个是

Unnamed: 0,appid_game1,appid_game2,appid_game3,appid_game4,appid_game5,appid_game1_2weeks,appid_game2_2weeks,appid_game3_2weeks
0,,,,,,,,
1,,,,,,,,
2,,,,,,,,
3,730.0,10.0,782330.0,391220.0,228380.0,730.0,1877600.0,952060.0
4,,,,,,,,
...,...,...,...,...,...,...,...,...
20735,,,,,,1811260.0,1172470.0,
20736,,,,,,,,
20737,,,,,,1384160.0,,
20738,,,,,,2054970.0,553850.0,1938090.0


将所有整理成一列

In [16]:
null_df = pd.DataFrame()
for column_name in selected_columns.columns:
    null_df = pd.concat([null_df, selected_columns[column_name]], ignore_index= True)


In [18]:
null_df.columns = ['game_id'] #重命名列名
null_df = null_df.dropna() # 去除空值
null_df = null_df.drop_duplicates() # 仅仅保存独立值
null_df['game_id'] = null_df['game_id'].astype(int) # 转换数据类型为整形
null_df

Unnamed: 0,game_id
3,730
10,252950
11,782330
13,252490
16,10
...,...
165640,1715130
165732,948740
165839,216110
165870,2381620


### 保存所有数据用于后续分析

In [19]:
new_df.to_csv('../../data/processed/all_steam_data.csv')
null_df.to_csv('../../data/processed/game_list.csv')