# 数据预处理

In [None]:
# 导入预处理过程需要的所有软件包
import os                  # 一些操作系统提供的 API
from tqdm import notebook  # 为循环或其他迭代操作添加进度条
import numpy as np
import pandas as pd
import ujson as json       # 用于读入 .json 文件
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline

## Part0. 分析特征

In [None]:
def read_matches(matches_file):
    """ 生成器函数，用于读取比赛数据 """
    MATCHES_COUNT = {
        'test_matches.jsonl': 10000,
        'train_matches.jsonl': 39675
    }
    _, filename = os.path.split(matches_file)
    total_matches = MATCHES_COUNT.get(filename)

    with open(matches_file) as fin:
        for line in notebook.tqdm(fin, total=total_matches):
            yield json.loads(line)

def extract_keys(data, parent_key=None, res=None):
    """ 递归提取 JSON 对象中的键及其父子关系 """
    if res is None:
        res = set()

    if isinstance(data, dict):
        for key, val in data.items():
            current_key = f"{parent_key}.{key}" if parent_key else key
            res.add((parent_key, key))
            extract_keys(val, current_key, res)
    elif isinstance(data, list):
        for item in data:
            extract_keys(item, parent_key, res)

    return res

In [None]:
keys_rel_set = set()

for match in read_matches('../data/train_matches.jsonl'):
    keys_rel_set.update(extract_keys(match))

keys_rel_df = pd.DataFrame(keys_rel_set, columns=["parent", "child"])

keys_rel_df.fillna("NULL", inplace=True)

# 去除 child 为数字的行
keys_rel_df = keys_rel_df[~keys_rel_df["child"].str.isnumeric()]

# 去除 parent 最后一个单元为数字的行
keys_rel_df = keys_rel_df[~keys_rel_df["parent"].str.split(".").apply(lambda x: x[-1]).str.isnumeric()]

# 去除不重要的键
keys_rel_df = keys_rel_df[keys_rel_df["parent"].str.split(".").apply(lambda x: x[-1]) != "ability_uses"]
keys_rel_df = keys_rel_df[keys_rel_df["parent"].str.split(".").apply(lambda x: x[-1]) != "item_uses"]
keys_rel_df = keys_rel_df[keys_rel_df["parent"].str.split(".").apply(lambda x: x[-1]) != "damage_inflictor"]
keys_rel_df = keys_rel_df[keys_rel_df["parent"].str.split(".").apply(lambda x: x[-1]) != "hero_hits"]
keys_rel_df = keys_rel_df[keys_rel_df["parent"].str.split(".").apply(lambda x: x[-1]) != "damage_inflictor_received"]
keys_rel_df = keys_rel_df[keys_rel_df["parent"].str.split(".").apply(lambda x: x[-1]) != "purchase"]

# 去除所有含有 "npc_dota" 的行
keys_rel_df = keys_rel_df[~keys_rel_df["parent"].str.contains("npc_dota")]
keys_rel_df = keys_rel_df[~keys_rel_df["child"].str.contains("npc_dota")]

keys_rel = keys_rel_df.apply(lambda x: tuple(x), axis=1).values

In [None]:
def build_tree(tuples):
    """ 根据父子关系列表构建树结构 """
    tree = {}

    for path, node in tuples:
        if path == "NULL":
            # 如果没有路径，直接在根级别创建该节点
            if node not in tree:
                tree[node] = {}  # 执行顺序不定！
            continue

        parts = path.split('.')  # 将路径按照 '.' 分割
        current_level = tree     # 从树的根节点开始

        # 遍历路径中的每一段，逐层创建节点
        for part in parts:
            if part not in current_level:
                current_level[part] = {}  # 如果不存在该节点，创建一个字典表示该节点
            current_level = current_level[part]  # 继续向下层递归

        if node not in current_level:
            current_level[node] = {}  # 将当前节点添加到树中

    return tree

keys_rel_tree = build_tree(keys_rel)

# 将树结构转换为可读性强的 json 格式
with open("./data/all_features_train.json", "w") as fout:
    json.dump(keys_rel_tree, fout, indent=4)

In [None]:
def print_tree(tree, depth=0, file=None):
    """ dfs 遍历树并打印 """
    for key, val in tree.items():
        print(f"{'    ' * depth}{key}", file=file)
        print_tree(val, depth + 1, file)

print_tree(keys_rel_tree, file=open("./data/features_tree_train.txt", "w"))

In [None]:
def update_json_with_descriptions(json_file, excel_file, output_file):
    """ 将 .xlsx 文件中的特征描述添加到 JSON 文件对应的特征中 """
    # 读取 JSON 文件
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # 读取 Excel 文件，第一列是 'feature'，第二列是 'description'，第三列是 'priority'
    df = pd.read_excel(excel_file)

    # priority: T1 > T2 > T3 > T4 > T5 > 0
    df['priority'] = df['priority'].map({'T1': 5, 'T2': 4, 'T3': 3, 'T4': 2, 'T5': 1, 0: 0})

    # 创建 feature 到 description 和 priority 的映射
    # feature : { description : "", priority : "" }
    feature_descriptions = {feature: {"description": description, "priority": priority} for feature, description, priority in df.values}

    # 递归更新 JSON 中的特征描述
    def update_feature_descriptions(obj):
        if isinstance(obj, dict):
            for key, value in obj.items():
                if key in feature_descriptions and value == {}:  # 如果该特征的值为空
                    obj[key] = feature_descriptions[key]
                else:
                    update_feature_descriptions(value)           # 递归处理子对象
        elif isinstance(obj, list):
            for item in obj:
                update_feature_descriptions(item)

    # 更新 JSON 数据
    update_feature_descriptions(data)

    # 将更新后的 JSON 数据保存到新文件中
    with open(output_file, 'w', encoding='utf-8') as f_out:
        json.dump(data, f_out, indent=4, ensure_ascii=False)

json_file_path = "./data/all_features_test.json"   # 原始 JSON 文件路径
excel_file_path = "./data/dota-feature.xlsx"       # Excel 文件路径
output_file_path = "./data/updated_features.json"  # 输出的 JSON 文件路径

update_json_with_descriptions(json_file_path, excel_file_path, output_file_path)

# 将 *.0 替换为 *
!sed -i 's/\([0-9]\)\.0/\1/g' ./data/updated_features.json

手工标注 updated_features

## Step1. 特征选择

理解特征含义后，选择需要的特征并从原始数据中提取所需数据。

In [None]:
# 获取原始数据方式如下
for match in read_matches('../data/train_matches.jsonl'):
    match_id_hash = match['match_id_hash']
    game_time     = match['game_time']
    teamfights    = match['teamfights']
    objectives    = match['objectives']
    game_mode     = match['game_mode']
    players       = match['players']
    print(type(match))       # class 'dict'
    print(type(players))     # class 'list'
    print(type(players[0]))  # class 'dict'
    # To be continued...
    break

In [None]:
for match in read_matches('../data/train_matches.jsonl'):
    teamfights = match['teamfights']
    for teamfight in teamfights:
        players = teamfight['players']
        for player in players:
            deaths_pos = player['deaths_pos']
            if len(deaths_pos) > 0:
                print(deaths_pos, match['match_id_hash'])
                break

In [None]:
def add_new_features(df_features, matches_file):

    # Process raw data and add new features
    for match in read_matches(matches_file):
        match_id_hash = match['match_id_hash'] # 对 jsonl 文件的每一行

        # Counting ruined towers for both teams
        #构建两个新的特征，构造他们的值
        radiant_tower_kills = 0
        dire_tower_kills = 0
        for objective in match['objectives']:
            if objective['type'] == 'CHAT_MESSAGE_TOWER_KILL':
                if objective['team'] == 2:
                    radiant_tower_kills += 1
                if objective['team'] == 3:
                    dire_tower_kills += 1

        # Write new features
        #将三个新的特征写入新的列
        df_features.loc[match_id_hash, 'radiant_tower_kills'] = radiant_tower_kills
        df_features.loc[match_id_hash, 'dire_tower_kills'] = dire_tower_kills
        df_features.loc[match_id_hash, 'diff_tower_kills'] = radiant_tower_kills - dire_tower_kills

        #let's add one more
        df_features.loc[match_id_hash, 'ratio_tower_kills'] = radiant_tower_kills / (0.01+dire_tower_kills)
        # ... here you can add more features ...

In [None]:
import collections

MATCH_FEATURES = [
    ('game_time', lambda m: m['game_time']),
    ('game_mode', lambda m: m['game_mode']),
    ('lobby_type', lambda m: m['lobby_type']),
    ('objectives_len', lambda m: len(m['objectives'])),
    ('chat_len', lambda m: len(m['chat'])),
]

PLAYER_FIELDS = [
    'hero_id',

    'kills',
    'deaths',
    'assists',
    'denies',

    'gold',
    'lh',
    'xp',
    'health',
    'max_health',
    'max_mana',
    'level',

    'x',
    'y',

    'stuns',
    'creeps_stacked',
    'camps_stacked',
    'rune_pickups',
    'firstblood_claimed',
    'teamfight_participation',
    'towers_killed',
    'roshans_killed',
    'obs_placed',
    'sen_placed',
]

def extract_features_csv(match):
    row = [
        ('match_id_hash', match['match_id_hash']),
    ]

    for field, f in MATCH_FEATURES:
        row.append((field, f(match)))

    for slot, player in enumerate(match['players']):
        if slot < 5:
            player_name = 'r%d' % (slot + 1)
        else:
            player_name = 'd%d' % (slot - 4)

        for field in PLAYER_FIELDS:
            column_name = '%s_%s' % (player_name, field)
            row.append((column_name, player[field]))

    return collections.OrderedDict(row)

def extract_targets_csv(match, targets):
    return collections.OrderedDict([('match_id_hash', match['match_id_hash'])] + [
        (field, targets[field])
        for field in ['game_time', 'radiant_win', 'duration', 'time_remaining', 'next_roshan_team']
    ])

In [None]:
df_new_features = []
df_new_targets = []

for match in read_matches('../data/train_matches.jsonl'):
    match_id_hash = match['match_id_hash']
    features = extract_features_csv(match)
    targets = extract_targets_csv(match, match['targets'])

    df_new_features.append(features)
    df_new_targets.append(targets)

df_new_features = pd.DataFrame.from_records(df_new_features).set_index('match_id_hash')
df_new_targets = pd.DataFrame.from_records(df_new_targets).set_index('match_id_hash')

df_new_features.head()

## Step2. 数据清洗与加工

类别编码

可能的数据标准化 / 归一化等等

## Step3. 特征工程



In [None]:
# 分析时序数据
# 原始数据集中有部分以 _t 结尾的时序数据
# 这些数据是在比赛的不同时间点记录的，我们可以使用这些数据来分析比赛的动态变化
# 如：gold_t, lh_t, dn_t, xp_t 等
# gold_t => 金币数，lh_t => 补刀数，dn_t => 反补数，xp_t => 经验值

for match in read_matches('../data/train_matches.jsonl'):
    for player in match['players']:
        plt.plot(player['times'], player['gold_t'])
    break

plt.title('Gold change for all players')

### 3.1 计算随机森林的特征重要性

使用 `scikit-learn` 库的随机森林模型时，计算特征重要性非常简单。

可以直接通过模型的 `feature_importances_` 属性来获取每个特征的重要性得分。

这个属性会返回一个数组，数组中的每个元素对应特征的重要性分数，分数越高，表示该特征对模型越重要。

#### 3.1.1 基于 Gini 不纯度或信息增益评估

In [None]:
# 导入所需的库并加载数据
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# 加载示例数据集（Iris数据集）
data = load_iris()
X = data.data
y = data.target

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 定义并训练随机森林模型
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# 获取特征重要性
importances = clf.feature_importances_

# 输出特征重要性
for i, importance in enumerate(importances):
    print(f"Feature {i}: Importance {importance}")

# 使用 matplotlib 对特征重要性进行可视化，以帮助理解哪些特征对模型贡献最大
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

# 特征名称
feature_names = data.feature_names

# 对特征重要性进行排序
indices = np.argsort(importances)[::-1]

# 可视化特征重要性
plt.figure(figsize=(10, 6))
plt.title("Feature Importance")
plt.bar(range(X_train.shape[1]), importances[indices], align="center")
plt.xticks(range(X_train.shape[1]), [feature_names[i] for i in indices], rotation=45)
plt.show()

#### 3.1.2 基于 OOB 评估

> OOB (Out of Bag)

随机森林在训练过程中会为每棵树随机抽取部分样本（bootstrap采样），这意味着一部分样本不会被用于训练当前的树，这些样本称为 **OOB样本**。

可以通过将某个特征在 OOB 样本中 **打乱**，然后观察模型的性能变化，来评估该特征对模型的影响。

**实现步骤**：

Step1. 随机森林模型的 OOB 设置

在使用`scikit-learn`训练随机森林模型时，可以通过将`oob_score`参数设置为`True`来启用 OOB 误差计算。

Step2. 特征打乱与性能对比

对于每个特征，可以按照以下步骤来计算OOB误差变化：

1. 基准 OOB 性能：首先在原始数据上计算 OOB 误差，作为基准。

2. 特征打乱：将某个特征的值随机打乱，再计算新的 OOB 误差。

3. 误差变化：比较打乱特征前后的 OOB 误差差值，误差增大越多，说明该特征对模型性能的贡献越大。

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# 加载数据集
data = load_iris()
X, y = data.data, data.target

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 训练随机森林模型，启用 OOB 误差计算
rf = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=42)
rf.fit(X_train, y_train)

# 查看基于OOB的性能
print(f"OOB Score: {rf.oob_score_}")  # rf.oob_score_ 表示模型在 OOB 样本上的基准性能。

# 计算特征打乱后的 OOB 误差变化，观察模型性能的变化。
from sklearn.metrics import accuracy_score

def permutation_importance_oob(model, X_train, y_train):
    """ 计算基于 OOB 误差的特征重要性。

    parameter:
        1. model   : 训练好的随机森林模型
        2. X_train : 训练集特征
        3. y_train : 训练集标签
    """
    # 初始化变量
    base_oob_score = model.oob_score_  # 基准OOB分数
    feature_importances = np.zeros(X_train.shape[1])  # 保存特征重要性

    # 遍历每个特征
    for col in range(X_train.shape[1]):
        X_train_permuted = X_train.copy()  # 创建训练集副本
        np.random.shuffle(X_train_permuted[:, col])  # 随机打乱某个特征列

        # 用打乱特征后的训练集重新计算OOB得分
        model.fit(X_train_permuted, y_train)
        oob_score_permuted = model.oob_score_

        # 计算OOB误差的变化
        feature_importances[col] = base_oob_score - oob_score_permuted  # 分数下降越多，特征越重要

    return feature_importances

# 调用函数计算特征重要性
oob_importances = permutation_importance_oob(rf, X_train, y_train)

# 输出结果
for i, importance in enumerate(oob_importances):
    print(f"Feature {i} ({data.feature_names[i]}): OOB Importance {importance}")

# 可视化特征重要性
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.title("OOB Feature Importance")
plt.bar(range(X_train.shape[1]), oob_importances, align="center")
plt.xticks(range(X_train.shape[1]), data.feature_names, rotation=45)
plt.ylabel("OOB Score Decrease")
plt.show()

优点：

- **模型无关**：虽然我们在这里使用的是随机森林模型，但这种OOB误差打乱方法可以应用于任何基于bootstrap的模型。

- **直接评估**：相比于仅通过分裂节点的纯度变化来评估，基于OOB误差的方法能够直接反映特征对模型整体预测能力的影响。

挑战：

- **计算成本**：每次打乱特征后，都需要重新拟合模型并计算OOB分数。这对于大规模数据集或模型复杂度较高的情况，计算开销会较大。

- **特征相关性问题**：如果两个特征高度相关，打乱其中一个特征后，另一个特征可能会“补偿”它的影响，这使得模型的性能下降幅度不明显，从而低估该特征的重要性。

可以将OOB误差的重要性与**Gini不纯度减少**的特征重要性评分结合起来，综合评估模型中各特征的影响力。

**基于 OOB 误差的特征重要性评估** 方法通过打乱某个特征并观察模型 OOB 得分的变化来衡量该特征的重要性。

这种方法能直接反映特征对模型预测性能的影响，尤其适合随机森林模型。虽然计算复杂度较高，但它能提供更直观的特征评估方式，特别是在高维数据集中。