# 数据预处理

In [None]:
# 导入预处理过程需要的所有软件包
import os                  # 一些操作系统提供的 API
import csv
from tqdm import tqdm      # 为循环或其他迭代操作添加进度条
import numpy as np
import pandas as pd
import ujson as json       # 用于读入 .json 文件
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
# 宏定义
PATH_TO_RAW_DATA = "../data"
PATH_TO_PROCESSED_DATA = "./data"

## Part0. 分析特征

In [None]:
from utils.readjsonl import read_matches
from utils.getfeaturetree import get_keys_relation, build_tree, print_tree

In [None]:
# 获取训练集中的所有键的关系
key_rel_train = get_keys_relation(os.path.join(PATH_TO_RAW_DATA, "train_matches.jsonl"))
tree = build_tree(key_rel_train)

feature_tree_train_path = os.path.join(PATH_TO_PROCESSED_DATA, "feature_tree_train.txt")
with open(feature_tree_train_path, "w") as f:
    print_tree(tree, file=f)

# 将树存储为 .json 文件
feature_tree_train_json_path = os.path.join(PATH_TO_PROCESSED_DATA, "feature_tree_train.json")
with open(feature_tree_train_json_path, "w") as f:
    json.dump(tree, f)

In [None]:
# 获取测试集中的所有键的关系
key_rel_test = get_keys_relation(os.path.join(PATH_TO_RAW_DATA, "test_matches.jsonl"))
tree = build_tree(key_rel_test)

feature_tree_test_path = os.path.join(PATH_TO_PROCESSED_DATA, "feature_tree_test.txt")
with open(feature_tree_test_path, "w") as f:
    print_tree(tree, file=f)

# 将树存储为 .json 文件
feature_tree_test_json_path = os.path.join(PATH_TO_PROCESSED_DATA, "feature_tree_test.json")
with open(feature_tree_test_json_path, "w") as f:
    json.dump(tree, f)

In [None]:
def update_json_with_descriptions(json_file, excel_file, output_file):
    """ 将 .xlsx 文件中的特征描述添加到 JSON 文件对应的特征中 """
    # 读取 JSON 文件
    with open(json_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # 读取 Excel 文件，第一列是 'feature'，第二列是 'description'，第三列是 'priority'
    df = pd.read_excel(excel_file)

    # 创建 feature 到 description 的映射
    # feature : { description : "" }
    feature_descriptions = {feature: {"description": description} for feature, description, _ in df.values}

    # 递归更新 JSON 中的特征描述
    def update_feature_descriptions(obj):
        if isinstance(obj, dict):
            for key, value in obj.items():
                if key in feature_descriptions and value == {}:  # 如果该特征的值为空
                    obj[key] = feature_descriptions[key]
                else:
                    update_feature_descriptions(value)           # 递归处理子对象
        elif isinstance(obj, list):
            for item in obj:
                update_feature_descriptions(item)

    # 更新 JSON 数据
    update_feature_descriptions(data)

    # 将更新后的 JSON 数据保存到新文件中
    with open(output_file, 'w', encoding='utf-8') as f_out:
        json.dump(data, f_out, indent=4, ensure_ascii=False)

json_file_path = "./data/feature_tree_test.json"   # 原始 JSON 文件路径
excel_file_path = "./data/dota-feature.xlsx"       # Excel 文件路径
output_file_path = "./data/updated_feature.json"  # 输出的 JSON 文件路径

update_json_with_descriptions(json_file_path, excel_file_path, output_file_path)

手工标注 updated_features 得到 feature_description.json

## Step1. 特征选择

理解特征含义后，选择需要的特征并从原始数据中提取所需数据。

In [None]:
# 确保输出路径存在
extracted_data_path = os.path.join(PATH_TO_RAW_DATA, 'extracted_data')
os.makedirs(extracted_data_path, exist_ok=True)  # 如果目录不存在，创建它

In [None]:
print('1.一级标签提取 —— main table')

def extract_non_nested_and_count_nested_keys(json_object):
    """
    从JSON对象中提取所有不含嵌套属性的一级属性，
    并对包含嵌套属性的一级属性统计其内部的数据数量。
    """
    data = {}

    for key, value in json_object.items():
        if not isinstance(value, (dict, list)):  # 如果属性不含嵌套（即不是dict或list），直接提取
            data[key] = value
        elif isinstance(value, list):  # 如果属性是列表，统计其长度并添加统计列
            data[f"{key}_number"] = len(value)
        elif isinstance(value, dict):  # 如果属性是字典类型，也记录其包含的键值数量
            data[f"{key}_number"] = len(value)

    return data

def jsonl_to_csv(jsonl_file_path, csv_file_path):
    """ 将 JSONL 文件转换为 CSV 文件，提取不含嵌套的一级属性，并统计嵌套属性的数量 """

    with open(jsonl_file_path, 'r') as jsonl_file, open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
        csv_writer = None
        for line in jsonl_file:
            json_object = json.loads(line.strip())
            # 提取不含嵌套的一级属性和嵌套属性的数量
            processed_data = extract_non_nested_and_count_nested_keys(json_object)

            # 删除不需要的列
            for col_to_remove in ['objectives_number', 'players_number', 'targets_number']:
                processed_data.pop(col_to_remove, None)

            # 初始化CSV写入器（仅在第一次循环时执行）
            if csv_writer is None:
                csv_writer = csv.DictWriter(csv_file, fieldnames=processed_data.keys())
                csv_writer.writeheader()

            # 写入CSV行
            csv_writer.writerow(processed_data)

jsonl_file_path = os.path.join(PATH_TO_RAW_DATA, "train_matches.jsonl")
csv_file_path = os.path.join(PATH_TO_RAW_DATA, 'extracted_data/main_table_deleted.csv')
jsonl_to_csv(jsonl_file_path, csv_file_path)

print('1.一级标签提取——main table提取成功')

In [None]:
print('2.提取objectives表格')

def find_max_objectives(jsonl_file_path):
    """
    遍历JSONL文件，找出所有行中objectives的最大数量。
    """
    max_objectives = 0
    with open(jsonl_file_path, 'r') as jsonl_file:
        for line in jsonl_file:
            json_object = json.loads(line.strip())
            objectives_count = len(json_object.get("objectives", []))
            if objectives_count > max_objectives:
                max_objectives = objectives_count
    print(max_objectives)
    return max_objectives

def extract_objectives(json_object, max_objectives):
    """
    从JSON对象中提取所有objectives数据，将每个objective展开成单独的列。
    如果没有那么多objectives，则保留空值。
    """
    data = {}
    objectives = json_object.get("objectives", [])
    for i in range(1, max_objectives + 1):
        if i <= len(objectives):
            objective = objectives[i - 1]
            for key in ["time", "type", "player_slot", "team", "key", "slot"]:
                column_name = f"objective-{i}-{key}"
                data[column_name] = objective.get(key, "")
        else:
            for key in ["time", "type", "player_slot", "team", "key", "slot"]:
                column_name = f"objective-{i}-{key}"
                data[column_name] = ""
    return data

def jsonl_to_csv(jsonl_file_path, csv_file_path):
    """
    将JSONL文件转换为CSV文件，提取并展开每一行中的所有objective数据。
    """
    # 第一次遍历：找到最大的objectives数量
    max_objectives = find_max_objectives(jsonl_file_path)

    with open(jsonl_file_path, 'r') as jsonl_file, open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
        csv_writer = None
        for line in jsonl_file:
            json_object = json.loads(line.strip())
            # 提取并展开objectives数据
            processed_data = extract_objectives(json_object, max_objectives=max_objectives)
            # 初始化CSV写入器（仅在第一次循环时执行）
            if csv_writer is None:
                csv_writer = csv.DictWriter(csv_file, fieldnames=processed_data.keys())
                csv_writer.writeheader()
            # 写入CSV行
            csv_writer.writerow(processed_data)

jsonl_file_path = os.path.join(PATH_TO_RAW_DATA, "train_matches.jsonl")
csv_file_path = os.path.join(PATH_TO_RAW_DATA, 'extracted_data/objective_table_deleted.csv')
jsonl_to_csv(jsonl_file_path, csv_file_path)

print('2.提取objectives表格提取成功')

In [None]:
print('3.提取targets表格')

def extract_radiant_win(json_object):
    """ 从 JSON 对象中提取 radiant_win 属性 """
    data = {}
    targets = json_object.get("targets", {})
    # 仅提取radiant_win字段
    data["radiant_win"] = targets.get("radiant_win", "")

    return data

def jsonl_to_csv(jsonl_file_path, csv_file_path):
    """
    将JSONL文件转换为CSV文件，仅提取radiant_win字段。
    """
    with open(jsonl_file_path, 'r') as jsonl_file, open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
        csv_writer = None
        for line in jsonl_file:
            json_object = json.loads(line.strip())
            # 提取radiant_win字段
            processed_data = extract_radiant_win(json_object)
            # 初始化CSV写入器（仅在第一次循环时执行）
            if csv_writer is None:
                csv_writer = csv.DictWriter(csv_file, fieldnames=processed_data.keys())
                csv_writer.writeheader()
            # 写入CSV行
            csv_writer.writerow(processed_data)

jsonl_file_path = os.path.join(PATH_TO_RAW_DATA, "train_matches.jsonl")
csv_file_path = os.path.join(PATH_TO_RAW_DATA, 'extracted_data/target_table_radiantwin.csv')
jsonl_to_csv(jsonl_file_path, csv_file_path)
print('3.targets表格提取成功')

In [None]:
print('4.提取teamfights表格')

def extract_teamfights(json_object):
    """
    从JSON对象中提取teamfights数据，将每个teamfight和其下的player属性展开成单独的列。
    删除players中的ability_uses, deaths, deaths_pos, item_uses, 和killed字段。
    """
    data = {}
    teamfights = json_object.get("teamfights", [])

    for i, teamfight in enumerate(teamfights, start=1):
        # 提取teamfight的顶层字段
        data[f"teamfights-{i}-end"] = teamfight.get("end", "")
        data[f"teamfights-{i}-start"] = teamfight.get("start", "")
        data[f"teamfights-{i}-deaths"] = teamfight.get("deaths", "")
        data[f"teamfights-{i}-last_death"] = teamfight.get("last_death", "")

        # 提取每个player的数据
        players = teamfight.get("players", [])
        for j, player in enumerate(players, start=1):
            # 删除不需要的字段，只保留以下字段
            data[f"teamfights-{i}-player-{j}-xp_delta"] = player.get("xp_delta", "")
            data[f"teamfights-{i}-player-{j}-damage"] = player.get("damage", "")
            data[f"teamfights-{i}-player-{j}-gold_delta"] = player.get("gold_delta", "")
            data[f"teamfights-{i}-player-{j}-healing"] = player.get("healing", "")
            data[f"teamfights-{i}-player-{j}-buybacks"] = player.get("buybacks", "")

    return data

def jsonl_to_csv(jsonl_file_path, csv_file_path):
    """
    将JSONL文件转换为CSV文件，提取并展开每一行中的所有teamfights数据。
    """
    all_fieldnames = set()
    max_players = 0

    # 第一次遍历：确定最大玩家数量和完整字段名
    with open(jsonl_file_path, 'r') as jsonl_file:
        for line in jsonl_file:
            json_object = json.loads(line.strip())
            teamfights = json_object.get("teamfights", [])
            processed_data = extract_teamfights(json_object)
            all_fieldnames.update(processed_data.keys())

    # 排序字段名，确保一致性
    all_fieldnames = sorted(all_fieldnames)

    # 第二次遍历：将数据写入CSV文件
    with open(jsonl_file_path, 'r') as jsonl_file, open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
        csv_writer = csv.DictWriter(csv_file, fieldnames=all_fieldnames)
        csv_writer.writeheader()
        for line in jsonl_file:
            json_object = json.loads(line.strip())
            processed_data = extract_teamfights(json_object, max_players)
            csv_writer.writerow(processed_data)

jsonl_file_path = os.path.join(PATH_TO_RAW_DATA, "train_matches.jsonl")
csv_file_path = os.path.join(PATH_TO_RAW_DATA, 'extracted_data/teamfights_table_deleted.csv')
jsonl_to_csv(jsonl_file_path, csv_file_path)

print('4.teamfights表格提取成功')

In [None]:
print('5.提取players表格')

def flatten_dict(data, prefix=""):
    """
    仅递归展开嵌套字典中需要展开的字段，避免过度拉平，保留指定字段为普通字段。
    如果字段是字典，则展开为`prefix-key`的形式，如果是其他类型，则直接保存为值。
    """
    flat_data = {}
    for key, value in data.items():
        new_key = f"{prefix}-{key}" if prefix else key
        if isinstance(value, dict):
            for sub_key, sub_value in value.items():
                flat_data[f"{new_key}-{sub_key}"] = sub_value
        else:
            flat_data[new_key] = value
    return flat_data

def extract_players(json_object):
    """
    从JSON对象中提取players数据，将每个player和其下的多级属性展开成单独的列。
    删除指定字段，保留需要展开的字段，并处理多级嵌套。
    """
    data = {}
    players = json_object.get("players", [])

    for i, player in enumerate(players, start=1):
        player_data = {}

        # 仅保留需要的字段
        required_fields = [
            "sen_placed", "sen_left_log", "kills", "obs_left_log",
            "max_hero_hit", "obs_log", "max_mana", "creeps_stacked",
            "xp_reasons", "randomed", "towers_killed", "health",
            "rune_pickups", "level", "stuns", "deaths", "gold",
            "nearby_creep_death_count", "denies", "observers_placed",
            "sen_log", "hero_id", "max_health"
        ]

        for field in required_fields:
            if field in player:
                # 对 max_hero_hit 特殊处理，只保留 value 字段
                if field == "max_hero_hit" and isinstance(player[field], dict):
                    player_data[field] = player[field].get("value", "")
                else:
                    player_data[field] = player[field]

        # 展开需要的嵌套字典
        flat_player_data = flatten_dict(player_data, prefix=f"players-{i}")
        data.update(flat_player_data)

    return data

def jsonl_to_csv(jsonl_file_path, csv_file_path):
    """
    将JSONL文件转换为CSV文件，提取并展开每一行中的所有players数据。
    """
    all_fieldnames = set()

    # 第一次遍历：确定完整的字段名
    with open(jsonl_file_path, 'r') as jsonl_file:
        for line in jsonl_file:
            json_object = json.loads(line.strip())
            processed_data = extract_players(json_object)
            all_fieldnames.update(processed_data.keys())

    # 排序字段名，确保一致性
    all_fieldnames = sorted(all_fieldnames)

    # 第二次遍历：将数据写入CSV文件
    with open(jsonl_file_path, 'r') as jsonl_file, open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
        csv_writer = csv.DictWriter(csv_file, fieldnames=all_fieldnames)
        csv_writer.writeheader()
        for line in jsonl_file:
            json_object = json.loads(line.strip())
            processed_data = extract_players(json_object)
            csv_writer.writerow(processed_data)

jsonl_file_path = os.path.join(PATH_TO_RAW_DATA, "train_matches.jsonl")
csv_file_path = os.path.join(PATH_TO_RAW_DATA, 'extracted_data/players_table_deleted.csv')
jsonl_to_csv(jsonl_file_path, csv_file_path)

print('5.players表格提取成功')

---

To be continued...

添加新的特征

In [None]:
def add_new_features(df_features, matches_file):

    # Process raw data and add new features
    for match in read_matches(matches_file):
        match_id_hash = match['match_id_hash'] # 对 jsonl 文件的每一行

        # Counting ruined towers for both teams
        #构建两个新的特征，构造他们的值
        radiant_tower_kills = 0
        dire_tower_kills = 0
        for objective in match['objectives']:
            if objective['type'] == 'CHAT_MESSAGE_TOWER_KILL':
                if objective['team'] == 2:
                    radiant_tower_kills += 1
                if objective['team'] == 3:
                    dire_tower_kills += 1

        # Write new features
        #将三个新的特征写入新的列
        df_features.loc[match_id_hash, 'radiant_tower_kills'] = radiant_tower_kills
        df_features.loc[match_id_hash, 'dire_tower_kills'] = dire_tower_kills
        df_features.loc[match_id_hash, 'diff_tower_kills'] = radiant_tower_kills - dire_tower_kills

        #let's add one more
        df_features.loc[match_id_hash, 'ratio_tower_kills'] = radiant_tower_kills / (0.01+dire_tower_kills)
        # ... here you can add more features ...

In [None]:
import collections

MATCH_FEATURES = [
    ('game_time', lambda m: m['game_time']),
    ('game_mode', lambda m: m['game_mode']),
    ('lobby_type', lambda m: m['lobby_type']),
    ('objectives_len', lambda m: len(m['objectives'])),
    ('chat_len', lambda m: len(m['chat'])),
]

PLAYER_FIELDS = [
    'hero_id',

    'kills',
    'deaths',
    'assists',
    'denies',

    'gold',
    'lh',
    'xp',
    'health',
    'max_health',
    'max_mana',
    'level',

    'x',
    'y',

    'stuns',
    'creeps_stacked',
    'camps_stacked',
    'rune_pickups',
    'firstblood_claimed',
    'teamfight_participation',
    'towers_killed',
    'roshans_killed',
    'obs_placed',
    'sen_placed',
]

def extract_features_csv(match):
    row = [
        ('match_id_hash', match['match_id_hash']),
    ]

    for field, f in MATCH_FEATURES:
        row.append((field, f(match)))

    for slot, player in enumerate(match['players']):
        if slot < 5:
            player_name = 'r%d' % (slot + 1)
        else:
            player_name = 'd%d' % (slot - 4)

        for field in PLAYER_FIELDS:
            column_name = '%s_%s' % (player_name, field)
            row.append((column_name, player[field]))

    return collections.OrderedDict(row)

def extract_targets_csv(match, targets):
    return collections.OrderedDict([('match_id_hash', match['match_id_hash'])] + [
        (field, targets[field])
        for field in ['game_time', 'radiant_win', 'duration', 'time_remaining', 'next_roshan_team']
    ])

In [None]:
df_new_features = []
df_new_targets = []

for match in read_matches('../data/train_matches.jsonl'):
    match_id_hash = match['match_id_hash']
    features = extract_features_csv(match)
    targets = extract_targets_csv(match, match['targets'])

    df_new_features.append(features)
    df_new_targets.append(targets)

df_new_features = pd.DataFrame.from_records(df_new_features).set_index('match_id_hash')
df_new_targets = pd.DataFrame.from_records(df_new_targets).set_index('match_id_hash')

df_new_features.head()

## Step2. 数据清洗与加工

类别编码

可能的数据标准化 / 归一化等等

In [None]:
# 加载数据
file_path_player_table = 'player_table_cleaning.csv'
player_table_data = pd.read_csv(file_path_player_table)

# 填充空值为 0
player_table_data.fillna(0, inplace=True)

# 替换布尔值
player_table_data.replace({False: 0, True: 1}, inplace=True)


# 遍历每个玩家的 xp_reasons 列进行处理
for player_num in range(1, 11):
    # 构造当前玩家的 xp_reasons 列名
    reason_cols = [f'players-{player_num}-xp_reasons-{i}' for i in range(4)]
    new_col_name = f'players-{player_num}-xp'  # 新列名

    # 检查是否所有列存在
    if all(col in player_table_data.columns for col in reason_cols):
        # 计算总和
        player_table_data[new_col_name] = player_table_data[reason_cols].sum(axis=1)

        # 找到原始位置索引
        first_col_index = player_table_data.columns.get_loc(reason_cols[0])

        # 删除原始 xp_reasons 列
        player_table_data.drop(columns=reason_cols, inplace=True)

        # 调整列顺序，将新列插入到原始位置
        cols = player_table_data.columns.tolist()
        cols.insert(first_col_index, cols.pop(cols.index(new_col_name)))
        player_table_data = player_table_data[cols]

## Step3. 特征工程



In [None]:
# 分析时序数据
# 原始数据集中有部分以 _t 结尾的时序数据
# 这些数据是在比赛的不同时间点记录的，我们可以使用这些数据来分析比赛的动态变化
# 如：gold_t, lh_t, dn_t, xp_t 等
# gold_t => 金币数，lh_t => 补刀数，dn_t => 反补数，xp_t => 经验值

for match in read_matches('../data/train_matches.jsonl'):
    for player in match['players']:
        plt.plot(player['times'], player['gold_t'])
    break

plt.title('Gold change for all players')

### 3.1 计算随机森林的特征重要性

使用 `scikit-learn` 库的随机森林模型时，计算特征重要性非常简单。

可以直接通过模型的 `feature_importances_` 属性来获取每个特征的重要性得分。

这个属性会返回一个数组，数组中的每个元素对应特征的重要性分数，分数越高，表示该特征对模型越重要。

#### 3.1.1 基于 Gini 不纯度或信息增益评估

In [None]:
# 导入所需的库并加载数据
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# 加载示例数据集（Iris数据集）
data = load_iris()
X = data.data
y = data.target

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 定义并训练随机森林模型
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# 获取特征重要性
importances = clf.feature_importances_

# 输出特征重要性
for i, importance in enumerate(importances):
    print(f"Feature {i}: Importance {importance}")

# 使用 matplotlib 对特征重要性进行可视化，以帮助理解哪些特征对模型贡献最大
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

# 特征名称
feature_names = data.feature_names

# 对特征重要性进行排序
indices = np.argsort(importances)[::-1]

# 可视化特征重要性
plt.figure(figsize=(10, 6))
plt.title("Feature Importance")
plt.bar(range(X_train.shape[1]), importances[indices], align="center")
plt.xticks(range(X_train.shape[1]), [feature_names[i] for i in indices], rotation=45)
plt.show()

#### 3.1.2 基于 OOB 评估

> OOB (Out of Bag)

随机森林在训练过程中会为每棵树随机抽取部分样本（bootstrap采样），这意味着一部分样本不会被用于训练当前的树，这些样本称为 **OOB样本**。

可以通过将某个特征在 OOB 样本中 **打乱**，然后观察模型的性能变化，来评估该特征对模型的影响。

**实现步骤**：

Step1. 随机森林模型的 OOB 设置

在使用`scikit-learn`训练随机森林模型时，可以通过将`oob_score`参数设置为`True`来启用 OOB 误差计算。

Step2. 特征打乱与性能对比

对于每个特征，可以按照以下步骤来计算OOB误差变化：

1. 基准 OOB 性能：首先在原始数据上计算 OOB 误差，作为基准。

2. 特征打乱：将某个特征的值随机打乱，再计算新的 OOB 误差。

3. 误差变化：比较打乱特征前后的 OOB 误差差值，误差增大越多，说明该特征对模型性能的贡献越大。

In [None]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# 加载数据集
data = load_iris()
X, y = data.data, data.target

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 训练随机森林模型，启用 OOB 误差计算
rf = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=42)
rf.fit(X_train, y_train)

# 查看基于OOB的性能
print(f"OOB Score: {rf.oob_score_}")  # rf.oob_score_ 表示模型在 OOB 样本上的基准性能。

# 计算特征打乱后的 OOB 误差变化，观察模型性能的变化。
from sklearn.metrics import accuracy_score

def permutation_importance_oob(model, X_train, y_train):
    """ 计算基于 OOB 误差的特征重要性。

    parameter:
        1. model   : 训练好的随机森林模型
        2. X_train : 训练集特征
        3. y_train : 训练集标签
    """
    # 初始化变量
    base_oob_score = model.oob_score_  # 基准OOB分数
    feature_importances = np.zeros(X_train.shape[1])  # 保存特征重要性

    # 遍历每个特征
    for col in range(X_train.shape[1]):
        X_train_permuted = X_train.copy()  # 创建训练集副本
        np.random.shuffle(X_train_permuted[:, col])  # 随机打乱某个特征列

        # 用打乱特征后的训练集重新计算OOB得分
        model.fit(X_train_permuted, y_train)
        oob_score_permuted = model.oob_score_

        # 计算OOB误差的变化
        feature_importances[col] = base_oob_score - oob_score_permuted  # 分数下降越多，特征越重要

    return feature_importances

# 调用函数计算特征重要性
oob_importances = permutation_importance_oob(rf, X_train, y_train)

# 输出结果
for i, importance in enumerate(oob_importances):
    print(f"Feature {i} ({data.feature_names[i]}): OOB Importance {importance}")

# 可视化特征重要性
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.title("OOB Feature Importance")
plt.bar(range(X_train.shape[1]), oob_importances, align="center")
plt.xticks(range(X_train.shape[1]), data.feature_names, rotation=45)
plt.ylabel("OOB Score Decrease")
plt.show()

优点：

- **模型无关**：虽然我们在这里使用的是随机森林模型，但这种OOB误差打乱方法可以应用于任何基于bootstrap的模型。

- **直接评估**：相比于仅通过分裂节点的纯度变化来评估，基于OOB误差的方法能够直接反映特征对模型整体预测能力的影响。

挑战：

- **计算成本**：每次打乱特征后，都需要重新拟合模型并计算OOB分数。这对于大规模数据集或模型复杂度较高的情况，计算开销会较大。

- **特征相关性问题**：如果两个特征高度相关，打乱其中一个特征后，另一个特征可能会“补偿”它的影响，这使得模型的性能下降幅度不明显，从而低估该特征的重要性。

可以将OOB误差的重要性与**Gini不纯度减少**的特征重要性评分结合起来，综合评估模型中各特征的影响力。

**基于 OOB 误差的特征重要性评估** 方法通过打乱某个特征并观察模型 OOB 得分的变化来衡量该特征的重要性。

这种方法能直接反映特征对模型预测性能的影响，尤其适合随机森林模型。虽然计算复杂度较高，但它能提供更直观的特征评估方式，特别是在高维数据集中。