下一阶段的基本任务为：在前期获取到的数据上初步对随机森林模型进行超参数调优，使用调好的超参数进行特征重要性评估，选取 top k 个重要性较高的特征进行训练，观察结果和使用所有特征进行训练是否有较大差别。

其次，进行特征组合，例如 KDA 等数据的进一步计算，添加一系列特征。



In [None]:
import os
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
%matplotlib inline

In [None]:
PATH_TO_DATA = './data/'

# 将 'match_id_hash' 作为索引，'radiant_win' 作为标签
train_data_path = os.path.join(PATH_TO_DATA, 'Dota_data_v1.0.csv')
df_train = pd.read_csv(train_data_path, index_col='match_id_hash')

In [None]:
print(df_train.shape)

In [None]:
if 'radiant_win' in df_train:
    df_y = df_train['radiant_win']
    del df_train['radiant_win']
else:
    print('No target')
    df_y = None

y = df_y.to_numpy()

In [None]:
# 查看所有列名
del df_train['game_time.1']
df_train.columns

In [None]:
df_X = df_train

# 删除从 'total_teamfight_time' 开始的所有列
df_X = df_X.iloc[:, 0:df_X.columns.get_loc('total_teamfight_time')]
X = df_X.to_numpy()

In [None]:
# 用于调参的参数空间
param_dist = {
    'n_estimators': Integer(10, 200),
    'max_depth': Integer(1, 200),
    'min_samples_split': Integer(2, 10),
    'min_samples_leaf': Integer(1, 10),
}

# 交叉验证
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=17)

# 随机森林
rf = RandomForestClassifier(n_jobs=-1, random_state=17, criterion='log_loss',
                            class_weight='balanced')

# 贝叶斯优化
opt = BayesSearchCV(
    rf,
    param_dist,
    n_iter=20,
    cv=cv,
    scoring='roc_auc',
    n_jobs=-1,
    random_state=17
)
opt.fit(X, y)

In [None]:
# 保存贝叶斯优化过程
results = pd.DataFrame(opt.cv_results_)
results['mean_test_score'] = -results['mean_test_score']
results['mean_train_score'] = -results['mean_train_score']
results['n_estimators'] = results['param_n_estimators']
results['max_depth'] = results['param_max_depth']
results['min_samples_split'] = results['param_min_samples_split']
results['min_samples_leaf'] = results['param_min_samples_leaf']
results = results.sort_values('mean_test_score', ascending=False)
results = results.drop(columns=['param_n_estimators', 'param_max_depth',
                                'param_min_samples_split', 'param_min_samples_leaf'])
results = results.reset_index(drop=True)
results.to_csv('./results/RF_bayes_search_results.csv', index=False)

In [None]:
print('Best params & Best score', opt.best_params_, opt.best_score_)

In [None]:
rf = RandomForestClassifier(class_weight='balanced', criterion='log_loss',
                            max_depth=28, min_samples_leaf=8, min_samples_split=6,
                            n_estimators=200, n_jobs=-1, random_state=17)

# 使用交叉验证评估模型
cv_scores = cross_val_score(rf, X, y, cv=cv, scoring='roc_auc')
print('CV scores', cv_scores)
print(f'CV mean: {cv_scores.mean()}, CV std: {cv_scores.std()}')

In [None]:
# 使用 scikit-learn 的 feature_importances_ 属性来获取特征重要性
rf.fit(X, y)

importances = rf.feature_importances_
for i, imp in enumerate(importances):
    print(f'{df_X.columns[i]}: {imp}')

# 在所有特征中，筛选出属于前150个特征的行，并可视化
df_fi = pd.DataFrame({'Feature': df_X.columns, 'importance': importances})
df_fi = df_fi.sort_values('importance', ascending=False).reset_index(drop=True)
df_fi = df_fi.iloc[:150]

plt.figure(figsize=(14,28))
sns.barplot(x="importance", y="Feature", data=df_fi)
plt.title('Features importance')
plt.tight_layout()
plt.savefig('FI-Simple.png')

In [None]:
# 使用重要性排名前 150 的特征进行训练
index = np.argsort(importances)[::-1][:150]
X_new = X[:, index]

cv_scores = cross_val_score(rf, X_new, y, cv=cv, scoring='roc_auc')
print('CV scores', cv_scores)
print(f'CV mean: {cv_scores.mean()}, CV std: {cv_scores.std()}')

In [None]:
index = np.argsort(importances)[::-1][:200]
X_new = X[:, index]

cv_scores = cross_val_score(rf, X_new, y, cv=cv, scoring='roc_auc')
print('CV scores', cv_scores)
print(f'CV mean: {cv_scores.mean()}, CV std: {cv_scores.std()}')

In [None]:
# 使用特征重要性大于 0.001 的特征进行训练
index = np.where(importances > 0.001)[0]
X_new = X[:, index]

cv_scores = cross_val_score(rf, X_new, y, cv=cv, scoring='roc_auc')
print('CV scores', cv_scores)
print(f'CV mean: {cv_scores.mean()}, CV std: {cv_scores.std()}')

In [None]:
# 使用 OOB 获取特征重要性
rf = RandomForestClassifier(class_weight='balanced', criterion='log_loss',
                            max_depth=28, min_samples_leaf=8, min_samples_split=6,
                            n_estimators=200, n_jobs=-1, random_state=17, oob_score=True)

rf.fit(X, y)

# 查看基于OOB的性能
print(f"OOB Score: {rf.oob_score_}")  # rf.oob_score_ 表示模型在 OOB 样本上的基准性能。

def permutation_importance_oob(model, X_train, y_train):
    """ 计算基于 OOB 误差的特征重要性。

    parameter:
        1. model   : 训练好的随机森林模型
        2. X_train : 训练集特征
        3. y_train : 训练集标签
    """
    # 初始化变量
    base_oob_score = model.oob_score_  # 基准OOB分数
    feature_importances = np.zeros(X_train.shape[1])  # 保存特征重要性

    # 遍历每个特征
    for col in tqdm(range(X_train.shape[1])):
        X_train_permuted = X_train.copy()  # 创建训练集副本
        np.random.shuffle(X_train_permuted[:, col])  # 随机打乱某个特征列

        # 用打乱特征后的训练集重新计算OOB得分
        model.fit(X_train_permuted, y_train)
        oob_score_permuted = model.oob_score_

        # 计算 OOB 误差的变化
        feature_importances[col] = base_oob_score - oob_score_permuted  # 分数下降越多，特征越重要

    return feature_importances

# 调用函数计算特征重要性
oob_importances = permutation_importance_oob(rf, X, y)

In [None]:
# 输出结果
for i, importance in enumerate(oob_importances):
    print(f"Feature {i}: OOB Importance {importance}")


# 在所有特征中，筛选出属于前150个特征的行，并可视化
df_fi = pd.DataFrame({'Feature': df_X.columns, 'importance': importances})
df_fi = df_fi.sort_values('importance', ascending=False).reset_index(drop=True)
df_fi = df_fi.iloc[:150]

plt.figure(figsize=(14,28))
sns.barplot(x="importance", y="Feature", data=df_fi)
plt.title('Features importance')
plt.tight_layout()
plt.savefig('FI-OBB.png')

In [None]:
# 使用网格搜索进行超参数调优

param_grid = {
    'n_estimators': [10, 50, 100],
    'max_depth': [None, 10, 15, 20, 30, 40],
    'min_samples_split': [20, 25, 30, 35],
    'min_samples_leaf': [20, 25, 30, 35, 40],
}

# 统计训练时间
start_time = datetime.now()
grid_search = GridSearchCV(rf, param_grid, cv=cv, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X, y)
end_time = datetime.now()

print('Training took: ', end_time - start_time)
print('Best params & Best score', grid_search.best_params_, grid_search.best_score_)

In [None]:
# 绘制网格搜索结果热力图
results = pd.DataFrame(grid_search.cv_results_)
scores = np.array(results.mean_test_score).reshape(6, 4, 5, 5)
scores = scores.mean(axis=1)
plt.figure(figsize=(10, 5))
sns.heatmap(scores.mean(axis=0), annot=True, fmt='.4f', xticklabels=param_grid['min_samples_leaf'],
            yticklabels=param_grid['min_samples_split'])

plt.xlabel('min_samples_leaf')
plt.ylabel('min_samples_split')
plt.title('ROC_AUC score')
plt.tight_layout()
plt.savefig('GridSearch.png')

In [None]:
# 使用随机搜索进行超参数调优
param_dist = {
    'n_estimators': sp_randint(50, 200),
    'max_depth': [None] + list(sp_randint(10, 20).rvs(10)),
    'min_samples_leaf': sp_randint(1, 5)
}

random_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=20,
                                      cv=cv, scoring='roc_auc', n_jobs=-1, random_state=17)
random_search.fit(X, y)

print(random_search.best_params_, random_search.best_score_)

In [None]:
# 绘制随机搜索结果热力图
results = pd.DataFrame(random_search.cv_results_)
scores = np.array(results.mean_test_score).reshape(10, 5, 4)
scores = scores.mean(axis=0)
plt.figure(figsize=(10, 5))
sns.heatmap(scores, annot=True, fmt='.4f', xticklabels=param_dist['max_depth'],
            yticklabels=param_dist['min_samples_leaf'])

plt.xlabel('max_depth')
plt.ylabel('min_samples_leaf')
plt.title('ROC_AUC score')
plt.tight_layout()
plt.savefig('RandomSearch.png')