In [None]:
import os
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
%matplotlib inline

In [None]:
PATH_TO_DATA = './data/'

# 将 'match_id_hash' 作为索引，'radiant_win' 作为标签
train_data_path = os.path.join(PATH_TO_DATA, 'Dota_data_v2.0.csv')
df_train = pd.read_csv(train_data_path, index_col='match_id_hash')

In [None]:
# 删除从 'total_teamfight_time' 开始的所有列
#df_train = df_train.iloc[:, 0:df_train.columns.get_loc('total_teamfight_time')]

In [None]:
zero=df_train[df_train['teamfights_number']==0]
non_zero=df_train[df_train['teamfights_number']!=0]

In [None]:
print(zero.shape)
print(non_zero.shape)

In [None]:

if 'radiant_win' in zero:
    zero_y = zero['radiant_win']
    del zero['radiant_win']
    Zy = zero_y.to_numpy()
    Zx = zero.to_numpy()
else:
    print('No target')
    Zy = None

if 'radiant_win' in non_zero:
    non_zero_y = non_zero['radiant_win']
    del non_zero['radiant_win']
    NZy = non_zero_y.to_numpy()
    NZx = non_zero.to_numpy()
else:
    print('No target')
    NZy = None

print(NZx.shape,NZy.shape)
print(Zx.shape,Zy.shape)

In [None]:
param_dist = {
    'max_depth': Integer(30, 50)
    # 'n_estimators': Integer(10, 1000),
    # 'max_depth': Integer(10, 100),
    # 'min_samples_split': Integer(2, 10),
    # 'min_samples_leaf': Integer(1, 10),
}

# 交叉验证
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=17)

# 随机森林
Zrf = RandomForestClassifier(n_jobs=-1,
                            random_state=17,
                            criterion='log_loss',
                            class_weight='balanced',
                            #max_depth=40,
                            min_samples_split=2,
                            min_samples_leaf=8 ,
                            max_features='log2',
                            n_estimators = 200
                            )

# 贝叶斯优化
Zopt = BayesSearchCV(
    Zrf,
    param_dist,
    n_iter=20,
    cv=cv,
    scoring='roc_auc',
    n_jobs=-1,
    random_state=17
)
Zopt.fit(Zx, Zy)

In [None]:
print('Best params & Best score', Zopt.best_params_, Zopt.best_score_)

In [None]:
param_dist = {
    'max_depth': Integer(35, 45)
    # 'n_estimators': Integer(10, 1000),
    # 'max_depth': Integer(10, 100),
    # 'min_samples_split': Integer(2, 10),
    # 'min_samples_leaf': Integer(1, 10),
}

# 交叉验证
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=17)

# 随机森林
NZrf = RandomForestClassifier(n_jobs=-1,
                            random_state=17,
                            criterion='log_loss',
                            class_weight='balanced',
                            max_depth=40,
                            min_samples_split=2,
                            min_samples_leaf=8 ,
                            max_features='log2',
                            n_estimators = 200
                            )


# 贝叶斯优化
NZopt = BayesSearchCV(
    NZrf,
    param_dist,
    n_iter=20,
    cv=cv,
    scoring='roc_auc',
    n_jobs=-1,
    random_state=17
)
NZopt.fit(NZx, NZy)

In [None]:

print('Best params & Best score', NZopt.best_params_, NZopt.best_score_)

In [None]:
# 使用网格搜索进行超参数调优

param_grid = {
    'min_samples_leaf' : [ 2],
}
# 随机森林
Zrf = RandomForestClassifier(n_jobs=-1,
                            random_state=17,
                            criterion='log_loss',
                            class_weight='balanced',
                            max_depth=40,
                            min_samples_split=2,
                            #min_samples_leaf=8 ,
                            max_features='log2',
                            n_estimators = 2000
                            )

# 统计训练时间
start_time = datetime.now()
grid_search = GridSearchCV(Zrf,
                           param_grid,
                           cv=cv,
                           scoring='roc_auc',
                           n_jobs=-1,
                            )
grid_search.fit(Zx, Zy)
end_time = datetime.now()

print('Training took: ', end_time - start_time)
print('Best params & Best score', grid_search.best_params_, grid_search.best_score_)