In [None]:
import os #to access files
import pandas as pd #to work with dataframes
import numpy as np #just a tradition
from sklearn.model_selection import StratifiedKFold,train_test_split #for cross-validation
from sklearn.metrics import roc_auc_score #this is we are trying to increase
import matplotlib.pyplot as plt #we will plot something at the end)
import seaborn as sns #same reason
import lightgbm as lgb #the model we gonna use
import optuna #超参数调优库

In [None]:
PATH_TO_DATA = './data/'

# 将 'match_id_hash' 作为索引，'radiant_win' 作为标签
train_data_path = os.path.join(PATH_TO_DATA, 'Dota_data_v1.0.csv')
test_data_path = os.path.join(PATH_TO_DATA, 'test_data_v1.1.csv')
df_train = pd.read_csv(train_data_path, index_col='match_id_hash')
df_test = pd.read_csv(test_data_path, index_col='match_id_hash')
#删掉多余的列
df_train = df_train.drop(columns=['game_time.1'])

df_test_features = pd.read_csv(os.path.join(PATH_TO_DATA, 'test_features.csv'),
                               index_col='match_id_hash')


In [None]:
print(df_train.shape)
print(df_test.shape)

训练集数据

In [None]:
if 'radiant_win' in df_train:
    df_y_train = df_train['radiant_win']
    del df_train['radiant_win']
else:
    print('No target')
    df_y_train = None

y_train = df_y_train.to_numpy()

df_X_train = df_train

# 删除从 'total_teamfight_time' 开始的所有列
df_X_train = df_X_train.iloc[:, 0:df_X_train.columns.get_loc('total_teamfight_time')]
X_train = df_X_train.to_numpy()

测试集数据

In [None]:
if 'radiant_win' in df_test:
    df_y_test = df_test['radiant_win']
    del df_test['radiant_win']
else:
    print('No target')
    df_y_test = None

y_test = df_y_test.to_numpy()

df_X_test = df_test

# 删除从 'total_teamfight_time' 开始的所有列
df_X_test = df_X_test.iloc[:, 0:df_X_test.columns.get_loc('total_teamfight_time')]
X_test = df_X_test.to_numpy()

In [None]:
print(X_test.shape)
print(X_train.shape)
print(y_test.shape)
print(y_train.shape)

nan_positions_train = np.where(np.isnan(y_train), 'NaN', y_train)
print(nan_positions_train)
nan_positions = np.where(np.isnan(y_test), 'NaN', y_test)
print(nan_positions)

In [None]:
# 将数据集分割为训练集和测试集，80% 用于训练，20% 用于测试
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 创建 LightGBM 数据集
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

lightGBM 调用超参数调优库optuna进行调试:

In [None]:

# 定义目标函数
def objective(trial):
    params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'num_leaves': trial.suggest_int('num_leaves', 10, 100),
        'max_depth': trial.suggest_int('max_depth', -1, 10),
        #'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 50),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.5, 1.0),
        'verbosity': -1 , # 控制训练过程中的输出级别，表示不输出信息
        'num_round' : trial.suggest_int('num_round', 200, 800)
    }

    # params = {
    #     'boosting_type': 'gbdt',
    #     'objective': 'binary',
    #     'metric': 'auc',
    #     'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log=True),
    #     'num_leaves': trial.suggest_int('num_leaves', 10, 100),
    #     'max_depth': trial.suggest_int('max_depth', -1, 10),
    #     #'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 10, 50),
    #     'feature_fraction': trial.suggest_float('feature_fraction', 0.5, 1.0),
    #     'bagging_fraction': trial.suggest_float('bagging_fraction', 0.5, 1.0),
    #     'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
    #     'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
    #     'min_gain_to_split': trial.suggest_float('min_gain_to_split', 0.0, 1.0),
    #     'min_sum_hessian_in_leaf': trial.suggest_float('min_sum_hessian_in_leaf', 1e-5, 10.0, log=True),
    #     'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
    #     'subsample': trial.suggest_float('subsample', 0.5, 1.0),
    #     'verbosity': -1  # 控制训练过程中的输出级别，表示不输出信息
    # }

    # 训练模型

    bst = lgb.train(params, train_data, valid_sets=[test_data],callbacks=[lgb.early_stopping(30)])

    # 在测试集上进行预测
    y_pred_prob = bst.predict(X_test, num_iteration=bst.best_iteration)

    # 计算 ROC AUC 分数
    roc_auc = roc_auc_score(y_test, y_pred_prob)

    return roc_auc

# 创建 Optuna 研究对象
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

# 输出最优超参数
print("Best trial: score {},\nparams {}".format(study.best_trial.value, study.best_trial.params))

# 使用最优超参数进行最终训练
best_params = study.best_trial.params
best_params['verbosity'] = -1  # 控制训练过程中的输出级别，表示要输出信息

# 训练最终模型
num_round = 100
bst = lgb.train(best_params, train_data, num_round, valid_sets=[test_data],callbacks=[lgb.early_stopping(30)])

# 在测试集上进行预测
y_pred_prob = bst.predict(X_test, num_iteration=bst.best_iteration)

# 计算 ROC AUC 分数
roc_auc = roc_auc_score(y_test, y_pred_prob)
print(f'ROC AUC Score with best params: {roc_auc}')

LightGBM:

In [None]:
# 设置 LightGBM 参数
# params = {
#     'boosting_type': 'gbdt',
#     'objective': 'binary',
#     'metric': 'auc',
#     'learning_rate': 0.023508897569229955,
#     'num_leaves': 82,
#     'max_depth': -1,
#     'feature_fraction': 0.5206648048699497,
#     'bagging_fraction': 0.6432060922386999,
#     'num_round': 499
# }
#score=0.812


params={
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.0531121462757666, 
    'num_leaves': 67, 
    'max_depth': 6, 
    'feature_fraction': 0.5208432697923645, 
    'bagging_fraction': 0.8069692225513229, 
    'num_round': 664
        }
#score=0.814


# params = {
#     'boosting_type' : 'gbdt',  # 梯度提升决策树，一种利用残差的多决策树集成学习
#     'objective' : 'binary',    # 二分类
#     'metric' : 'auc',          # 模型评估指标
#     #'num_iterations' : 100,   # 生成多少棵树，即追逐残差多少次
#     'learning_rate': 0.01,
#     #下面有控制决策树叶子结点最多最少，深度最多最少的参数，均采用default
#     'num_leaves' : 31, #最大叶子结点数
#     'max_depth' : -1, #不限制最大参数
#     'min_data_in_leaf' : 20, #一个叶子结点中最小的样本量，防止过度细的分类而过拟合
#     'feature_fraction' : 0.9, #每次构建树时用于选择的特征的比例，不选择1防止过拟合
#     'bagging_fraction' : 0.8, #每次迭代时用于训练的数据的比例，不选择1(全部样本)防止过拟合
#     'verbosity': 1 #控制训练过程中的输出级别，表示要输出信息
# }
#score=0.769

# 训练模型
bst = lgb.train(params, train_data, valid_sets=[test_data],callbacks=[lgb.early_stopping(30)])

# 在测试集上进行预测
y_pred_prob = bst.predict(X_test, num_iteration=bst.best_iteration)

In [None]:
# 计算 ROC AUC 分数
roc_auc = roc_auc_score(y_test, y_pred_prob)
print(f'ROC AUC Score: {roc_auc}')

打包数据

In [None]:
df_submission = pd.DataFrame({'radiant_win_prob': y_pred_prob},
                                 index=df_test_features.index)

df_submission.to_csv("./data/submission.csv")