# 模型训练

In [1]:
import os
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ShuffleSplit, KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.model_selection import train_test_split

## Step1. 加载数据

In [2]:
# 加载预处理过后的数据
PATH_TO_DATA = '../data/'  # 预处理后的数据存放在同级目录下！

df_train_features = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_features.csv'),
                                index_col='match_id_hash')

df_train_targets = pd.read_csv(os.path.join(PATH_TO_DATA, 'train_targets.csv'),
                               index_col='match_id_hash')

df_test_features = pd.read_csv(os.path.join(PATH_TO_DATA, 'test_features.csv'),
                               index_col='match_id_hash')

""" turn the data into numpy arrays """
X_train = df_train_features.to_numpy()               # the axes labels will be removed!
y_train = df_train_targets['radiant_win'].to_numpy() # extract the column we need
X_test  = df_test_features.to_numpy()

In [3]:
print('X_train.shape:', X_train.shape)

X_train.shape: (39675, 245)


## Step2. 模型参数调优

### 随机森林超参数简介

随机森林模型有以下超参数：

1. n_estimators: 森林中树的数量
   - Int = 100,
2. criterion: 分裂节点的标准
   - Literal['gini', 'entropy', 'log_loss'] = "gini"
3. max_depth: 树的最大深度
   - Int | None = None
4. min_samples_split: 内部节点再划分所需的最小样本数
   - float | int = 2,
5. min_samples_leaf: 叶子节点最少样本数
   - float | int = 1
6. min_weight_fraction_leaf: 叶子节点最小的样本权重和
   - Float = 0
7. max_features: 划分时考虑的最大特征数
   - float | int | Literal['sqrt', 'log2'] = "sqrt"
8. max_leaf_nodes: 叶子节点最大数
   - Int | None = None
9. min_impurity_decrease: 分裂节点的不纯度减少值
   - Float = 0,
10. bootstrap: 是否使用bootstrap样本
    - bool = True
11. oob_score: 是否使用袋外估计
    - bool = False
12. n_jobs: 并行处理的数量
    - Int | None = None
13. random_state: 随机种子
    - Int | RandomState | None = None
14. verbose: 控制输出
    - Int = 0
15. warm_start: 是否热启动
    - bool = False,
16. class_weight: 类别权重
    - Mapping | Sequence[Mapping] | Literal['balanced', 'balanced_subsample'] | None = None
17. ccp_alpha: 最小成本复杂度剪枝
    - float = 0
18. max_samples: 每棵树的最大样本数
    - float | int | None = None

In [None]:
""" Split the data into train and validation sets """
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=17)  # 5 splits, 30% test data

""" Train the Random Forest model """
rf = RandomForestClassifier(n_jobs=-1, random_state=17, criterion='log_loss', class_weight='balanced')

## Step3. 预测并打包数据

In [None]:
y_test_pred = rf.predict_proba(X_test)[:, 1]

df_submission = pd.DataFrame({'radiant_win_prob': y_test_pred},
                                 index=df_test_features.index)

df_submission.to_csv("./data/submission.csv")