# 模型训练

In [4]:
import os
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ShuffleSplit, KFold
from sklearn.model_selection import cross_val_score

## Step1. 加载数据

In [None]:
# 加载预处理过后的数据
PATH_TO_DATA = './data/'  # 预处理后的数据存放在同级目录下！

df_train_features = pd.read_csv(os.path.join(PATH_TO_DATA, 'my_train_features.csv'),
                                index_col='match_id_hash')

df_train_targets = pd.read_csv(os.path.join(PATH_TO_DATA, 'my_train_targets.csv'),
                               index_col='match_id_hash')

df_test_features = pd.read_csv(os.path.join(PATH_TO_DATA, 'my_test_features.csv'),
                               index_col='match_id_hash')

""" turn the data into numpy arrays """
X_train = df_train_features.to_numpy()               # the axes labels will be removed!
y_train = df_train_targets['radiant_win'].to_numpy() # extract the column we need
X_test  = df_test_features.to_numpy()

## Step2. 使用交叉验证的方法调节模型参数

In [None]:
""" Split the data into train and validation sets """
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=17)  # 5 splits, 30% test data

""" Train the Random Forest model """
rf = RandomForestClassifier(n_estimators=100, n_jobs=4,
                            max_depth=None, min_samples_leaf=3, random_state=17)  # 考虑是否需要进行正则化！

""" Cross-validation """
cv_scores = cross_val_score(rf, X_train, y_train, cv=cv, scoring='roc_auc', n_jobs=-1)

## Step3. 预测并打包数据

In [None]:
y_test_pred = rf.predict_proba(X_test)[:, 1]

df_submission = pd.DataFrame({'radiant_win_prob': y_test_pred},
                                 index=df_test_features.index)

df_submission.to_csv("./data/submission.csv")