In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [16]:
seed = 356437451

In [17]:
train = pd.read_csv('data_train.csv')
X_ini = train.iloc[:,2:]
y_ini = train.iloc[:,1]

In [18]:
sp = RepeatedStratifiedKFold(random_state=seed,n_repeats=3,n_splits=5)

In [19]:
import hyperopt
from hyperopt import hp

In [20]:
def objective(param):
    aucs = []
    for train_index,test_index in sp.split(X_ini,y_ini):
        X_train = X_ini.iloc[train_index,:]
        X_vali = X_ini.iloc[test_index,:]
        y_train = y_ini[train_index]
        y_vali = y_ini[test_index]
        model = RandomForestClassifier(random_state=seed,
                                       n_estimators=param['n_estimators'],
                                       max_depth=param['max_depth'],
                                       min_samples_split=param['min_samples_split'],
                                       min_samples_leaf=param['min_samples_leaf'])
        model.fit(X_train,y_train)
        pro_vali = model.predict_proba(X_vali)[:,1]
        auc_vali = roc_auc_score(y_vali,pro_vali)
        aucs.append(auc_vali)
    return -np.mean(aucs)

In [21]:
#超参数搜索范围，根据数据集不同进行修改
space = {
    'n_estimators':hp.choice('n_estimators',range(2,50)),
    'max_depth':hp.choice('max_depth',range(1,3)),
    'min_samples_split':hp.choice('min_samples_split',range(2,50)),
    'min_samples_leaf':hp.choice('min_samples_leaf',range(2,50)),
}

In [22]:
best_param = hyperopt.fmin(objective,space,hyperopt.tpe.suggest,max_evals=100)

100%|████████████████████████████████████████████████████████████| 100/100 [00:47<00:00,  2.11trial/s, best loss: -1.0]


In [23]:
best_param

{'max_depth': np.int64(1),
 'min_samples_leaf': np.int64(14),
 'min_samples_split': np.int64(42),
 'n_estimators': np.int64(9)}

In [24]:
#这里注意range()范围与上面的搜索空间保持一致
model = RandomForestClassifier(random_state=seed,
                               n_estimators=range(2,50)[best_param['n_estimators']],
                               max_depth=range(1,3)[best_param['max_depth']],
                               min_samples_split=range(2,50)[best_param['min_samples_split']],
                               min_samples_leaf=range(2,50)[best_param['min_samples_leaf']])
model.fit(X_ini,y_ini)

In [25]:
pro_train = model.predict_proba(X_ini)[:,1]
print('训练集AUC={:.3f}'.format(roc_auc_score(y_ini,pro_train)))

训练集AUC=1.000


In [26]:
df_train = pd.DataFrame({
    'ID':train['ID'],
    'True':y_ini,
    'Pre':pro_train
})
df_train.to_csv('RF_train.csv',index=False)

In [27]:
# Load and predict for multiple test datasets
test_files = ['data_test1.csv', 'data_test2.csv', 'data_test3.csv', 
                  'data_test4.csv', 'data_test5.csv', 'data_test6.csv', 'data_test7.csv', 
                  'data_test8.csv', 'data_test9.csv', 'data_test10.csv', 'data_test11.csv', 'data_test12.csv']
for test_file in test_files:
    test = pd.read_csv(test_file)
    X_test = test.iloc[:,2:]
    y_test = test.iloc[:,1]

    pro_test = model.predict_proba(X_test)[:,1]

    print(f'{test_file} 测试集AUC={roc_auc_score(y_test, pro_test):.3f}')

    # Save the predictions for each test dataset
    df_test = pd.DataFrame({
        'ID': test['ID'],
        'True': y_test,
        'Pre': pro_test
    })
    df_test.to_csv(f'RF_{test_file}_predictions.csv', index=False)

data_test1.csv 测试集AUC=0.170
data_test2.csv 测试集AUC=1.000
data_test3.csv 测试集AUC=1.000
data_test4.csv 测试集AUC=1.000
data_test5.csv 测试集AUC=1.000
data_test6.csv 测试集AUC=1.000
data_test7.csv 测试集AUC=1.000
data_test8.csv 测试集AUC=1.000
data_test9.csv 测试集AUC=1.000
data_test10.csv 测试集AUC=1.000
data_test11.csv 测试集AUC=1.000
data_test12.csv 测试集AUC=1.000


In [28]:
import joblib
#save model
joblib.dump(model, 'saved_model/RF.pkl')

['saved_model/RF.pkl']