In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score

In [30]:
seed = 3563451

In [31]:
train = pd.read_csv('data_train.csv')
X_ini = train.iloc[:,2:]
y_ini = train.iloc[:,1]

In [32]:
sp = RepeatedStratifiedKFold(random_state=seed,n_repeats=3,n_splits=5)

In [33]:
import hyperopt
from hyperopt import hp

In [34]:
def objective(param):
    aucs = []
    for train_index,test_index in sp.split(X_ini,y_ini):
        X_train = X_ini.iloc[train_index,:]
        X_vali = X_ini.iloc[test_index,:]
        y_train = y_ini[train_index]
        y_vali = y_ini[test_index]
        model = KNeighborsClassifier(n_neighbors=param['n_neighbors'])
        model.fit(X_train,y_train)
        pro_vali = model.predict_proba(X_vali)[:,1]
        auc_vali = roc_auc_score(y_vali,pro_vali)
        aucs.append(auc_vali)
    return -np.mean(aucs)

In [35]:
#超参数搜索范围，根据数据集不同进行修改
space = {
    'n_neighbors':hp.choice('n_neighbors',range(2,30))
}

In [36]:
best_param = hyperopt.fmin(objective,space,hyperopt.tpe.suggest,max_evals=100)

100%|█████████████████████████████████████████████| 100/100 [00:05<00:00, 16.85trial/s, best loss: -0.9986187455954898]


In [37]:
best_param

{'n_neighbors': np.int64(3)}

In [38]:
#这里注意range()范围与上面的搜索空间保持一致
model = KNeighborsClassifier(n_neighbors=range(2,30)[best_param['n_neighbors']])
model.fit(X_ini,y_ini)

In [39]:
pro_train = model.predict_proba(X_ini)[:,1]
print('训练集AUC={:.3f}'.format(roc_auc_score(y_ini,pro_train)))

训练集AUC=1.000


In [40]:
df_train = pd.DataFrame({
    'ID':train['ID'],
    'True':y_ini,
    'Pre':pro_train
})
df_train.to_csv('KNN_train.csv',index=False)

In [41]:
# Load and predict for multiple test datasets
test_files = ['data_test1.csv', 'data_test2.csv', 'data_test3.csv']
for test_file in test_files:
    test = pd.read_csv(test_file)
    X_test = test.iloc[:,2:]
    y_test = test.iloc[:,1]

    pro_test = model.predict_proba(X_test)[:,1]

    print(f'{test_file} 测试集AUC={roc_auc_score(y_test, pro_test):.3f}')

    # Save the predictions for each test dataset
    df_test = pd.DataFrame({
        'ID': test['ID'],
        'True': y_test,
        'Pre': pro_test
    })
    df_test.to_csv(f'KNN_{test_file}_predictions.csv', index=False)

data_test1.csv 测试集AUC=0.437
data_test2.csv 测试集AUC=0.643
data_test3.csv 测试集AUC=0.448


In [42]:
import joblib
#save model
joblib.dump(model, 'saved_model/KNN.pkl')

['saved_model/KNN.pkl']