In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import seaborn as sns

In [None]:
# read data
df = pd.read_csv('./data/gerd.csv',index_col=0)

In [None]:
# Divide features and labels, remove and replace miscellaneous items in the features
X = df.drop(columns=['Name','gender', 'age', 'birthday', '检查时间', '主诉','诊断结果','LES上缘位置cm', 
                     'LES位置(距鼻孔)cm','LES下缘位置cm','UES静息压mmHg', 'UES残余压mmHg', '大缺损收缩(次)',
       'UES上缘位置cm', '小缺损收缩(次)', 'UES下缘位置cm', 'IEC', 'UES长度cm', 'MRS-DCI',
       'UES位置(距鼻孔)cm','label'])
X.replace('YES',1,inplace=True)
X.replace('NO',0,inplace=True)
X.replace('1.#J',1.0,inplace=True)
X.replace('--',0,inplace=True)
X.replace('-',0,inplace=True)
y = df['label']

In [None]:
# dataset split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.8,random_state= 123, stratify = 
                                                 y)

In [None]:
# The features are divided into training and testing sets based on four different selection methods
PCCs_col = ['LES-IRP','LESP','PUT','DMS','TRE','PTT','LRE','TRAC','LES-length','HH']
X_pccs = X_train.loc[:,PCCs_col]
pccs_test = X_test.loc[:,PCCs_col]

from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestClassifier
estimator = RandomForestClassifier(oob_score=True,random_state=123,bootstrap=True)
selector = RFECV(estimator, step=1, cv=5)
selector = selector.fit(X_train, y_train)
selector.support_
selector.ranking_
X_rfe = X_train.loc[:,selector.support_]
rfe_test = X_test.loc[:,selector.support_]

X_lasso = X_train.loc[:,['TRAC','RE-L-Ac','RE-L-Ak','RE-M-Wa','LESP','LES-length','IBP','HH']]
lasso_test = X_test.loc[:,['TRAC','RE-L-Ac','RE-L-Ak','RE-M-Wa','LESP','LES-length','IBP','HH']]

X_expert = X_train.loc[:,['LESP','LES-IRP','LES-length','HH','PTT','DMS']]
expert_test = X_test.loc[:,['LESP','LES-IRP','LES-length','HH','PTT','DMS']]

In [None]:
# Using the random forest model to build models for four training sets
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

rf_clf = RandomForestClassifier(oob_score=True,random_state=321,bootstrap=True)
params_rf = {
    'n_estimators': [10,20,30,50,70,100,125,150,175,200,250],
    'max_samples':[40,50,60,70,80,90,100,120,140],
    'max_depth': [i for i in range(3,10)],
    'min_samples_split' : [i for i in range(3,10)],
    'min_samples_leaf': [i for i in range(3,10)],
    'criterion':['gini', 'entropy', 'log_loss']
}
grid_pccs = GridSearchCV(
    estimator=rf_clf,
    param_grid=params_rf,
    n_jobs = -1,
    cv = 10
)
grid_pccs.fit(X_pccs,y_train)

rf_clf = RandomForestClassifier(oob_score=True,random_state=321,bootstrap=True)
params_rf = {
    'n_estimators': [10,20,30,50,70,100,125,150,175,200,250],
    'max_samples':[40,50,60,70,80,90,100,120,140],
    'max_depth': [i for i in range(3,10)],
    'min_samples_split' : [i for i in range(3,10)],
    'min_samples_leaf': [i for i in range(3,10)],
    'criterion':['gini', 'entropy', 'log_loss']
}
grid_rfe = GridSearchCV(
    estimator=rf_clf,
    param_grid=params_rf,
    n_jobs = -1,
    cv = 10
)
grid_rfe.fit(X_rfe,y_train)


rf_clf = RandomForestClassifier(oob_score=True,random_state=321,bootstrap=True)
params_rf = {
    'n_estimators': [10,20,30,50,70,100,125,150,175,200,250],
    'max_samples':[40,50,60,70,80,90,100,120,140],
    'max_depth': [i for i in range(3,10)],
    'min_samples_split' : [i for i in range(3,10)],
    'min_samples_leaf': [i for i in range(3,10)],
    'criterion':['gini', 'entropy', 'log_loss']
}
grid_lasso = GridSearchCV(
    estimator=rf_clf,
    param_grid=params_rf,
    n_jobs = -1,
    cv = 10
)
grid_lasso.fit(X_lasso,y_train)

rf_clf = RandomForestClassifier(oob_score=True,random_state=321,bootstrap=True)
params_rf = {
    'n_estimators': [10,20,30,50,70,100,125,150,175,200,250],
    'max_samples':[40,50,60,70,80,90,100,120,140],
    'max_depth': [i for i in range(3,10)],
    'min_samples_split' : [i for i in range(3,10)],
    'min_samples_leaf': [i for i in range(3,10)],
    'criterion':['gini', 'entropy', 'log_loss']
}
grid_export = GridSearchCV(
    estimator=rf_clf,
    param_grid=params_rf,
    n_jobs = -1,
    cv = 10
)
grid_export.fit(X_export,y_train)

In [None]:
# Perform linear fitting using the predicted probabilities of the four feature models
pro = (1.26782345*export+0.69267517*lasso-0.66314059*pccs+0.0502186*ref)/1

In [None]:
# Calculate the predicted label based on the obtained fusion probability
pre_label = []
for i in range(pro.shape[0]):
    if(pro[i,0]>pro[i,1]):
        pre_label.append(0)
    else:
        pre_label.append(1)