In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [None]:
# read data
df = pd.read_csv('gerd.csv',index_col=0)

In [None]:
# Divide features and labels, remove and replace miscellaneous items in the features
X = df.drop(columns=['Name','gender', 'LES上缘位置cm', 'LES位置(距鼻孔)cm','LES下缘位置cm','UES静息压mmHg', 'UES残余压mmHg', '大缺损收缩(次)',
       'UES上缘位置cm', '小缺损收缩(次)', 'UES下缘位置cm','UES长度cm','UES位置(距鼻孔)cm','age', 'birthday', '检查时间', '主诉','诊断结果','PIP','label'])
X.replace('YES',1,inplace=True)
X.replace('NO',0,inplace=True)
X.replace('1.#J',1.0,inplace=True)
X.replace('--',0,inplace=True)
X.replace('-',0,inplace=True)
y = df['label']

In [None]:
# Feature selection
from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestClassifier
# X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
# estimator = SVR(kernel="linear")
estimator = RandomForestClassifier(oob_score=True,random_state=123,bootstrap=True)

# estimator = grid_rf.best_estimator_
selector = RFECV(estimator, step=1, cv=5)
selector = selector.fit(X_train, y_train)
selector.support_
selector.ranking_
X = X.loc[:,selector.support_]

In [None]:
# dataset split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.8,random_state= 123, stratify = 
                                                 y)

In [None]:
# normalization
from sklearn.preprocessing import StandardScaler
y_train = y_train.reset_index(drop=True)
X_train = X_train.reset_index(drop=True)
standard_scaler = StandardScaler()
standard_scaler.fit(X_train)
X_train = standard_scaler.transform(X_train)
X_test = standard_scaler.transform(X_test)

In [None]:
# Modeling with different models
'''KNN'''
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
params = {
    'n_neighbors' : [n for n in range(1,50)],
    'weights' : ['uniform', 'distance'],
    'p': [i for i in range(1,7)]
}
grid_knn = GridSearchCV(
    estimator=KNeighborsClassifier(),
    param_grid = params,
    n_jobs=-1,
    cv = 10
)
grid_knn.fit(X_train,y_train)

'''LR'''
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
pipeline_lr = Pipeline([
    ('PolynomialFeatures', PolynomialFeatures()),
    ('LR', LogisticRegression())
    
])
params_lr = {
    'LR__C': [0.0001,0.001,0.01,0.1,2,3,5,10,15,20,25,30,40,50,60,70,80,90,100,1000],
    'LR__penalty': ['l2','l1'],
    'LR__solver': ['liblinear'],
    'LR__max_iter':[10000],
    'PolynomialFeatures__degree': [i for i in range(1, 3)]
}
grid_lr = GridSearchCV(
    estimator=pipeline_lr,
    param_grid=params_lr,
    n_jobs = -1,
    cv = 10
)

grid_lr.fit(X_train,y_train)


'''SVM'''
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
pipeline_svc = Pipeline([
    ('PolynomialFeatures', PolynomialFeatures()),
    ('SVC', SVC(probability=True))
    
])
params_svm = {
    'SVC__C': [0.01,0.1,2,3,5,10,15,20,25,30,40,50,60,70,80,90,100],
    'SVC__gamma': [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100, 1000],
    'SVC__kernel': ['rbf'],
#     'SVC__probability':['True'],
    'PolynomialFeatures__degree': [i for i in range(1, 3)]
}
grid_svm = GridSearchCV(
    estimator=pipeline_svc,
    param_grid=params_svm,
    n_jobs = -1,
    cv = 10
)
grid_svm.fit(X_train,y_train)


'''Voting'''
from sklearn.ensemble import VotingClassifier
clf = [
    KNeighborsClassifier(),
    LogisticRegression(),
    DecisionTreeClassifier(),
    SVC(probability=True)
]
vclf = VotingClassifier(
    estimators = [
        ('knn', clf[0]),
        ('lr', clf[1]),
        ('dt', clf[2]),
        ('svm',clf[3])
    ],
    voting = 'soft'
)
params = {
    'knn__n_neighbors' : [n for n in range(1,50)],
    'knn__weights' : ['uniform', 'distance'],
    'knn__p': [i for i in range(1,7)],
    'lr__C': [0.0001,0.001,0.01,0.1,2,3,5,10,15,20,25,30,40,50,60,70,80,90,100,1000],
    'lr__penalty': ['l2','l1'],
    'lr__solver': ['liblinear'],
    'lr__max_iter':[10000],
    'lr__multi_class':['multinomial','ovr'],
    'svm__C': [0.01,0.1,2,3,5,10,15,20,25,30,40,50,60,70,80,90,100],
    'svm__gamma': [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100, 1000],
    'svm__kernel': ['rbf'],
    'dt__max_depth': [i for i in range(3,10)],
    'dt__min_samples_split' : [i for i in range(3,10)],
    'dt__min_samples_leaf': [i for i in range(3,10)],
    'dt__criterion':['gini', 'entropy', 'log_loss']
    }
from sklearn.model_selection import RandomizedSearchCV 
 
random_cv = RandomizedSearchCV( 
    vclf, params, n_iter=10000, cv=10, scoring="neg_log_loss", n_jobs=-1
)
random_cv.fit(X_train,y_train)

'''RF'''
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(oob_score=True,random_state=321,bootstrap=True)
params_rf = {
    'n_estimators': [10,20,30,50,70,100,125,150,175,200,250],
    'max_samples':[40,50,60,70,80,90,100,120,140],
    'max_depth': [i for i in range(3,10)],
    'min_samples_split' : [i for i in range(3,10)],
    'min_samples_leaf': [i for i in range(3,10)],
    'criterion':['gini', 'entropy', 'log_loss']
}
grid_rf = GridSearchCV(
    estimator=rf_clf,
    param_grid=params_rf,
    n_jobs = -1,
    cv = 10
)
grid_rf.fit(X_train,y_train)

'''Bagging'''
from sklearn.ensemble import BaggingClassifier
bagging = BaggingClassifier(
    n_estimators = 200,
    bootstrap=True,
    oob_score=True,
    bootstrap_features=True,
    random_state=123
)
params_bagging = {
    'estimator':[DecisionTreeClassifier(),SVC(probability=True),LogisticRegression(),KNeighborsClassifier()],
    'max_features':[i for i in range(1,10)],
    'max_samples':[1,3,5,7,10,20,30,40,50,60,70,80,90,100,120],
#     'max_depth': [i for i in range(3,10)],
#     'min_samples_split' : [i for i in range(3,10)],
#     'min_samples_leaf': [i for i in range(3,10)],
#     'criterion':['gini', 'entropy', 'log_loss']
}
random_bagging = GridSearchCV( 
    bagging, params_bagging,  cv=10,  n_jobs=16 
)
random_bagging.fit(X_train,y_train)

In [None]:
# graphing
fig,ax= plt.subplots()

bwith = 1
ax = plt.gca()
plt.rcParams['font.sans-serif'] = 'Arial'
ax.spines['bottom'].set_visible(True)
ax.spines['left'].set_visible(True)
ax.spines['top'].set_visible(True)
ax.spines['right'].set_visible(True)
ax.spines['bottom'].set_linewidth(bwith)
ax.spines['left'].set_linewidth(bwith)
ax.spines['top'].set_linewidth(bwith)
ax.spines['right'].set_linewidth(bwith)
ax.spines['bottom'].set_linestyle("-")
ax.spines['bottom'].set_color('black')
ax.spines['left'].set_linestyle("-")
ax.spines['left'].set_color('black')
ax.spines['top'].set_linestyle("-")
ax.spines['top'].set_color('black')
ax.spines['right'].set_linestyle("-")
ax.spines['right'].set_color('black')
ax.set_facecolor("white")
ax.minorticks_on()
ax.tick_params(axis="both", which="major", direction="out", width=1, length=5)
ax.tick_params(axis="both", which="minor", direction="out", width=1, length=3)


y_score_knn = grid_knn.best_estimator_.predict_proba(X_test)
auc_knn = roc_auc_score(y_test,y_score_knn[:,1])
fpr_knn,tpr_knn,thres_knn = roc_curve(y_test,y_score_knn[:,1])
plt.plot(fpr_knn,tpr_knn,label = 'KNN AUC = %0.3f' % auc_knn)

y_decition_lr = grid_lr.best_estimator_.decision_function(X_test)
auc_log = roc_auc_score(y_test,y_decition_lr)
fpr_log,tpr_log,thres_log = roc_curve(y_test,y_decition_lr)
plt.plot(fpr_log,tpr_log,label = 'LR AUC = %0.3f' % auc_log)

y_decition_vo = random_cv.best_estimator_.predict_proba(X_test)
auc_vo = roc_auc_score(y_test,y_decition_vo[:,1])
fpr_vo,tpr_vo,thres_vo = roc_curve(y_test,y_decition_vo[:,1])
plt.plot(fpr_vo,tpr_vo,label = 'Voting AUC = %0.3f' % auc_vo)

y_decition_svm = grid_svm.best_estimator_.decision_function(X_test)
auc_svm = roc_auc_score(y_test,y_decition_svm)
fpr_svm,tpr_svm,thres_svm = roc_curve(y_test,y_decition_svm)
plt.plot(fpr_svm,tpr_svm,label = 'SVM AUC = %0.3f' % auc_svm)
                                      
y_decition_rf = grid_rf.best_estimator_.predict_proba(X_test)
auc_rf = roc_auc_score(y_test,y_decition_rf[:,1])
fpr_rf,tpr_rf,thres_rf = roc_curve(y_test,y_decition_rf[:,1])
plt.plot(fpr_rf,tpr_rf,label = 'Random Forest AUC = %0.3f' % auc_rf)


y_decition_bagging = random_bagging.best_estimator_.predict_proba(X_test)
auc_bagging = roc_auc_score(y_test,y_decition_bagging[:,1])
fpr_bagging,tpr_bagging,thres_bagging = roc_curve(y_test,y_decition_bagging[:,1])
plt.plot(fpr_bagging,tpr_bagging,label = 'Bagging AUC = %0.3f' % auc_bagging)

plt.grid(False)

plt.xlabel('FPR')
plt.ylabel('TPR')
plt.legend()
plt.title('RFE')
plt.show()