In [1]:
'''
1. Load the package required by the program
'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import roc_curve, auc 

In [2]:
'''
2. Read the training set
'''
HC_SP_train_data = pd.read_csv(r"E:\DIY_feature\LAtest\HC_SP\HC_SP_train_features.csv")
SP_BP_train_data = pd.read_csv(r"E:\DIY_feature\LAtest\SP_BP\SP_BP_train_features.csv")

HC_SP_train_features = HC_SP_train_data.drop(columns="disease")
SP_BP_train_features = SP_BP_train_data.drop(columns="disease")
HC_SP_train_labels = pd.DataFrame(HC_SP_train_data["disease"])
SP_BP_train_labels = pd.DataFrame(SP_BP_train_data["disease"])

In [3]:
'''
3. Read the test set
'''
HC_SP_test_data = pd.read_csv(r"E:\DIY_feature\LAtest\HC_SP\HC_SP_features.csv")
SP_BP_test_data = pd.read_csv(r"E:\DIY_feature\LAtest\SP_BP\SP_BP_features.csv")

HC_SP_test_features = HC_SP_test_data.drop(columns="disease")
SP_BP_test_features = SP_BP_test_data.drop(columns="disease")
HC_SP_test_labels = pd.DataFrame(HC_SP_test_data["disease"])
SP_BP_test_labels = pd.DataFrame(SP_BP_test_data["disease"])

In [4]:
'''
4. Read the validation set
'''
HC_SP_vaild_data = pd.read_csv(r"C:\Users\Yunheng\Desktop\paper\data\new\Healthy Control and Schizophrenia\Healthy Control and Schizophrenia vaild features.csv")
SP_BP_vaild_data = pd.read_csv(r"C:\Users\Yunheng\Desktop\paper\data\new\Schizophrenia and Bipolar Disorder\Schizophrenia and Bipolar Disorder vaild features.csv")

HC_SP_vaild_features = HC_SP_vaild_data.drop(columns="disease")
SP_BP_vaild_features = SP_BP_vaild_data.drop(columns="disease")
HC_SP_vaild_labels = pd.DataFrame(HC_SP_vaild_data["disease"])
SP_BP_vaild_labels = pd.DataFrame(SP_BP_vaild_data["disease"])

In [5]:
'''
5. To construct a differential diagnosis model between healthy controls and schizophrenia based on RandomForestClassifier
'''
HC_SP_RFC = RFC(n_estimators = 366,random_state = 5)
HC_SP_RFC.fit(HC_SP_train_features, np.array(HC_SP_train_labels).reshape((-1,)))
RF_score = HC_SP_RFC.score(HC_SP_test_features, np.array(HC_SP_test_labels).reshape((-1,)))
RF_vaild_score = HC_SP_RFC.score(HC_SP_vaild_features, np.array(HC_SP_vaild_labels).reshape((-1,)))


HC_SP_RFC_pre = HC_SP_RFC.predict(HC_SP_test_features)
HC_SP_RFC_val = HC_SP_RFC.predict(HC_SP_vaild_features)


fpr,tpr,threshold = roc_curve(np.array(HC_SP_vaild_labels).reshape((-1,)), HC_SP_RFC_val)
roc_auc = auc(fpr,tpr)
print("Test set score:",RF_score)
print("Validation set score",RF_vaild_score)
print("AUC score:", roc_auc)


Test set score: 0.9166666666666666
Validation set score 0.95
AUC score: 0.9444444444444444


In [6]:
'''
6. To construct a differential diagnosis model between healthy controls and schizophrenia based on KNeighborsClassifier
'''
knn = KNeighborsClassifier()
knn.fit(HC_SP_train_features, np.array(HC_SP_train_labels).reshape((-1,)))
y_predict_knn = knn.predict(HC_SP_test_features)
KNN_score = accuracy_score(np.array(HC_SP_test_labels).reshape((-1,)), y_predict_knn)
fpr,tpr,threshold = roc_curve(np.array(HC_SP_test_labels).reshape((-1,)), y_predict_knn)
roc_auc = auc(fpr,tpr)
print("Test set score:",KNN_score)
print("AUC score:",roc_auc)

Test set score: 0.8
AUC score: 0.8042269187986651


In [7]:
'''
7. To construct a differential diagnosis model between healthy controls and schizophrenia based on GradientBoostingClassifier
'''
GB = GradientBoostingClassifier()
GB.fit(HC_SP_train_features, np.array(HC_SP_train_labels).reshape((-1,)))
y_predict_gb = GB.predict(HC_SP_test_features)
GB_score = accuracy_score(np.array(HC_SP_test_labels).reshape((-1,)), y_predict_gb)
fpr,tpr,threshold = roc_curve(np.array(HC_SP_test_labels).reshape((-1,)), y_predict_gb)
roc_auc = auc(fpr,tpr)
print("Test set score:",GB_score)
print("AUC score:",roc_auc)

Test set score: 0.8666666666666667
AUC score: 0.8665183537263625


In [8]:
'''
8. To construct a differential diagnosis model between healthy controls and schizophrenia based on DecisionTreeClassifier
'''
clf = DecisionTreeClassifier()
clf.fit(HC_SP_train_features, np.array(HC_SP_train_labels).reshape((-1,)))
DT_score = clf.score(HC_SP_test_features, np.array(HC_SP_test_labels).reshape((-1,)))
y_predict_DT = clf.predict(HC_SP_test_features)

fpr,tpr,threshold = roc_curve(np.array(HC_SP_test_labels).reshape((-1,)), y_predict_DT)
roc_auc = auc(fpr,tpr)

print("Test set score:",DT_score)
print("AUC score:",roc_auc)

Test set score: 0.7
AUC score: 0.7018909899888764


In [9]:
'''
9. To construct a differential diagnosis model between healthy controls and schizophrenia based on SVM
'''
model = SVC()
model.fit(HC_SP_train_features, np.array(HC_SP_train_labels).reshape((-1,)))
prediction = model.predict(HC_SP_test_features)
SVM_score = accuracy_score(np.array(HC_SP_test_labels).reshape((-1,)), prediction)
fpr,tpr,threshold = roc_curve(np.array(HC_SP_test_labels).reshape((-1,)), prediction)
roc_auc = auc(fpr,tpr)

print("Test set score:",SVM_score)
print("AUC score:",roc_auc)

Test set score: 0.7833333333333333
AUC score: 0.7869855394883203


In [10]:
'''
10. 10-fold cross-validation was performed between healthy controls and schizophrenia differential diagnosis models
'''
HC_SP_cross_vaild_data = pd.read_csv(r"E:\DIY_feature\LAtest\HC_SP\HC_SP_cross_vaild_features.csv")
HC_SP_cross_vaild_features = HC_SP_cross_vaild_data.drop(columns="disease")
HC_SP_cross_vaild_labels = pd.DataFrame(HC_SP_cross_vaild_data["disease"])

HC_SP_RFC_10Fold= RFC(n_estimators = 366,random_state = 5)
score = cross_val_score(HC_SP_RFC_10Fold, HC_SP_cross_vaild_features, np.array(HC_SP_cross_vaild_labels).reshape((-1,)), cv=10)
print("10-fold cross validation score:",score.mean())

10-fold cross validation score: 0.865


In [11]:
'''
11. Change the 1 in the test set label to 0 and the 3 to 1 in order to calculate the AUC
'''
labels = []
test_labels = np.array(SP_BP_test_labels).reshape((-1,))
for i in range(len(test_labels)):
    if test_labels[i] == 1:
        labels.append(0)
    elif test_labels[i] == 3:
        labels.append(1)

In [12]:
'''
12. To construct a differential diagnosis model for schizophrenia and bipolar disorder based on RandomForestClassifier
'''
SP_BP_RFC = RFC(n_estimators = 460,random_state = 5)
SP_BP_RFC.fit(SP_BP_train_features, np.array(SP_BP_train_labels).reshape((-1,)))
RF_score = SP_BP_RFC.score(SP_BP_test_features, np.array(SP_BP_test_labels).reshape((-1,)))

RF_vaild_score = SP_BP_RFC.score(SP_BP_vaild_features, np.array(SP_BP_vaild_labels).reshape((-1,)))


SP_BP_RFC_pre = SP_BP_RFC.predict(SP_BP_test_features)
SP_BP_RFC_val = SP_BP_RFC.predict(SP_BP_vaild_features)


val_labels = []
vallabels = np.array(SP_BP_vaild_labels).reshape((-1,))
for i in range(len(vallabels)):
    if vallabels[i] == 1:
        val_labels.append(0)
    elif vallabels[i] == 3:
        val_labels.append(1)


predict_result = []
for i in range(len(SP_BP_RFC_val)):
    if SP_BP_RFC_val[i] == 1:
        predict_result.append(0)
    elif SP_BP_RFC_val[i] == 3:
        predict_result.append(1)

fpr,tpr,threshold = roc_curve(val_labels, predict_result)
roc_auc = auc(fpr,tpr)

print("Test set score:",RF_score)
print("Validation set score",RF_vaild_score)
print("AUC score:",roc_auc)

Test set score: 0.85
Validation set score 0.7
AUC score: 0.696969696969697


In [13]:
'''
13. To construct a differential diagnosis model for schizophrenia and bipolar disorder based on KNeighborsClassifier
'''

knn = KNeighborsClassifier()
knn.fit(SP_BP_train_features, np.array(SP_BP_train_labels).reshape((-1,)))
y_predict_knn = knn.predict(SP_BP_test_features)
KNN_score = accuracy_score(np.array(SP_BP_test_labels).reshape((-1,)), y_predict_knn)

predict_result = []
for i in range(len(y_predict_knn)):
    if y_predict_knn[i] == 1:
        predict_result.append(0)
    elif y_predict_knn[i] == 3:
        predict_result.append(1)

fpr,tpr,threshold = roc_curve(labels, predict_result)
roc_auc = auc(fpr,tpr)

print("Test set score:",KNN_score)
print("AUC score:",roc_auc)

Test set score: 0.5833333333333334
AUC score: 0.5878754171301446


In [14]:
'''
14. To construct a differential diagnosis model for schizophrenia and bipolar disorder based on GradientBoostingClassifier
'''

GB = GradientBoostingClassifier()
GB.fit(SP_BP_train_features, np.array(SP_BP_train_labels).reshape((-1,)))
y_predict_gb = GB.predict(SP_BP_test_features)
GB_score = accuracy_score(np.array(SP_BP_test_labels).reshape((-1,)), y_predict_gb)

predict_result = []
for i in range(len(y_predict_gb)):
    if y_predict_gb[i] == 1:
        predict_result.append(0)
    elif y_predict_gb[i] == 3:
        predict_result.append(1)

fpr,tpr,threshold = roc_curve(labels, predict_result)
roc_auc = auc(fpr,tpr)

print("Test set score:",GB_score)
print("AUC score:",roc_auc)

Test set score: 0.7166666666666667
AUC score: 0.7180200222469412


In [15]:
'''
15. To construct a differential diagnosis model for schizophrenia and bipolar disorder based on DecisionTreeClassifier
'''


clf = DecisionTreeClassifier()
clf.fit(SP_BP_train_features, np.array(SP_BP_train_labels).reshape((-1,)))
DT_score = clf.score(SP_BP_test_features, np.array(SP_BP_test_labels).reshape((-1,)))
y_predict_DT = clf.predict(SP_BP_test_features)

predict_result = []
for i in range(len(y_predict_DT)):
    if y_predict_DT[i] == 1:
        predict_result.append(0)
    elif y_predict_DT[i] == 3:
        predict_result.append(1)

fpr,tpr,threshold = roc_curve(labels, predict_result)
roc_auc = auc(fpr,tpr)

print("Test set score:",DT_score)
print("AUC score:",roc_auc)

Test set score: 0.5833333333333334
AUC score: 0.5845383759733036


In [16]:
'''
16. To construct a differential diagnosis model for schizophrenia and bipolar disorder based on SVM
'''

model = SVC()
model.fit(SP_BP_train_features, np.array(SP_BP_train_labels).reshape((-1,)))
prediction = model.predict(SP_BP_test_features)
#     confusion_matrix = confusion_matrix(y_test,prediction)
SVM_score = accuracy_score(np.array(SP_BP_test_labels).reshape((-1,)), prediction)

predict_result = []
for i in range(len(prediction)):
    if prediction[i] == 1:
        predict_result.append(0)
    elif prediction[i] == 3:
        predict_result.append(1)

fpr,tpr,threshold = roc_curve(labels, predict_result)
roc_auc = auc(fpr,tpr)

print("Test set score:",SVM_score)
print("AUC score:",roc_auc)

Test set score: 0.7333333333333333
AUC score: 0.728587319243604


In [17]:
'''
17. 10-fold cross-validation of the differential diagnosis model between schizophrenia and bipolar disorder
'''
SP_BP_cross_vaild_data = pd.read_csv(r"E:\DIY_feature\LAtest\SP_BP\SP_BP_cross_vaild_features.csv")
SP_BP_cross_vaild_features = SP_BP_cross_vaild_data.drop(columns="disease")
SP_BP_cross_vaild_labels = pd.DataFrame(SP_BP_cross_vaild_data["disease"])


SP_BP_RFC_10Fold  = RFC(n_estimators = 460,random_state = 5)
score = cross_val_score(SP_BP_RFC_10Fold, SP_BP_cross_vaild_features, np.array(SP_BP_cross_vaild_labels).reshape((-1,)), cv=10)
print("10-fold cross validation score:",score.mean())

10-fold cross validation score: 0.85
