In [None]:
import numpy as np
import pandas as pd
import warnings,re,datetime
import matplotlib
import matplotlib.pyplot as plt
from pandas import Series, DataFrame
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, LeaveOneOut
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, matthews_corrcoef, roc_curve, roc_auc_score

from sklearn.tree import DecisionTreeClassifier



In [None]:
def show_metrics(y_true, y_pred):
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    for i in range(int(y_true.shape[0])):
        if y_true[i] == 1 and y_pred[i] == 1:
            tp += 1
        if y_true[i] == 1 and y_pred[i] == 0:
            fn += 1
        if y_true[i] == 0 and y_pred[i] == 1:
            fp += 1
        if y_true[i] == 0 and y_pred[i] == 0:
            tn += 1
    se = float(tp) / ( float(tp)+float(fn) )
    sp = float(tn) / ( float(tn)+float(fp) )
    print("tp=%d, tn=%d, fp=%d, fn=%d" % (tp, tn, fp, fn))
    print("sensitivity = %.2f%%, specificity = %.2f%%" % (100*se, 100*sp))
    print("accuracy=%.2f%%, precision=%.2f%%, recall=%.2f%%, MCC=%.4f" % (100*accuracy_score(y_true, y_pred), 100*precision_score(y_true, y_pred), 100*recall_score(y_true, y_pred), matthews_corrcoef(y_true, y_pred)))

In [None]:
# # 这部分是划分好的文件输入用一下代码框：第1-4组为SOM划分的四组训练集和测试集
tr_csv = r"Q:\ALK_1_Wild\3.ALK_300nM_RANDOM_ECFP4\ChEMBL_TargetID_CHEMBL4247_1810ECFP4_filter387-ran-111_tr_1212.csv"
te_csv = r"Q:\ALK_1_Wild\3.ALK_300nM_RANDOM_ECFP4\ChEMBL_TargetID_CHEMBL4247_1810ECFP4_filter387-ran-111_te_598.csv"

tr_x = pd.read_csv(tr_csv).iloc[:,:-1] #这一步如果活性列在最后一列就是"-1"；注意修改读取范围
te_x = pd.read_csv(te_csv).iloc[:,:-1]
tr_y = pd.read_csv(tr_csv).iloc[:,-1]
te_y = pd.read_csv(te_csv).iloc[:,-1]
tr_x

In [None]:
# # 这部分是划分好的文件输入用一下代码框：第9-12组为random划分的四组训练集和测试集（有index）
tr_csv = r"Q:\ALK_1_Wild\3.ALK_300nM_RANDOM_ECFP4\index_ChEMBL_TargetID_CHEMBL4247_1810ECFP4_filter387-ran-111_tr_1212.csv"
te_csv = r"Q:\ALK_1_Wild\3.ALK_300nM_RANDOM_ECFP4\index_ChEMBL_TargetID_CHEMBL4247_1810ECFP4_filter387-ran-111_te_598.csv"

tr_x = pd.read_csv(tr_csv).iloc[:,:-2] #这一步如果活性列在倒数第二列，"-2"；注意修改读取范围
te_x = pd.read_csv(te_csv).iloc[:,:-2]
tr_y = pd.read_csv(tr_csv).iloc[:,-2]
te_y = pd.read_csv(te_csv).iloc[:,-2]
tr_x

In [None]:
# 数据压缩 Data Processing
if (tr_x.describe().loc['min',:].min() == 0 and tr_x.describe().loc['max',:].max() == 1):
    tr_feature = tr_x
    te_feature = te_x
else:
    scaler = MinMaxScaler(feature_range=(0.1, 0.9))
    tr_feature = scaler.fit_transform(tr_x)
    te_feature = scaler.transform(te_x)

tr_label = tr_y
te_label = te_y
tr_feature

In [None]:
# 参数寻优
criterion_list = ['gini','entropy']
max_depth_list = range(1,30,2)
max_features_list = [None,'sqrt','log2']
max_leaf_nodes_list = range(1,101,10)

warnings.filterwarnings("ignore")

grid_dict = {'criterion':criterion_list,'max_depth':max_depth_list,'max_features':max_features_list,'max_leaf_nodes':max_leaf_nodes_list}

grid_score_mcc = make_scorer(matthews_corrcoef, greater_is_better=True)

grid_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=3)

grid_dtc = DecisionTreeClassifier(random_state = 3)

Grid = GridSearchCV(grid_dtc, grid_dict, scoring=grid_score_mcc, cv=grid_cv)

Grid.fit(tr_feature, tr_label)
print(Grid.best_params_)

In [None]:
# 参数寻优（过程结果）
score = []
for p, s in zip(Grid.cv_results_['params'],
	Grid.cv_results_['mean_test_score']):
	print(p, s)
plt.plot(score)
plt.savefig(r"Q:\ALK_1_Wild\ALK_300nM_RANDOM_ECFP4\ECFP4_filter387_DT\ChEMBL_TargetID_CHEMBL4247_1810ECFP4_filter387_DT-16.png", dpi=1000, bbox_inches="tight" )
plt.show()

In [None]:
# 参数寻优（最优结果输出）
Grid.best_estimator_

In [None]:
# 建模结果（寻优后参数模型）

# model_dtc = DecisionTreeClassifier(criterion = Grid.best_params_['criterion'],
#                                    max_depth = Grid.best_params_['max_depth'],
#                                    max_features = Grid.best_params_['max_features'],
#                                    max_leaf_nodes = Grid.best_params_['max_leaf_nodes'],
#                                    random_state = 3)
model_dtc = DecisionTreeClassifier(criterion = 'gini',
                                   max_depth = 9,
                                   max_features = None,
                                   max_leaf_nodes = 41,
                                   random_state = 3) ## 创建一个决策树分类（DT）模型，记得改参数！！！
model_dtc.fit(tr_feature, tr_label)
tr_pre_label = model_dtc.predict(tr_feature)
te_pre_label = model_dtc.predict(te_feature)

print("training set:\n")
show_metrics(tr_label, tr_pre_label)
print("\n\ntest set:\n")
show_metrics(te_label, te_pre_label)

In [None]:
# 保存模型
import joblib
joblib.dump(model_dtc, r"Q:\ALK_1_Wild\ALK_300nM_RANDOM_ECFP4\ECFP4_filter387_DT\ChEMBL_TargetID_CHEMBL4247_1810ECFP4_filter387_DT-16.pkl")

In [None]:
# 导出训练集预测结果 Training set result file
train_file_path = "Q:\ALK_1_Wild\ALK_300nM_RANDOM_ECFP4\ECFP4_filter387_DT\ChEMBL_TargetID_CHEMBL4247_1810ECFP4_filter387_DT-16_training_results.csv"
with open(train_file_path, "w") as train_file:
    train_file.write("tr_label,tr_pre_label\n")
  
    # Write training set results to file
    for tr_label_i, tr_pre_label_i in zip(tr_label, tr_pre_label):
        train_file.write(f"{tr_label_i},{tr_pre_label_i}\n")

In [None]:
# 导出测试集预测结果 Testing set result file
test_file_path = "Q:\ALK_1_Wild\ALK_300nM_RANDOM_ECFP4\ECFP4_filter387_DT\ChEMBL_TargetID_CHEMBL4247_1810ECFP4_filter387_DT-16_testing_results.csv"
with open(test_file_path, "w") as test_file:
    test_file.write("te_label,te_pre_label\n")

    # Write testing set results to file
    for te_label_i, te_pre_label_i in zip(te_label, te_pre_label):
        test_file.write(f"{te_label_i},{te_pre_label_i}\n")

In [None]:
# 外部测试集验证

def load_external_test_data(file_path):
    df = pd.read_csv(file_path)
    ext_te_feature = df.drop('label', axis=1)  # 特征数据
    ext_te_label = df['label']  # 标签
    return ext_te_feature, ext_te_label

# 外部测试集的 CSV 文件路径
# external_test_file_path = "Q:\ALK_1_Wild\ALK_300nM_External\ChEMBL_TargetID_CHEMBL4247_1810_ExternalECFP4.csv"
external_test_file_path = "Q:\ALK_1_Wild\ALK_300nM_External\ChEMBL_TargetID_CHEMBL4247_1810_ExternalECFP4_filter387.csv"

# 加载外部测试集数据
ext_te_feature, ext_te_label = load_external_test_data(external_test_file_path)

# 使用相同的标准化器对外部测试集进行标准化
scaler = MinMaxScaler()
ext_te_feature = scaler.fit_transform(ext_te_feature)

# 使用训练好的模型对外部测试集进行预测
ext_te_pre_label = model_dtc.predict(ext_te_feature)

# 打印外部测试集的预测结果
print("\nexternal test set predictions:\n")
show_metrics(ext_te_label, ext_te_pre_label)

In [None]:
# 用于检查训练时使用的特征数量和预测时传递给模型的特征数量是否相同。确保外部测试集的特征数量与训练集的特征数量一致
print("训练集特征数量:", tr_feature.shape[1])
print("外部测试集特征数量:", ext_te_feature.shape[1])

In [None]:
# 5-CV & 10-CV & LOO
# cv_dtc = DecisionTreeClassifier(criterion=Grid.best_params_['criterion'],
#                                  max_depth = Grid.best_params_['max_depth'],
#                                  max_features = Grid.best_params_['max_features'],
#                                  max_leaf_nodes=Grid.best_params_['max_leaf_nodes'], random_state=3)
cv_dtc = DecisionTreeClassifier(criterion = 'gini',
                                   max_depth = 9,
                                   max_features = None,
                                   max_leaf_nodes = 41,
                                   random_state = 3) ## 创建一个决策树分类（DT）模型，记得改参数！！！
cv_score_accuracy = make_scorer(accuracy_score)
cv_cv5 = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
cv_cv10 = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
cv_cvloo = LeaveOneOut()
print("5-fold: %.2f%%" % (100 * np.mean(cross_val_score(cv_dtc, tr_feature, tr_label, scoring=cv_score_accuracy, cv=cv_cv5))))
print("10-fold: %.2f%%" % (100 * np.mean(cross_val_score(cv_dtc, tr_feature, tr_label, scoring=cv_score_accuracy, cv=cv_cv10))))
print("leave-one-out: %.2f%%" % (100 * np.mean(cross_val_score(cv_dtc, tr_feature, tr_label, scoring=cv_score_accuracy, cv=cv_cvloo))))

In [None]:
# Calculating ROC curve.
tr_decision_score = model_dtc.predict_proba(tr_feature)[:,1] #不一样诶？
te_decision_score = model_dtc.predict_proba(te_feature)[:,1] #不一样诶？
tr_roc_inform = roc_curve(tr_label,tr_decision_score,drop_intermediate=False)
te_roc_inform = roc_curve(te_label,te_decision_score,drop_intermediate=False)
tr_auc_str = "AUC (Training set): " + str(round(roc_auc_score(tr_label, tr_decision_score),3))
te_auc_str = "AUC (Test set): " + str(round(roc_auc_score(te_label, te_decision_score),3))
plt.clf()
matplotlib.rcParams['font.family'] = 'Times New Roman'
plt.plot(tr_roc_inform[0], tr_roc_inform[1], 'b',label="Training set")
plt.plot(te_roc_inform[0], te_roc_inform[1], 'r-.', label="Test set")
plt.text(0.6,0.8,tr_auc_str+'\n'+te_auc_str)
plt.xlabel("False positive rate")
plt.ylabel("True positive rate")
plt.legend(loc="best")
plt.savefig(r"C:\Users\lab408\Desktop\iii.tif", dpi=1000, bbox_inches="tight")
plt.show()

In [None]:
# 将tr_proba输出到CSV文件
tr_proba = model_dtc.predict_proba(tr_feature) 
tr_decision_score_df = pd.DataFrame(tr_proba, columns=['Class_0_Proba', 'Class_1_Proba'])
tr_decision_score_df.to_csv(r"Q:\ALK_1_Wild\3.ALK_300nM_RANDOM_ECFP4\ECFP4_filter387_DT\ChEMBL_TargetID_CHEMBL4247_1810ECFP4_filter387_DT-16_tr_proba.csv", index=False)

In [None]:
# 将te_proba输出到CSV文件
te_proba = model_dtc.predict_proba(te_feature) 
te_decision_score_df = pd.DataFrame(te_proba, columns=['Class_0_Proba', 'Class_1_Proba'])
te_decision_score_df.to_csv(r"Q:\ALK_1_Wild\3.ALK_300nM_RANDOM_ECFP4\ECFP4_filter387_DT\ChEMBL_TargetID_CHEMBL4247_1810ECFP4_filter387_DT-16_te_proba.csv", index=False)