In [None]:
#QSAR MODEL

In [None]:
#VIVO
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV, cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, f1_score, roc_curve, auc, confusion_matrix
import joblib
import os

plt.rcParams['font.family'] = 'Arial'
plt.rcParams['figure.dpi'] = 330
plt.rcParams['savefig.dpi'] = 330
plt.rcParams['font.size'] = 12
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['legend.fontsize'] = 10

data_path = r"D:\个人\课题\NEU\数据\神经毒性体内数据\体内数据\rdkit_vivo_fingerprints.csv"
df = pd.read_csv(data_path)

X = df[[str(i) for i in range(1024)]]  
y = df["Model6"] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [5, 10, 20, None],
    'random_state': [42]
}

rf = RandomForestClassifier()
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_rf = grid_search.best_estimator_
print(f"最佳参数: {best_params}")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
CV_acc = cross_val_score(best_rf, X_train, y_train, cv=cv, scoring="accuracy").mean()
CV_bal_acc = cross_val_score(best_rf, X_train, y_train, cv=cv, scoring="balanced_accuracy").mean()
CV_roc_auc = cross_val_score(best_rf, X_train, y_train, cv=cv, scoring="roc_auc").mean()
CV_f1 = cross_val_score(best_rf, X_train, y_train, cv=cv, scoring="f1").mean()

print(f"5 折交叉验证 - 准确率: {CV_acc:.4f}")
print(f"5 折交叉验证 - 平衡准确率: {CV_bal_acc:.4f}")
print(f"5 折交叉验证 - ROC AUC: {CV_roc_auc:.4f}")
print(f"5 折交叉验证 - F1 分数: {CV_f1:.4f}")

best_rf.fit(X_train, y_train)
y_pred = best_rf.predict(X_test)
y_prob = best_rf.predict_proba(X_test)[:, 1]

test_acc = accuracy_score(y_test, y_pred)
test_bal_acc = balanced_accuracy_score(y_test, y_pred)
test_roc_auc = roc_auc_score(y_test, y_prob)
test_f1 = f1_score(y_test, y_pred)

print(f"测试集 - 准确率: {test_acc:.4f}")
print(f"测试集 - 平衡准确率: {test_bal_acc:.4f}")
print(f"测试集 - ROC AUC: {test_roc_auc:.4f}")
print(f"测试集 - F1 分数: {test_f1:.4f}")

y_train_prob = cross_val_predict(best_rf, X_train, y_train, cv=cv, method="predict_proba")[:, 1]

fpr_cv, tpr_cv, _ = roc_curve(y_train, y_train_prob)
roc_auc_cv = auc(fpr_cv, tpr_cv)

fpr_test, tpr_test, _ = roc_curve(y_test, y_prob)
roc_auc_test = auc(fpr_test, tpr_test)

plt.figure(figsize=(7, 7))
plt.plot(fpr_cv, tpr_cv, label=f'5-Fold CV ROC (AUC = {roc_auc_cv:.3f})', linestyle='--', color='blue', linewidth=2)
plt.plot(fpr_test, tpr_test, label=f'Test ROC (AUC = {roc_auc_test:.3f})', color='red', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlabel('False Positive Rate (FPR)', fontname='Arial')
plt.ylabel('True Positive Rate (TPR)', fontname='Arial')
plt.title('In Vivo Dataset ROC Curve of RandomForest Classifier', fontname='Arial', fontweight='bold')
plt.legend(loc='lower right', prop={'family': 'Arial'})
plt.grid(True, alpha=0.3)
plt.tight_layout()

save_dir = r"D:/个人/课题/NEU/数据/神经毒性体内数据/体内数据"
os.makedirs(save_dir, exist_ok=True)
plt.savefig(os.path.join(save_dir, "vivo_roc_curve.png"), bbox_inches='tight')
plt.show()

# 计算混淆矩阵
conf_matrix_val = confusion_matrix(y_train, cross_val_predict(best_rf, X_train, y_train, cv=cv))
conf_matrix_test = confusion_matrix(y_test, y_pred)

total_matrix = conf_matrix_val + conf_matrix_test

fig, ax = plt.subplots(figsize=(6, 6))
cax = ax.matshow(total_matrix, cmap="Blues") 

cmap = plt.get_cmap("Blues")

for (i, j), value in np.ndenumerate(total_matrix):
    val_count = conf_matrix_val[i, j]
    test_count = conf_matrix_test[i, j]
   
    normalized_value = cax.norm(value)  
    rgba_color = cmap(normalized_value)  
    brightness = sum(rgba_color[:3]) / 3  

    text_color = "white" if brightness < 0.5 else "black"

    ax.text(j, i, f"CV: {val_count}\nTest: {test_count}", 
            ha='center', va='center', fontsize=10, color=text_color, fontname='Arial')

plt.xticks([0, 1], ["Negative", "Positive"], fontname='Arial')
plt.yticks([0, 1], ["Negative", "Positive"], fontname='Arial')
plt.xlabel("Predicted Label", fontname='Arial')
plt.ylabel("True Label", fontname='Arial')
plt.title("In Vivo Dataset Combined Confusion Matrix (CV + Test)", fontname='Arial', fontweight='bold')
plt.tight_layout()

plt.savefig(os.path.join(save_dir, "vivo_combined_conf_matrix.png"), bbox_inches='tight')
plt.show()

In [None]:
# 谷本系数相似性分析
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns
from scipy.spatial.distance import pdist, squareform
from scipy.stats import gaussian_kde
import matplotlib.gridspec as gridspec
from matplotlib.ticker import MultipleLocator, AutoMinorLocator, AutoLocator  # 添加AutoLocator
import time

plt.rcParams['font.family'] = 'Arial'
plt.rcParams['font.size'] = 10
plt.rcParams['axes.linewidth'] = 0.8
plt.rcParams['axes.edgecolor'] = 'black'
plt.rcParams['xtick.major.width'] = 0.8
plt.rcParams['ytick.major.width'] = 0.8
plt.rcParams['xtick.minor.width'] = 0.6
plt.rcParams['ytick.minor.width'] = 0.6
plt.rcParams['figure.dpi'] = 300

def load_data(file_path):
    """加载指纹数据"""
    print("Loading data...")
    data = pd.read_csv(file_path)
    fingerprints = data.iloc[:, :1024].values
    smiles = data.iloc[:, 1024].values
    print(f"Loaded {len(fingerprints)} compounds with {fingerprints.shape[1]} features")
    return fingerprints, smiles

def compute_tanimoto_similarity(fingerprints):
    """使用纯NumPy高效计算Tanimoto相似性矩阵"""
    print("Computing Tanimoto similarity matrix using optimized NumPy method...")
    start_time = time.time()
    
    n = len(fingerprints)
    
   
    try:
        jaccard_dist = pdist(fingerprints, 'jaccard')
        tanimoto_sim = 1 - squareform(jaccard_dist)
        method_used = "scipy optimized method"
    except Exception as e:
        print(f"Scipy method failed: {e}, using NumPy method...")
        
        tanimoto_sim = np.zeros((n, n))
        
        
        popcounts = np.sum(fingerprints, axis=1)
        
       
        intersection_matrix = fingerprints @ fingerprints.T
        
        for i in tqdm(range(n)):
            for j in range(i, n):
                union = popcounts[i] + popcounts[j] - intersection_matrix[i, j]
                if union == 0:
                    sim = 0.0
                else:
                    sim = intersection_matrix[i, j] / union
                tanimoto_sim[i, j] = sim
                tanimoto_sim[j, i] = sim
        
        method_used = "NumPy optimized method"
    
    elapsed_time = time.time() - start_time
    print(f"Similarity matrix computed in {elapsed_time:.2f} seconds using {method_used}")
    
    return tanimoto_sim

def save_histogram_only(similarities, output_path):
    """单独保存直方图为PNG文件"""
    print("Creating and saving histogram plot...")
    
    fig, ax = plt.subplots(figsize=(10, 6))
    
    mean_sim = np.mean(similarities)
    median_sim = np.median(similarities)
    std_sim = np.std(similarities)
    q1 = np.percentile(similarities, 25)
    q3 = np.percentile(similarities, 75)
    
   
    n_bins = 60
    hist_bins = np.linspace(0, 0.3, n_bins + 1)
    
    
    n, bins, patches = ax.hist(similarities, bins=hist_bins, 
                              color='#2E86AB', alpha=0.85, 
                              edgecolor='white', linewidth=0.8,
                              density=False)
    
    kde = gaussian_kde(similarities)
    x_kde = np.linspace(0, 0.3, 500)
    y_kde = kde(x_kde) * len(similarities) * (bins[1] - bins[0])
    ax.plot(x_kde, y_kde, color='#D1495B', linewidth=2.5, 
           label='Probability Density', zorder=5)
    

    ax.axvline(mean_sim, color='#EDAE49', linestyle='--', linewidth=2.5,
              label=f'Mean: {mean_sim:.3f}', zorder=6)
    ax.axvline(median_sim, color='#66A182', linestyle='-', linewidth=2.5,
              label=f'Median: {median_sim:.3f}', zorder=6)
    

    ax.axvspan(q1, q3, alpha=0.2, color='#FF6B6B', label='IQR (25%-75%)')
    

    ax.set_xlim(0, 0.3)
    ax.set_xlabel('Tanimoto Similarity Coefficient', fontsize=12, fontweight='bold', labelpad=10)
    ax.set_ylabel('Frequency', fontsize=12, fontweight='bold', labelpad=10)
    

    ax.xaxis.set_major_locator(MultipleLocator(0.05))
    ax.xaxis.set_minor_locator(MultipleLocator(0.01))
    ax.yaxis.set_major_locator(plt.MaxNLocator(10))  
    ax.yaxis.set_minor_locator(AutoMinorLocator())
    
    ax.grid(True, which='major', linestyle='--', alpha=0.4, color='gray', linewidth=0.8)
    ax.grid(True, which='minor', linestyle=':', alpha=0.2, color='lightgray', linewidth=0.6)
    
    ax.legend(loc='upper right', fontsize=10, frameon=True, 
             fancybox=True, framealpha=0.95, edgecolor='black', shadow=True)
    
    stats_text = (f'Total Pairs: {len(similarities):,}\n'
                 f'Mean: {mean_sim:.4f}\n'
                 f'Median: {median_sim:.4f}\n'
                 f'Std Dev: {std_sim:.4f}\n'
                 f'Q1: {q1:.4f}, Q3: {q3:.4f}\n'
                 f'Range: [{np.min(similarities):.4f}, {np.max(similarities):.4f}]')
    
    ax.text(0.02, 0.98, stats_text, transform=ax.transAxes, fontsize=9,
           verticalalignment='top', bbox=dict(boxstyle='round', facecolor='white',
                                             alpha=0.95, edgecolor='black', linewidth=1))
    
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
 
    plt.title('Molecular Fingerprint Similarity Analysis\n(Tanimoto Coefficient Distribution)', 
             fontsize=14, fontweight='bold', pad=20)
 
    plt.tight_layout()

    plt.savefig(output_path, dpi=300, bbox_inches='tight', 
               facecolor='white', edgecolor='none')
    print(f"Histogram saved to: {output_path}")
    
    plt.close(fig) 
    
    return output_path

def additional_analysis(similarities, fingerprints, smiles):
    """进行额外的统计分析"""
    print("\n=== Additional Analysis ===")

    high_sim_threshold = 0.6
    high_sim_indices = np.where(similarities > high_sim_threshold)[0]
    
    if len(high_sim_indices) > 0:
        print(f"Found {len(high_sim_indices)} pairs with similarity > {high_sim_threshold}")
    
    else:
        print(f"No pairs found with similarity > {high_sim_threshold}")
   
    sparsity = 1 - np.mean(fingerprints)
    print(f"Fingerprint sparsity: {sparsity:.3f} ({sparsity*100:.1f}% zeros)")

def main():
    file_path = "D:/个人/课题/NEU/数据/神经毒性体内数据/体内数据/rdkit_vivo_fingerprints.csv"
    fingerprints, smiles = load_data(file_path)

    tanimoto_sim = compute_tanimoto_similarity(fingerprints)

    np.save("tanimoto_similarity_matrix.npy", tanimoto_sim)
    print("Similarity matrix saved.")

    upper_tri = np.triu_indices_from(tanimoto_sim, k=1)
    similarities = tanimoto_sim[upper_tri]
    
    print(f"\n=== Statistical Summary ===")
    print(f"Total compound pairs: {len(similarities):,}")
    print(f"Mean similarity: {np.mean(similarities):.4f}")
    print(f"Median similarity: {np.median(similarities):.4f}")
    print(f"Standard deviation: {np.std(similarities):.4f}")
    print(f"Minimum similarity: {np.min(similarities):.4f}")
    print(f"Maximum similarity: {np.max(similarities):.4f}")

    additional_analysis(similarities, fingerprints, smiles)

    histogram_path = r'D:/个人/课题/NEU/数据/神经毒性体内数据/体内数据/tanimoto_histogram.png'
    print(f"\nSaving histogram separately...")
    save_histogram_only(similarities, histogram_path)
    
    print(f"\n=== Analysis Completed ===")
    print(f"Histogram saved to: {histogram_path}")
    print(f"Similarity matrix saved to: tanimoto_similarity_matrix.npy")

if __name__ == "__main__":
    main()

In [None]:
#二维t-sne分析
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from rdkit import Chem
from rdkit.Chem import AllChem
import seaborn as sns
import re

df = pd.read_csv(r'D:\个人\课题\NEU\数据\神经毒性体内数据\体内数据\rdkit_vivo_fingerprints.csv')

print("检查数据质量...")
print(f"数据形状: {df.shape}")

features_columns = df.columns[:1024]
print(f"特征列数量: {len(features_columns)}")

nan_count = df[features_columns].isna().sum().sum()
print(f"NaN值总数: {nan_count}")

inf_count = np.isinf(df[features_columns].values).sum()

max_values = df[features_columns].max().max()
min_values = df[features_columns].min().min()
print(f"最大值: {max_values}, 最小值: {min_values}")


if nan_count > 0 or inf_count > 0:
    print("正在进行数据清理...")
 
    df[features_columns] = df[features_columns].fillna(0)
   
    df[features_columns] = df[features_columns].replace([np.inf, -np.inf], 0)
    print("数据清理完成")


def preprocess_smiles(smiles):
    """
    预处理SMILES字符串，处理转义字符问题
    """
    if pd.isna(smiles):
        return ""
    
    smiles_str = str(smiles).replace('\\', 'BACKSLASH')
    return smiles_str


def restore_smiles(processed_smiles):
    """
    将预处理后的SMILES恢复为原始格式
    """
    if pd.isna(processed_smiles):
        return ""
    return str(processed_smiles).replace('BACKSLASH', '\\')


print("预处理SMILES字符串...")
df['Processed_SMILES'] = df['SMILES'].apply(preprocess_smiles)


def classify_compound(processed_smiles):
    """
    根据SMILES字符串对化合物进行分类
    """
    if not processed_smiles or processed_smiles == "":
        return "Unknown"
    
    smiles = restore_smiles(processed_smiles)
    
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol is None:
            return "Invalid_SMILES"
        
      
        benzene_pattern = Chem.MolFromSmarts('c1ccccc1')
        if mol.HasSubstructMatch(benzene_pattern):
            return "Benzenoids"
        
        if mol.HasSubstructMatch(Chem.MolFromSmarts('[OX2H]')):
            return "Alcohols"
        
        halogen_patterns = [
            Chem.MolFromSmarts('[F]'),
            Chem.MolFromSmarts('[Cl]'),
            Chem.MolFromSmarts('[Br]'),
            Chem.MolFromSmarts('[I]')
        ]
        for pattern in halogen_patterns:
            if mol.HasSubstructMatch(pattern):
                return "Halides"
        
      
        if mol.HasSubstructMatch(Chem.MolFromSmarts('[N]')):
            return "Amines"
        
        if mol.HasSubstructMatch(Chem.MolFromSmarts('C(=O)N')):
            return "Amides"
        
        heterocycle_patterns = [
            Chem.MolFromSmarts('[#7]1~[#6]~[#6]~[#6]~[#6]~1'),
            Chem.MolFromSmarts('[#8]1~[#6]~[#6]~[#6]~[#6]~1'),
            Chem.MolFromSmarts('[#16]1~[#6]~[#6]~[#6]~[#6]~1')
        ]
        for pattern in heterocycle_patterns:
            if mol.HasSubstructMatch(pattern):
                return "Heterocycles"
       
        if mol.HasSubstructMatch(Chem.MolFromSmarts('C(=O)[#6]')):
            return "Ketones"
        
        if mol.HasSubstructMatch(Chem.MolFromSmarts('[CH](=O)')):
            return "Aldehydes"
        
        if mol.HasSubstructMatch(Chem.MolFromSmarts('C(=O)[OH]')):
            return "Carboxylic_Acids"
        
        if mol.HasSubstructMatch(Chem.MolFromSmarts('C(=O)O[#6]')):
            return "Esters"
        
        if mol.HasSubstructMatch(Chem.MolFromSmarts('[#6][O][#6]')):
            return "Ethers"
        
        return "Others"
    
    except Exception as e:
        print(f"处理SMILES时出错: {smiles}, 错误: {e}")
        return "Error"


print("开始分类化合物...")
df['Category'] = df['Processed_SMILES'].apply(classify_compound)

print("过滤无效化合物...")
initial_count = len(df)
df = df[~df['Category'].isin(['Unknown', 'Invalid_SMILES'])]
filtered_count = len(df)
print(f"过滤后化合物数量: {filtered_count}/{initial_count} (保留了 {filtered_count/initial_count*100:.2f}%)")

category_counts = df['Category'].value_counts()
print("化合物分类统计:")
print(category_counts)

output_df = df.drop('Processed_SMILES', axis=1)
output_df.to_csv(r'D:\个人\课题\NEU\数据\神经毒性体内数据\体内数据\classified_compounds.csv', index=False)
print("已保存分类后的表格: classified_compounds.csv")

print("开始t-SNE降维...")
features = df.iloc[:, :1024].values
labels = df['Model6'].values
categories = df['Category'].values

print("再次检查特征数据质量...")
print(f"特征矩阵形状: {features.shape}")
print(f"NaN值: {np.isnan(features).sum()}")
print(f"无穷大值: {np.isinf(features).sum()}")

features = np.nan_to_num(features, nan=0.0, posinf=0.0, neginf=0.0)

# 执行t-SNE降维
print("执行t-SNE降维...")
tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000, verbose=1)
tsne_results = tsne.fit_transform(features)


tsne_df = pd.DataFrame({
    'TSNE1': tsne_results[:, 0],
    'TSNE2': tsne_results[:, 1],
    'Category': categories,
    'Model6': labels
})

category_colors = {
    'Benzenoids': '#1f77b4',       # 蓝色
    'Alcohols': '#ff7f0e',         # 橙色
    'Halides': '#2ca02c',          # 绿色
    'Amines': '#d62728',           # 红色
    'Amides': '#9467bd',           # 紫色
    'Heterocycles': '#8c564b',     # 棕色
    'Ketones': '#e377c2',          # 粉色
    'Aldehydes': '#7f7f7f',        # 灰色
    'Carboxylic_Acids': '#bcbd22', # 黄绿色
    'Esters': '#17becf',           # 青色
    'Ethers': '#ff9896',           # 浅红色
    'Others': '#c5b0d5',           # 淡紫色
}

plt.rcParams['font.sans-serif'] = ['Arial']
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams.update({'font.size': 20})

plt.figure(figsize=(16, 12))

for category in tsne_df['Category'].unique():
    for model6_val in [0, 1]:
        mask = (tsne_df['Category'] == category) & (tsne_df['Model6'] == model6_val)
        subset = tsne_df[mask]
        
        if len(subset) > 0:
            marker = 'o' if model6_val == 0 else 's'
            color = category_colors.get(category, '#000000')
            label = f'{category} (0)' if model6_val == 0 else f'{category} (1)'
            
            plt.scatter(subset['TSNE1'], subset['TSNE2'], 
                        c=color, marker=marker, s=60, alpha=0.7,
                        label=label, edgecolors='white', linewidth=0.8)

handles, labels = plt.gca().get_legend_handles_labels()
indexed_labels = list(enumerate(labels))
sorted_index = [i for i, label in indexed_labels if 'Others' not in label] + \
               [i for i, label in indexed_labels if 'Others' in label]
sorted_handles = [handles[i] for i in sorted_index]
sorted_labels = [labels[i] for i in sorted_index]

plt.legend(sorted_handles, sorted_labels,
           bbox_to_anchor=(1.02, 1), loc='upper left',
           fontsize=19, frameon=True, fancybox=True, shadow=False,
           ncol=1, borderpad=1, labelspacing=1.2, handletextpad=1.2)

plt.title('t-SNE Visualization of Compounds\n', fontsize=40,pad=20)
plt.xlabel('t-SNE Dimension 1', fontsize=35)
plt.ylabel('t-SNE Dimension 2', fontsize=35)
plt.grid(True, alpha=0.2)
plt.tight_layout()

plt.savefig(r'D:\个人\课题\NEU\数据\神经毒性体内数据\体内数据\tsne_visualization_filtered.png',
            dpi=300, bbox_inches='tight')
plt.show()


In [None]:
# SHAP
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

plt.rcParams['font.family'] = 'Arial'
plt.rcParams['axes.unicode_minus'] = False
from sklearn.model_selection import train_test_split
from randomforest import RFClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import shap
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

df = pd.read_csv(r'D:\个人\课题\NEU\数据\神经毒性体内数据\体内数据\rdkit_vivo_fingerprints.csv')

df = df.drop(columns=['Chemical name', 'CAS Number'], errors='ignore')

X = df.drop(['Model6', 'SMILES'], axis=1)
y = df['Model6']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=df['Model6']
)


model_xgb = RFClassifier(
    use_label_encoder=False, eval_metric='logloss', random_state=8
)


param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=8)

grid_search = GridSearchCV(
    estimator=model_rf, param_grid=param_grid, scoring='accuracy',
    cv=kfold, verbose=1, n_jobs=-1
)

grid_search.fit(X_train, y_train)


rf = grid_search.best_estimator_


y_pred =rf.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
print(f"模型准确度: {accuracy * 100:.2f}%")
print("\n分类报告:")
print(classification_report(y_test, y_pred))



# SHAP分析
explainer = shap.TreeExplainer(rf)
shap_values = explainer.shap_values(X_test)


fig, ax1 = plt.subplots(figsize=(10, 8), dpi=300)
shap.summary_plot(
    shap_values, X_test, feature_names=X_test.columns, plot_type="dot",
    show=False, color_bar=True
)


ax1 = plt.gca()
for item in ([ax1.title, ax1.xaxis.label, ax1.yaxis.label] +
             ax1.get_xticklabels() + ax1.get_yticklabels()):
    item.set_fontname('Arial')
    item.set_fontsize(16)


ax2 = ax1.twiny()
shap.summary_plot(shap_values, X_test, plot_type="bar", show=False)


for item in ([ax2.title, ax2.xaxis.label, ax2.yaxis.label] +
             ax2.get_xticklabels() + ax2.get_yticklabels()):
    item.set_fontname('Arial')
    item.set_fontsize(12)


ax1.set_xlabel('Shapley Value Contribution (Bee Swarm)', fontname='Arial', fontsize=12)
ax2.set_xlabel('Mean Shapley Value (Feature Importance)', fontname='Arial', fontsize=12)
ax2.xaxis.set_label_position('top')
ax2.xaxis.tick_top()
ax1.set_ylabel('Features', fontname='Arial', fontsize=12)

bars = ax2.patches
for bar in bars:
    bar.set_alpha(0.2)

plt.tight_layout()
plt.savefig(r"D:\个人\课题\NEU\数据\神经毒性体内数据\体内数据\VIVO_SHAP.png", 
            format='png', bbox_inches='tight', dpi=300)
plt.close()

print("SHAP图已保存。")

In [None]:
#AOP MODEL

In [None]:
#AHR模型
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV, cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, f1_score, roc_curve, auc, confusion_matrix
import joblib


plt.rcParams['font.family'] = 'Arial'
plt.rcParams['figure.dpi'] = 330
plt.rcParams['savefig.dpi'] = 330
plt.rcParams['font.size'] = 12
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['legend.fontsize'] = 10

data_path = r"D:/个人/课题/NEU/数据/实验/MODEL/neu_AHR_predict_X.csv"
df = pd.read_csv(data_path)


X = df[[str(i) for i in range(1024)]] 
y = df["Model1"] 


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [5, 10, 20, None],
    'random_state': [42]
}

rf = RandomForestClassifier()
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)


best_params = grid_search.best_params_
best_rf = grid_search.best_estimator_
print(f"最佳参数: {best_params}")


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
CV_acc = cross_val_score(best_rf, X_train, y_train, cv=cv, scoring="accuracy").mean()
CV_bal_acc = cross_val_score(best_rf, X_train, y_train, cv=cv, scoring="balanced_accuracy").mean()
CV_roc_auc = cross_val_score(best_rf, X_train, y_train, cv=cv, scoring="roc_auc").mean()
CV_f1 = cross_val_score(best_rf, X_train, y_train, cv=cv, scoring="f1").mean()

print(f"5 折交叉验证 - 准确率: {CV_acc:.4f}")
print(f"5 折交叉验证 - 平衡准确率: {CV_bal_acc:.4f}")
print(f"5 折交叉验证 - ROC AUC: {CV_roc_auc:.4f}")
print(f"5 折交叉验证 - F1 分数: {CV_f1:.4f}")

best_rf.fit(X_train, y_train)
y_pred = best_rf.predict(X_test)
y_prob = best_rf.predict_proba(X_test)[:, 1]


test_acc = accuracy_score(y_test, y_pred)
test_bal_acc = balanced_accuracy_score(y_test, y_pred)
test_roc_auc = roc_auc_score(y_test, y_prob)
test_f1 = f1_score(y_test, y_pred)

print(f"测试集 - 准确率: {test_acc:.4f}")
print(f"测试集 - 平衡准确率: {test_bal_acc:.4f}")
print(f"测试集 - ROC AUC: {test_roc_auc:.4f}")
print(f"测试集 - F1 分数: {test_f1:.4f}")


y_train_prob = cross_val_predict(best_rf, X_train, y_train, cv=cv, method="predict_proba")[:, 1]


fpr_cv, tpr_cv, _ = roc_curve(y_train, y_train_prob)
roc_auc_cv = auc(fpr_cv, tpr_cv)


fpr_test, tpr_test, _ = roc_curve(y_test, y_prob)
roc_auc_test = auc(fpr_test, tpr_test)


plt.figure(figsize=(7, 7))
plt.plot(fpr_cv, tpr_cv, label=f'5-Fold CV ROC (AUC = {roc_auc_cv:.3f})', linestyle='--', color='blue', linewidth=2)
plt.plot(fpr_test, tpr_test, label=f'Test ROC (AUC = {roc_auc_test:.3f})', color='red', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlabel('False Positive Rate (FPR)', fontname='Arial')
plt.ylabel('True Positive Rate (TPR)', fontname='Arial')
plt.title('AHR ROC Curve of RandomForest Classifier', fontname='Arial', fontweight='bold')
plt.legend(loc='lower right', prop={'family': 'Arial'})
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(r"D:/个人/课题/NEU/数据/实验/MODEL/AHR_roc_curve.png", bbox_inches='tight')
plt.show()


conf_matrix_val = confusion_matrix(y_train, cross_val_predict(best_rf, X_train, y_train, cv=cv))
conf_matrix_test = confusion_matrix(y_test, y_pred)

total_matrix = conf_matrix_val + conf_matrix_test


fig, ax = plt.subplots(figsize=(6, 6))
cax = ax.matshow(total_matrix, cmap="Blues")
fig.colorbar(cax)


cmap = plt.get_cmap("Blues")

for (i, j), value in np.ndenumerate(total_matrix):
    val_count = conf_matrix_val[i, j]
    test_count = conf_matrix_test[i, j]

    
    normalized_value = cax.norm(value)  
    rgba_color = cmap(normalized_value)  
    brightness = sum(rgba_color[:3]) / 3 

    
    text_color = "white" if brightness < 0.5 else "black"

    ax.text(j, i, f"CV: {val_count}\nTest: {test_count}", 
            ha='center', va='center', fontsize=10, color=text_color, fontname='Arial')

plt.xticks([0, 1], ["Negative", "Positive"], fontname='Arial')
plt.yticks([0, 1], ["Negative", "Positive"], fontname='Arial')
plt.xlabel("Predicted Label", fontname='Arial')
plt.ylabel("True Label", fontname='Arial')
plt.title("AHR Combined Confusion Matrix (CV + Test)", fontname='Arial', fontweight='bold')
plt.tight_layout()
plt.savefig(r"D:/个人/课题/NEU/数据/实验/MODEL/AHR_combined_conf_matrix.png", bbox_inches='tight')
plt.show()

# 保存模型
model_path = r"D:\个人\课题\NEU\数据\实验\MODEL\neu_AHR_rf_model.pkl"
joblib.dump(best_rf, model_path)
print(f"模型已保存至: {model_path}")

In [None]:
# Nrf2模型
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV, cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, f1_score, roc_curve, auc, confusion_matrix
import joblib

plt.rcParams['font.family'] = 'Arial'
plt.rcParams['figure.dpi'] = 330
plt.rcParams['savefig.dpi'] = 330
plt.rcParams['font.size'] = 12
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['legend.fontsize'] = 10


data_path = r"D:/个人/课题/NEU/数据/实验/MODEL/neu_NRF2_predict_X.csv"
df = pd.read_csv(data_path)


X = df[[str(i) for i in range(1024)]] 
y = df["Model3"] 


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [5, 10, 20, None],
    'random_state': [42]
}

rf = RandomForestClassifier()
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)


best_params = grid_search.best_params_
best_rf = grid_search.best_estimator_
print(f"最佳参数: {best_params}")


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
CV_acc = cross_val_score(best_rf, X_train, y_train, cv=cv, scoring="accuracy").mean()
CV_bal_acc = cross_val_score(best_rf, X_train, y_train, cv=cv, scoring="balanced_accuracy").mean()
CV_roc_auc = cross_val_score(best_rf, X_train, y_train, cv=cv, scoring="roc_auc").mean()
CV_f1 = cross_val_score(best_rf, X_train, y_train, cv=cv, scoring="f1").mean()

print(f"5 折交叉验证 - 准确率: {CV_acc:.4f}")
print(f"5 折交叉验证 - 平衡准确率: {CV_bal_acc:.4f}")
print(f"5 折交叉验证 - ROC AUC: {CV_roc_auc:.4f}")
print(f"5 折交叉验证 - F1 分数: {CV_f1:.4f}")


best_rf.fit(X_train, y_train)
y_pred = best_rf.predict(X_test)
y_prob = best_rf.predict_proba(X_test)[:, 1]

test_acc = accuracy_score(y_test, y_pred)
test_bal_acc = balanced_accuracy_score(y_test, y_pred)
test_roc_auc = roc_auc_score(y_test, y_prob)
test_f1 = f1_score(y_test, y_pred)

print(f"测试集 - 准确率: {test_acc:.4f}")
print(f"测试集 - 平衡准确率: {test_bal_acc:.4f}")
print(f"测试集 - ROC AUC: {test_roc_auc:.4f}")
print(f"测试集 - F1 分数: {test_f1:.4f}")


y_train_prob = cross_val_predict(best_rf, X_train, y_train, cv=cv, method="predict_proba")[:, 1]


fpr_cv, tpr_cv, _ = roc_curve(y_train, y_train_prob)
roc_auc_cv = auc(fpr_cv, tpr_cv)


fpr_test, tpr_test, _ = roc_curve(y_test, y_prob)
roc_auc_test = auc(fpr_test, tpr_test)

# 绘制 ROC 曲线
plt.figure(figsize=(7, 7))
plt.plot(fpr_cv, tpr_cv, label=f'5-Fold CV ROC (AUC = {roc_auc_cv:.3f})', linestyle='--', color='blue', linewidth=2)
plt.plot(fpr_test, tpr_test, label=f'Test ROC (AUC = {roc_auc_test:.3f})', color='red', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlabel('False Positive Rate (FPR)', fontname='Arial')
plt.ylabel('True Positive Rate (TPR)', fontname='Arial')
plt.title('Nrf2 ROC Curve of RandomForest Classifier', fontname='Arial', fontweight='bold')
plt.legend(loc='lower right', prop={'family': 'Arial'})
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(r"D:/个人/课题/NEU/数据/实验/MODEL/Nrf2_roc_curve.png", bbox_inches='tight')
plt.show()


conf_matrix_val = confusion_matrix(y_train, cross_val_predict(best_rf, X_train, y_train, cv=cv))
conf_matrix_test = confusion_matrix(y_test, y_pred)


total_matrix = conf_matrix_val + conf_matrix_test

fig, ax = plt.subplots(figsize=(6, 6))
cax = ax.matshow(total_matrix, cmap="Blues")
fig.colorbar(cax)


cmap = plt.get_cmap("Blues")


for (i, j), value in np.ndenumerate(total_matrix):
    val_count = conf_matrix_val[i, j]
    test_count = conf_matrix_test[i, j]

   
    normalized_value = cax.norm(value)  
    rgba_color = cmap(normalized_value)  
    brightness = sum(rgba_color[:3]) / 3  

    
    text_color = "white" if brightness < 0.5 else "black"

    ax.text(j, i, f"CV: {val_count}\nTest: {test_count}", 
            ha='center', va='center', fontsize=10, color=text_color, fontname='Arial')

plt.xticks([0, 1], ["Negative", "Positive"], fontname='Arial')
plt.yticks([0, 1], ["Negative", "Positive"], fontname='Arial')
plt.xlabel("Predicted Label", fontname='Arial')
plt.ylabel("True Label", fontname='Arial')
plt.title("Nrf2 Combined Confusion Matrix (CV + Test)", fontname='Arial', fontweight='bold')
plt.tight_layout()
plt.savefig(r"D:/个人/课题/NEU/数据/实验/MODEL/NRF2_combined_conf_matrix.png", bbox_inches='tight')
plt.show()

# 保存模型
model_path = r"D:\个人\课题\NEU\数据\实验\MODEL\neu_NRF2_rf_model.pkl"
joblib.dump(best_rf, model_path)
print(f"模型已保存至: {model_path}")

In [None]:
#预测体内数据集，填充数据MODEL1 MODEL3
import pandas as pd
import joblib


data_path = r"D:\个人\课题\NEU\数据\神经毒性体内数据\体内数据\rdkit_vivo_fingerprints.csv"
ahr_model_path = r"D:\个人\课题\NEU\数据\实验\MODEL\neu_AHR_rf_model.pkl"
nrf2_model_path = r"D:\个人\课题\NEU\数据\实验\MODEL\neu_NRF2_rf_model.pkl"
output_path = r"D:\个人\课题\NEU\数据\神经毒性体内数据\体内数据\rdkit_vivo_fingerprints_predicted.csv"


df = pd.read_csv(data_path)

feature_cols = [str(i) for i in range(1024)]
df_features = df[feature_cols]


ahr_model = joblib.load(ahr_model_path)
nrf2_model = joblib.load(nrf2_model_path)

df["Model1"] = ahr_model.predict(df_features)
df["Model3"] = nrf2_model.predict(df_features)


df.to_csv(output_path, index=False)

print(f"预测完成，结果已保存至: {output_path}")


In [None]:
#AOP模型  
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV, cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, f1_score, roc_curve, auc, confusion_matrix
import joblib

plt.rcParams['font.family'] = 'Arial'
plt.rcParams['figure.dpi'] = 330
plt.rcParams['savefig.dpi'] = 330
plt.rcParams['font.size'] = 12
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['legend.fontsize'] = 10


data_path = r"D:\个人\课题\NEU\数据\神经毒性体内数据\体内数据\rdkit_vivo_fingerprints_predicted.csv"
df = pd.read_csv(data_path)

feature_cols = [str(i) for i in range(1024)] + ["Model1", "Model3"] 
X = df[feature_cols]
y = df["Model6"] 


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [5, 10, 20, None],
    'random_state': [42]
}

rf = RandomForestClassifier()
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)


best_params = grid_search.best_params_
best_rf = grid_search.best_estimator_
print(f"最佳参数: {best_params}")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
CV_acc = cross_val_score(best_rf, X_train, y_train, cv=cv, scoring="accuracy").mean()
CV_bal_acc = cross_val_score(best_rf, X_train, y_train, cv=cv, scoring="balanced_accuracy").mean()
CV_roc_auc = cross_val_score(best_rf, X_train, y_train, cv=cv, scoring="roc_auc").mean()
CV_f1 = cross_val_score(best_rf, X_train, y_train, cv=cv, scoring="f1").mean()

print(f"5 折交叉验证 - 准确率: {CV_acc:.4f}")
print(f"5 折交叉验证 - 平衡准确率: {CV_bal_acc:.4f}")
print(f"5 折交叉验证 - ROC AUC: {CV_roc_auc:.4f}")
print(f"5 折交叉验证 - F1 分数: {CV_f1:.4f}")


best_rf.fit(X_train, y_train)
y_pred = best_rf.predict(X_test)
y_prob = best_rf.predict_proba(X_test)[:, 1]

test_acc = accuracy_score(y_test, y_pred)
test_bal_acc = balanced_accuracy_score(y_test, y_pred)
test_roc_auc = roc_auc_score(y_test, y_prob)
test_f1 = f1_score(y_test, y_pred)

print(f"测试集 - 准确率: {test_acc:.4f}")
print(f"测试集 - 平衡准确率: {test_bal_acc:.4f}")
print(f"测试集 - ROC AUC: {test_roc_auc:.4f}")
print(f"测试集 - F1 分数: {test_f1:.4f}")


y_train_prob = cross_val_predict(best_rf, X_train, y_train, cv=cv, method="predict_proba")[:, 1]


fpr_cv, tpr_cv, _ = roc_curve(y_train, y_train_prob)
roc_auc_cv = auc(fpr_cv, tpr_cv)

fpr_test, tpr_test, _ = roc_curve(y_test, y_prob)
roc_auc_test = auc(fpr_test, tpr_test)

plt.figure(figsize=(7, 7))
plt.plot(fpr_cv, tpr_cv, label=f'5-Fold CV ROC (AUC = {roc_auc_cv:.3f})', linestyle='--', color='blue', linewidth=2)
plt.plot(fpr_test, tpr_test, label=f'Test ROC (AUC = {roc_auc_test:.3f})', color='red', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlabel('False Positive Rate (FPR)', fontname='Arial')
plt.ylabel('True Positive Rate (TPR)', fontname='Arial')
plt.title('AOP ROC Curve of RandomForest Classifier', fontname='Arial', fontweight='bold')
plt.legend(loc='lower right', prop={'family': 'Arial'})
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(r"D:\个人\课题\NEU\数据\神经毒性体内数据\体内数据\AOP_roc_curve.png", bbox_inches='tight')
plt.show()


conf_matrix_val = confusion_matrix(y_train, cross_val_predict(best_rf, X_train, y_train, cv=cv))
conf_matrix_test = confusion_matrix(y_test, y_pred)

total_matrix = conf_matrix_val + conf_matrix_test


fig, ax = plt.subplots(figsize=(6, 6))
cax = ax.matshow(total_matrix, cmap="Blues") 
fig.colorbar(cax)


plt.get_cmap("Blues")


for (i, j), value in np.ndenumerate(total_matrix):
    val_count = conf_matrix_val[i, j]
    test_count = conf_matrix_test[i, j]

    
    normalized_value = cax.norm(value)  
    rgba_color = cmap(normalized_value)  
    brightness = sum(rgba_color[:3]) / 3  

    text_color = "white" if brightness < 0.5 else "black"

    ax.text(j, i, f"CV: {val_count}\nTest: {test_count}", 
            ha='center', va='center', fontsize=10, color=text_color, fontname='Arial')

plt.xticks([0, 1], ["Negative", "Positive"], fontname='Arial')
plt.yticks([0, 1], ["Negative", "Positive"], fontname='Arial')
plt.xlabel("Predicted Label", fontname='Arial')
plt.ylabel("True Label", fontname='Arial')
plt.title("AOP Combined Confusion Matrix (CV + Test)", fontname='Arial', fontweight='bold')
plt.tight_layout()
plt.savefig(r"D:\个人\课题\NEU\数据\神经毒性体内数据\体内数据\AOP_combined_conf_matrix.png", bbox_inches='tight')
plt.show()

# 保存模型
model_path = r"D:\个人\课题\NEU\数据\神经毒性体内数据\体内数据\neu_AOP_rf_model.pkl"
joblib.dump(best_rf, model_path)
print(f"模型已保存至: {model_path}")

In [None]:
#VIRTUAL SCREENING
import pandas as pd
import joblib


data_path = r"D:\个人\课题\NEU\数据\神经毒性体内数据\体内数据\数据集\虚拟筛选\TOX21化合物\TOX21化合物_VIVO\predict_X.csv"
ahr_model_path = r"D:\个人\课题\NEU\数据\实验\MODEL\AHR_rf_model.pkl"
nrf2_model_path = r"D:\个人\课题\NEU\数据\实验\MODEL\NRF2_rf_model.pkl"
aop_model_path = r"D:\个人\课题\NEU\数据\神经毒性体内数据\体内数据\AOP_rf_model.pkl"
output_path = r"D:\个人\课题\NEU\数据\神经毒性体内数据\体内数据\数据集\虚拟筛选\TOX21化合物\TOX21_AOP筛选结果.xlsx"

df = pd.read_csv(data_path)

feature_cols = [str(i) for i in range(1024)]
X = df[feature_cols]

ahr_model = joblib.load(ahr_model_path)
df["Model1"] = ahr_model.predict(X)

nrf2_model = joblib.load(nrf2_model_path)
df["Model3"] = nrf2_model.predict(X)

X_aop = df[feature_cols + ["Model1", "Model3"]]

aop_model = joblib.load(aop_model_path)
df["Model6"] = aop_model.predict(X_aop)

df.to_excel(output_path, index=False)
print(f"预测结果已保存至: {output_path}")


In [None]:
#BBB
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV, cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score, roc_auc_score, f1_score, roc_curve, auc, confusion_matrix
import joblib
import os

plt.rcParams['font.family'] = 'Arial'
plt.rcParams['figure.dpi'] = 330
plt.rcParams['savefig.dpi'] = 330
plt.rcParams['font.size'] = 12
plt.rcParams['axes.titlesize'] = 14
plt.rcParams['axes.labelsize'] = 12
plt.rcParams['xtick.labelsize'] = 10
plt.rcParams['ytick.labelsize'] = 10
plt.rcParams['legend.fontsize'] = 10


data_path = r"D:/个人/课题/NEU/数据/实验/MODEL/neu_BBB_predict_X.csv"
df = pd.read_csv(data_path)

X = df[[str(i) for i in range(1024)]] 
y = df["Model5"]  # 目标列

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [5, 10, 20, None],
    'random_state': [42]
}

rf = RandomForestClassifier()
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_train, y_train)

# 最优参数
best_params = grid_search.best_params_
best_rf = grid_search.best_estimator_
print(f"最佳参数: {best_params}")


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
CV_acc = cross_val_score(best_rf, X_train, y_train, cv=cv, scoring="accuracy").mean()
CV_bal_acc = cross_val_score(best_rf, X_train, y_train, cv=cv, scoring="balanced_accuracy").mean()
CV_roc_auc = cross_val_score(best_rf, X_train, y_train, cv=cv, scoring="roc_auc").mean()
CV_f1 = cross_val_score(best_rf, X_train, y_train, cv=cv, scoring="f1").mean()

print(f"5 折交叉验证 - 准确率: {CV_acc:.4f}")
print(f"5 折交叉验证 - 平衡准确率: {CV_bal_acc:.4f}")
print(f"5 折交叉验证 - ROC AUC: {CV_roc_auc:.4f}")
print(f"5 折交叉验证 - F1 分数: {CV_f1:.4f}")


best_rf.fit(X_train, y_train)
y_pred = best_rf.predict(X_test)
y_prob = best_rf.predict_proba(X_test)[:, 1]


test_acc = accuracy_score(y_test, y_pred)
test_bal_acc = balanced_accuracy_score(y_test, y_pred)
test_roc_auc = roc_auc_score(y_test, y_prob)
test_f1 = f1_score(y_test, y_pred)

print(f"测试集 - 准确率: {test_acc:.4f}")
print(f"测试集 - 平衡准确率: {test_bal_acc:.4f}")
print(f"测试集 - ROC AUC: {test_roc_auc:.4f}")
print(f"测试集 - F1 分数: {test_f1:.4f}")


y_train_prob = cross_val_predict(best_rf, X_train, y_train, cv=cv, method="predict_proba")[:, 1]


fpr_cv, tpr_cv, _ = roc_curve(y_train, y_train_prob)
roc_auc_cv = auc(fpr_cv, tpr_cv)

fpr_test, tpr_test, _ = roc_curve(y_test, y_prob)
roc_auc_test = auc(fpr_test, tpr_test)

plt.figure(figsize=(7, 7))
plt.plot(fpr_cv, tpr_cv, label=f'5-Fold CV ROC (AUC = {roc_auc_cv:.3f})', linestyle='--', color='blue', linewidth=2)
plt.plot(fpr_test, tpr_test, label=f'Test ROC (AUC = {roc_auc_test:.3f})', color='red', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlabel('False Positive Rate (FPR)', fontname='Arial')
plt.ylabel('True Positive Rate (TPR)', fontname='Arial')
plt.title('BBB ROC Curve of RandomForest Classifier', fontname='Arial', fontweight='bold')
plt.legend(loc='lower right', prop={'family': 'Arial'})
plt.grid(True, alpha=0.3)
plt.tight_layout()

save_dir = r"D:/个人/课题/NEU/数据/实验/MODEL/"
os.makedirs(save_dir, exist_ok=True)
plt.savefig(os.path.join(save_dir, "BBB_roc_curve.png"), bbox_inches='tight')
plt.show()


conf_matrix_val = confusion_matrix(y_train, cross_val_predict(best_rf, X_train, y_train, cv=cv))
conf_matrix_test = confusion_matrix(y_test, y_pred)


total_matrix = conf_matrix_val + conf_matrix_test


fig, ax = plt.subplots(figsize=(6, 6))
cax = ax.matshow(total_matrix, cmap="Blues")  
fig.colorbar(cax)

cmap = plt.get_cmap("Blues")

for (i, j), value in np.ndenumerate(total_matrix):
    val_count = conf_matrix_val[i, j]
    test_count = conf_matrix_test[i, j]

    
    normalized_value = cax.norm(value)  
    rgba_color = cmap(normalized_value) 
    brightness = sum(rgba_color[:3]) / 3  
    
    text_color = "white" if brightness < 0.5 else "black"

    ax.text(j, i, f"CV: {val_count}\nTest: {test_count}", 
            ha='center', va='center', fontsize=10, color=text_color, fontname='Arial')

plt.xticks([0, 1], ["Negative", "Positive"], fontname='Arial')
plt.yticks([0, 1], ["Negative", "Positive"], fontname='Arial')
plt.xlabel("Predicted Label", fontname='Arial')
plt.ylabel("True Label", fontname='Arial')
plt.title("BBB Combined Confusion Matrix (CV + Test)", fontname='Arial', fontweight='bold')
plt.tight_layout()


plt.savefig(os.path.join(save_dir, "BBB_combined_conf_matrix.png"), bbox_inches='tight')
plt.show()

# 保存模型
model_path = os.path.join(save_dir, "neu_BBB_rf_model.pkl")
joblib.dump(best_rf, model_path)
print(f"模型已保存至: {model_path}")

In [None]:
#BBB筛选
import pandas as pd
import joblib
data_path = r"D:\个人\课题\NEU\数据\神经毒性体内数据\体内数据\数据集\虚拟筛选\TOX21化合物\TOX21_AOP筛选结果.xlsx"
model_path = r"D:\个人\课题\NEU\数据\实验\MODEL\BBB_rf_model.pkl"
output_path = r"D:\个人\课题\NEU\数据\神经毒性体内数据\体内数据\数据集\虚拟筛选\TOX21化合物\TOX21_AOP筛选结果_BBB预测.xlsx"


df = pd.read_excel(data_path)

feature_columns = [str(i) for i in range(1024)]
X = df[feature_columns]


model = joblib.load(model_path)


df["Model5"] = model.predict(X)


df["Model5_Probability"] = model.predict_proba(X)[:, 1]  

# 保存预测结果
df.to_excel(output_path, index=False)

print(f"预测完成，结果已保存至: {output_path}")


In [None]:
#AD
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib
from sklearn.preprocessing import StandardScaler
from numpy.linalg import inv

plt.rcParams['font.family'] = 'Arial'
plt.rcParams['font.size'] = 14

df = pd.read_excel(r'D:\个人\课题\NEU\数据\神经毒性体内数据\体内数据\数据集\虚拟筛选\TOX21化合物\TOX21_终筛结果_fingerprints.xlsx')
model = joblib.load(r'D:\个人\课题\NEU\数据\神经毒性体内数据\体内数据\AOP_rf_model.pkl')


X = df.drop(columns=['Model6', 'CAS Number', 'Chemical name', 'SMILES', 'Model5'])


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

y_prob = model.predict_proba(X_scaled)[:, 1]

X_mat = np.array(X_scaled)
H = X_mat @ inv(X_mat.T @ X_mat) @ X_mat.T
leverage = np.diag(H)

p = X.shape[1]
n = X.shape[0]
leverage_threshold = 3 * (p + 1) / n

# 7. 判断AD状态
in_ad = leverage <= leverage_threshold

df['AD_Status'] = ['Inside AD' if status else 'Outside AD' for status in in_ad]
df['Predicted_Probability'] = y_prob
df['Leverage'] = leverage

output_path = r'D:\个人\课题\NEU\数据\神经毒性体内数据\体内数据\TOX21_virtualscreen_with_AD.xlsx'
df.to_excel(output_path, index=False)
print(f"已保存带有AD状态的数据表格到: {output_path}")

plt.figure(figsize=(10, 6))

plt.axvline(x=leverage_threshold, color='purple', linestyle='--', linewidth=2, 
            label=f'Leverage Threshold: {leverage_threshold:.4f}')

np.random.seed(42)
y_jitter_ad = np.random.normal(0.5, 0.1, np.sum(in_ad))
y_jitter_out = np.random.normal(-0.5, 0.1, np.sum(~in_ad))

# 绘制 Inside / Outside AD
plt.scatter(leverage[in_ad], y_jitter_ad, 
            c='green', s=30, alpha=0.6, edgecolors='black', linewidth=0.5, 
            label=f'Inside AD ({np.sum(in_ad)})')
plt.scatter(leverage[~in_ad], y_jitter_out, 
            c='red', s=100, alpha=0.8, edgecolors='black', linewidth=1, 
            label=f'Outside AD ({np.sum(~in_ad)})')

plt.xlabel("Leverage", fontsize=14)
plt.ylabel("Distribution (jittered)", fontsize=14)
plt.title("Williams Plot for Virtual Screening Dataset", fontsize=16)
plt.legend()
plt.grid(True, linestyle='--', alpha=0.3)
plt.ylim(-1.5, 1.5)

plt.tight_layout()
plt.savefig(r'D:\个人\课题\NEU\数据\神经毒性体内数据\体内数据\williams_plot_virtualscreen.png', dpi=300)
plt.show()

ad_coverage = np.mean(in_ad)
print(f"\n虚拟筛选数据集 AD 内占比: {ad_coverage*100:.2f}%")
print(f"Leverage 阈值: {leverage_threshold:.4f}")
print(f"AD内化合物数量: {np.sum(in_ad)}")
print(f"AD外化合物数量: {np.sum(~in_ad)}")
