In [None]:
import pandas as pd
import numpy as np
import catboost
from sklearn.feature_selection import RFECV
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import cross_val_score, StratifiedKFold
import matplotlib.pyplot as plt
import shap
import joblib

In [None]:
# 定义CatBoost模型
cat_model = CatBoostClassifier(
    depth=6,
    learning_rate=0.04768599666111316,
    n_estimators=113,
    l2_leaf_reg=5.29032308340007,
    random_state=0,
    verbose=0
)

In [None]:
#SHAP value-based ranked feature importance plot for the 54 features derived from the optimal CAT model.

In [None]:
ML_data = pd.read_csv("DES CAT.csv", sep=',')
# 准备特征数据 (假设特征在1-54列)
X = ML_data.iloc[:, 0:54]
feature_names = ML_data.columns[0:54]
#X = ML_data.iloc[:, :54]  # 前54列是特征
y = ML_data.iloc[:, 54]   # 第55列是目标变量

cat_model.fit(X, y)

# 计算SHAP值
explainer = shap.TreeExplainer(cat_model)
shap_values = explainer.shap_values(X)

# 处理二分类输出 (如果返回的是列表)
if isinstance(shap_values, list):
    shap_values = shap_values[1]  # 取正类的SHAP值

# 计算平均绝对SHAP值
shap_df = pd.DataFrame({
    'feature': feature_names,
    'shap_abs': np.mean(np.abs(shap_values), axis=0),
    'shap_mean': np.mean(shap_values, axis=0)
})

# 按重要性排序
shap_df = shap_df.sort_values(by='shap_abs', ascending=False)

# 保存结果
shap_df.to_csv("CAT shap.csv", sep=',')

In [None]:
# 按shap_mean降序排序
df_sorted = shap_df.sort_values(by="shap_mean", ascending=False)

# 分离正负值特征
positive_df = df_sorted[df_sorted['shap_mean'] >= 0]
negative_df = df_sorted[df_sorted['shap_mean'] < 0]

In [None]:
df_sorted.to_csv("CAT shap sorted.csv", sep=',')

In [None]:
# 设置全局字体为 Arial
plt.rcParams['font.family'] = 'Arial'

# 创建图表
fig, ax = plt.subplots(figsize=(14, 7))  # 调整为更适合横向图表的尺寸

# 设置位置参数
bar_width = 0.8
indices = np.arange(len(df_sorted))

# 绘制正值特征（红色）
if not positive_df.empty:
    # 获取正值特征的索引位置
    pos_indices = indices[df_sorted['shap_mean'] >= 0]
    ax.bar(
        pos_indices, 
        positive_df['shap_mean'], 
        color="#FF0D57", 
        width=bar_width, 
        #edgecolor='black',
        alpha=1,
        label='Positive Impact'
    )

# 绘制负值特征（蓝色）
if not negative_df.empty:
    # 获取负值特征的索引位置
    neg_indices = indices[df_sorted['shap_mean'] < 0]
    ax.bar(
        neg_indices, 
        negative_df['shap_mean'], 
        color="#1E88E5", 
        width=bar_width, 
        #edgecolor='black',
        alpha=1,
        label='Negative Impact'
    )

# 设置x轴标签和旋转
ax.set_xticks(range(len(df_sorted)))
ax.set_xticklabels(df_sorted['feature'], rotation=90, ha='center', fontsize=15)
plt.yticks(fontsize=15)  # 增加y轴刻度字体大小

# 设置y轴和标题
plt.title('Feature Importance', fontsize=20, weight="bold")
ax.set_ylabel('SHAP value', fontsize=20, weight="bold")
ax.set_xlabel('Descriptors', fontsize=20, weight="bold")
#ax.set_title('特征重要性排序', fontsize=18, pad=15)

# 添加水平线
ax.axhline(y=0, color='red', linestyle='--', alpha=0.7)

# 设置y轴范围和网格
# 设置对称的y轴范围
max_abs_value = max(abs(min(df_sorted['shap_mean'])), abs(max(df_sorted['shap_mean'])))
y_margin = max_abs_value * 0.05  # 添加5%的边距
ax.set_ylim(-max_abs_value - y_margin, max_abs_value + y_margin)
ax.grid(axis='y', linestyle='--', alpha=0.3)

# 添加图例
ax.legend(fontsize=20, loc='upper right')

# 调整布局
plt.tight_layout()
plt.subplots_adjust(left=0.1, right=0.95, top=0.9, bottom=0.3)  # 增加底部边距以容纳旋转的标签

# 保存图片
plt.savefig("Feature Importance (SHAP value).png", dpi=900, format="png")
plt.show()

In [None]:
# SHAP local feature importance plots

In [None]:
# 示例数据加载
data = pd.read_csv("Prediction.csv")  #替换为你的数据文件路径
label_column = data.columns[54]  # 假设最后一列是标签列
X = data.drop(columns=[label_column])  # 特征集
y = data[label_column]  # 标签列

# 定义CatBoost模型
best_model = CatBoostClassifier(
    depth=6,
    learning_rate=0.04768599666111316,
    n_estimators=113,
    l2_leaf_reg=5.29032308340007,
    random_state=0,
    verbose=0  # 禁用日志输出
)

# 训练模型以获取 SHAP 值
best_model.fit(X, y)

# 计算 SHAP值
explainer = shap.Explainer(best_model, X)
shap_values = explainer(X)

# 打印被选择的特征
selected_features = X.columns
print("被选择的特征：", selected_features)

# 创建SHAP解释器并计算SHAP值
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X)

# 生成每个样本的SHAP力图（force plot）
for i in range(len(X)):
    # 创建单一样本的力图
    force_plot = shap.force_plot(
        base_value=explainer.expected_value,
        shap_values=shap_values[i],
        features=X.iloc[i],
        feature_names=X.columns.tolist(),
        matplotlib=False,  # 使用HTML输出
        show=False
    )
    
    # 保存为HTML文件
    shap.save_html(f"force_plot_sample_{i}.html", force_plot)

print(f"成功生成 {len(X)} 个SHAP力图")

In [None]:
#feature importance/SHAP Local Feature Importance for Sample {i}

In [None]:
shap_values_explanation = explainer(X)
for i in range(len(shap_values_explanation)):
       plt.figure(figsize=(10, 6))
       shap.plots.bar(shap_values_explanation[i], show_data=True, show=False)
       plt.title(f'SHAP Local Feature Importance for Sample {i}', fontsize=14)
       plt.xlabel('SHAP Value', fontsize=12)
       plt.ylabel('Features', fontsize=12)
       plt.savefig(f"feature importance/SHAP Local Feature Importance for Sample {i}.png", dpi=900, format='png', bbox_inches='tight')
       plt.show()

In [None]:
#shap beeswarm

In [None]:
plt.figure(figsize=(12, 8))
plt.rcParams['font.family'] = 'Arial'
shap.summary_plot(shap_values, X, feature_names=selected_features, plot_type="dot", show=False, max_display=54)
plt.title('SHAP Values - Bee Swarm Plot', fontsize=14)
plt.xlabel('SHAP Value', fontsize=12)
plt.ylabel('Features', fontsize=12)
plt.savefig('shap_beeswarm_CAT 54_2.tiff', dpi=300, format='tiff', bbox_inches='tight')
plt.show()