In [2]:
# import necessary libraries 
import os 
import pickle
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")


In [4]:
# set variables
dir_prefix = '0.processed data/'
base_dir = dir_prefix
if not os.path.exists(dir_prefix):
    os.makedirs(dir_prefix)
    print(f"已创建目录: {dir_prefix}")
else:
    print(f"目录已存在: {dir_prefix}")

# use glob to get all the csv files 
# in the folder 
omics_files = [ch for ch in os.listdir(os.getcwd()+'/'+base_dir) if '.csv' in ch]
omics_files = sorted(omics_files)

# we don't use the RNA data for healthy samples

omics_files

目录已存在: 0.processed data/


['LSCC_combined_proteomics_processed.csv',
 'LSCC_control_proteomics_processed.csv',
 'LSCC_tumor_proteomics_processed.csv']

In [6]:
import pandas as pd
import numpy as np

def normalize_columns(df, col1='prot1', col2='prot2'):
    """
    规范化prot1和prot2列：确保每行中prot1 <= prot2
    这样 (A, B) 和 (B, A) 会被视为相同的连接
    """
    df_normalized = df.copy()
    
    # 确保prot1和prot2列存在
    if col1 not in df.columns or col2 not in df.columns:
        raise ValueError(f"数据框必须包含 '{col1}' 和 '{col2}' 列")
    
    # 创建规范化版本
    prot1_norm = np.where(df[col1] <= df[col2], df[col1], df[col2])
    prot2_norm = np.where(df[col1] <= df[col2], df[col2], df[col1])
    
    # 添加规范化列
    df_normalized[f'{col1}_norm'] = prot1_norm
    df_normalized[f'{col2}_norm'] = prot2_norm
    
    return df_normalized

def merge_preserve_left(left_df, right_df, col1='prot1', col2='prot2'):
    """
    拼接两个数据框，保留左侧数据，并处理prot1和prot2列的顺序问题
    """
    # 规范化两个数据框的prot1和prot2列
    left_norm = normalize_columns(left_df, col1, col2)
    right_norm = normalize_columns(right_df, col1, col2)
    
    # 执行左连接合并，基于规范化后的列
    merged_df = pd.merge(
        left_norm,
        right_norm,
        left_on=[f'{col1}_norm', f'{col2}_norm'],
        right_on=[f'{col1}_norm', f'{col2}_norm'],
        how='left',
        suffixes=('_left', '_right')
    )
    
    # 删除规范化过程中添加的临时列
    merged_df.drop([f'{col1}_norm', f'{col2}_norm'], axis=1, inplace=True)
    
    # 处理prot1和prot2列 - 保留左侧的原始值
    # 首先删除右侧的prot1和prot2列
    if f'{col1}_right' in merged_df.columns:
        merged_df.drop(f'{col1}_right', axis=1, inplace=True)
    if f'{col2}_right' in merged_df.columns:
        merged_df.drop(f'{col2}_right', axis=1, inplace=True)
    
    # 重命名左侧的prot1和prot2列，去掉后缀
    if f'{col1}_left' in merged_df.columns:
        merged_df.rename(columns={f'{col1}_left': col1}, inplace=True)
    if f'{col2}_left' in merged_df.columns:
        merged_df.rename(columns={f'{col2}_left': col2}, inplace=True)
    
    # 处理其他重复列（除了prot1和prot2列）
    left_cols = set(left_df.columns)
    right_cols = set(right_df.columns)
    common_cols = left_cols.intersection(right_cols) - {col1, col2}
    
    # 对于重复列，优先保留左侧的值
    for col in common_cols:
        left_col = f'{col}_left'
        right_col = f'{col}_right'
        
        if left_col in merged_df.columns and right_col in merged_df.columns:
            # 如果左侧列有值，优先使用左侧列
            merged_df[col] = merged_df[left_col].combine_first(merged_df[right_col])
            
            # 删除临时列
            merged_df.drop([left_col, right_col], axis=1, inplace=True)
    
    return merged_df

In [8]:
##############
# 1. 计算 Pearson & Partial correlations
##############

import os, pickle
import numpy as np
import pandas as pd
from sklearn.covariance import LedoitWolf
from sklearn.preprocessing import StandardScaler
GO=pd.read_csv('0.Data/Go_similarity.csv', sep=',')
min_nr_samples = 30
base_dir_out = '1.cor_mat/'
os.makedirs(base_dir_out, exist_ok=True)

for file_name in omics_files:
    cur_cohort = file_name.split('_')[0]+'_'+file_name.split('_')[1]
    prot_data = pd.read_csv(base_dir+file_name, sep=',', index_col=0)

    if prot_data.shape[1] >= min_nr_samples:  # 至少 30 个样本
        prot_data.sort_index(inplace=True)

        # X: 样本 × 基因
        X = prot_data.T.values
        X = StandardScaler().fit_transform(X)
        genes = prot_data.index

        ###### 1. Pearson correlation ######
        pearson_mat = np.corrcoef(X, rowvar=False)
        pearson_mat = pd.DataFrame(pearson_mat, index=genes, columns=genes)

        ###### 2. Partial correlation (LedoitWolf 协方差收缩) ######
        try:
            lw = LedoitWolf().fit(X)
            prec_mat = np.linalg.inv(lw.covariance_)
            D = np.diag(1 / np.sqrt(np.diag(prec_mat)))
            pcorr_mat = -D @ prec_mat @ D
            np.fill_diagonal(pcorr_mat, 1)
            pcorr_mat = pd.DataFrame(pcorr_mat, index=genes, columns=genes)
        except Exception as e:
            print(f"LedoitWolf failed on {cur_cohort}: {e}")
            pcorr_mat = pd.DataFrame(np.nan, index=genes, columns=genes)

        ###### 上三角展开（Series） ######
        def stack_upper(mat):
            iu = np.triu_indices_from(mat, k=1)
            idx = pd.MultiIndex.from_arrays(
                [mat.index[iu[0]], mat.columns[iu[1]]],
                names=["prot1", "prot2"]
            )
            return pd.Series(mat.values[iu], index=idx)

        pearson_list = stack_upper(pearson_mat)
        pcorr_list   = stack_upper(pcorr_mat)

        # 合并
        correl_list = pd.concat([pearson_list, pcorr_list], axis=1)
        correl_list.columns = [f"{cur_cohort}_pearson", f"{cur_cohort}_pcorr"]

        # reset_index: 让 prot1/prot2 永远是显式列
        correl_list = correl_list.reset_index()
        merged_df=merge_preserve_left(correl_list, GO)
        file_out = file_name.split('_idfix')[0]+'_corr.obj'
        with open(base_dir_out+file_out, "wb") as file_handler:
            pickle.dump(merged_df.round(6), file_handler)

        print(f"{cur_cohort}: 保存 {merged_df.shape[0]} 对蛋白质关系")

LSCC_combined: 保存 8094276 对蛋白质关系
LSCC_control: 保存 8094276 对蛋白质关系
LSCC_tumor: 保存 8094276 对蛋白质关系


In [9]:
import pandas as pd
dir_prefix = '0.Data/'
# conversion tool for CORUM proteins to the standard gene set we use
conversion_df = pd.read_csv(dir_prefix+'[fig1a]_conversion_df_240827.csv')
conversion_df.set_index('from_id',inplace=True)

# load human interactions from CORUM database
#file_name =
temp_df = pd.read_csv(dir_prefix+'corum_humanComplexes.txt',sep='\t')
#temp_df = temp_df.loc[temp_df.loc[:,'Organism']=='Human',:]
CORUM_complexes = temp_df['subunits_gene_name'].reset_index(drop=True)

corum_set = []
for complex_nr in range(len(CORUM_complexes)):
    # get CORUM complex members and number of parts
    cur_compl = CORUM_complexes[complex_nr].split(';')
    cur_compl = [x.strip() for x in cur_compl if x]
    cur_compl = [x.upper() for x in cur_compl if 'orf' not in x]+[x for x in cur_compl if 'orf' in x]
    cur_compl = [x for x in cur_compl if 'NONE' not in x]
    cur_compl = [x for x in cur_compl if 'B-RAF' not in x]+['BRAF' for x in cur_compl if 'B-RAF' in x]
    
    # store all combinations of protein pairs in the complex
    n_parts = len(cur_compl)
    for i in range(n_parts):
        if ('variant' not in cur_compl[i]) & ('None' not in cur_compl[i]):
            for j in range(n_parts):
                if ('variant' not in cur_compl[j]) & ('None' not in cur_compl[j]):
                    if i != j:
                        corum_set.append([cur_compl[i],cur_compl[j]])


corum_db = (pd.DataFrame(corum_set))
corum_db=corum_db.drop_duplicates()
corum_db.columns=['prot1','prot2']

corum_db = corum_db.loc[corum_db['prot1'].isin(conversion_df.index) & corum_db['prot2'].isin(conversion_df.index)]
corum_db.loc[:,'prot1']=list(conversion_df.loc[corum_db['prot1'],'to_id'])
corum_db.loc[:,'prot2']=list(conversion_df.loc[corum_db['prot2'],'to_id'])
corum_db.set_index(['prot1','prot2'],inplace=True)
corum_db

prot1,prot2
BCL6,HDAC4
HDAC4,BCL6
BCL6,HDAC5
HDAC5,BCL6
BCL6,HDAC7
...,...
QRFPR,HCRTR1
LRP4,SOST
SOST,LRP4
HCRTR2,QRFPR


In [10]:
# set variables
base_dir = '1.cor_mat/'

# use glob to get all the csv files 
# in the folder 
omics_files = [ch for ch in os.listdir(os.getcwd()+'/'+base_dir) if '.obj' in ch]
omics_files = sorted(omics_files)

omics_files

['LSCC_combined_proteomics_processed.csv_corr.obj',
 'LSCC_control_proteomics_processed.csv_corr.obj',
 'LSCC_tumor_proteomics_processed.csv_corr.obj']

In [11]:

string=pd.read_table("9606 link merged.txt",index_col=[0,1])

##############
# 2. 训练 Logistic Regression 并输出概率
##############

import numpy as np
from sklearn.linear_model import LogisticRegression

base_dir_out = '2.probabilities/'
os.makedirs(base_dir_out, exist_ok=True)

for file_name in omics_files:
    # load correlation file
    with open('1.cor_mat/'+file_name, "rb") as file_handler:
        cur_correl = pickle.load(file_handler)

    # 设置 index = (prot1, prot2)，方便后续匹配 corum_db
    cur_correl = cur_correl.set_index(['prot1','prot2'])

    # annotate with CORUM
    cur_correl.loc[~cur_correl.index.isin(string.index), 'corum'] = 0
    cur_correl.loc[cur_correl.index.isin(corum_db.index), 'corum'] = 1
    cur_correl.loc[cur_correl['GO:MF'].isna(), 'GO:MF'] = 0
    cur_correl.loc[cur_correl['GO:BP'].isna(), 'GO:BP'] = 0
    cur_correl.loc[cur_correl['GO:CC'].isna(), 'GO:CC'] = 0
    cur_correl=cur_correl.loc[~cur_correl['corum'].isna(),]
    # 
    X = cur_correl.iloc[:, :5].values   # 前两列是 pearson 和 pcorr
    y = cur_correl['corum'].values

    clf = LogisticRegression(
        penalty=None, class_weight='balanced',
        fit_intercept=True, solver='newton-cholesky'
    ).fit(X, y)

    # 预测概率（正类）
    cur_prob = pd.DataFrame(
        clf.predict_proba(X)[:, 1],
        index=cur_correl.index,
        columns=[file_name.split('_idfix')[0]]
    )

    file_out = file_name.split('_idfix')[0]+'_prob.obj'
    with open(base_dir_out+file_out, "wb") as file_handler:
        pickle.dump(cur_prob.round(7), file_handler)

    print(f"{file_name}: 已保存概率 {cur_prob.shape[0]} 对蛋白质")


LSCC_combined_proteomics_processed.csv_corr.obj: 已保存概率 7632795 对蛋白质
LSCC_control_proteomics_processed.csv_corr.obj: 已保存概率 7632795 对蛋白质
LSCC_tumor_proteomics_processed.csv_corr.obj: 已保存概率 7632795 对蛋白质


In [12]:
import os, pickle
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score

##############
# 3. XGBoost版本训练并输出概率（完整版）
##############

# ==== 路径设置 ====
string = pd.read_table("9606 link merged.txt", index_col=[0, 1])
xgb_dir_out = '3.xgb_probabilities/'
xgb_eval_dir_out = '3.xgb_evaluation_plots/'
os.makedirs(xgb_dir_out, exist_ok=True)
os.makedirs(xgb_eval_dir_out, exist_ok=True)


# ==== 采样函数 ====
def sample_negatives_with_na(df, ratio=2.0, label_col="corum"):
    positives = df[df[label_col] == 1]
    negatives = df[df[label_col] == 0]
    na_samples = df[df[label_col].isna()]

    n_pos = len(positives)
    n_neg_sample = int(min(len(negatives), ratio * n_pos))

    neg_sample = negatives.sample(n=n_neg_sample, random_state=42)
    return pd.concat([positives, neg_sample])


# ==== 模型训练 + 超参搜索 ====
def train_xgb_model(X_train, y_train):
    param_dist = {
        "max_depth": [1,2,3,4],
        "learning_rate": [0.01, 0.05, 0.1],
        "subsample": [0.6, 0.8],
        "colsample_bytree": [0.6, 0.8],
        "n_estimators": [100, 300, 500],
        "min_child_weight": [1, 3, 5],
        'reg_alpha': [0,1, 3, 5],
        'reg_lambda': [1,3,2]
    }

    model = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='aucpr',
        tree_method='hist',
        random_state=42,
        n_jobs=-1,
        scale_pos_weight=8
    )

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    search = RandomizedSearchCV(
        model,
        param_distributions=param_dist,
        n_iter=30,
        scoring="average_precision",
        cv=cv,
        verbose=1,
        n_jobs=-1,
        random_state=42
    )

    search.fit(X_train, y_train)
    return search.best_estimator_, search.best_params_


# ==== 主循环 ====
all_predictions = []   # 保存所有文件的预测结果

for file_name in omics_files:
    # 1. 加载特征数据
    with open(f'1.cor_mat/{file_name}', "rb") as file_handler:
        cur_correl = pickle.load(file_handler)

    cur_correl = cur_correl.set_index(['prot1', 'prot2'])

    # 2. 标注 CORUM
    cur_correl.loc[cur_correl.index.isin(corum_db.index), 'corum'] = 1
    cur_correl.loc[~cur_correl.index.isin(string.index), 'corum'] = 0
    # NA 保留

    for go_term in ['GO:MF', 'GO:BP', 'GO:CC']:
        cur_correl.loc[cur_correl[go_term].isna(), go_term] = 0

    # 3. 采样（训练用）
    cur_sampled = sample_negatives_with_na(cur_correl, ratio=4)

    # 4. 提取特征和标签 (仅 0/1)
    X = cur_sampled.iloc[:, :5]
    y = cur_sampled['corum']
    print(sum(y == 0) / sum(y == 1))
    # 5. 拆分训练/测试
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    # 6. 超参搜索 + 模型训练
    best_model, best_params = train_xgb_model(X_train, y_train)

    # 7. 在测试集上评估
    y_test_pred = best_model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_test_pred)
    roc_auc = auc(fpr, tpr)
    precision, recall, _ = precision_recall_curve(y_test, y_test_pred)
    avg_precision = average_precision_score(y_test, y_test_pred)

    # 绘制评估图
    prefix = file_name.split('_idfix')[0]
    prefix = prefix.split("_pro")[0]
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

    ax1.plot(fpr, tpr, color='darkorange', lw=2,
             label=f'ROC curve (AUC = {roc_auc:.3f})')
    ax1.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    ax1.set_xlim([0.0, 1.0]); ax1.set_ylim([0.0, 1.05])
    ax1.set_xlabel('False Positive Rate'); ax1.set_ylabel('True Positive Rate')
    ax1.set_title(f'{prefix} - XGBoost ROC Curve'); ax1.legend(loc="lower right")

    ax2.plot(recall, precision, color='blue', lw=2,
             label=f'PR curve (AP = {avg_precision:.3f})')
    ax2.set_xlim([0.0, 1.0]); ax2.set_ylim([0.0, 1.05])
    ax2.set_xlabel('Recall'); ax2.set_ylabel('Precision')
    ax2.set_title(f'{prefix} - XGBoost Precision-Recall Curve')
    ax2.legend(loc="lower left")

    plt.tight_layout()
    plt.savefig(os.path.join(xgb_eval_dir_out, f"{prefix}_xgb_eval.png"), dpi=300)
    plt.close()

    print(f"[{prefix}] XGBoost 已完成 | AUC={roc_auc:.3f} | AP={avg_precision:.3f}")
    print(f"最佳参数: {best_params}")

    # 8. 对整体数据做预测（包括 NA 和未采样负样本）
    X_all = cur_correl.iloc[:, :5]
    y_all_pred = best_model.predict_proba(X_all)[:, 1]

    cur_all_prob = pd.DataFrame({
        "prot1": X_all.index.get_level_values(0),
        "prot2": X_all.index.get_level_values(1),
        "probability": y_all_pred
    }).set_index(["prot1", "prot2"])
    prefix = file_name.split('_idfix')[0]
    cur_all_prob.to_csv(os.path.join(xgb_dir_out, f"{prefix}_predictions.csv"))
    #保存模型
    
    model_filename = os.path.join(xgb_dir_out, f"{prefix}_model.pkl")
    with open(model_filename, 'wb') as model_file:
        pickle.dump(best_model, model_file)
    print(f"模型已保存到: {model_filename}")


4.0
Fitting 5 folds for each of 30 candidates, totalling 150 fits
[LSCC_combined] XGBoost 已完成 | AUC=0.964 | AP=0.921
最佳参数: {'subsample': 0.8, 'reg_lambda': 1, 'reg_alpha': 0, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 4, 'learning_rate': 0.01, 'colsample_bytree': 0.6}
模型已保存到: 3.xgb_probabilities/LSCC_combined_proteomics_processed.csv_corr.obj_model.pkl
4.0
Fitting 5 folds for each of 30 candidates, totalling 150 fits
[LSCC_control] XGBoost 已完成 | AUC=0.964 | AP=0.912
最佳参数: {'subsample': 0.8, 'reg_lambda': 1, 'reg_alpha': 0, 'n_estimators': 500, 'min_child_weight': 3, 'max_depth': 4, 'learning_rate': 0.01, 'colsample_bytree': 0.6}
模型已保存到: 3.xgb_probabilities/LSCC_control_proteomics_processed.csv_corr.obj_model.pkl
4.0
Fitting 5 folds for each of 30 candidates, totalling 150 fits
[LSCC_tumor] XGBoost 已完成 | AUC=0.961 | AP=0.911
最佳参数: {'subsample': 0.6, 'reg_lambda': 3, 'reg_alpha': 1, 'n_estimators': 300, 'min_child_weight': 3, 'max_depth': 2, 'learning_rate': 0.05, 'colsampl

In [25]:
6240/5

1248.0

In [13]:
import os, pickle
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
from itertools import cycle

from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score

##############
# 3. XGBoost Model Training with Probability Output (Full Version)
##############

# ==== Path Settings ====
string = pd.read_table("9606 link merged.txt", index_col=[0, 1])
xgb_dir_out = '3.xgb_probabilities/'
xgb_eval_dir_out = '3.xgb_evaluation_plots/'
os.makedirs(xgb_dir_out, exist_ok=True)
os.makedirs(xgb_eval_dir_out, exist_ok=True)

# ==== Feature Set Definitions ====
FEATURE_SETS = {
    "Correlation": [0],
    "Correlation+Partial": [0, 1],
    "Correlation+Partial+GO:MF": [0, 1, 2],
    "Correlation+Partial+GO:BP": [0, 1, 3],
    "Correlation+Partial+GO:CC": [0, 1, 4],
    "Correlation+Partial+All GO": [0, 1, 2, 3, 4]
}

# ==== Sampling Function ====
def sample_negatives_with_na(df, ratio=2.0, label_col="corum"):
    positives = df[df[label_col] == 1]
    negatives = df[df[label_col] == 0]
    na_samples = df[df[label_col].isna()]

    n_pos = len(positives)
    n_neg_sample = int(min(len(negatives), ratio * n_pos))

    neg_sample = negatives.sample(n=n_neg_sample, random_state=42)
    return pd.concat([positives, neg_sample])

# ==== Model Training + Hyperparameter Tuning ====
def train_xgb_model(X_train, y_train):
    param_dist = {
        "max_depth": [1,2,3,4],
        "learning_rate": [0.01, 0.05, 0.1],
        "subsample": [0.6, 0.8],
        "colsample_bytree": [0.6, 0.8],
        "n_estimators": [100, 300, 500],
        "min_child_weight": [1, 3, 5],
        'reg_alpha': [0,1, 3, 5],
        'reg_lambda': [1,3,2]
    }

    model = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='aucpr',
        tree_method='hist',
        random_state=42,
        n_jobs=-1,
        scale_pos_weight=8
    )

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    search = RandomizedSearchCV(
        model,
        param_distributions=param_dist,
        n_iter=30,
        scoring="average_precision",
        cv=cv,
        verbose=1,
        n_jobs=-1,
        random_state=42
    )

    search.fit(X_train, y_train)
    return search.best_estimator_, search.best_params_

# ==== Main Loop ====
all_predictions = []   # Save predictions for all files

for file_name in omics_files:
    # 1. Load feature data
    with open(f'1.cor_mat/{file_name}', "rb") as file_handler:
        cur_correl = pickle.load(file_handler)

    cur_correl = cur_correl.set_index(['prot1', 'prot2'])

    # 2. Label CORUM
    cur_correl.loc[cur_correl.index.isin(corum_db.index), 'corum'] = 1
    cur_correl.loc[~cur_correl.index.isin(string.index), 'corum'] = 0
    # NA remains

    for go_term in ['GO:MF', 'GO:BP', 'GO:CC']:
        cur_correl.loc[cur_correl[go_term].isna(), go_term] = 0

    # 3. Sampling (for training)
    cur_sampled = sample_negatives_with_na(cur_correl, ratio=4)

    # 4. Extract features and labels (only 0/1)
    X = cur_sampled.iloc[:, :5]
    y = cur_sampled['corum']
    print(f"Positive/Negative ratio: {sum(y == 1)}:{sum(y == 0)} = 1:{sum(y == 0)/sum(y == 1):.1f}")
    
    # 5. Split train/test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )

    # Store results for each feature set
    results = {}
    
    # 6. Train and evaluate each feature set
    for feature_name, feature_indices in FEATURE_SETS.items():
        print(f"\nTraining feature set: {feature_name}")
        
        # Select feature subset
        X_train_sub = X_train.iloc[:, feature_indices]
        X_test_sub = X_test.iloc[:, feature_indices]
        
        # Hyperparameter tuning + model training
        best_model, best_params = train_xgb_model(X_train_sub, y_train)
        
        # Evaluate on test set
        y_test_pred = best_model.predict_proba(X_test_sub)[:, 1]
        
        # Calculate ROC curve
        fpr, tpr, _ = roc_curve(y_test, y_test_pred)
        roc_auc = auc(fpr, tpr)
        
        # Calculate PR curve
        precision, recall, _ = precision_recall_curve(y_test, y_test_pred)
        avg_precision = average_precision_score(y_test, y_test_pred)
        
        # Store results
        results[feature_name] = {
            'fpr': fpr,
            'tpr': tpr,
            'roc_auc': roc_auc,
            'precision': precision,
            'recall': recall,
            'avg_precision': avg_precision,
            'model': best_model,
            'params': best_params
        }
        
        print(f"[{feature_name}] AUC={roc_auc:.3f} | AP={avg_precision:.3f}")
        print(f"Best parameters: {best_params}")

    # 7. Create comparison plots
    prefix = file_name.split('_idfix')[0]
    prefix = prefix.split("_pro")[0]
    
    # Create plots
    plt.figure(figsize=(15, 12))
    
    # ROC curve
    plt.subplot(2, 1, 1)
    colors = cycle(['blue', 'green', 'red', 'purple', 'orange', 'brown'])
    for i, (feature_name, result) in enumerate(results.items()):
        plt.plot(result['fpr'], result['tpr'], color=next(colors), lw=2,
                 label=f'{feature_name} (AUC = {result["roc_auc"]:.3f})')
    
    plt.plot([0, 1], [0, 1], 'k--', lw=2)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'{prefix} - ROC Curve Comparison')
    plt.legend(loc="lower right")
    
    # PR curve
    plt.subplot(2, 1, 2)
    colors = cycle(['blue', 'green', 'red', 'purple', 'orange', 'brown'])
    for i, (feature_name, result) in enumerate(results.items()):
        plt.plot(result['recall'], result['precision'], color=next(colors), lw=2,
                 label=f'{feature_name} (AP = {result["avg_precision"]:.3f})')
    
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(f'{prefix} - Precision-Recall Curve Comparison')
    plt.legend(loc="lower left")
    
    plt.tight_layout()
    plt.savefig(os.path.join(xgb_eval_dir_out, f"{prefix}_feature_comparison.png"), dpi=300)
    plt.close()
    
    # 8. Feature importance plot (for full feature set)
    full_model = results['Correlation+Partial+All GO']['model']
    plt.figure(figsize=(10, 6))
    xgb.plot_importance(full_model, max_num_features=10)
    plt.title(f'{prefix} - Feature Importance')
    plt.tight_layout()
    plt.savefig(os.path.join(xgb_eval_dir_out, f"{prefix}_feature_importance.png"), dpi=300)
    plt.close()
    
    # 9. Performance metric comparison plot
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
    
    # AUC comparison
    feature_names = list(results.keys())
    auc_scores = [results[name]['roc_auc'] for name in feature_names]
    ap_scores = [results[name]['avg_precision'] for name in feature_names]
    
    ax1.bar(feature_names, auc_scores, color='skyblue')
    ax1.set_title('AUC Comparison')
    ax1.set_ylabel('AUC')
    ax1.set_ylim([0.5, 1.0])
    ax1.tick_params(axis='x', rotation=45)
    
    # AP comparison
    ax2.bar(feature_names, ap_scores, color='lightgreen')
    ax2.set_title('Average Precision (AP) Comparison')
    ax2.set_ylabel('Average Precision')
    ax2.set_ylim([0.0, 1.0])
    ax2.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.savefig(os.path.join(xgb_eval_dir_out, f"{prefix}_performance_comparison.png"), dpi=300)
    plt.close()
    
    # 10. Predict on all data (using full feature model)
    X_all = cur_correl.iloc[:, :5]
    y_all_pred = full_model.predict_proba(X_all)[:, 1]

    cur_all_prob = pd.DataFrame({
        "prot1": X_all.index.get_level_values(0),
        "prot2": X_all.index.get_level_values(1),
        "probability": y_all_pred
    }).set_index(["prot1", "prot2"])
    
    cur_all_prob.to_csv(os.path.join(xgb_dir_out, f"{prefix}_predictions.csv"))
    
    # 11. Save full feature model
    model_filename = os.path.join(xgb_dir_out, f"{prefix}_model.pkl")
    with open(model_filename, 'wb') as model_file:
        pickle.dump(full_model, model_file)
    print(f"Model saved to: {model_filename}")

    print(f"[{prefix}] Evaluation of all feature sets completed")

Positive/Negative ratio: 1248:4992 = 1:4.0

Training feature set: Correlation
Fitting 5 folds for each of 30 candidates, totalling 150 fits
[Correlation] AUC=0.752 | AP=0.556
Best parameters: {'subsample': 0.6, 'reg_lambda': 1, 'reg_alpha': 0, 'n_estimators': 300, 'min_child_weight': 3, 'max_depth': 2, 'learning_rate': 0.01, 'colsample_bytree': 0.8}

Training feature set: Correlation+Partial
Fitting 5 folds for each of 30 candidates, totalling 150 fits
[Correlation+Partial] AUC=0.776 | AP=0.607
Best parameters: {'subsample': 0.6, 'reg_lambda': 1, 'reg_alpha': 0, 'n_estimators': 300, 'min_child_weight': 3, 'max_depth': 2, 'learning_rate': 0.01, 'colsample_bytree': 0.8}

Training feature set: Correlation+Partial+GO:MF
Fitting 5 folds for each of 30 candidates, totalling 150 fits
[Correlation+Partial+GO:MF] AUC=0.878 | AP=0.749
Best parameters: {'subsample': 0.6, 'reg_lambda': 1, 'reg_alpha': 3, 'n_estimators': 500, 'min_child_weight': 5, 'max_depth': 4, 'learning_rate': 0.01, 'colsample_

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>