# Predicting on Unseen Gene Pairs Using Multiple Feature Sets and Labelled Datasets 

**Training and Prediction:**
- A separate classifier is trained for each combination of a `feature set` and a `labelled dataset` (e.g., PPI bioplex, SL).
- Each classifier is trained on known labelled data (using a specific set of features) and then used to predict on unseen gene pairs.


**Reporting Predictions:**
- For gene pairs that are already labelled, the known prediction is directly reported as `True` or `False`.
- For unseen gene pairs, the classifier’s `prediction score (a float value)` is provided.


**Ranking:**
- The gene pairs are sorted based on the `sum of all prediction scores`.

In [7]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.base import clone

## Define Features Lists

In [2]:
# Define the different set of features to use in classifiers

# Structural pair-wise features + min. sequence identity
pairStruct_features = ['fident_struct', 'bits_struct', 'alntmscore_struct', 'qtmscore_struct','ttmscore_struct',
                'alnlen_struct', 'evalue_struct', 'prob_struct', 'lddt_struct',
                'min_sequence_identity',
                    ]

# Sequence context features (similarity searches) + min. sequence identity
contSeq_features = [
                'rank_seq', 'selfSP_seq', 'taxid_seq',
                'min_sequence_identity'
                    ]

# Structural context features (similarity searches) + min. sequence identity
contStruct_features = ['rank_struct', 'selfSP_struct', 'taxid_struct', 
                'min_sequence_identity'
                    ]

# Structural and Sequence context features (similarity searches) + min. sequence identity
contSeqStruct_features = ['rank_struct', 'selfSP_struct', 'taxid_struct', 
                'rank_seq', 'selfSP_seq', 'taxid_seq',
                'min_sequence_identity'
                    ]

# Protein Language Model embedding distance features + min. sequence identity
plm_features = ['min_sequence_identity',
                'esm2_beginning_of_sequence_cosine','esm2_beginning_of_sequence_euclidean','esm2_beginning_of_sequence_manhattan',
                'esm2_beginning_of_sequence_ts_ss','esm2_end_of_sequence_cosine','esm2_end_of_sequence_euclidean',
                'esm2_end_of_sequence_manhattan','esm2_end_of_sequence_ts_ss','esm2_mean_of_residue_tokens_cosine',
                'esm2_mean_of_residue_tokens_euclidean','esm2_mean_of_residue_tokens_manhattan',
                'esm2_mean_of_residue_tokens_ts_ss',
                'esm2_mean_of_special_tokens_cosine','esm2_mean_of_special_tokens_euclidean',
                'esm2_mean_of_special_tokens_manhattan',
                'esm2_mean_of_special_tokens_ts_ss',
                'ProtT5_per-protein_cosine', 'ProtT5_per-protein_euclidean', 
                'ProtT5_per-protein_manhattan', 'ProtT5_per-protein_ts_ss',
                    ]

# Protein Language Model embedding distance features + min. sequence identity
pairwise_features = ['min_sequence_identity',
                'esm2_beginning_of_sequence_cosine','esm2_beginning_of_sequence_euclidean','esm2_beginning_of_sequence_manhattan',
                'esm2_beginning_of_sequence_ts_ss','esm2_end_of_sequence_cosine','esm2_end_of_sequence_euclidean',
                'esm2_end_of_sequence_manhattan','esm2_end_of_sequence_ts_ss','esm2_mean_of_residue_tokens_cosine',
                'esm2_mean_of_residue_tokens_euclidean','esm2_mean_of_residue_tokens_manhattan','esm2_mean_of_residue_tokens_ts_ss',
                'esm2_mean_of_special_tokens_cosine','esm2_mean_of_special_tokens_euclidean','esm2_mean_of_special_tokens_manhattan',
                'esm2_mean_of_special_tokens_ts_ss',
                'ProtT5_per-protein_cosine', 'ProtT5_per-protein_euclidean', 'ProtT5_per-protein_manhattan', 'ProtT5_per-protein_ts_ss',
                'fident_struct', 'bits_struct', 'alntmscore_struct', 'qtmscore_struct','ttmscore_struct',
                'alnlen_struct', 'evalue_struct', 'prob_struct', 'lddt_struct',    
                    ]

# All news sequences and structures features to model sequence divergence + min. sequence identity
news_features = ['min_sequence_identity',
                'rank_struct', 'selfSP_struct', 'taxid_struct', 
                'fident_struct', 'bits_struct', 'alntmscore_struct', 'qtmscore_struct','ttmscore_struct',
                'alnlen_struct', 'evalue_struct', 'prob_struct', 'lddt_struct',
                'rank_seq', 'selfSP_seq', 'taxid_seq', 
                'esm2_beginning_of_sequence_cosine','esm2_beginning_of_sequence_euclidean','esm2_beginning_of_sequence_manhattan',
                'esm2_beginning_of_sequence_ts_ss','esm2_end_of_sequence_cosine','esm2_end_of_sequence_euclidean',
                'esm2_end_of_sequence_manhattan','esm2_end_of_sequence_ts_ss','esm2_mean_of_residue_tokens_cosine',
                'esm2_mean_of_residue_tokens_euclidean','esm2_mean_of_residue_tokens_manhattan','esm2_mean_of_residue_tokens_ts_ss',
                'esm2_mean_of_special_tokens_cosine','esm2_mean_of_special_tokens_euclidean','esm2_mean_of_special_tokens_manhattan',
                'esm2_mean_of_special_tokens_ts_ss',
                'ProtT5_per-protein_cosine', 'ProtT5_per-protein_euclidean', 'ProtT5_per-protein_manhattan', 'ProtT5_per-protein_ts_ss',
                    ]



## Make predictions for every `train_dataset` X `classifier` for every pairs

### 1) For Human pairs

In [6]:
####################################################
# 1) Setup Datasets, Feature Sets, and Classifier
####################################################
# Choose classifier type: 'XGB', 'RF', or 'LR'
class_mod = 'XGB'
if class_mod == 'XGB':
    base_classifier = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', XGBClassifier(
            n_estimators=600,
            random_state=8,
            learning_rate=0.1,
            colsample_bytree=0.5,
            use_label_encoder=False,
            eval_metric='logloss'
        ))
    ])
elif class_mod == 'RF':
    base_classifier = RandomForestClassifier(
        n_estimators=600, random_state=8,
        max_features=0.5, max_depth=3, min_samples_leaf=8
    )
elif class_mod == 'LR':
    base_classifier = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(max_iter=1000))
    ])

# Human training datasets: mapping dataset names to file paths.
hs_train_datasets = {
    'PPI Bioplex': './data/ens111_human_BioPlex.csv',
    'SL': './data/ens111_human_SL.csv',
    'GO BPO': './data/ens111_human_BPO.csv',
    'GO MFO': './data/ens111_human_MFO.csv',
    'GO CCO': './data/ens111_human_CCO.csv',
    'PPI Biogrid Phys.': './data/ens111_human_Biogrid.csv',
    'PPI Biogrid Y2H': './data/ens111_human_Biogrid-Y2H.csv',
    'SL-Lenient': './data/ens111_human_SL-lenient.csv',
}

# Unseen pairs dataset (human) path
hs_all_pairs_path = './data/ens111_human_allFeatures.csv'

# Define feature sets: key is descriptive name, value is the list of feature columns
feature_sets = {
    f'Minimum Sequence Identity {class_mod}': ['min_sequence_identity'],
    f'Pairwise structure {class_mod}': pairStruct_features,
    f'PLM emb. dist. {class_mod}': plm_features,
    f'Pairwise {class_mod}': pairwise_features,
    f'Similarity Search {class_mod}': contSeqStruct_features,
    f'All {len(news_features)} features {class_mod}': news_features,
}

####################################################
# 2) Load All-Pairs Data (Using sorted_gene_pair as ID)
####################################################
all_pairs_df = pd.read_csv(hs_all_pairs_path)
if 'sorted_gene_pair' not in all_pairs_df.columns:
    raise ValueError("The unseen pairs dataset must contain the 'sorted_gene_pair' column.")

# Create two DataFrames with the common identifier.
mask_df = all_pairs_df[['sorted_gene_pair']].copy()  # for training labels (mask)
pred_df = all_pairs_df[['sorted_gene_pair']].copy()  # for predictions

####################################################
# 3) Process Each Training Dataset
####################################################
for dataset_name, train_path in hs_train_datasets.items():
    # Load training dataset
    df_train = pd.read_csv(train_path)
    
    # Use the available label: if 'same_func_ppi' exists, use it; otherwise, assume 'SL' is present.
    if 'same_func_ppi' in df_train.columns:
        df_train['SL'] = df_train['same_func_ppi']
    elif 'SL' not in df_train.columns:
        raise ValueError(f"Dataset {dataset_name} must contain a label column 'same_func_ppi' or 'SL'.")
    
    # Ensure the training dataset has the 'sorted_gene_pair' column.
    if 'sorted_gene_pair' not in df_train.columns:
        raise ValueError(f"Dataset {dataset_name} must contain the 'sorted_gene_pair' column.")
    
    # Create a lookup for training pairs (using sorted_gene_pair as key)
    train_pairs_dict = dict(zip(df_train['sorted_gene_pair'], df_train['SL']))
    
    # ----------------------------
    # Create a mask column for this dataset (one column per dataset)
    # ----------------------------
    mask_column = []
    for idx, row in all_pairs_df.iterrows():
        pair = row['sorted_gene_pair']
        if pair in train_pairs_dict:
            mask_column.append(train_pairs_dict[pair])
        else:
            mask_column.append(np.nan)
    mask_df[dataset_name] = mask_column

    # ----------------------------
    # Loop over feature sets to generate predictions.
    # ----------------------------
    for feat_set_name, feat_columns in feature_sets.items():
        col_name = f"{dataset_name} | {feat_set_name}"
        
        # Check that the required features exist in the training dataset.
        missing_train = set(feat_columns) - set(df_train.columns)
        if missing_train:
            raise ValueError(f"Missing features {missing_train} in training dataset {dataset_name} for feature set {feat_set_name}.")
        
        # Check that the unseen pairs dataset contains the required features.
        missing_all = set(feat_columns) - set(all_pairs_df.columns)
        if missing_all:
            raise ValueError(f"Missing features {missing_all} in unseen pairs dataset for feature set {feat_set_name}.")
        
        # Clone the classifier for a fresh start.
        classifier = clone(base_classifier)
        X_train = df_train[feat_columns]
        y_train = df_train['SL']
        classifier.fit(X_train, y_train)
        
        # Predict probabilities on all pairs (regardless of training presence)
        X_all = all_pairs_df[feat_columns]
        y_pred_proba = classifier.predict_proba(X_all)[:, 1]
        
        # Store predicted scores for all pairs.
        pred_df[col_name] = y_pred_proba

####################################################
# 4) Aggregate and Sort Predictions (Optional)
####################################################
pred_cols = pred_df.columns.difference(['sorted_gene_pair'])
pred_df['sum_pred'] = pred_df[pred_cols].sum(axis=1)
pred_df.sort_values('sum_pred', ascending=False, inplace=True)

pred_df

Unnamed: 0,sorted_gene_pair,PPI Bioplex | Minimum Sequence Identity XGB,PPI Bioplex | Pairwise structure XGB,PPI Bioplex | PLM emb. dist. XGB,PPI Bioplex | Pairwise XGB,PPI Bioplex | Similarity Search XGB,PPI Bioplex | All 36 features XGB,SL | Minimum Sequence Identity XGB,SL | Pairwise structure XGB,SL | PLM emb. dist. XGB,...,PPI Biogrid Y2H | Pairwise XGB,PPI Biogrid Y2H | Similarity Search XGB,PPI Biogrid Y2H | All 36 features XGB,SL-Lenient | Minimum Sequence Identity XGB,SL-Lenient | Pairwise structure XGB,SL-Lenient | PLM emb. dist. XGB,SL-Lenient | Pairwise XGB,SL-Lenient | Similarity Search XGB,SL-Lenient | All 36 features XGB,sum_pred
73405,LDHA_LDHB,0.254615,0.087535,0.034228,0.038146,0.413065,0.409726,0.160722,0.793932,0.869031,...,0.547877,0.570757,0.795630,0.231632,0.694274,0.821961,0.886581,0.843417,0.924512,33.428799
13736,VPS4A_VPS4B,0.254615,0.586596,0.414461,0.603148,0.517870,0.433762,0.084927,0.884130,0.781749,...,0.633850,0.306285,0.572985,0.147255,0.806643,0.957078,0.946369,0.563756,0.957361,31.114931
100511,TBL1X_TBL1XR1,0.254615,0.525493,0.843817,0.628790,0.586236,0.891083,0.160722,0.888996,0.887073,...,0.045372,0.335257,0.150604,0.185290,0.807522,0.919275,0.947728,0.540039,0.959971,30.896423
9827,CUL4A_CUL4B,0.212583,0.714635,0.670831,0.759916,0.320984,0.683919,0.107251,0.889828,0.935878,...,0.705225,0.154531,0.723434,0.188775,0.866102,0.887229,0.939691,0.264237,0.960339,30.604008
74729,PAK1_PAK2,0.254615,0.692015,0.330796,0.415791,0.575830,0.585417,0.160722,0.893481,0.898271,...,0.332515,0.378904,0.568760,0.231632,0.719490,0.754514,0.892035,0.591907,0.937921,30.587427
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13452,BMP1_CUBN,0.005331,0.004585,0.004226,0.010684,0.008626,0.015507,0.001393,0.000223,0.000005,...,0.001773,0.002991,0.005881,0.028220,0.003486,0.004073,0.003669,0.003541,0.010602,1.518351
65706,PRDM2_ZNF155,0.016247,0.003486,0.001722,0.000647,0.005296,0.000796,0.001393,0.000435,0.000081,...,0.000319,0.000050,0.000064,0.028220,0.001008,0.001456,0.000678,0.008602,0.001069,1.515338
66339,ZNF547_ZNF644,0.008002,0.001797,0.005831,0.000373,0.005047,0.001044,0.001393,0.000634,0.000067,...,0.000929,0.001496,0.001251,0.028220,0.002180,0.004166,0.004759,0.013635,0.008718,1.489157
13670,CUBN_TLL2,0.005331,0.001409,0.002607,0.004550,0.009468,0.007691,0.001393,0.000044,0.000005,...,0.002138,0.006104,0.004104,0.028220,0.002992,0.004781,0.008210,0.000549,0.009740,1.343691


In [8]:
# Save the final datasets
mask_df.to_csv('ens111_human_allMask.csv', index=False)
pred_df.to_csv('ens111_human_allPredictions.csv', index=False)

### 2) For Yeast pairs

In [9]:
####################################################
# 1) Setup Datasets, Feature Sets, and Classifier
####################################################
# Choose classifier type: 'XGB', 'RF', or 'LR'
class_mod = 'XGB'
if class_mod == 'XGB':
    base_classifier = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', XGBClassifier(
            n_estimators=600,
            random_state=8,
            learning_rate=0.1,
            colsample_bytree=0.5,
            use_label_encoder=False,
            eval_metric='logloss'
        ))
    ])
elif class_mod == 'RF':
    base_classifier = RandomForestClassifier(
        n_estimators=600, random_state=8,
        max_features=0.5, max_depth=3, min_samples_leaf=8
    )
elif class_mod == 'LR':
    base_classifier = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(max_iter=1000))
    ])

# Yeast training datasets
sc_train_datasets = {
    'PPI SocInt': './data/ens111_yeast_Interactome.csv',
    'SL': './data/ens111_yeast_SL-deadly.csv',
    'GO BPO': './data/ens111_yeast_BPO.csv',
    'GO MFO': './data/ens111_yeast_MFO.csv',
    'GO CCO': './data/ens111_yeast_CCO.csv',
    'PPI Biogrid Phys.': './data/ens111_yeast_Biogrid.csv',
    'PPI Biogrid Y2H': './data/ens111_yeast_Biogrid-Y2H.csv',
    'Neg. GI': './data/ens111_yeast_SL.csv',
}

sc_all_pairs_path = './data/ens111_yeast_allFeatures.csv'

# Define feature sets: key is descriptive name, value is the list of feature columns
feature_sets = {
    f'Minimum Sequence Identity {class_mod}': ['min_sequence_identity'],
    f'Pairwise structure {class_mod}': pairStruct_features,
    f'PLM emb. dist. {class_mod}': plm_features,
    f'Pairwise {class_mod}': pairwise_features,
    f'Similarity Search {class_mod}': contSeqStruct_features,
    f'All {len(news_features)} features {class_mod}': news_features,
}

####################################################
# 2) Load All-Pairs Data (Using sorted_gene_pair as ID)
####################################################
all_pairs_df = pd.read_csv(sc_all_pairs_path)
if 'sorted_gene_pair' not in all_pairs_df.columns:
    raise ValueError("The unseen pairs dataset must contain the 'sorted_gene_pair' column.")

# Create two DataFrames with the common identifier.
mask_df = all_pairs_df[['sorted_gene_pair']].copy()  # for training labels (mask)
pred_df = all_pairs_df[['sorted_gene_pair']].copy()  # for predictions

####################################################
# 3) Process Each Training Dataset
####################################################
for dataset_name, train_path in sc_train_datasets.items():
    # Load training dataset
    df_train = pd.read_csv(train_path)
    
    # Use the available label: if 'same_func_ppi' exists, use it; otherwise, assume 'SL' is present.
    if 'same_func_ppi' in df_train.columns:
        df_train['SL'] = df_train['same_func_ppi']
    elif 'SL' not in df_train.columns:
        raise ValueError(f"Dataset {dataset_name} must contain a label column 'same_func_ppi' or 'SL'.")
    
    # Ensure the training dataset has the 'sorted_gene_pair' column.
    if 'sorted_gene_pair' not in df_train.columns:
        raise ValueError(f"Dataset {dataset_name} must contain the 'sorted_gene_pair' column.")
    
    # Create a lookup for training pairs (using sorted_gene_pair as key)
    train_pairs_dict = dict(zip(df_train['sorted_gene_pair'], df_train['SL']))
    
    # ----------------------------
    # Create a mask column for this dataset (one column per dataset)
    # ----------------------------
    mask_column = []
    for idx, row in all_pairs_df.iterrows():
        pair = row['sorted_gene_pair']
        if pair in train_pairs_dict:
            mask_column.append(train_pairs_dict[pair])
        else:
            mask_column.append(np.nan)
    mask_df[dataset_name] = mask_column

    # ----------------------------
    # Loop over feature sets to generate predictions.
    # ----------------------------
    for feat_set_name, feat_columns in feature_sets.items():
        col_name = f"{dataset_name} | {feat_set_name}"
        
        # Check that the required features exist in the training dataset.
        missing_train = set(feat_columns) - set(df_train.columns)
        if missing_train:
            raise ValueError(f"Missing features {missing_train} in training dataset {dataset_name} for feature set {feat_set_name}.")
        
        # Check that the unseen pairs dataset contains the required features.
        missing_all = set(feat_columns) - set(all_pairs_df.columns)
        if missing_all:
            raise ValueError(f"Missing features {missing_all} in unseen pairs dataset for feature set {feat_set_name}.")
        
        # Clone the classifier for a fresh start.
        classifier = clone(base_classifier)
        X_train = df_train[feat_columns]
        y_train = df_train['SL']
        classifier.fit(X_train, y_train)
        
        # Predict probabilities on all pairs (regardless of training presence)
        X_all = all_pairs_df[feat_columns]
        y_pred_proba = classifier.predict_proba(X_all)[:, 1]
        
        # Store predicted scores for all pairs.
        pred_df[col_name] = y_pred_proba

####################################################
# 4) Aggregate and Sort Predictions (Optional)
####################################################
pred_cols = pred_df.columns.difference(['sorted_gene_pair'])
pred_df['sum_pred'] = pred_df[pred_cols].sum(axis=1)
pred_df.sort_values('sum_pred', ascending=False, inplace=True)

pred_df

Unnamed: 0,sorted_gene_pair,PPI SocInt | Minimum Sequence Identity XGB,PPI SocInt | Pairwise structure XGB,PPI SocInt | PLM emb. dist. XGB,PPI SocInt | Pairwise XGB,PPI SocInt | Similarity Search XGB,PPI SocInt | All 36 features XGB,SL | Minimum Sequence Identity XGB,SL | Pairwise structure XGB,SL | PLM emb. dist. XGB,...,PPI Biogrid Y2H | Pairwise XGB,PPI Biogrid Y2H | Similarity Search XGB,PPI Biogrid Y2H | All 36 features XGB,Neg. GI | Minimum Sequence Identity XGB,Neg. GI | Pairwise structure XGB,Neg. GI | PLM emb. dist. XGB,Neg. GI | Pairwise XGB,Neg. GI | Similarity Search XGB,Neg. GI | All 36 features XGB,sum_pred
638,YAL005C_YLL024C,0.424839,0.928102,0.959664,0.979023,0.581263,0.983361,0.668382,0.951772,0.911230,...,0.899025,0.326829,0.901938,0.891112,0.890851,0.985336,0.963614,0.937532,0.954989,42.640881
2606,YDR099W_YER177W,0.213385,0.991956,0.899115,0.985192,0.587640,0.962398,0.485062,0.961214,0.871580,...,0.930540,0.179636,0.942863,0.490078,0.989565,0.827585,0.917015,0.650468,0.939529,41.775993
2781,YLR264W_YOR167C,0.077056,0.945963,0.987936,0.992662,0.354175,0.994154,0.177421,0.980107,0.955561,...,0.898335,0.266609,0.892654,0.274276,0.975836,0.996145,0.985672,0.825838,0.993663,41.404217
103,YIL035C_YOR061W,0.072049,0.925203,0.863763,0.924792,0.866163,0.976096,0.076756,0.893639,0.905429,...,0.953894,0.662242,0.953303,0.328374,0.901778,0.951559,0.944670,0.915266,0.963467,40.692101
1420,YMR194W_YPL249C-A,0.424839,0.969320,0.928676,0.986770,0.581263,0.989952,0.668382,0.290740,0.547076,...,0.936937,0.326829,0.935763,0.891112,0.933840,0.978484,0.931943,0.937532,0.958751,40.639389
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2376,YLR240W_YNL267W,0.060084,0.002496,0.030444,0.005953,0.025825,0.003280,0.001046,0.000029,0.001468,...,0.000090,0.000409,0.000133,0.011500,0.001005,0.001805,0.000881,0.027653,0.000694,8.795201
2476,YLL010C_YPL063W,0.054346,0.072083,0.000239,0.002609,0.068601,0.001559,0.001353,0.001328,0.000107,...,0.000517,0.000822,0.001976,0.010101,0.013312,0.057685,0.011973,0.027760,0.024096,8.176650
1780,YDL126C_YKL197C,0.034371,0.008825,0.001306,0.001148,0.008116,0.005537,0.000968,0.003518,0.000350,...,0.011747,0.000585,0.002128,0.006900,0.013777,0.001350,0.006195,0.001845,0.024338,7.599596
2318,YAL021C_YMR285C,0.008277,0.000655,0.000281,0.000525,0.007823,0.001004,0.001046,0.001653,0.001868,...,0.000946,0.000208,0.002198,0.017424,0.008183,0.003547,0.003668,0.001929,0.002276,6.250986


In [10]:
# Save the final datasets
mask_df.to_csv('ens111_yeast_allMask.csv', index=False)
pred_df.to_csv('ens111_yeast_allPredictions.csv', index=False)