# Predicting on Unseen Gene Pairs Using Multiple Feature Sets and Labelled Datasets 

**Training and Prediction:**
- A separate classifier is trained for each combination of a `feature set` and a `labelled dataset` (e.g., PPI bioplex, SL).
- Each classifier is trained on known labelled data (using a specific set of features) and then used to predict on unseen gene pairs.


**Reporting Predictions:**
- For gene pairs that are already labelled, the known prediction is directly reported as `True` or `False`.
- For unseen gene pairs, the classifier’s `prediction score (a float value)` is provided.


**Ranking:**
- The gene pairs are sorted based on the `sum of all prediction scores`.

## Define Features Lists

In [2]:
# Define the different set of features to use in classifiers

# Structural pair-wise features + min. sequence identity
pairStruct_features = ['fident_struct', 'bits_struct', 'alntmscore_struct', 'qtmscore_struct','ttmscore_struct',
                'alnlen_struct', 'evalue_struct', 'prob_struct', 'lddt_struct',
                'min_sequence_identity',
                    ]

# Sequence context features (similarity searches) + min. sequence identity
contSeq_features = [
                'rank_seq', 'selfSP_seq', 'taxid_seq',
                'min_sequence_identity'
                    ]

# Structural context features (similarity searches) + min. sequence identity
contStruct_features = ['rank_struct', 'selfSP_struct', 'taxid_struct', 
                'min_sequence_identity'
                    ]

# Structural and Sequence context features (similarity searches) + min. sequence identity
contSeqStruct_features = ['rank_struct', 'selfSP_struct', 'taxid_struct', 
                'rank_seq', 'selfSP_seq', 'taxid_seq',
                'min_sequence_identity'
                    ]

# Protein Language Model embedding distance features + min. sequence identity
plm_features = ['min_sequence_identity',
                'esm2_beginning_of_sequence_cosine','esm2_beginning_of_sequence_euclidean','esm2_beginning_of_sequence_manhattan',
                'esm2_beginning_of_sequence_ts_ss','esm2_end_of_sequence_cosine','esm2_end_of_sequence_euclidean',
                'esm2_end_of_sequence_manhattan','esm2_end_of_sequence_ts_ss','esm2_mean_of_residue_tokens_cosine',
                'esm2_mean_of_residue_tokens_euclidean','esm2_mean_of_residue_tokens_manhattan',
                'esm2_mean_of_residue_tokens_ts_ss',
                'esm2_mean_of_special_tokens_cosine','esm2_mean_of_special_tokens_euclidean',
                'esm2_mean_of_special_tokens_manhattan',
                'esm2_mean_of_special_tokens_ts_ss',
                'ProtT5_per-protein_cosine', 'ProtT5_per-protein_euclidean', 
                'ProtT5_per-protein_manhattan', 'ProtT5_per-protein_ts_ss',
                    ]

# Protein Language Model embedding distance features + min. sequence identity
pairwise_features = ['min_sequence_identity',
                'esm2_beginning_of_sequence_cosine','esm2_beginning_of_sequence_euclidean','esm2_beginning_of_sequence_manhattan',
                'esm2_beginning_of_sequence_ts_ss','esm2_end_of_sequence_cosine','esm2_end_of_sequence_euclidean',
                'esm2_end_of_sequence_manhattan','esm2_end_of_sequence_ts_ss','esm2_mean_of_residue_tokens_cosine',
                'esm2_mean_of_residue_tokens_euclidean','esm2_mean_of_residue_tokens_manhattan','esm2_mean_of_residue_tokens_ts_ss',
                'esm2_mean_of_special_tokens_cosine','esm2_mean_of_special_tokens_euclidean','esm2_mean_of_special_tokens_manhattan',
                'esm2_mean_of_special_tokens_ts_ss',
                'ProtT5_per-protein_cosine', 'ProtT5_per-protein_euclidean', 'ProtT5_per-protein_manhattan', 'ProtT5_per-protein_ts_ss',
                'fident_struct', 'bits_struct', 'alntmscore_struct', 'qtmscore_struct','ttmscore_struct',
                'alnlen_struct', 'evalue_struct', 'prob_struct', 'lddt_struct',    
                    ]

# All news sequences and structures features to model sequence divergence + min. sequence identity
news_features = ['min_sequence_identity',
                'rank_struct', 'selfSP_struct', 'taxid_struct', 
                'fident_struct', 'bits_struct', 'alntmscore_struct', 'qtmscore_struct','ttmscore_struct',
                'alnlen_struct', 'evalue_struct', 'prob_struct', 'lddt_struct',
                'rank_seq', 'selfSP_seq', 'taxid_seq', 
                'esm2_beginning_of_sequence_cosine','esm2_beginning_of_sequence_euclidean','esm2_beginning_of_sequence_manhattan',
                'esm2_beginning_of_sequence_ts_ss','esm2_end_of_sequence_cosine','esm2_end_of_sequence_euclidean',
                'esm2_end_of_sequence_manhattan','esm2_end_of_sequence_ts_ss','esm2_mean_of_residue_tokens_cosine',
                'esm2_mean_of_residue_tokens_euclidean','esm2_mean_of_residue_tokens_manhattan','esm2_mean_of_residue_tokens_ts_ss',
                'esm2_mean_of_special_tokens_cosine','esm2_mean_of_special_tokens_euclidean','esm2_mean_of_special_tokens_manhattan',
                'esm2_mean_of_special_tokens_ts_ss',
                'ProtT5_per-protein_cosine', 'ProtT5_per-protein_euclidean', 'ProtT5_per-protein_manhattan', 'ProtT5_per-protein_ts_ss',
                    ]



## Make predictions for every `train_dataset` X `classifier` for every pairs

### 1) For Human pairs

In [8]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.base import clone

####################################################
# 2) Setup Datasets, Feature Sets, and Classifier
####################################################
# Choose classifier type: 'XGB', 'RF', or 'LR'
class_mod = 'XGB'

if class_mod == 'XGB':
    base_classifier = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', XGBClassifier(
            n_estimators=600,
            random_state=8,
            learning_rate=0.1,
            colsample_bytree=0.5,
            use_label_encoder=False,
            eval_metric='logloss'
        ))
    ])
elif class_mod == 'RF':
    base_classifier = RandomForestClassifier(
        n_estimators=600, random_state=8,
        max_features=0.5, max_depth=3, min_samples_leaf=8
    )
elif class_mod == 'LR':
    base_classifier = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(max_iter=1000))
    ])

# Human training datasets: mapping dataset names to file paths.
hs_train_datasets = {
    'PPI Bioplex': './data/ens111_human_BioPlex.csv',
    'SL': './data/ens111_human_SL.csv',
    'GO BPO': './data/ens111_human_BPO.csv',
    'GO MFO': './data/ens111_human_MFO.csv',
    'GO CCO': './data/ens111_human_CCO.csv',
    'PPI Biogrid Phys.': './data/ens111_human_Biogrid.csv',
    'PPI Biogrid Y2H': './data/ens111_human_Biogrid-Y2H.csv',
    'SL-Lenient': './data/ens111_human_SL-lenient.csv',
}

# Unseen pairs dataset (human) path
hs_all_pairs_path = './data/ens111_human_allFeatures.csv'

# Define feature sets: key is descriptive name, value is the list of feature columns
feature_sets = {
    f'Minimum Sequence Identity {class_mod}': ['min_sequence_identity'],
    f'Pairwise structure {class_mod}': pairStruct_features,
    f'PLM emb. dist. {class_mod}': plm_features,
    f'Pairwise {class_mod}': pairwise_features,
    f'Similarity Search {class_mod}': contSeqStruct_features,
    f'All {len(news_features)} features {class_mod}': news_features,
}

####################################################
# 3) Load All-Pairs Data (Using sorted_gene_pair as ID)
####################################################
all_pairs_df = pd.read_csv(hs_all_pairs_path)
if 'sorted_gene_pair' not in all_pairs_df.columns:
    raise ValueError("The unseen pairs dataset must contain the 'sorted_gene_pair' column.")

# Prepare a DataFrame to store predictions (using sorted_gene_pair as the identifier)
predictions_df = all_pairs_df[['sorted_gene_pair']].copy()

####################################################
# 4) Train Classifiers, Predict, and Store Results
####################################################
for dataset_name, train_path in hs_train_datasets.items():
    # Load training dataset
    df_train = pd.read_csv(train_path)
    
    # Use the available label: if 'same_func_ppi' exists, use it
    if 'same_func_ppi' in df_train.columns:
        df_train['SL'] = df_train['same_func_ppi']
    elif 'SL' in df_train.columns:
        df_train['SL']
    else:
        raise ValueError(f"Dataset {dataset_name} must contain a label column 'same_func_ppi' or 'SL'")
    
    # Ensure the training dataset has the 'sorted_gene_pair' column
    if 'sorted_gene_pair' not in df_train.columns:
        raise ValueError(f"Dataset {dataset_name} must contain the 'sorted_gene_pair' column.")
     
    # Create a lookup for pairs present in training (using sorted_gene_pair as key)
    train_pairs_dict = dict(zip(df_train['sorted_gene_pair'], df_train['SL']))
    
    # Loop over each feature set for the current training dataset
    for feat_set_name, feat_columns in feature_sets.items():
        col_name = f"{dataset_name} | {feat_set_name}"
        
        # Check that the training dataset contains the required features
        missing_train = set(feat_columns) - set(df_train.columns)
        if missing_train:
            raise ValueError(f"Missing features {missing_train} in training dataset {dataset_name} for feature set {feat_set_name}.")
        
        # Also check that the unseen pairs dataset contains the required features
        missing_all = set(feat_columns) - set(all_pairs_df.columns)
        if missing_all:
            raise ValueError(f"Missing features {missing_all} in unseen pairs dataset for feature set {feat_set_name}.")
        
        # Clone the classifier for a fresh start
        classifier = clone(base_classifier)
        
        # Train the classifier using the selected feature set
        X_train = df_train[feat_columns]
        y_train = df_train['SL']
        classifier.fit(X_train, y_train)
        
        # Predict probabilities on the all-pairs data
        X_all = all_pairs_df[feat_columns]
        y_pred_proba = classifier.predict_proba(X_all)[:, 1]
        
        # For each pair, if it exists in the training data, use its known label; otherwise, use the predicted probability.
        final_preds = []
        for idx, row in all_pairs_df.iterrows():
            pair = row['sorted_gene_pair']
            if pair in train_pairs_dict:
                final_preds.append(train_pairs_dict[pair])
            else:
                final_preds.append(y_pred_proba[idx])
        
        # Store the results in the final DataFrame
        predictions_df[col_name] = final_preds

####################################################
# 5) Aggregate and Sort Predictions
####################################################
# Sum predictions across all classifier columns to form a summary score
pred_cols = predictions_df.columns.difference(['sorted_gene_pair'])
predictions_df['sum_pred'] = predictions_df[pred_cols].sum(axis=1)

# Sort the DataFrame so that pairs with the highest overall score are at the top
predictions_df.sort_values('sum_pred', ascending=False, inplace=True)

# Optionally, drop the summary column if not needed
# predictions_df.drop(columns='sum_pred', inplace=True)
predictions_df


Unnamed: 0,sorted_gene_pair,PPI Bioplex | Minimum Sequence Identity XGB,PPI Bioplex | Pairwise structure XGB,PPI Bioplex | PLM emb. dist. XGB,PPI Bioplex | Pairwise XGB,PPI Bioplex | Similarity Search XGB,PPI Bioplex | All 36 features XGB,SL | Minimum Sequence Identity XGB,SL | Pairwise structure XGB,SL | PLM emb. dist. XGB,...,PPI Biogrid Y2H | Pairwise XGB,PPI Biogrid Y2H | Similarity Search XGB,PPI Biogrid Y2H | All 36 features XGB,SL-Lenient | Minimum Sequence Identity XGB,SL-Lenient | Pairwise structure XGB,SL-Lenient | PLM emb. dist. XGB,SL-Lenient | Pairwise XGB,SL-Lenient | Similarity Search XGB,SL-Lenient | All 36 features XGB,sum_pred
73405,LDHA_LDHB,0.254615,0.087535,0.034228,0.038146,0.413065,0.409726,True,True,True,...,True,True,True,True,True,True,True,True,True,42.730586
74729,PAK1_PAK2,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,42.521324
34795,CREBBP_EP300,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,42.209763
9827,CUL4A_CUL4B,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,41.741009
27324,CCND1_CCND2,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,41.333923
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2265,CES3_NLGN1,False,False,False,False,False,False,0.000405,0.000111,0.00049,...,False,False,False,False,False,False,False,False,False,0.00112
2292,CES3_NLGN4X,False,False,False,False,False,False,0.000314,0.000245,0.000247,...,False,False,False,False,False,False,False,False,False,0.001102
2263,CES1_NLGN1,False,False,False,False,False,False,0.000466,0.000029,0.000026,...,False,False,False,False,False,False,False,False,False,0.000901
2280,CES1_NLGN3,False,False,False,False,False,False,0.000202,0.00005,0.000045,...,False,False,False,False,False,False,False,False,False,0.000504


In [9]:
# Save Dataset
predictions_df.to_csv(f'ens111_human_predictions.csv', index=False)

### 2) For Yeast pairs

In [14]:
####################################################
# 2) Setup Datasets, Feature Sets, and Classifier
####################################################
# Choose classifier type: 'XGB', 'RF', or 'LR'
class_mod = 'XGB'

if class_mod == 'XGB':
    base_classifier = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', XGBClassifier(
            n_estimators=600,
            random_state=8,
            learning_rate=0.1,
            colsample_bytree=0.5,
            use_label_encoder=False,
            eval_metric='logloss'
        ))
    ])
elif class_mod == 'RF':
    base_classifier = RandomForestClassifier(
        n_estimators=600, random_state=8,
        max_features=0.5, max_depth=3, min_samples_leaf=8
    )
elif class_mod == 'LR':
    base_classifier = Pipeline([
        ('scaler', StandardScaler()),
        ('classifier', LogisticRegression(max_iter=1000))
    ])

# Yeast training datasets
sc_train_datasets = {
    'PPI SocInt': './data/ens111_yeast_Interactome.csv',
    'SL': './data/ens111_yeast_SL-deadly.csv',
    'GO BPO': './data/ens111_yeast_BPO.csv',
    'GO MFO': './data/ens111_yeast_MFO.csv',
    'GO CCO': './data/ens111_yeast_CCO.csv',
    'PPI Biogrid Phys.': './data/ens111_yeast_Biogrid.csv',
    'PPI Biogrid Y2H': './data/ens111_yeast_Biogrid-Y2H.csv',
    'Neg. GI': './data/ens111_yeast_SL.csv',
}

sc_all_pairs_path = './data/ens111_yeast_allFeatures.csv'


# Define feature sets: key is descriptive name, value is the list of feature columns
feature_sets = {
    f'Minimum Sequence Identity {class_mod}': ['min_sequence_identity'],
    f'Pairwise structure {class_mod}': pairStruct_features,
    f'PLM emb. dist. {class_mod}': plm_features,
    f'Pairwise {class_mod}': pairwise_features,
    f'Similarity Search {class_mod}': contSeqStruct_features,
    f'All {len(news_features)} features {class_mod}': news_features,
}

####################################################
# 3) Load All-Pairs Data (Using sorted_gene_pair as ID)
####################################################
all_pairs_df = pd.read_csv(sc_all_pairs_path)
if 'sorted_gene_pair' not in all_pairs_df.columns:
    raise ValueError("The unseen pairs dataset must contain the 'sorted_gene_pair' column.")

# Prepare a DataFrame to store predictions (using sorted_gene_pair as the identifier)
predictions_df = all_pairs_df[['sorted_gene_pair']].copy()

####################################################
# 4) Train Classifiers, Predict, and Store Results
####################################################
for dataset_name, train_path in sc_train_datasets.items():
    # Load training dataset
    df_train = pd.read_csv(train_path)
    
    # Use the available label: if 'same_func_ppi' exists, use it
    if 'same_func_ppi' in df_train.columns:
        df_train['SL'] = df_train['same_func_ppi']
    elif 'SL' in df_train.columns:
        df_train['SL']
    else:
        raise ValueError(f"Dataset {dataset_name} must contain a label column 'same_func_ppi' or 'SL'")
    
    # Ensure the training dataset has the 'sorted_gene_pair' column
    if 'sorted_gene_pair' not in df_train.columns:
        raise ValueError(f"Dataset {dataset_name} must contain the 'sorted_gene_pair' column.")
     
    # Create a lookup for pairs present in training (using sorted_gene_pair as key)
    train_pairs_dict = dict(zip(df_train['sorted_gene_pair'], df_train['SL']))
    
    # Loop over each feature set for the current training dataset
    for feat_set_name, feat_columns in feature_sets.items():
        col_name = f"{dataset_name} | {feat_set_name}"
        
        # Check that the training dataset contains the required features
        missing_train = set(feat_columns) - set(df_train.columns)
        if missing_train:
            raise ValueError(f"Missing features {missing_train} in training dataset {dataset_name} for feature set {feat_set_name}.")
        
        # Also check that the unseen pairs dataset contains the required features
        missing_all = set(feat_columns) - set(all_pairs_df.columns)
        if missing_all:
            raise ValueError(f"Missing features {missing_all} in unseen pairs dataset for feature set {feat_set_name}.")
        
        # Clone the classifier for a fresh start
        classifier = clone(base_classifier)
        
        # Train the classifier using the selected feature set
        X_train = df_train[feat_columns]
        y_train = df_train['SL']
        classifier.fit(X_train, y_train)
        
        # Predict probabilities on the all-pairs data
        X_all = all_pairs_df[feat_columns]
        y_pred_proba = classifier.predict_proba(X_all)[:, 1]
        
        # For each pair, if it exists in the training data, use its known label; otherwise, use the predicted probability.
        final_preds = []
        for idx, row in all_pairs_df.iterrows():
            pair = row['sorted_gene_pair']
            if pair in train_pairs_dict:
                final_preds.append(train_pairs_dict[pair])
            else:
                final_preds.append(y_pred_proba[idx])
        
        # Store the results in the final DataFrame
        predictions_df[col_name] = final_preds

####################################################
# 5) Aggregate and Sort Predictions
####################################################
# Sum predictions across all classifier columns to form a summary score
pred_cols = predictions_df.columns.difference(['sorted_gene_pair'])
predictions_df['sum_pred'] = predictions_df[pred_cols].sum(axis=1)

# Sort the DataFrame so that pairs with the highest overall score are at the top
predictions_df.sort_values('sum_pred', ascending=False, inplace=True)

# Optionally, drop the summary column if not needed
# predictions_df.drop(columns='sum_pred', inplace=True)
predictions_df


Unnamed: 0,sorted_gene_pair,PPI SocInt | Minimum Sequence Identity XGB,PPI SocInt | Pairwise structure XGB,PPI SocInt | PLM emb. dist. XGB,PPI SocInt | Pairwise XGB,PPI SocInt | Similarity Search XGB,PPI SocInt | All 36 features XGB,SL | Minimum Sequence Identity XGB,SL | Pairwise structure XGB,SL | PLM emb. dist. XGB,...,PPI Biogrid Y2H | Pairwise XGB,PPI Biogrid Y2H | Similarity Search XGB,PPI Biogrid Y2H | All 36 features XGB,Neg. GI | Minimum Sequence Identity XGB,Neg. GI | Pairwise structure XGB,Neg. GI | PLM emb. dist. XGB,Neg. GI | Pairwise XGB,Neg. GI | Similarity Search XGB,Neg. GI | All 36 features XGB,sum_pred
2781,YLR264W_YOR167C,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,47.848488
2606,YDR099W_YER177W,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,47.835587
1906,YDR312W_YHR066W,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,47.663315
638,YAL005C_YLL024C,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,47.579895
3573,YKL129C_YMR109W,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,46.906975
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2376,YLR240W_YNL267W,False,False,False,False,False,False,0.001046,0.000029,0.001468,...,False,False,False,0.0115,0.001005,0.001805,0.000881,0.027653,0.000694,5.537943
2476,YLL010C_YPL063W,False,False,False,False,False,False,0.001353,0.001328,0.000107,...,False,False,False,0.010101,0.013312,0.057685,0.011973,0.02776,0.024096,5.47057
1780,YDL126C_YKL197C,False,False,False,False,False,False,0.000968,0.003518,0.00035,...,False,False,False,0.0069,0.013777,0.00135,0.006195,0.001845,0.024338,5.254545
2318,YAL021C_YMR285C,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,3.99298


In [15]:
# Save Dataset
predictions_df.to_csv(f'ens111_yeast_predictions.csv', index=False)