# PHIStruct: Improving phage-host interaction prediction at low sequence similarity settings using structure-aware protein embeddings

<b>Mark Edward M. Gonzales<sup>1, 2</sup>, Jennifer C. Ureta<sup>1, 2, 3</sup> & Anish M.S. Shrestha<sup>1, 2</sup></b>

<sup>1</sup> Bioinformatics Lab, Advanced Research Institute for Informatics, Computing and Networking, De La Salle University, Manila 1004, Philippines <br>
<sup>2</sup> Department of Software Technology, College of Computer Studies, De La Salle University, Manila 1004, Philippines <br>
<sup>3</sup> Walter and Eliza Hall Institute of Medical Research, Melbourne, Victoria, 3052, Australia

✉️ gonzales.markedward@gmail.com, jennifer.ureta@gmail.com, anish.shrestha@dlsu.edu.ph

<hr>

# 💡 Prerequisites

The prerequisite files (i.e., the results of evaluating the models' performance) &mdash; which are saved in `temp/results` &mdash; are already included when the repository was cloned. <br>

<hr>

# Part I: Preliminaries

Import the necessary libraries and modules.

In [7]:
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import ConstantsUtil

%load_ext autoreload
%autoreload 2

In [8]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", 50)

pd.options.mode.chained_assignment = None

In [9]:
constants = ConstantsUtil.ConstantsUtil()

<hr>

# Part II: Compare the performance of the classifiers

Declare constant variables for readability of subsequent code.

In [13]:
CLASS = 0
PRECISION = 0
RECALL = 1
F1 = 2

Y_TEST = -3
Y_PRED = -1

#### Change the value of `SIMILARITY` as needed.

`SIMILARITY` refers to the maximum train-versus-test sequence similarity threshold.

In [15]:
SIMILARITY = 40

SOTA = [
    f"boeckaerts-eskapee-smotetomek-{SIMILARITY}",
    f"phiembed-eskapee-smotetomek-{SIMILARITY}",
    f"badam-eskapee-smotetomek-{SIMILARITY}",
    f"blast-eskapee-smotetomek-{SIMILARITY}",
    f"psiblast-eskapee-smotetomek-{SIMILARITY}",
    f"saprot_relaxed_r3-mlp-eskapee-smotetomek-{SIMILARITY}",
]

PLM = [
    f"prott5-mlp-eskapee-smotetomek-{SIMILARITY}",
    f"esm2-mlp-eskapee-smotetomek-{SIMILARITY}",
    f"esm1b-mlp-eskapee-smotetomek-{SIMILARITY}",
    f"seqvec-mlp-eskapee-smotetomek-{SIMILARITY}",
    f"saprot_relaxed_r3-mlp-eskapee-smotetomek-{SIMILARITY}",
]

STRUCT = [
    f"prostt5_relaxed_r3-mlp-eskapee-smotetomek-{SIMILARITY}",
    f"prostt5_3di_relaxed_r3-mlp-eskapee-smotetomek-{SIMILARITY}",
    f"pst_relaxed_r3-mlp-eskapee-smotetomek-{SIMILARITY}",
    f"saprot_relaxed_r3-mlp-eskapee-smotetomek-{SIMILARITY}",
]

MASKING = [
    f"saprot_seq_mask_relaxed_r3-mlp-eskapee-smotetomek-{SIMILARITY}",
    f"saprot_struct_mask_relaxed_r3-mlp-eskapee-smotetomek-{SIMILARITY}",
    f"saprot_mask_relaxed_r3-mlp-eskapee-smotetomek-{SIMILARITY}",
    f"saprot_relaxed_r3-mlp-eskapee-smotetomek-{SIMILARITY}",
]

ML = [
    f"saprot_relaxed_r3-rf-eskapee-smotetomek-{SIMILARITY}",
    f"saprot_relaxed_r3-svm-eskapee-smotetomek-{SIMILARITY}",
    f"saprot_relaxed_r3-mlp-eskapee-smotetomek-{SIMILARITY}",
]

#### Change the value of `model` as needed.

Choose from the constants defined in the code block above<br> (these constants pertain to our benchmarking experiments):
- `SOTA` - state-of-the-art phage-host interaction prediction tools
- `PLM`- sequence-only protein language models
- `STRUCT` - structure-aware protein language models
- `MASKING` - masking strategy (with respect to the SaProt encoding)
- `ML` - downstream classifier

In [17]:
models = SOTA

Load the pickled files storing the results of evaluating the models' performance.

In [19]:
model_results = []
for model in models:
    with open(f"{constants.TEMP_RESULTS}/{model}.pickle", "rb") as f:
        model_results.append(pickle.load(f))

In [20]:
labels = [
    "enterococcus",
    "staphylococcus",
    "klebsiella",
    "acinetobacter",
    "pseudomonas",
    "enterobacter",
    "escherichia",
    "others",
]

labels.sort()


def get_macro_no_others(model, threshold, metric):
    ave = 0
    for idx, label in enumerate(labels):
        if label != "others":
            ave += model_results[models.index(model)][threshold][CLASS][metric][idx]

    return ave / (len(labels) - 1)

The shaded cells in the subsequent tables correspond to the highest scores (i.e., best performance in terms of the specified evaluation metric). 

⚠️ **Caveat**: The highest score is determined via lexicographical ordering of the values (i.e., the scores are treated as strings) &mdash; so do some quick sanity check! 

In [22]:
print("Macro Recall")

results = []
for model in models:
    result = []
    for threshold in range(0, 10):
        metric = "{:.2f}".format(get_macro_no_others(model, threshold, RECALL) * 100)
        result.append(f"{metric}%")
    results.append(result)

results_df = pd.DataFrame(
    results, columns=[str(_) + "%" for _ in range(0, 91, 10)], index=models
)
results_df.style.highlight_max(color="lightgreen", axis=0)

Macro Recall


Unnamed: 0,0%,10%,20%,30%,40%,50%,60%,70%,80%,90%
boeckaerts-eskapee-smotetomek-40,64.12%,48.78%,37.91%,32.31%,25.38%,20.86%,16.05%,13.36%,6.53%,2.55%
phiembed-eskapee-smotetomek-40,52.43%,39.69%,29.86%,21.97%,14.23%,8.70%,5.31%,2.54%,1.70%,1.24%
badam-eskapee-smotetomek-40,59.29%,58.45%,57.24%,56.69%,55.63%,54.87%,54.10%,53.03%,51.67%,49.70%
blast-eskapee-smotetomek-40,58.14%,58.14%,58.14%,58.14%,58.14%,58.14%,58.14%,58.14%,58.14%,58.14%
psiblast-eskapee-smotetomek-40,46.06%,46.06%,46.06%,46.06%,46.06%,46.06%,46.06%,46.06%,46.06%,46.06%
saprot_relaxed_r3-mlp-eskapee-smotetomek-40,63.09%,62.60%,61.09%,60.23%,59.30%,58.52%,57.69%,56.35%,55.00%,51.95%


In [23]:
print("Macro Precision")

results = []
for model in models:
    result = []
    for threshold in range(0, 10):
        metric = "{:.2f}".format(get_macro_no_others(model, threshold, PRECISION) * 100)
        result.append(f"{metric}%")
    results.append(result)

results_df = pd.DataFrame(
    results, columns=[str(_) + "%" for _ in range(0, 91, 10)], index=models
)
results_df.style.highlight_max(color="lightgreen", axis=0)

Macro Precision


Unnamed: 0,0%,10%,20%,30%,40%,50%,60%,70%,80%,90%
boeckaerts-eskapee-smotetomek-40,52.07%,63.82%,79.03%,85.65%,89.71%,89.68%,69.14%,70.45%,56.88%,42.86%
phiembed-eskapee-smotetomek-40,43.18%,53.66%,64.31%,72.68%,79.75%,83.93%,98.56%,85.71%,71.43%,42.86%
badam-eskapee-smotetomek-40,46.83%,47.26%,47.82%,48.60%,49.08%,50.25%,51.62%,52.62%,54.78%,58.35%
blast-eskapee-smotetomek-40,52.97%,52.97%,52.97%,52.97%,52.97%,52.97%,52.97%,52.97%,52.97%,52.97%
psiblast-eskapee-smotetomek-40,43.96%,43.96%,43.96%,43.96%,43.96%,43.96%,43.96%,43.96%,43.96%,43.96%
saprot_relaxed_r3-mlp-eskapee-smotetomek-40,56.62%,57.98%,58.31%,59.13%,59.90%,61.45%,62.79%,65.12%,67.50%,69.43%


In [24]:
print("Macro F1")

results = []
for model in models:
    result = []
    for threshold in range(0, 10):
        metric = "{:.2f}".format(get_macro_no_others(model, threshold, F1) * 100)
        result.append(f"{metric}%")
    results.append(result)

results_df = pd.DataFrame(
    results, columns=[str(_) + "%" for _ in range(0, 91, 10)], index=models
)
results_df.style.highlight_max(color="lightgreen", axis=0)

Macro F1


Unnamed: 0,0%,10%,20%,30%,40%,50%,60%,70%,80%,90%
boeckaerts-eskapee-smotetomek-40,55.55%,52.63%,45.49%,40.69%,33.29%,28.17%,22.70%,19.77%,11.02%,4.65%
phiembed-eskapee-smotetomek-40,43.60%,42.91%,39.27%,31.87%,23.20%,15.46%,9.90%,4.78%,3.23%,2.33%
badam-eskapee-smotetomek-40,48.12%,47.93%,47.78%,47.96%,47.86%,48.18%,48.61%,48.68%,49.03%,49.46%
blast-eskapee-smotetomek-40,51.35%,51.35%,51.35%,51.35%,51.35%,51.35%,51.35%,51.35%,51.35%,51.35%
psiblast-eskapee-smotetomek-40,39.67%,39.67%,39.67%,39.67%,39.67%,39.67%,39.67%,39.67%,39.67%,39.67%
saprot_relaxed_r3-mlp-eskapee-smotetomek-40,57.15%,57.61%,57.22%,57.09%,57.09%,57.42%,57.48%,57.67%,57.44%,56.27%
