# PHIStruct: Improving phage-host interaction prediction at low sequence similarity settings using structure-aware protein embeddings

<b>Mark Edward M. Gonzales<sup>1, 2</sup>, Jennifer C. Ureta<sup>1, 2, 3</sup> & Anish M.S. Shrestha<sup>1, 2</sup></b>

<sup>1</sup> Bioinformatics Lab, Advanced Research Institute for Informatics, Computing and Networking, De La Salle University, Manila 1004, Philippines <br>
<sup>2</sup> Department of Software Technology, College of Computer Studies, De La Salle University, Manila 1004, Philippines <br>
<sup>3</sup> Walter and Eliza Hall Institute of Medical Research, Melbourne, Victoria, 3052, Australia

✉️ gonzales.markedward@gmail.com, jennifer.ureta@gmail.com, anish.shrestha@dlsu.edu.ph

<hr>

# 💡 Prerequisites

The prerequisite files (i.e., the results of evaluating the models' performance) &mdash; which are saved in `temp/results` &mdash; are already included when the repository was cloned. <br>

<hr>

# Part I: Preliminaries

Import the necessary libraries and modules.

In [7]:
import pickle

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import ConstantsUtil

%load_ext autoreload
%autoreload 2

In [8]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", 50)

pd.options.mode.chained_assignment = None

In [9]:
constants = ConstantsUtil.ConstantsUtil()

<hr>

# Part II: Compare the performance of the classifiers

Declare constant variables for readability of subsequent code.

In [13]:
CLASS = 0
PRECISION = 0
RECALL = 1
F1 = 2
SUPPORT = 3

Y_TEST = -3
Y_PRED = -1

#### Change the value of `SIMILARITY` as needed.

`SIMILARITY` refers to the maximum train-versus-test sequence similarity threshold.

In [15]:
SIMILARITY = 40

SOTA = [
    f"boeckaerts-eskapee-smotetomek-{SIMILARITY}",
    f"phiembed-eskapee-smotetomek-{SIMILARITY}",
    f"badam-eskapee-smotetomek-{SIMILARITY}",
    f"blast-eskapee-smotetomek-{SIMILARITY}",
    f"psiblast-eskapee-smotetomek-{SIMILARITY}",
    f"saprot_relaxed_r3-mlp-eskapee-smotetomek-{SIMILARITY}",
]

PLM = [
    f"prott5-mlp-eskapee-smotetomek-{SIMILARITY}",
    f"esm2-mlp-eskapee-smotetomek-{SIMILARITY}",
    f"esm1b-mlp-eskapee-smotetomek-{SIMILARITY}",
    f"seqvec-mlp-eskapee-smotetomek-{SIMILARITY}",
    f"saprot_relaxed_r3-mlp-eskapee-smotetomek-{SIMILARITY}",
]

STRUCT = [
    f"prostt5_relaxed_r3-mlp-eskapee-smotetomek-{SIMILARITY}",
    f"prostt5_3di_relaxed_r3-mlp-eskapee-smotetomek-{SIMILARITY}",
    f"pst_relaxed_r3-mlp-eskapee-smotetomek-{SIMILARITY}",
    f"saprot_relaxed_r3-mlp-eskapee-smotetomek-{SIMILARITY}",
]

MASKING = [
    f"saprot_seq_mask_relaxed_r3-mlp-eskapee-smotetomek-{SIMILARITY}",
    f"saprot_struct_mask_relaxed_r3-mlp-eskapee-smotetomek-{SIMILARITY}",
    f"saprot_mask_relaxed_r3-mlp-eskapee-smotetomek-{SIMILARITY}",
    f"saprot_relaxed_r3-mlp-eskapee-smotetomek-{SIMILARITY}",
]

ML = [
    f"saprot_relaxed_r3-rf-eskapee-smotetomek-{SIMILARITY}",
    f"saprot_relaxed_r3-svm-eskapee-smotetomek-{SIMILARITY}",
    f"saprot_relaxed_r3-mlp-eskapee-smotetomek-{SIMILARITY}",
]

#### Change the value of `model` as needed.

Choose from the constants defined in the code block above<br> (these constants pertain to our benchmarking experiments):
- `SOTA` - state-of-the-art phage-host interaction prediction tools
- `PLM`- sequence-only protein language models
- `STRUCT` - structure-aware protein language models
- `MASKING` - masking strategy (with respect to the SaProt encoding)
- `ML` - downstream classifier

In [17]:
models = SOTA

Load the pickled files storing the results of evaluating the models' performance.

In [19]:
model_results = []
for model in models:
    with open(f"{constants.TEMP_RESULTS}/{model}.pickle", "rb") as f:
        model_results.append(pickle.load(f))

In [20]:
labels = [
    "enterococcus",
    "staphylococcus",
    "klebsiella",
    "acinetobacter",
    "pseudomonas",
    "enterobacter",
    "escherichia",
    "others",
]

labels.sort()


def get_weighted_no_others(model, threshold, metric):
    ave = 0
    total_support = 0
    for idx, label in enumerate(labels):
        if label != "others":
            ave += (
                model_results[models.index(model)][threshold][CLASS][metric][idx]
                * model_results[models.index(model)][threshold][CLASS][SUPPORT][idx]
            )

            total_support += model_results[models.index(model)][threshold][CLASS][
                SUPPORT
            ][idx]

    return ave / total_support

The shaded cells in the subsequent tables correspond to the highest scores (i.e., best performance in terms of the specified evaluation metric). 

⚠️ **Caveat**: The highest score is determined via lexicographical ordering of the values (i.e., the scores are treated as strings) &mdash; so do some quick sanity check! 

In [22]:
print("Weighted Recall")

results = []
for model in models:
    result = []
    for threshold in range(0, 10):
        metric = "{:.2f}".format(get_weighted_no_others(model, threshold, RECALL) * 100)
        result.append(f"{metric}%")
    results.append(result)

results_df = pd.DataFrame(
    results, columns=[str(_) + "%" for _ in range(0, 91, 10)], index=models
)
results_df.style.highlight_max(color="lightgreen", axis=0)

Weighted Recall


Unnamed: 0,0%,10%,20%,30%,40%,50%,60%,70%,80%,90%
boeckaerts-eskapee-smotetomek-40,63.66%,42.16%,26.03%,17.87%,13.00%,10.31%,7.97%,6.42%,3.29%,1.30%
phiembed-eskapee-smotetomek-40,46.62%,31.21%,23.37%,15.84%,10.75%,7.65%,3.89%,1.93%,0.95%,0.51%
badam-eskapee-smotetomek-40,48.77%,47.15%,45.64%,44.66%,43.33%,42.03%,40.64%,38.87%,36.94%,32.76%
blast-eskapee-smotetomek-40,55.66%,55.66%,55.66%,55.66%,55.66%,55.66%,55.66%,55.66%,55.66%,55.66%
psiblast-eskapee-smotetomek-40,42.50%,42.50%,42.50%,42.50%,42.50%,42.50%,42.50%,42.50%,42.50%,42.50%
saprot_relaxed_r3-mlp-eskapee-smotetomek-40,64.23%,63.06%,62.02%,61.16%,59.96%,58.67%,57.69%,56.23%,53.83%,50.03%


In [23]:
print("Weighted Precision")

results = []
for model in models:
    result = []
    for threshold in range(0, 10):
        metric = "{:.2f}".format(
            get_weighted_no_others(model, threshold, PRECISION) * 100
        )
        result.append(f"{metric}%")
    results.append(result)

results_df = pd.DataFrame(
    results, columns=[str(_) + "%" for _ in range(0, 91, 10)], index=models
)
results_df.style.highlight_max(color="lightgreen", axis=0)

Weighted Precision


Unnamed: 0,0%,10%,20%,30%,40%,50%,60%,70%,80%,90%
boeckaerts-eskapee-smotetomek-40,66.88%,73.83%,81.90%,87.07%,94.22%,94.22%,89.87%,91.20%,34.89%,32.26%
phiembed-eskapee-smotetomek-40,56.71%,67.11%,77.37%,83.98%,88.16%,93.50%,96.28%,98.70%,88.87%,25.24%
badam-eskapee-smotetomek-40,63.56%,63.88%,64.41%,65.07%,65.49%,66.20%,66.95%,67.28%,69.17%,70.06%
blast-eskapee-smotetomek-40,67.96%,67.96%,67.96%,67.96%,67.96%,67.96%,67.96%,67.96%,67.96%,67.96%
psiblast-eskapee-smotetomek-40,57.37%,57.37%,57.37%,57.37%,57.37%,57.37%,57.37%,57.37%,57.37%,57.37%
saprot_relaxed_r3-mlp-eskapee-smotetomek-40,64.99%,65.58%,65.91%,66.64%,67.23%,68.21%,69.23%,70.59%,72.83%,74.50%


In [24]:
print("Weighted F1")

results = []
for model in models:
    result = []
    for threshold in range(0, 10):
        metric = "{:.2f}".format(get_weighted_no_others(model, threshold, F1) * 100)
        result.append(f"{metric}%")
    results.append(result)

results_df = pd.DataFrame(
    results, columns=[str(_) + "%" for _ in range(0, 91, 10)], index=models
)
results_df.style.highlight_max(color="lightgreen", axis=0)

Weighted F1


Unnamed: 0,0%,10%,20%,30%,40%,50%,60%,70%,80%,90%
boeckaerts-eskapee-smotetomek-40,64.27%,51.83%,35.44%,24.19%,17.37%,14.03%,11.33%,9.53%,5.55%,2.39%
phiembed-eskapee-smotetomek-40,49.01%,40.85%,35.05%,26.03%,18.68%,13.93%,7.40%,3.71%,1.83%,0.96%
badam-eskapee-smotetomek-40,50.98%,49.91%,49.04%,48.51%,47.76%,47.04%,46.18%,44.83%,43.70%,39.93%
blast-eskapee-smotetomek-40,59.87%,59.87%,59.87%,59.87%,59.87%,59.87%,59.87%,59.87%,59.87%,59.87%
psiblast-eskapee-smotetomek-40,46.73%,46.73%,46.73%,46.73%,46.73%,46.73%,46.73%,46.73%,46.73%,46.73%
saprot_relaxed_r3-mlp-eskapee-smotetomek-40,63.41%,63.05%,62.72%,62.55%,62.19%,61.89%,61.70%,61.32%,60.46%,58.38%
