In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import glob
import uuid
import shutil
import os
from pathlib import Path

import numpy as np
import pandas as pd

from plaid.evaluation._structure_metrics import calculate_rmsd
from plaid.evaluation._tmalign import run_tmalign
from plaid.evaluation._perplexity import RITAPerplexity

from plaid.utils._misc import (
    extract_avg_b_factor_per_residue,
    parse_sequence_from_structure,
)
from plaid.utils._protein_properties import calculate_df_protein_property_mp

In [37]:
sample_dir = Path("/data/lux70/plaid/artifacts/samples/5j007z42/val_dist/f989_o1326_l144_s3")
os.listdir(sample_dir)

['inverse_generated',
 'mmseqs_easycluster.m8_all_seqs.fasta',
 'phantom_generated',
 'latent.npz',
 'sample.yaml',
 'generated',
 'mmseqs_easycluster.m8_rep_seq.fasta',
 'mmseqs_easycluster.m8_cluster.tsv',
 'sample.log',
 'mmseqs_easysearch.m8']

In [50]:
os.listdir(sample_dir / "inverse_generated")

def sort_dict_values_by_key(d):
    # d: header : sequence
    idx = np.argsort(np.array(list(d.keys())))
    return np.array(list(d.values()))[idx]

In [51]:
from plaid.utils import read_sequences_from_fasta


gen_seqs = read_sequences_from_fasta(sample_dir / "generated" / "sequences.fasta")
inv_gen_seqs = read_sequences_from_fasta(sample_dir / "inverse_generated" / "sequences.fasta")
phan_gen_seqs = read_sequences_from_fasta(sample_dir / "phantom_generated" / "sequences.fasta")

gen_seqs = sort_dict_values_by_key(gen_seqs)
inv_gen_seqs = sort_dict_values_by_key(inv_gen_seqs)
phan_gen_seqs = sort_dict_values_by_key(phan_gen_seqs)


In [52]:
from plaid.utils import calc_sequence_recovery

ccsr = [calc_sequence_recovery(gen, inv) for (gen, inv) in zip(gen_seqs, inv_gen_seqs)]
scsr = [calc_sequence_recovery(gen, phan) for (gen, phan) in zip(gen_seqs, phan_gen_seqs)]

In [54]:
print(np.mean(ccsr), np.mean(scsr))

0.1474609375 0.15570746527777776


In [12]:
import warnings

warnings.filterwarnings("ignore")

# Gather paths; sort should guarantee that samples are in the same order
generated_pdb_paths = glob.glob(str(sample_dir / "generated/structures/*pdb"))
inverse_generated_pdb_paths = glob.glob(
    str(sample_dir / "inverse_generated/structures/*pdb")
)

generated_pdb_paths.sort()
inverse_generated_pdb_paths.sort()
assert (
    len(generated_pdb_paths) == len(inverse_generated_pdb_paths)
)

# maybe run self-consistency (if phantom generated structures were generated)
phantom_generated_pdb_paths = glob.glob(
    str(sample_dir / "phantom_generated/structures/*pdb")
)
run_self_consistency = len(phantom_generated_pdb_paths) > 0

if run_self_consistency:
    phantom_generated_pdb_paths.sort()
    assert (
        len(generated_pdb_paths) == len(phantom_generated_pdb_paths)
    )


# Initialize dataframe
d = {
    "pdb_paths": [],
    "sequences": [],
    "inverse_generated_pdb_paths": [],
}



if run_self_consistency:
    d["phantom_generated_pdb_paths"] = []

# parse sequence directly from structure to make sure there are no mismatches
print("Parsing sequences from structures")
for i, p in enumerate(generated_pdb_paths):
    d["pdb_paths"].append(p)
    
    d["inverse_generated_pdb_paths"].append(inverse_generated_pdb_paths[i])

    if run_self_consistency:
        d["phantom_generated_pdb_paths"].append(phantom_generated_pdb_paths[i])

    with open(p, "r") as f:
        pdbstr = f.read()

    sequence = parse_sequence_from_structure(pdbstr)
    d["sequences"].append(sequence)


Parsing sequences from structures


In [None]:
from plaid.utils import  read_sequences_from_fasta



In [15]:
df = pd.DataFrame(d)
df.head()

print("Calculating average pLDDT")
df["plddt"] = df.apply(
    lambda row: np.mean(extract_avg_b_factor_per_residue(row["pdb_paths"])),
    axis=1,
)

print("Calculating ccRMSD")
df["ccrmsd"] = df.apply(
    lambda row: calculate_rmsd(
        row["pdb_paths"], row["inverse_generated_pdb_paths"]
    ),
    axis=1,
)

print("Calculating cctm")
df["cctm"] = df.apply(
    lambda row: run_tmalign(
        row["pdb_paths"], row["inverse_generated_pdb_paths"]
    ),
    axis=1,
)

df["designable"] = df.ccrmsd < 2

# run self-consistency metrics, if applicable:
if run_self_consistency:
    print("Calculating scRMSD")
    df["scrmsd"] = df.apply(
        lambda row: calculate_rmsd(
            row["pdb_paths"], row["phantom_generated_pdb_paths"]
        ),
        axis=1,
    )

    print("Calculating sctm")
    df["sctm"] = df.apply(
        lambda row: run_tmalign(
            row["pdb_paths"], row["phantom_generated_pdb_paths"]
        ),
        axis=1,
    )


Calculating average pLDDT
Calculating ccRMSD
Calculating cctm
Calculating scRMSD
Calculating sctm


In [16]:
from plaid.utils import pdb_path_to_biotite_atom_array
from biotite.application.dssp import DsspApp

structure_atom_arrays = [pdb_path_to_biotite_atom_array(p) for p in df.pdb_paths]

In [21]:
import os
import shutil
from pathlib import Path
from typing import Optional


In [23]:
DSSP_PATH = get_mkdssp_path()

In [27]:
out = DsspApp.annotate_sse(structure_atom_arrays[0], DSSP_PATH)

In [29]:
dssp_annotation = "".join(out)

In [30]:
dssp_annotation

'CCCCCCCCCCCCCCCTTTGGGGSCCSSCCCCCCCCCCCCSSCCCCCCCCCCSSCCGGGSTTTTTTHHHHHHGGGCCCHHHHHHHHHHHHHHHHGGGSCSCCCCCHHHHHHHHHHHCGGGGCCCCCCCCCCCSCSCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCSCCCCCTTTTSCCCCCCCCCCSCGGGSCSCCCSSCCCCSCCCTTSCSSCCCCCCCCCTTHHHHHTTCCCCCHHHHHHHHHTTTCCCCC'

In [34]:
alpha_percentage = np.array([x == "C" for x in dssp_annotation]).sum() / len(dssp_annotation)
beta_percentage = np.array([x == "E" for x in dssp_annotation]).sum() / len(dssp_annotation)

In [35]:
print(coil_percentage, alpha_percentage)

0.6111111111111112 0.6111111111111112


In [None]:
beta_percentage = [c == "E" metrics["sample_pct_beta"] = mean([c == "E" for c in dssp_sample])
metrics["sample_pct_alpha"] = mean([c == "H" for c in dssp_sample])

In [36]:
df

Unnamed: 0,pdb_paths,sequences,inverse_generated_pdb_paths,phantom_generated_pdb_paths,plddt,ccrmsd,cctm,designable,scrmsd,sctm
0,/data/lux70/plaid/artifacts/samples/5j007z42/v...,ISKPSFQKAETSRQPSWWRAKRSCTGASAARRRVALASSSSCVHPT...,/data/lux70/plaid/artifacts/samples/5j007z42/v...,/data/lux70/plaid/artifacts/samples/5j007z42/v...,34.819364,28.417275,0.24189,False,42.390841,0.16123
1,/data/lux70/plaid/artifacts/samples/5j007z42/v...,PSIDYSSARRQFERARKEKALASSTQAKPTDPMTTSKSPSTNLTQE...,/data/lux70/plaid/artifacts/samples/5j007z42/v...,/data/lux70/plaid/artifacts/samples/5j007z42/v...,43.874387,22.557421,0.37681,False,38.96341,0.22098
2,/data/lux70/plaid/artifacts/samples/5j007z42/v...,QGLLKAKKCKAGFARGPCGEQHPKMEDLHNLSPENTRRIALPGRLP...,/data/lux70/plaid/artifacts/samples/5j007z42/v...,/data/lux70/plaid/artifacts/samples/5j007z42/v...,42.526181,24.994765,0.30356,False,35.247336,0.19507
3,/data/lux70/plaid/artifacts/samples/5j007z42/v...,QVTMDTKAKLRRGLQRTPRGAKQEPAVRDPYAYNVERDHVIVPKSS...,/data/lux70/plaid/artifacts/samples/5j007z42/v...,/data/lux70/plaid/artifacts/samples/5j007z42/v...,38.467386,26.599901,0.26867,False,51.295589,0.14298
4,/data/lux70/plaid/artifacts/samples/5j007z42/v...,DQPDEPPEEEGAPLKRRHLKPSPEELERELEAYCEGTSGMMEMVTG...,/data/lux70/plaid/artifacts/samples/5j007z42/v...,/data/lux70/plaid/artifacts/samples/5j007z42/v...,33.902052,19.069064,0.38486,False,31.539156,0.21688
5,/data/lux70/plaid/artifacts/samples/5j007z42/v...,AYILPVEGPELKHRPQSLLTWTPQPTTVKHFPQSTWFMDRQHLATG...,/data/lux70/plaid/artifacts/samples/5j007z42/v...,/data/lux70/plaid/artifacts/samples/5j007z42/v...,28.127815,15.904109,0.48461,False,31.062879,0.24741
6,/data/lux70/plaid/artifacts/samples/5j007z42/v...,TSLHAYETERFIRNKQEGYVQSPNGCNGSSGNGNPSNRGEQMLRPG...,/data/lux70/plaid/artifacts/samples/5j007z42/v...,/data/lux70/plaid/artifacts/samples/5j007z42/v...,43.450781,20.468548,0.33499,False,19.364089,0.33499
7,/data/lux70/plaid/artifacts/samples/5j007z42/v...,MGAGHTKRPANSIGPISIPVGHSVFLHPRHNKNMRRRERFPSPFIA...,/data/lux70/plaid/artifacts/samples/5j007z42/v...,/data/lux70/plaid/artifacts/samples/5j007z42/v...,28.882349,17.838684,0.31614,False,36.605656,0.1611
8,/data/lux70/plaid/artifacts/samples/5j007z42/v...,RGRNSQNKRRPAGFVVEVTGLSTTVASPEVKTGDQNQLHKQEEDPG...,/data/lux70/plaid/artifacts/samples/5j007z42/v...,/data/lux70/plaid/artifacts/samples/5j007z42/v...,36.106119,25.021768,0.27634,False,34.074209,0.20795
9,/data/lux70/plaid/artifacts/samples/5j007z42/v...,FSSTEMGVVEPSPLPPGPSRGVRRDRRRDTLPRPIIVAPTPLSSDH...,/data/lux70/plaid/artifacts/samples/5j007z42/v...,/data/lux70/plaid/artifacts/samples/5j007z42/v...,34.667633,21.954948,0.30571,False,26.24312,0.24404
