In [4]:
from __future__ import annotations

import sys
sys.path.append("../utilities/")

from cbutils import aa_code, get_mpnn_seq, make_consensus_sequence, setup_aligner, alignment_to_mapping, mapping_to_sequence, mpnn_score

import os

os.environ["XLA_PYTHON_CLIENT_PREALLOCATE"] = "false"

from tqdm.notebook import tqdm
import pandas as pd
from colabdesign.mpnn import mk_mpnn_model

In [5]:
#setup input pdb files
pdbs = {
    "open": "../pdbs/lpla/3a7r.pdb",
    "closed": "../pdbs/lpla/1x2g.pdb",
}

#identify which chains to use for scoring
chains = {
    'open':"A",
    'closed':"A",
}

In [None]:
# input protein sequence
lpla_w37v = "STLRLLISDSYDPWFNLAVEECIFRQMPATQRVLFLVRNADTVVIGRAQNPWKECNTRRMEEDNVRLARRSSGGGAVFHDLGNTCFTFMAGKPEYDKTISTSIVLNALNALGVSAEASGRNDLVVKTVEGDRKVSGSAYRETKDRGFHHGTLLLNADLSRLANYLNPDKKKLAAKGITSVRSRVTNLTELLPGITHEQVCEAITEAFFAHYGERVEAEIISPNKTPDLPNFAETFARQSSWEWNFGQAPAFSHLLDERFTWGGVELHFDVEKGHITRAQVFTDSLNPAPLEALAGRLQGCLYRADMLQQECEALLVDFPEQEKELRELSAWMAGAVR"

# get protein sequences from pdb files and form consensus sequence
seqs = {pdb: get_mpnn_seq(pdbs[pdb], chains[pdb]) for pdb in pdbs}
con_seq = make_consensus_sequence([lpla_w37v] + list(seqs.values()))

# align all sequences to the consensus sequence
aligner = setup_aligner()
alignments = {pdb: aligner.align(con_seq, seq)[0] for pdb, seq in seqs.items()}

# create mapping of positions from consensus sequence to each pdb sequence
mappings = {
    pdb: alignment_to_mapping(alignment) for pdb, alignment in alignments.items()
}

# create list of all possible single mutations of input sequence
muts = []
mut_seqs = []
for i, aa in enumerate(con_seq):
    for aa_new in aa_code:
        if aa_new != aa:
            mut_seqs.append(con_seq[:i] + aa_new + con_seq[i + 1 :])
            muts.append(f"{aa}{i+1}{aa_new}")

homooligomer = False  # if structure is a homooligomer
fix_pos = None #don't fix any positions
inverse = True  # whether to invert the fix pos selection
model_name = "v_48_020"  #use default model checkpoint

#initialize proteinMPNN model
if "mpnn_model" not in dir():
    mpnn_model = mk_mpnn_model(model_name)

output_data = pd.DataFrame({'mut': muts, 'seq': mut_seqs})

#for each pdb file, score all mutations and save scores relative to WT score
for structure in pdbs:
    output_scores = []

    mpnn_model.prep_inputs(
        pdb_filename=pdbs[structure],
        chain=chains[structure],
        homooligomer=homooligomer,
        fix_pos=fix_pos,
        inverse=inverse,
        verbose=True,
    )

    wt_seq = mapping_to_sequence(con_seq, seqs[structure], mappings[structure])
    wt_score = mpnn_score(wt_seq, mpnn_model)

    for mut_seq in tqdm(mut_seqs):
        mapped_seq = mapping_to_sequence(mut_seq, seqs[structure], mappings[structure])
        score = mpnn_score(mapped_seq, mpnn_model)
        output_scores.append(score - wt_score)

    output_data["pmpnn_" + structure] = output_scores

#save output data
#output_data.to_csv("cb_lpla.csv", index=False)