In [1]:
from protein_sequence import ProteinSequence
from mutation_strategy import MutationStrategy, MinLogitPosSub, BlosumWeightedSub
from evolution import Evolution
import esm
from Bio.Align import substitution_matrices

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Source: https://github.com/facebookresearch/esm/tree/main?tab=readme-ov-file#esmfold
model, alphabet = esm.pretrained.load_model_and_alphabet("esm2_t33_650M_UR50D") 
batch_converter = alphabet.get_batch_converter()

In [3]:
reference_seq = 'TTSAGESADPVTATVENYGGETQVQRRQHTDIAFILDRFVKVKPKEQVNVLDLMQIPAHTLVGALLRTATYYFSDLELAVKHEGDLTWVPNGAPETALDNTTNPTAYHKEPLTRLALPYTAPHRVLATVYNGSSKYGDTSTNNVRGDLQVLAQKAERTLPTSFNFGAIKATRVTELLYRMKRAETYCPRPLLAIQPSDARHKQRIVAPAKQ'

In [4]:
blosum80 = substitution_matrices.load("BLOSUM80")
print(f"ESM alphabet: {''.join(alphabet.standard_toks)}")
print(f"Blosum alphabet: {len(blosum80.alphabet[0:-4])}")
# blosum_alphabet_string = blosum62.alphabet
# cleaned_blosum_alphabet = [char for char in blosum_alphabet_string if char in esm_alphabet_string]
# cleaned_esm_alphabet = [char for char in esm_alphabet_string if char in blosum_alphabet_string]

# blosum_20_alphabet = blosum62.alphabet[0:-5]
# reference_alphabet = list(set(reference_seq))
# print([char for char in esm_alphabet_string if char not in reference_alphabet])
# print([char for char in blosum_20_alphabet  if char not in reference_alphabet])
# set(reference_seq)

ESM alphabet: LAGVSERTIDPKQNFYMHWCXBUZO.-
Blosum alphabet: 20


In [11]:
blosum80 = substitution_matrices.load("BLOSUM80")
fmd_sequence = ProteinSequence("base",reference_seq)
mutation_strat = BlosumWeightedSub(alphabet=alphabet,blosum_matrix=blosum80,multiplier=0.5)
evolution = Evolution(
    protein_sequence = fmd_sequence,
    mutation_strategy = mutation_strat,
    model=model,
    alphabet=alphabet,
    batch_converter=batch_converter)

In [12]:
evolution.evolve_sequence(steps=8) 

Invalid amino acid candidate for mutation as it is the same as the current amino acid: S>S
Using the second best fit amino acid for this position: S>A
Position mutated = 139
Invalid amino acid candidate for mutation as it is the same as the current amino acid: A>A
Using the second best fit amino acid for this position: A>Q
Position mutated = 139
Invalid amino acid candidate for mutation as it is the same as the current amino acid: T>T
Using the second best fit amino acid for this position: T>V
Position mutated = 140
Invalid amino acid candidate for mutation as it is the same as the current amino acid: T>T
Using the second best fit amino acid for this position: T>S
Position mutated = 138
Invalid amino acid candidate for mutation as it is the same as the current amino acid: S>S
Using the second best fit amino acid for this position: S>A
Position mutated = 138
Invalid amino acid candidate for mutation as it is the same as the current amino acid: Q>Q
Using the second best fit amino acid fo

In [7]:
fmd_sequence = ProteinSequence("base",reference_seq)
mutation_strat = MinLogitPosSub(alphabet)
evolution = Evolution(
    protein_sequence = fmd_sequence,
    mutation_strategy = mutation_strat,
    model=model,
    alphabet=alphabet,
    batch_converter=batch_converter)

In [8]:
evolution.evolve_sequence(steps=8)   

Invalid amino acid candidate for mutation as it is the same as the current amino acid: S>S
Using the second best fit amino acid for this position: S>K
Position mutated = 139
Invalid amino acid candidate for mutation as it is the same as the current amino acid: N>N
Using the second best fit amino acid for this position: N>T
Position mutated = 141
Invalid amino acid candidate for mutation as it is the same as the current amino acid: T>T
Using the second best fit amino acid for this position: T>V
Position mutated = 140
Invalid amino acid candidate for mutation as it is the same as the current amino acid: T>T
Using the second best fit amino acid for this position: T>N
Position mutated = 138
Invalid amino acid candidate for mutation as it is the same as the current amino acid: K>K
Using the second best fit amino acid for this position: K>Q
Position mutated = 139
Invalid amino acid candidate for mutation as it is the same as the current amino acid: Q>Q
Using the second best fit amino acid fo