In [4]:
from esm.models.esmc import ESMC
from esm.sdk.api import ESMProtein, LogitsConfig
import os
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm

In [5]:
protein_sequence = "MGQPGNGSAFLLAPNGSHAPDHDVTQERDEVWVVGMGIVMSLIVLAIVFGNVLVITAIAKFERLQTVTNYFITSLACADLVMGLAVVPFGAAHILMKMWTFGNFWCEFWTSIDVLCVTASIETLCVIAVDRYFAITSPFKYQSLLTKNKARVIILMVWIVSGLTSFLPIQMHWYRATHQEAINCYANETCCDFFTNQAYAIASSIVSFYVPLVIMVFVYSRVFQEAKRQLQKIDKSEGRFHVQNLSQVEQDGRTGHGLRRSSKFCLKEHKALKTLGIIMGTFTLCWLPFFIVNIVHVIQDNLIRKEVYILLNWIGYVNSGFNPLIYCRSPDFRIAFQELLCLRRSSLKAYGNGYSSNGNTGEQSGYHVEQEKENKLLCEDLPGTEDFVGHQGTVPSDNIDSQGRNCSTNDSLL"

In [7]:
protein = ESMProtein(sequence=protein_sequence)
client = ESMC.from_pretrained("esmc_300m").to("cuda")
protein_tensor = client.encode(protein)
logits_output = client.logits(
    protein_tensor, LogitsConfig(sequence=True, return_embeddings=True)
)
print(logits_output.logits, logits_output.embeddings)

ForwardTrackData(sequence=tensor([[[-38.0000, -38.0000, -38.0000,  ..., -38.0000, -38.0000, -38.0000],
         [-42.2500, -42.2500, -42.2500,  ..., -42.0000, -42.0000, -42.2500],
         [-39.7500, -39.5000, -39.7500,  ..., -39.5000, -39.5000, -39.7500],
         ...,
         [-40.0000, -40.0000, -40.0000,  ..., -40.0000, -40.0000, -40.0000],
         [-36.0000, -36.0000, -36.2500,  ..., -36.0000, -36.2500, -36.2500],
         [-35.2500, -35.2500, -35.5000,  ..., -35.2500, -35.5000, -35.5000]]],
       device='cuda:0', dtype=torch.bfloat16), structure=None, secondary_structure=None, sasa=None, function=None) tensor([[[ 0.0043,  0.0037,  0.0008,  ...,  0.0041, -0.0008, -0.0168],
         [ 0.0073,  0.0539,  0.0495,  ..., -0.0092, -0.0329, -0.0049],
         [-0.0188,  0.0335,  0.0407,  ..., -0.0523, -0.0595,  0.0023],
         ...,
         [-0.0084,  0.0286,  0.0094,  ...,  0.0132,  0.0004,  0.0340],
         [-0.0276, -0.0355, -0.0080,  ...,  0.0339, -0.0392,  0.0007],
         [ 0

In [8]:
# take average of all amino acid sequences
tensor = logits_output.embeddings[0]
col_means = torch.mean(tensor, dim=0)
tensor = col_means.unsqueeze(0)
tensor.shape


# save tensor
torch.save(tensor, 'esmc_embedding')


In [None]:
#TODO: make this a class pipeline, process all single AA mutations through it

In [10]:
def read_fasta(file_path):
    with open(file_path, 'r') as file:
        sequences = {}
        sequence_id = ""
        sequence = []
        
        for line in file:
            line = line.strip()
            if line.startswith('>'):  # Header line
                if sequence_id:  # Save previous sequence
                    sequences[sequence_id] = ''.join(sequence)
                sequence_id = line[1:]  # Remove '>'
                sequence = []
            else:
                sequence.append(line)
        
        # Add the last sequence
        if sequence_id:
            sequences[sequence_id] = ''.join(sequence)
    
    return sequences

test = read_fasta('data/jones.fasta')


In [11]:
def esmc_process(fasta_file_path, output_file_path):
    # iterates through all entries in the fasta file path, and creates a .csv file of embeddings
    # output .csv same format as evolvepro embedding .csv:
        # row names correspond to mutation
        # columns correspond to dimensions

    fasta_dict = read_fasta(fasta_file_path)
    model = ESMC.from_pretrained("esmc_300m").to("cpu")

    embedding_dict = {} # empty dict to store data

    for mutation in tqdm(fasta_dict, desc="Creating embeddings:"):
        sequence = fasta_dict[mutation]
        sequence = ESMProtein(sequence=sequence)
        sequence_tensor = model.encode(sequence)
        embedding = model.logits(sequence_tensor, LogitsConfig(sequence=True, return_embeddings=True))
        embedding = embedding.embeddings[0]
        # calculate mean amino acid sequence embedding
        col_means = torch.mean(embedding, dim=0)
        mean_embedding = col_means.tolist()
        # add to dict
        embedding_dict[mutation] = mean_embedding
    
    # converting dict to .csv file
    df = pd.DataFrame.from_dict(embedding_dict, orient='index')
    # save df
    df.to_csv(output_file_path, index=True)

    return df

In [12]:
df = esmc_process('data/jones.fasta', 'data/jones_esmc_embeddings.csv')

Creating embeddings::   5%|▍         | 370/7800 [01:39<33:10,  3.73it/s]


KeyboardInterrupt: 