# ESM Embeddigs

Given a sequence such as ABCDE, using ESM, get the embeddings for each aminoacid. Assume that mutbpe tokenizer segments this sequence into AB CDE and we know that our mutator is able to mutate AB into AN. Get the embeddings for sequence ANCDE. 
- Compare the embeddings of N and B. We expect the change in embeddings should be smaller compared to say any "non-mutbpe" mutation for B. It is possible to also consider the score or the observed frequency of this mutation during the training of mutBPE. Do the mutation scores or frequencies correlate positively with the change in embedding vectors?
- Observe the changes in embeddings of other aminoacids after the mutation. Also possible to consider "up to a range". How local or a global impact does this particular mutation have?

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ["HF_HOME"] = "/cta/share/users/esm"

In [2]:
from time import time
import sqlite3
import pandas as pd
from tqdm import tqdm
import numpy as np
from tokenizers import Tokenizer
import json
from collections import Counter
from transformers import AutoTokenizer, AutoModel
import torch
from tqdm import tqdm
import random
from pandarallel import pandarallel
from protein_embedding_database import ProteinEmbeddingDatabase
from EfficientBPE.vocabulary_functions import get_mutated, get_parents, set_difference, set_intersection, load_tokenizers, calc_agreement, calc_dice_idx_only

In [3]:
pandarallel.initialize(progress_bar=True, nb_workers=20)

INFO: Pandarallel will run on 20 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [4]:
torch.cuda.is_available(), torch.cuda.device_count(), torch.cuda.current_device(), torch.cuda.get_device_name(0)

(True, 1, 0, 'NVIDIA RTX A6000')

In [5]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

## Get Embeddings

In [6]:
# facebook/model_name
# Checkpoint name	    Num layers  Num parameters  Dim count   Have DB
# esm2_t48_15B_UR50D	48	        15B             -           No
# esm2_t36_3B_UR50D	    36          3B              -           No
# esm2_t33_650M_UR50D	33          650M            1280        Yes
# esm2_t30_150M_UR50D	30          150M            640         Yes
# esm2_t12_35M_UR50D	12          35M             480         No
# esm2_t6_8M_UR50D	    6           8M              320         Yes
model_name = "facebook/esm2_t30_150M_UR50D"

In [7]:
# def get_embeddings(text, model_name="facebook/esm2_t6_8M_UR50D"):
#     """
#     Compute embeddings for each token in the text using a specified model.
    
#     Parameters:
#     - text (str): The input text for which embeddings need to be computed.
#     - model_name (str): The path to the pretrained model.
    
#     Returns:
#     - numpy.ndarray: A matrix where each row is the embedding of a token in the text.
#     """
#     tokenizer = AutoTokenizer.from_pretrained(model_name)
#     model = AutoModel.from_pretrained(model_name)

#     # inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=1024)
#     inputs = tokenizer(text, return_tensors="pt")
#     with torch.no_grad():
#         outputs = model(**inputs)

#     # Return embeddings after removing <cls> and <eos> tokens and converting to numpy.
#     return outputs.last_hidden_state[:, 1:-1, :].squeeze(0).numpy()

def get_esm_embeddings_model(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)

    return tokenizer, model

def get_esm_embeddings(text, tokenizer, model, ):
    # inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=1024)
    inputs = tokenizer(text, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)

    # Return embeddings after removing <cls> and <eos> tokens and converting to numpy.
    return outputs.last_hidden_state.to('cpu')[:, 1:-1, :].squeeze(0).numpy()

In [8]:
esm_tokenizer, esm_model = get_esm_embeddings_model(model_name)
esm_model.to(device);

Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t30_150M_UR50D and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
get_esm_embeddings("PVNPCCYYPCXA", esm_tokenizer, esm_model).shape

(12, 640)

## Load ESM Embeddings Faiss DB

In [10]:
# def load_protein_embedding_db(model_name, root_path="/cta/share/users/uniprot/human/faiss"):
#     faiss_path = f"{root_path}/{model_name.replace('/', '_')}_protein_embeddings.faiss"
#     id_map_path = f"{root_path}/{model_name.replace('/', '_')}_id_mapping.csv"
#     loaded_db = ProteinEmbeddingDatabase.load_database(faiss_path, id_map_path, model_name)

#     return loaded_db

# loaded_db = load_protein_embedding_db(model_name)

In [11]:
# def get_aminoacid_embedding(uniprot_id, aa_index):
#     return loaded_db.get_amino_acid_embedding(f"{uniprot_id}_{aa_index}")

# def get_protein_embedding(uniprot_id, sequence_len):
#     return np.array([loaded_db.get_amino_acid_embedding(f"{uniprot_id}_{i}")for i in range(sequence_len)])

# def get_protein_embedding_slice(uniprot_id, start_index, end_index):
#     return np.array([loaded_db.get_amino_acid_embedding(f"{uniprot_id}_{i}")for i in range(start_index, end_index)])

In [None]:
# %timeit get_aminoacid_embedding('F8WC80', 3)

## Load Datasets

In [13]:
# Connect to DB
db_file = "/cta/share/users/uniprot/human/human.db"
conn = sqlite3.connect(db_file)

uniref_id = '50'
df_protein = pd.read_sql(f"""SELECT Entry as uniprot_id, Sequence as sequence
                          FROM proteins
                          WHERE Entry IN (SELECT uniprot_accession FROM uniref{uniref_id}_distilled)""", conn)
df_protein = df_protein[df_protein['sequence'].str.len() < 1024].reset_index(drop=True)

df_interpro_domain = pd.read_sql(f"SELECT uniprot_id, interpro_id as source, start_index, end_index FROM interpro_entries_v2 WHERE type='domain'", conn)
df_ted = pd.read_sql(f"SELECT uniprot_id, ted_id as source, start_index, end_index FROM ted_entries_summary WHERE plddt >= 70", conn)

conn.close()

In [14]:
df_domains = pd.concat([df_interpro_domain, df_ted])
interpro_ids = df_domains.loc[df_domains["source"].str.startswith("IPR"), "uniprot_id"].unique() # Find uniprot_ids that have "interpro" as a source
df_domains = df_domains[~((df_domains["uniprot_id"].isin(interpro_ids)) & (df_domains["source"].str.startswith("AF")))] # Filter the DataFrame to exclude rows with source "ted" for those uniprot_ids
df_domains = df_protein.set_index('uniprot_id').join(df_domains.set_index('uniprot_id'), how='inner').reset_index()
df_domains['domain_sequence'] = df_domains.apply(lambda row: row['sequence'][row['start_index']-1: row['end_index']], axis=1)
df_domains = df_domains[df_domains['domain_sequence'].str.len()>0].reset_index(drop=True)

# df_domains = df_domains[df_domains['source'].str.startswith('IPR')] # just keep interpro entries
# df_domains = df_domains[['uniprot_id', 'source', 'domain_sequence']].reset_index(drop=True)
df_domains

Unnamed: 0,uniprot_id,sequence,source,start_index,end_index,domain_sequence
0,A0A087X1C5,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...,AF-A0A087X1C5-F1-model_v4_TED01,32,333,RYPPGPLPLPGLGNLLHVDFQNTPYCFDQLRRRFGDVFSLQLAWTP...
1,A0A087X1C5,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...,AF-A0A087X1C5-F1-model_v4_TED01,344,362,VCPVRVQQEIDDVIGQVRR
2,A0A087X1C5,MGLEALVPLAMIVAIFLLLVDLMHRHQRWAARYPPGPLPLPGLGNL...,AF-A0A087X1C5-F1-model_v4_TED01,376,515,AVIHEVQHFGDIVPLGVTHMTSRDIEVQGFRIPKGTTLITNLSSVL...
3,A0A087X296,MSRSLLLWFLLFLLLLPPLPVLLADPGAPTPVNPCCYYPCQHQGIC...,IPR000742,31,69,PVNPCCYYPCQHQGICVRFGLDRYQCDCTRTGYSGPNCT
4,A0A0B4J2F0,MFRRLTFAQLLFATVLGIAGGVYIFQPVFEQYAKDQKELKEKMQLV...,AF-A0A0B4J2F0-F1-model_v4_TED01,30,54,EQYAKDQKELKEKMQLVQESEEKKS
...,...,...,...,...,...,...
101662,X6RL26,MQPMSFGWDHSLHKRKRLPPVKRSLVYYLKNREVRLQNETSYSRVL...,IPR056151,54,189,LPSLLKEREFHLGTLNKVFASQWLNHRQVVCGTKCNTLFVVDVQTS...
101663,X6RL45,MVRCYVEIVEKLPERRPDPATIEGCAQLKPNNYLLAWHTPFNEKGS...,AF-X6RL45-F1-model_v4_TED01,1,161,MVRCYVEIVEKLPERRPDPATIEGCAQLKPNNYLLAWHTPFNEKGS...
101664,X6RL83,MLQEWLAAVGDDYAAVVWRPEGEPRFYPDEEGPKHWTKERHQFLME...,AF-X6RL83-F1-model_v4_TED01,2,218,LQEWLAAVGDDYAAVVWRPEGEPRFYPDEEGPKHWTKERHQFLMEL...
101665,X6RLN4,EVKGLFKSENCPKVISCEFAHNSNWYITFQSDTDAQQAFKYLREEV...,AF-X6RLN4-F1-model_v4_TED01,3,54,KGLFKSENCPKVISCEFAHNSNWYITFQSDTDAQQAFKYLREEVKT...


## Load Tokenizers

In [15]:
# 'dataset': {'uniref50', 'uniref90'}
# 'is_pretokenizer': {True, False}
# 'subs_matrix': {'blosum45', 'blosum62', 'pam70', 'pam250'}
# 'mutation_cutoff': {0.7, 0.8, 0.9}
# 'min_mutation_freq': {0, 0.05,. 0.005}
# 'min_mutation_len': {3}
# 'max_mutation_len': {12}
# 'vocab_size': list=[800, 1600, 3200, 6400, 12800, 25600, 51200]

vocab_sizes = [800, 3200, 12800]
uniref_id = "50"

tokenizer_opts_list = [
    # {
    #     'is_mut': False,
    #     'dataset': f'uniref{uniref_id}',
    #     'is_pretokenizer': False,
    #     'vocab_size': vocab_sizes
    # },
    {
        'is_mut': True,
        'dataset': f'uniref{uniref_id}',
        'is_pretokenizer': False,
        'subs_matrix': 'blosum62',
        'mutation_cutoff': 0.7,
        'min_mutation_freq': 0.05,
        'min_mutation_len': 3,
        'max_mutation_len': 12,
        'vocab_size': vocab_sizes
    },
    {
        'is_mut': True,
        'dataset': f'uniref{uniref_id}',
        'is_pretokenizer': False,
        'subs_matrix': 'pam70',
        'mutation_cutoff': 0.7,
        'min_mutation_freq': 0.05,
        'min_mutation_len': 3,
        'max_mutation_len': 12,
        'vocab_size': vocab_sizes
    },
    {
        'is_mut': True,
        'dataset': f'uniref{uniref_id}',
        'is_pretokenizer': True,
        'subs_matrix': 'blosum62',
        'mutation_cutoff': 0.7,
        'min_mutation_freq': 0.05,
        'min_mutation_len': 3,
        'max_mutation_len': 12,
        'vocab_size': vocab_sizes
    },
]

In [16]:
tokenizer_list = load_tokenizers(tokenizer_opts_list, 'hf')
inner_vocab_list = load_tokenizers(tokenizer_opts_list, 'vocab')

vocab_list = {}
for name, tokenizer in tokenizer_list.items():
    vocab_list[name] = list(set([token for token, idx in tokenizer.get_vocab().items()]))


inner_vocab_parents_list = {}
inner_vocab_mutated_list = {}
inner_vocab_family_list = {}
for k, v in inner_vocab_list.items():
    inner_vocab_parents_list[k] = get_parents(v)
    inner_vocab_mutated_list[k] = get_mutated(v)
    inner_vocab_family_list[k] = {p:0 for p in inner_vocab_parents_list[k].keys()}

In [17]:
for tokenizer_name in tokenizer_list.keys():
    for mutated_token, mutated_token_attr in inner_vocab_mutated_list[tokenizer_name].items():
        parent_token = mutated_token_attr['parent']
        inner_vocab_parents_list[tokenizer_name][parent_token]['mutations'] = inner_vocab_parents_list[tokenizer_name][parent_token].get('mutations', []) + [mutated_token]

In [18]:
for name, tokenizer in tqdm(list(tokenizer_list.items())):
    df_protein[name] = [enc.tokens for enc in tokenizer.encode_batch(df_protein['sequence'])]

100%|██████████| 9/9 [00:18<00:00,  2.07s/it]


In [19]:
# df_protein_domain_sequences = df_domains[['uniprot_id', 'sequence']].drop_duplicates()
# for name, tokenizer in tqdm(list(tokenizer_list.items())):
#     df_protein_domain_sequences[name] = [enc.tokens for enc in tokenizer.encode_batch(df_protein_domain_sequences['sequence'])]
# df_domains = df_domains.set_index(['uniprot_id','sequence']).join(df_protein_domain_sequences.set_index(['uniprot_id','sequence']), how='inner').reset_index()
# df_domains.head()

In [20]:
np.random.seed(1)
df_protein_main = df_protein.sample(1000).reset_index(drop=True)

In [21]:
import random

def generate_alternative_token(token: str, mutated_token: str, tabu_list: list, alphabet: set) -> str:
    """
    Generate an alternative token based on two input tokens, avoiding tokens in tabu list.
    
    Args:
        token (str): The original token
        mutated_token (str): The mutated version of the token
        tabu_list (list): List of tokens to avoid
        alphabet (set): Set containing all possible characters
    
    Returns:
        str: A valid alternative token
    
    Raises:
        ValueError: If token and mutated_token have different lengths
        ValueError: If no valid alternative token can be generated after 100 attempts
    """
    if len(token) != len(mutated_token):
        raise ValueError("Token and mutated_token must have the same length")
    
    max_attempts = 100
    attempt = 0
    
    while attempt < max_attempts:
        # Initialize list to store characters of alternative token
        alternative_chars = []
        
        # Generate alternative token character by character
        for t, m in zip(token, mutated_token):
            if t == m:
                # If characters are same in both tokens, use that character
                alternative_chars.append(t)
            else:
                # If characters differ, randomly choose from alphabet
                alternative_chars.append(random.choice(''.join(alphabet-{t,m})))
        
        # Convert character list to string
        alternative = ''.join(alternative_chars)
        
        # Check if generated token is not in tabu list
        if alternative not in tabu_list:
            return alternative
            
        attempt += 1
    
    return mutated_token
    raise ValueError("Could not generate a valid alternative token after 100 attempts")

# Example usage
token = "hello"
mutated_token = "heppo"
tabu_list = ["hello", "heppo", "helpo"]
alphabet = set("abcdefghijklmnopqrstuvwxyz")
alphabet = set("hepolj")

alternative = generate_alternative_token(token, mutated_token, tabu_list, alphabet)
print(alternative)  # Might print something like "helao" or "helko"

hehoo


In [22]:
def generate_mutated_alternative_token_set(token_set, tokenizer_name, random_seed=42):
    random.seed(random_seed)
    alphabet = set("ARNDCEQGHILKMFPSTWYVUOXBZJ")
    mutated_token_set = []
    alternative_token_set = []
    for token in token_set:
        if token in inner_vocab_parents_list[tokenizer_name]:
            if 'mutations' in inner_vocab_parents_list[tokenizer_name][token]:
                mutated_token = inner_vocab_parents_list[tokenizer_name][token]['mutations'][0]
                tabu_list = [token] + inner_vocab_parents_list[tokenizer_name][token]['mutations']
                alternative_token = generate_alternative_token(token, mutated_token, tabu_list, alphabet)
            else:
                mutated_token = token
                alternative_token = token
        elif token in inner_vocab_mutated_list[tokenizer_name]:
            parent_token = inner_vocab_mutated_list[tokenizer_name][token]['parent']
            if len(inner_vocab_parents_list[tokenizer_name][parent_token]['mutations']) > 1:
                candidate_1 = inner_vocab_parents_list[tokenizer_name][parent_token]['mutations'][0]
                candidate_2 = inner_vocab_parents_list[tokenizer_name][parent_token]['mutations'][1]
                mutated_token = candidate_1 if candidate_1 != token else candidate_2
                tabu_list = [parent_token] + inner_vocab_parents_list[tokenizer_name][parent_token]['mutations']
                alternative_token = generate_alternative_token(token, mutated_token, tabu_list, alphabet)
            else:
                mutated_token = token
                alternative_token = token
        else:
            mutated_token = token
            alternative_token = token
        mutated_token_set.append(mutated_token)
        alternative_token_set.append(alternative_token)
    return mutated_token_set, alternative_token_set

In [23]:
df_protein_mut_alt = df_protein_main.apply(lambda row: {tokenizer_name:generate_mutated_alternative_token_set(row[tokenizer_name], tokenizer_name) for tokenizer_name in tokenizer_list.keys()}, axis=1)
df_protein_mut_alt = pd.DataFrame.from_dict(list(df_protein_mut_alt))
df_protein_mut_alt = pd.concat([df_protein_mut_alt[col].apply(pd.Series).rename(columns={0:'mutated', 1:'alternative'}).add_prefix(f"{col} ") for col in df_protein_mut_alt.columns],axis=1)
df_protein_mut_alt.head()

Unnamed: 0,mutBPE blosum62 0.7 0.05 800 mutated,mutBPE blosum62 0.7 0.05 800 alternative,mutBPE blosum62 0.7 0.05 3200 mutated,mutBPE blosum62 0.7 0.05 3200 alternative,mutBPE blosum62 0.7 0.05 12800 mutated,mutBPE blosum62 0.7 0.05 12800 alternative,mutBPE pam70 0.7 0.05 800 mutated,mutBPE pam70 0.7 0.05 800 alternative,mutBPE pam70 0.7 0.05 3200 mutated,mutBPE pam70 0.7 0.05 3200 alternative,mutBPE pam70 0.7 0.05 12800 mutated,mutBPE pam70 0.7 0.05 12800 alternative,mutBPE pre blosum62 0.7 0.05 800 mutated,mutBPE pre blosum62 0.7 0.05 800 alternative,mutBPE pre blosum62 0.7 0.05 3200 mutated,mutBPE pre blosum62 0.7 0.05 3200 alternative,mutBPE pre blosum62 0.7 0.05 12800 mutated,mutBPE pre blosum62 0.7 0.05 12800 alternative
0,"[F, PH, I, GG, GL, Y, YG, F, FL, Y, SK, TW, NI...","[F, PH, I, GG, GL, Y, YG, F, FL, Y, SK, TW, NI...","[F, PHM, GG, SIY, YG, FLI, Y, SK, TW, NI, GLI,...","[F, PHT, GG, EQY, YG, FJG, Y, SK, TW, NI, GXI,...","[F, PHM, GG, SIY, YG, FLI, YKR, TW, NI, GLI, P...","[F, PHT, GG, EQY, YG, FJG, YGG, TW, NI, GYI, P...","[F, PH, I, GG, GL, Y, YG, F, FL, Y, SK, TW, NI...","[F, PH, I, GG, GL, Y, YG, F, FL, Y, SK, TW, NI...","[F, PH, I, GG, TLY, YG, FFM, YTK, TW, NI, GII,...","[F, PH, I, GG, CLY, YG, FFE, YQK, TW, NI, GII,...","[F, PHM, GG, TLY, YG, FFM, YTK, TW, NI, GII, P...","[F, PHT, GG, ELY, YG, FFQ, YLK, TW, NI, GII, P...","[F, PH, I, GG, GL, Y, YG, FLI, Y, SK, TW, NI, ...","[F, PH, I, GG, GL, Y, YG, FCE, Y, SK, TW, NI, ...","[F, PHM, GG, GL, Y, YG, FLI, YAQ, TW, NI, GLI,...","[F, PHT, GG, GL, Y, YG, FEQ, YLR, TW, NI, GXI,...","[F, PHM, GG, SIY, YG, FLI, YAQ, TW, NI, GLI, P...","[F, PHT, GG, EQY, YG, FJG, YXV, TW, NI, GYI, P..."
1,"[M, SAG, ST, H, AQ, TP, RL, PD, AV, A, PR, SG,...","[M, SCG, ST, H, AQ, TP, RL, PD, AV, A, PR, SG,...","[M, SAG, ST, H, AQ, TP, RL, PD, AIA, PR, SG, I...","[M, SCG, ST, H, AQ, TP, RL, PD, AEA, PR, SG, Q...","[M, SAG, ST, HTK, SPRI, PD, AIA, PKAG, ILQ, RQ...","[M, SCG, ST, HEN, LPRG, PD, AXA, PGYG, JLQ, RQ...","[M, TSG, ST, H, AQ, TP, RL, PD, AV, A, PR, SG,...","[M, CEG, ST, H, AQ, TP, RL, PD, AV, A, PR, SG,...","[M, TSG, ST, HTQ, TP, RL, PD, AIA, PR, SG, MLQ...","[M, CEG, ST, HQQ, TP, RL, PD, ALA, PR, SG, GLQ...","[M, TSG, ST, HTQ, TP, RL, PD, AIA, PKPG, MLQ, ...","[M, CEG, ST, HQQ, TP, RL, PD, ALA, PVGG, XLQ, ...","[M, SAG, ST, H, AQ, TP, RL, PD, AV, A, PR, SG,...","[M, SCG, ST, H, AQ, TP, RL, PD, AV, A, PR, SG,...","[M, SAG, ST, H, AQ, TP, RL, PD, AIA, PR, SSLI,...","[M, SCG, ST, H, AQ, TP, RL, PD, AEA, PR, SQLJ,...","[M, SAG, ST, HAE, SPRI, PD, AIA, PR, SSLI, QRE...","[M, SCG, ST, HAL, GPRX, PD, AXA, PR, SYLJ, QRD..."
2,"[M, SEI, TR, SLI, QR, W, GA, SF, RR, GA, D, FD...","[M, SET, TR, SLE, QR, W, GA, SF, RR, GA, D, FD...","[M, SEI, TR, SLI, QR, W, GA, SF, RR, GA, EYD, ...","[M, SET, TR, SLE, QR, W, GA, SF, RR, GA, QLD, ...","[M, SEI, TR, SLI, QR, WSA, SF, RRSA, EYD, AYG,...","[M, SET, TR, SLE, QR, WQA, SF, RRLA, RGD, XDG,...","[M, SDV, TR, TLL, QR, W, GA, SF, RR, GA, D, FD...","[M, STV, TR, ELL, QR, W, GA, SF, RR, GA, D, FD...","[M, SDV, TR, TLL, QR, WAT, SF, RR, GA, EFN, TW...","[M, STV, TR, ELL, QR, WQL, SF, RR, GA, RFV, XW...","[M, SDV, TR, TLL, QR, WAT, SF, RRAT, EFN, TWG,...","[M, STV, TR, ELL, QR, WQL, SF, RRVX, VFP, LWG,...","[M, SEI, TR, SLI, QR, W, GA, SF, RR, G, AD, FD...","[M, SET, TR, SLE, QR, W, GA, SF, RR, G, AD, FD...","[M, SEI, TR, SLI, QR, W, GA, SF, RKG, AD, FD, ...","[M, SET, TR, SLE, QR, W, GA, SF, RLG, AD, FD, ...","[M, SEI, TR, SLI, QR, WSA, SF, RKG, AD, FD, AY...","[M, SET, TR, SLE, QR, WQA, SF, RLG, AD, FD, GG..."
3,"[MA, ESN, PSI, DI, Q, VL, H, DL, RQ, RF, P, EI...","[MA, TSY, PKJ, DI, Q, VL, H, DL, RQ, RF, P, EI...","[MA, ESN, PSI, DI, QIL, HSI, RQ, RF, PQI, PDG,...","[MA, TSY, PKJ, DI, QGL, HGX, RQ, RF, PPI, PLG,...","[MA, ESN, PSI, DI, QIL, HSI, RQ, RF, PQI, PDG,...","[MA, TSY, PKJ, DI, QGL, HGX, RQ, RF, PPI, PLG,...","[MA, EST, P, QL, DI, DVL, H, DL, RQ, RF, P, EI...","[MA, TSE, P, QL, DI, KVL, H, DL, RQ, RF, P, EI...","[MA, EST, PQM, DI, DVL, QDM, RQ, RF, PEM, SDG,...","[MA, TSE, PQQ, DI, LVL, VDX, RQ, RF, PEX, YLG,...","[MA, EST, PQM, DI, DVL, QDM, RQ, RF, PEM, SDG,...","[MA, TSE, PQQ, DI, LVL, VDX, RQ, RF, PEX, YLG,...","[MA, ESN, PSI, DI, Q, VL, H, DL, RQ, R, FP, EI...","[MA, TSY, PKJ, DI, Q, VL, H, DL, RQ, R, FP, EI...","[MA, ESN, PSI, DI, QIL, HSI, RSK, FP, EI, PDG,...","[MA, TSY, PKJ, DI, QGL, HGX, RDL, FP, EI, PYG,...","[MA, ESN, PSI, DI, QIL, HSI, RSK, FP, EI, PDG,...","[MA, TSY, PKJ, DI, QGL, HGX, RDL, FP, EI, PYG,..."
4,"[MR, FT, F, PL, MA, I, VL, EI, A, MI, VL, F, G...","[MR, FT, F, PL, MA, I, VL, EI, A, MI, VL, F, G...","[MR, FT, F, PL, MAM, VL, EI, ALI, VL, FSI, FV,...","[MR, FT, F, PL, MAT, VL, EI, AEI, VL, FQJ, FV,...","[MR, FTY, PL, MAM, VL, EI, ALI, VL, FSI, FV, E...","[MR, FTT, PL, MAE, VL, EI, AQI, VL, FLG, FV, E...","[MR, FT, F, PL, MA, I, VL, EI, A, MI, VL, F, G...","[MR, FT, F, PL, MA, I, VL, EI, A, MI, VL, F, G...","[MR, FT, FTL, MAM, VL, DLA, MI, VL, FTL, FV, E...","[MR, FT, FCL, MAE, VL, QJA, MI, VL, FVL, FV, E...","[MR, FT, FTL, MAM, VL, DLA, MI, VL, FTL, FV, E...","[MR, FT, FCL, MAE, VL, QJA, MI, VL, FVL, FV, E...","[MR, FT, F, PL, MA, I, VL, EI, A, MI, VL, F, G...","[MR, FT, F, PL, MA, I, VL, EI, A, MI, VL, F, G...","[MR, FT, FPI, MAM, VL, EI, ALI, VL, FSI, FV, E...","[MR, FT, FPC, MAE, VL, EI, AQI, VL, FLG, FV, E...","[MR, FT, FPI, MAM, VL, EI, ALI, VL, FSI, FV, E...","[MR, FT, FPC, MAE, VL, EI, AQI, VL, FLG, FV, E..."


In [24]:
df_protein_mut_alt_seqs = df_protein_mut_alt.map(lambda x: ''.join(x))
df_protein_mut_alt_seqs.head()

Unnamed: 0,mutBPE blosum62 0.7 0.05 800 mutated,mutBPE blosum62 0.7 0.05 800 alternative,mutBPE blosum62 0.7 0.05 3200 mutated,mutBPE blosum62 0.7 0.05 3200 alternative,mutBPE blosum62 0.7 0.05 12800 mutated,mutBPE blosum62 0.7 0.05 12800 alternative,mutBPE pam70 0.7 0.05 800 mutated,mutBPE pam70 0.7 0.05 800 alternative,mutBPE pam70 0.7 0.05 3200 mutated,mutBPE pam70 0.7 0.05 3200 alternative,mutBPE pam70 0.7 0.05 12800 mutated,mutBPE pam70 0.7 0.05 12800 alternative,mutBPE pre blosum62 0.7 0.05 800 mutated,mutBPE pre blosum62 0.7 0.05 800 alternative,mutBPE pre blosum62 0.7 0.05 3200 mutated,mutBPE pre blosum62 0.7 0.05 3200 alternative,mutBPE pre blosum62 0.7 0.05 12800 mutated,mutBPE pre blosum62 0.7 0.05 12800 alternative
0,FPHIGGGLYYGFFLYSKTWNIGIIPPVAKERTAFMGYAPPGGKKSFWG,FPHIGGGLYYGFFLYSKTWNIGIIPPVAKTYTAFMGYAPPGGKKSFWG,FPHMGGSIYYGFLIYSKTWNIGLIPPVAKERTAFIGYAPPGGKKSFWG,FPHTGGEQYYGFJGYSKTWNIGXIPPVAKVDTAFLGYAPPGGKKSFWG,FPHMGGSIYYGFLIYKRTWNIGLIPPSAKERTAFIGYAPPNGKKSFWG,FPHTGGEQYYGFJGYGGTWNIGYIPPLAKDZTAFLGYAPPHGKKSFWG,FPHIGGGLYYGFFLYSKTWNIGIIPPVAKKKTAFMGYAPPGGKKSFWG,FPHIGGGLYYGFFLYSKTWNIGIIPPVAKKKTAFMGYAPPGGKKSFWG,FPHIGGTLYYGFFMYTKTWNIGIIPPVARKRTAYLGYAPPGGKKSFWG,FPHIGGCLYYGFFEYQKTWNIGIIPPVALKVTAGXGYAPPGGKKSFWG,FPHMGGTLYYGFFMYTKTWNIGIIPPVARKRTAYLGYAPPGGKKSFWG,FPHTGGELYYGFFQYLKTWNIGIIPPVAVKGTAGYGYAPPGGKKSFWG,FPHIGGGLYYGFLIYSKTWNIGIIPPVAAKKTAFMGYAPPGGKKSFWG,FPHIGGGLYYGFCEYSKTWNIGIIPPVALKKTAFMGYAPPGGKKSFWG,FPHMGGGLYYGFLIYAQTWNIGLIPPVAAKKTAFIGYAPPGGKKSFWG,FPHTGGGLYYGFEQYLRTWNIGXIPPVALKKTAFEGYAPPGGKKSFWG,FPHMGGSIYYGFLIYAQTWNIGLIPPSAAKKTAFIGYAPPNGKKSFWG,FPHTGGEQYYGFJGYXVTWNIGYIPPLAYKKTAFZGYAPPLGKKSFWG
1,MSAGSTHAQTPRLPDAVAPRSGLLQRQSPIRDASRGSRWVEGVKKA...,MSCGSTHAQTPRLPDAVAPRSGLLQRQYPQRDASRGSRWVEGVKKA...,MSAGSTHAQTPRLPDAIAPRSGILQRQSPIRQAARGSRWVEGIRRA...,MSCGSTHAQTPRLPDAEAPRSGQLQRQLPGRVAXRGSRWVEGYLYA...,MSAGSTHTKSPRIPDAIAPKAGILQRQSPIRQAARGSRWVEGIRRA...,MSCGSTHENLPRGPDAXAPGYGJLQRQYPTRLAHRGSRWVEGNWIA...,MTSGSTHAQTPRLPDAVAPRSGLLQRQKAMRDASRGSRWVEGVKKA...,MCEGSTHAQTPRLPDAVAPRSGLLQRQKQJRDASRGSRWVEGVKKA...,MTSGSTHTQTPRLPDAIAPRSGMLQRQKAMRDATRGSRWVDAVKKT...,MCEGSTHQQTPRLPDALAPRSGGLQRQKGXRDAYRGSRWVLEVKKZ...,MTSGSTHTQTPRLPDAIAPKPGMLQRQKAMRNATRGSRWVDAVKKT...,MCEGSTHQQTPRLPDALAPVGGXLQRQKYJRYAZRGSRWVLHVKKN...,MSAGSTHAQTPRLPDAVAPRSGLLQRQSPIRDAARGSRWVEGVKKA...,MSCGSTHAQTPRLPDAVAPRSGLLQRQYPQRDALRGSRWVEGVKKA...,MSAGSTHAQTPRLPDAIAPRSSLIQRESPIRDIARGSRWVEGIKKS...,MSCGSTHAQTPRLPDAEAPRSQLJQRRGPXRDYLRGSRWVEGEKKZ...,MSAGSTHAESPRIPDAIAPRSSLIQRESPIRDIARGSRWVEGIKKS...,MSCGSTHALGPRXPDAXAPRSYLJQRDZPJRDHWRGSRWVEGSKKK...
2,MSEITRSLIQRWGASFRRGADFDSWGQLVEAMDEYQILARHLQKEA...,MSETTRSLEQRWGASFRRGADFDSWGQLVEAQDEYQILARHLQKEA...,MSEITRSLIQRWGASFRRGAEYDAYGQLIEAMEDYQVIARHLQKEA...,MSETTRSLEQRWGASFRRGAQLDGGGQLXEAYLYYQZJARHLQKEA...,MSEITRSLIQRWSASFRRSAEYDAYGQLIEAMEDYQVIARHLQKEA...,MSETTRSLEQRWQASFRRLARGDXDGQLLEAEZLYQHNARHLQKEA...,MSDVTRTLLQRWGASFRRGADFDSWGQLVEAIDEYQILARHLQKEA...,MSTVTRELLQRWGASFRRGADFDSWGQLVEAIDEYQILARHLQKEA...,MSDVTRTLLQRWATSFRRGAEFNTWGQLIEAIDEYQIMARHLQKEA...,MSTVTRELLQRWQLSFRRGARFVXWGQLYEAIDEYQIJARHLQKEA...,MSDVTRTLLQRWATSFRRATEFNTWGQLIEAIDEYQIMARHLQKEA...,MSTVTRELLQRWQLSFRRVXVFPLWGQLEEAIDEYQITARHLQKEA...,MSEITRSLIQRWGASFRRGADFDSWGQLVEAMDEYQILARHLQKEA...,MSETTRSLEQRWGASFRRGADFDSWGQLVEAQDEYQILARHLQKEA...,MSEITRSLIQRWGASFRKGADFDAYGQLIEAMDEYQVIARHLQKEA...,MSETTRSLEQRWGASFRLGADFDGGGQLXEAYDEYQLEARHLQKEA...,MSEITRSLIQRWSASFRKGADFDAYGQLIEAMDRFQVIARHLQKEA...,MSETTRSLEQRWQASFRLGADFDGGGQLXEAYDLEQZJARHLQKEA...
3,MAESNPSIDIQVLHDLRQRFPEIPEGVVSQCMLQNNNNLEACCRAP...,MATSYPKJDIQVLHDLRQRFPEIPEGVVSQCMLQNNNNLEACCRAP...,MAESNPSIDIQILHSIRQRFPQIPDGVVSQCLIQNNNNMEACCRNP...,MATSYPKJDIQGLHGXRQRFPPIPLGVVSQCETQNNNNJEACCRHP...,MAESNPSIDIQILHSIRQRFPQIPDGVVSQCLIENNNNMEACCRNP...,MATSYPKJDIQGLHGXRQRFPPIPLGVVSQCETLNNNNOEACCRWP...,MAESTPQLDIDVLHDLRQRFPEISDGVVSQCMLQNNNNLEACCRAP...,MATSEPQLDIKVLHDLRQRFPEILRGVVSQCMLQNNNNLEACCRAP...,MAESTPQMDIDVLQDMRQRFPEMSDGVVSQCLLENNDDLEACCRAT...,MATSEPQQDILVLVDXRQRFPEXYLGVVSQCELZNNLHLEACCRAN...,MAESTPQMDIDVLQDMRQRFPEMSDGVVADCLLENNDDLEACCRAT...,MATSEPQQDILVLVDXRQRFPEXYLGVVEZCJLHNNEWLEACCRAS...,MAESNPSIDIQVLHDLRQRFPEIPEGVVSQCMLQNNNNLEACCRAP...,MATSYPKJDIQVLHDLRQRFPEIPEGVVSQCMLQNNNNLEACCRAP...,MAESNPSIDIQILHSIRSKFPEIPDGVVSQCLIQNNNNLEACCRNP...,MATSYPKJDIQGLHGXRDLFPEIPYGVVSQCTJQNNNNLEACCRHP...,MAESNPSIDIQILHSIRSKFPEIPDGVVSQCLIQNNNNLEACCRNP...,MATSYPKJDIQGLHGXRDLFPEIPYGVVSQCTJQNNNNLEACCRHP...
4,MRFTFPLMAIVLEIAMIVLFGLFVEYETDQTVLEEINITKPTDMGI...,MRFTFPLMAIVLEIAMIVLFGLFVEYETDQTVLETENITKPTDMGI...,MRFTFPLMAMVLEIALIVLFSIFVEYETDQTILEEINITKPTDIGI...,MRFTFPLMATVLEIAEIVLFQJFVEYETDQTGLEVXNITKPTDYGI...,MRFTYPLMAMVLEIALIVLFSIFVEYETDQTILEEINITKPTDIGI...,MRFTTPLMAEVLEIAQIVLFLGFVEYETDQTXLEVYNITKPTDLGI...,MRFTFPLMAIVLEIAMIVLFGLFVEYETDQSVMEDLNITKPTDMGI...,MRFTFPLMAIVLEIAMIVLFGLFVEYETDQCVEEKLNITKPTDMGI...,MRFTFTLMAMVLDLAMIVLFTLFVEYETDQSVMEDLNITKPTELGI...,MRFTFCLMAEVLQJAMIVLFVLFVEYETDQXVXEPLNITKPTLEGI...,MRFTFTLMAMVLDLAMIVLFTLFVEYETDQSVMEDLNITKPTELGI...,MRFTFCLMAEVLQJAMIVLFVLFVEYETDQXVXEPLNITKPTLEGI...,MRFTFPLMAIVLEIAMIVLFGLFVEYETDQTVLEEINITKPTDMGI...,MRFTFPLMAIVLEIAMIVLFGLFVEYETDQTVLETENITKPTDMGI...,MRFTFPIMAMVLEIALIVLFSIFVEYEADQTVLEEINITKPTDIGI...,MRFTFPCMAEVLEIAQIVLFLGFVEYEXDQTVLEVYNITKPTDLGI...,MRFTFPIMAMVLEIALIVLFSIFVEYEADKTVIEEINITKPTDIGI...,MRFTFPCMAEVLEIAQIVLFLGFVEYEXDVTVYELENITKPTDZGI...


In [25]:
df_protein_all = pd.concat([df_protein_main[['uniprot_id', 'sequence']], df_protein_mut_alt_seqs],axis=1)
df_protein_all.head()

Unnamed: 0,uniprot_id,sequence,mutBPE blosum62 0.7 0.05 800 mutated,mutBPE blosum62 0.7 0.05 800 alternative,mutBPE blosum62 0.7 0.05 3200 mutated,mutBPE blosum62 0.7 0.05 3200 alternative,mutBPE blosum62 0.7 0.05 12800 mutated,mutBPE blosum62 0.7 0.05 12800 alternative,mutBPE pam70 0.7 0.05 800 mutated,mutBPE pam70 0.7 0.05 800 alternative,mutBPE pam70 0.7 0.05 3200 mutated,mutBPE pam70 0.7 0.05 3200 alternative,mutBPE pam70 0.7 0.05 12800 mutated,mutBPE pam70 0.7 0.05 12800 alternative,mutBPE pre blosum62 0.7 0.05 800 mutated,mutBPE pre blosum62 0.7 0.05 800 alternative,mutBPE pre blosum62 0.7 0.05 3200 mutated,mutBPE pre blosum62 0.7 0.05 3200 alternative,mutBPE pre blosum62 0.7 0.05 12800 mutated,mutBPE pre blosum62 0.7 0.05 12800 alternative
0,A0T3B4,FPHIGGGLYYGFFLYSKTWNIGIIPPVAKKKTAFMGYAPPGGKKSFWG,FPHIGGGLYYGFFLYSKTWNIGIIPPVAKERTAFMGYAPPGGKKSFWG,FPHIGGGLYYGFFLYSKTWNIGIIPPVAKTYTAFMGYAPPGGKKSFWG,FPHMGGSIYYGFLIYSKTWNIGLIPPVAKERTAFIGYAPPGGKKSFWG,FPHTGGEQYYGFJGYSKTWNIGXIPPVAKVDTAFLGYAPPGGKKSFWG,FPHMGGSIYYGFLIYKRTWNIGLIPPSAKERTAFIGYAPPNGKKSFWG,FPHTGGEQYYGFJGYGGTWNIGYIPPLAKDZTAFLGYAPPHGKKSFWG,FPHIGGGLYYGFFLYSKTWNIGIIPPVAKKKTAFMGYAPPGGKKSFWG,FPHIGGGLYYGFFLYSKTWNIGIIPPVAKKKTAFMGYAPPGGKKSFWG,FPHIGGTLYYGFFMYTKTWNIGIIPPVARKRTAYLGYAPPGGKKSFWG,FPHIGGCLYYGFFEYQKTWNIGIIPPVALKVTAGXGYAPPGGKKSFWG,FPHMGGTLYYGFFMYTKTWNIGIIPPVARKRTAYLGYAPPGGKKSFWG,FPHTGGELYYGFFQYLKTWNIGIIPPVAVKGTAGYGYAPPGGKKSFWG,FPHIGGGLYYGFLIYSKTWNIGIIPPVAAKKTAFMGYAPPGGKKSFWG,FPHIGGGLYYGFCEYSKTWNIGIIPPVALKKTAFMGYAPPGGKKSFWG,FPHMGGGLYYGFLIYAQTWNIGLIPPVAAKKTAFIGYAPPGGKKSFWG,FPHTGGGLYYGFEQYLRTWNIGXIPPVALKKTAFEGYAPPGGKKSFWG,FPHMGGSIYYGFLIYAQTWNIGLIPPSAAKKTAFIGYAPPNGKKSFWG,FPHTGGEQYYGFJGYXVTWNIGYIPPLAYKKTAFZGYAPPLGKKSFWG
1,A4D1M9,MSTGSTHAQTPRLPDAVAPRSGLLQRQKPLRDASRGSRWVEGVKKA...,MSAGSTHAQTPRLPDAVAPRSGLLQRQSPIRDASRGSRWVEGVKKA...,MSCGSTHAQTPRLPDAVAPRSGLLQRQYPQRDASRGSRWVEGVKKA...,MSAGSTHAQTPRLPDAIAPRSGILQRQSPIRQAARGSRWVEGIRRA...,MSCGSTHAQTPRLPDAEAPRSGQLQRQLPGRVAXRGSRWVEGYLYA...,MSAGSTHTKSPRIPDAIAPKAGILQRQSPIRQAARGSRWVEGIRRA...,MSCGSTHENLPRGPDAXAPGYGJLQRQYPTRLAHRGSRWVEGNWIA...,MTSGSTHAQTPRLPDAVAPRSGLLQRQKAMRDASRGSRWVEGVKKA...,MCEGSTHAQTPRLPDAVAPRSGLLQRQKQJRDASRGSRWVEGVKKA...,MTSGSTHTQTPRLPDAIAPRSGMLQRQKAMRDATRGSRWVDAVKKT...,MCEGSTHQQTPRLPDALAPRSGGLQRQKGXRDAYRGSRWVLEVKKZ...,MTSGSTHTQTPRLPDAIAPKPGMLQRQKAMRNATRGSRWVDAVKKT...,MCEGSTHQQTPRLPDALAPVGGXLQRQKYJRYAZRGSRWVLHVKKN...,MSAGSTHAQTPRLPDAVAPRSGLLQRQSPIRDAARGSRWVEGVKKA...,MSCGSTHAQTPRLPDAVAPRSGLLQRQYPQRDALRGSRWVEGVKKA...,MSAGSTHAQTPRLPDAIAPRSSLIQRESPIRDIARGSRWVEGIKKS...,MSCGSTHAQTPRLPDAEAPRSQLJQRRGPXRDYLRGSRWVEGEKKZ...,MSAGSTHAESPRIPDAIAPRSSLIQRESPIRDIARGSRWVEGIKKS...,MSCGSTHALGPRXPDAXAPRSYLJQRDZPJRDHWRGSRWVEGSKKK...
2,B7ZW30,MSEVTRSLLQRWGASFRRGADFDSWGQLVEAIDEYQILARHLQKEA...,MSEITRSLIQRWGASFRRGADFDSWGQLVEAMDEYQILARHLQKEA...,MSETTRSLEQRWGASFRRGADFDSWGQLVEAQDEYQILARHLQKEA...,MSEITRSLIQRWGASFRRGAEYDAYGQLIEAMEDYQVIARHLQKEA...,MSETTRSLEQRWGASFRRGAQLDGGGQLXEAYLYYQZJARHLQKEA...,MSEITRSLIQRWSASFRRSAEYDAYGQLIEAMEDYQVIARHLQKEA...,MSETTRSLEQRWQASFRRLARGDXDGQLLEAEZLYQHNARHLQKEA...,MSDVTRTLLQRWGASFRRGADFDSWGQLVEAIDEYQILARHLQKEA...,MSTVTRELLQRWGASFRRGADFDSWGQLVEAIDEYQILARHLQKEA...,MSDVTRTLLQRWATSFRRGAEFNTWGQLIEAIDEYQIMARHLQKEA...,MSTVTRELLQRWQLSFRRGARFVXWGQLYEAIDEYQIJARHLQKEA...,MSDVTRTLLQRWATSFRRATEFNTWGQLIEAIDEYQIMARHLQKEA...,MSTVTRELLQRWQLSFRRVXVFPLWGQLEEAIDEYQITARHLQKEA...,MSEITRSLIQRWGASFRRGADFDSWGQLVEAMDEYQILARHLQKEA...,MSETTRSLEQRWGASFRRGADFDSWGQLVEAQDEYQILARHLQKEA...,MSEITRSLIQRWGASFRKGADFDAYGQLIEAMDEYQVIARHLQKEA...,MSETTRSLEQRWGASFRLGADFDGGGQLXEAYDEYQLEARHLQKEA...,MSEITRSLIQRWSASFRKGADFDAYGQLIEAMDRFQVIARHLQKEA...,MSETTRSLEQRWQASFRLGADFDGGGQLXEAYDLEQZJARHLQKEA...
3,Q5JPK0,MAQSSPQLDIQVLHDLRQRFPEIPEGVVSQCMLQNNNNLEACCRAP...,MAESNPSIDIQVLHDLRQRFPEIPEGVVSQCMLQNNNNLEACCRAP...,MATSYPKJDIQVLHDLRQRFPEIPEGVVSQCMLQNNNNLEACCRAP...,MAESNPSIDIQILHSIRQRFPQIPDGVVSQCLIQNNNNMEACCRNP...,MATSYPKJDIQGLHGXRQRFPPIPLGVVSQCETQNNNNJEACCRHP...,MAESNPSIDIQILHSIRQRFPQIPDGVVSQCLIENNNNMEACCRNP...,MATSYPKJDIQGLHGXRQRFPPIPLGVVSQCETLNNNNOEACCRWP...,MAESTPQLDIDVLHDLRQRFPEISDGVVSQCMLQNNNNLEACCRAP...,MATSEPQLDIKVLHDLRQRFPEILRGVVSQCMLQNNNNLEACCRAP...,MAESTPQMDIDVLQDMRQRFPEMSDGVVSQCLLENNDDLEACCRAT...,MATSEPQQDILVLVDXRQRFPEXYLGVVSQCELZNNLHLEACCRAN...,MAESTPQMDIDVLQDMRQRFPEMSDGVVADCLLENNDDLEACCRAT...,MATSEPQQDILVLVDXRQRFPEXYLGVVEZCJLHNNEWLEACCRAS...,MAESNPSIDIQVLHDLRQRFPEIPEGVVSQCMLQNNNNLEACCRAP...,MATSYPKJDIQVLHDLRQRFPEIPEGVVSQCMLQNNNNLEACCRAP...,MAESNPSIDIQILHSIRSKFPEIPDGVVSQCLIQNNNNLEACCRNP...,MATSYPKJDIQGLHGXRDLFPEIPYGVVSQCTJQNNNNLEACCRHP...,MAESNPSIDIQILHSIRSKFPEIPDGVVSQCLIQNNNNLEACCRNP...,MATSYPKJDIQGLHGXRDLFPEIPYGVVSQCTJQNNNNLEACCRHP...
4,M1SZX7,MRFTFPLMAIVLEIAMIVLFGLFVEYETDQTVLEQLNITKPTDMGI...,MRFTFPLMAIVLEIAMIVLFGLFVEYETDQTVLEEINITKPTDMGI...,MRFTFPLMAIVLEIAMIVLFGLFVEYETDQTVLETENITKPTDMGI...,MRFTFPLMAMVLEIALIVLFSIFVEYETDQTILEEINITKPTDIGI...,MRFTFPLMATVLEIAEIVLFQJFVEYETDQTGLEVXNITKPTDYGI...,MRFTYPLMAMVLEIALIVLFSIFVEYETDQTILEEINITKPTDIGI...,MRFTTPLMAEVLEIAQIVLFLGFVEYETDQTXLEVYNITKPTDLGI...,MRFTFPLMAIVLEIAMIVLFGLFVEYETDQSVMEDLNITKPTDMGI...,MRFTFPLMAIVLEIAMIVLFGLFVEYETDQCVEEKLNITKPTDMGI...,MRFTFTLMAMVLDLAMIVLFTLFVEYETDQSVMEDLNITKPTELGI...,MRFTFCLMAEVLQJAMIVLFVLFVEYETDQXVXEPLNITKPTLEGI...,MRFTFTLMAMVLDLAMIVLFTLFVEYETDQSVMEDLNITKPTELGI...,MRFTFCLMAEVLQJAMIVLFVLFVEYETDQXVXEPLNITKPTLEGI...,MRFTFPLMAIVLEIAMIVLFGLFVEYETDQTVLEEINITKPTDMGI...,MRFTFPLMAIVLEIAMIVLFGLFVEYETDQTVLETENITKPTDMGI...,MRFTFPIMAMVLEIALIVLFSIFVEYEADQTVLEEINITKPTDIGI...,MRFTFPCMAEVLEIAQIVLFLGFVEYEXDQTVLEVYNITKPTDLGI...,MRFTFPIMAMVLEIALIVLFSIFVEYEADKTVIEEINITKPTDIGI...,MRFTFPCMAEVLEIAQIVLFLGFVEYEXDVTVYELENITKPTDZGI...


In [26]:
df_protein_all_embeddings = df_protein_all.iloc[:, 1:].map(lambda x: get_esm_embeddings(x, esm_tokenizer, esm_model))
df_protein_all_embeddings = pd.concat([df_protein_main[['uniprot_id']], df_protein_all_embeddings],axis=1)
df_protein_all_embeddings.head()

Unnamed: 0,uniprot_id,sequence,mutBPE blosum62 0.7 0.05 800 mutated,mutBPE blosum62 0.7 0.05 800 alternative,mutBPE blosum62 0.7 0.05 3200 mutated,mutBPE blosum62 0.7 0.05 3200 alternative,mutBPE blosum62 0.7 0.05 12800 mutated,mutBPE blosum62 0.7 0.05 12800 alternative,mutBPE pam70 0.7 0.05 800 mutated,mutBPE pam70 0.7 0.05 800 alternative,mutBPE pam70 0.7 0.05 3200 mutated,mutBPE pam70 0.7 0.05 3200 alternative,mutBPE pam70 0.7 0.05 12800 mutated,mutBPE pam70 0.7 0.05 12800 alternative,mutBPE pre blosum62 0.7 0.05 800 mutated,mutBPE pre blosum62 0.7 0.05 800 alternative,mutBPE pre blosum62 0.7 0.05 3200 mutated,mutBPE pre blosum62 0.7 0.05 3200 alternative,mutBPE pre blosum62 0.7 0.05 12800 mutated,mutBPE pre blosum62 0.7 0.05 12800 alternative
0,A0T3B4,"[[0.03542192, -0.20162101, 0.072748564, -0.390...","[[-0.0070584184, -0.16246861, 0.08035695, -0.3...","[[-0.11043366, -0.18711828, 0.13412131, -0.358...","[[-0.01443464, -0.16016142, 0.084358856, -0.40...","[[-0.02353886, -0.1887129, 0.045299236, -0.123...","[[-0.0063580023, -0.1938833, 0.056476638, -0.3...","[[0.045419626, -0.305488, -0.027887277, -0.186...","[[0.03542192, -0.20162101, 0.072748564, -0.390...","[[0.03542192, -0.20162101, 0.072748564, -0.390...","[[-0.019524762, -0.16899271, 0.06886673, -0.37...","[[-0.17647669, -0.029083593, 0.14986631, -0.22...","[[-0.005237334, -0.18229294, 0.05230611, -0.39...","[[0.017014656, -0.14356081, 0.05232837, -0.204...","[[0.006221322, -0.17739058, 0.08651216, -0.338...","[[0.047338374, -0.20401008, 0.07036892, -0.368...","[[0.039997935, -0.19516833, 0.09038662, -0.326...","[[0.063630246, -0.20090355, -0.0023157636, -0....","[[0.026538497, -0.2104627, 0.07958782, -0.3789...","[[0.0804156, -0.2786069, 0.031141065, -0.29633..."
1,A4D1M9,"[[-0.25708312, -0.05315833, 0.1431992, -0.7421...","[[-0.1885336, -0.04702764, 0.17660803, -0.6594...","[[-0.26630926, -0.07516795, 0.1761231, -0.7585...","[[-0.21502712, -0.13631645, 0.10181732, -0.439...","[[-0.20863134, -0.10367142, 0.2071627, -0.8265...","[[-0.21108943, -0.1136043, 0.07063229, -0.3603...","[[-0.042509854, -0.1180957, 0.26367345, -0.648...","[[-0.23611389, -0.10378755, 0.16048436, -0.563...","[[-0.17453872, -0.09483418, 0.1884631, -0.8016...","[[-0.15448728, -0.25334376, 0.25722936, -0.469...","[[-0.15730017, -0.0926227, 0.15085158, -0.6321...","[[-0.16676725, -0.23614542, 0.26108104, -0.406...","[[-0.15163076, -0.055272095, 0.20472606, -0.70...","[[-0.17984073, -0.056001678, 0.17570725, -0.67...","[[-0.23804379, -0.099479206, 0.2003271, -0.722...","[[-0.17445946, -0.1332686, 0.093484975, -0.444...","[[-0.1892129, -0.10801772, 0.18869942, -0.8334...","[[-0.08553649, -0.049229104, 0.11333832, -0.35...","[[-0.0636452, -0.04165172, 0.22885142, -0.6899..."
2,B7ZW30,"[[0.023537263, -0.1767277, 0.30394703, -0.5420...","[[-0.06831931, -0.227438, 0.22476129, -0.44335...","[[0.03304514, -0.15337117, 0.287734, -0.472741...","[[-0.07547428, -0.25788572, 0.23389618, -0.428...","[[0.06432823, -0.106907725, 0.34357786, -0.502...","[[-0.089166075, -0.24591288, 0.22476068, -0.40...","[[-0.06606978, -0.07202831, 0.28960124, -0.505...","[[0.007908295, -0.3082322, 0.3058616, -0.50636...","[[-0.020942302, -0.21019004, 0.3834232, -0.557...","[[-0.055117775, -0.3454723, 0.26945388, -0.420...","[[-0.015591178, -0.22253285, 0.42014718, -0.48...","[[-0.08414683, -0.37473118, 0.27095625, -0.418...","[[-0.06386681, -0.21567018, 0.33418947, -0.465...","[[-0.057079587, -0.23520704, 0.22801907, -0.44...","[[0.014532921, -0.11079074, 0.3148901, -0.4749...","[[-0.09113388, -0.23973343, 0.21591353, -0.392...","[[0.090177305, -0.049783166, 0.17756128, -0.38...","[[-0.10182371, -0.24535015, 0.16693404, -0.380...","[[-0.021974849, -0.159812, 0.22648951, -0.3901..."
3,Q5JPK0,"[[0.0028362111, -0.16249773, 0.15825017, -0.45...","[[0.021534912, -0.11866711, 0.23939873, -0.381...","[[0.012477595, -0.19464402, 0.26412705, -0.468...","[[-0.009137119, -0.16987179, 0.19275466, -0.31...","[[-0.03022467, -0.24249752, 0.14113125, -0.490...","[[-0.025221875, -0.18953829, 0.18315314, -0.30...","[[-0.023324607, -0.2254312, 0.17993398, -0.454...","[[0.025915094, -0.1519095, 0.2555932, -0.40836...","[[0.0039123427, -0.13047367, 0.3015617, -0.398...","[[0.044730686, -0.20594418, 0.24988335, -0.421...","[[-0.022029152, -0.061087385, 0.2538532, -0.55...","[[0.060305957, -0.24077226, 0.22941457, -0.407...","[[-0.013915579, -0.05498723, 0.25196362, -0.51...","[[0.018624738, -0.12156236, 0.2354317, -0.3773...","[[0.009288897, -0.19090924, 0.28117016, -0.469...","[[-0.005306852, -0.19716202, 0.21854284, -0.29...","[[0.02557912, -0.2024031, 0.14771663, -0.52680...","[[-0.021314206, -0.23254256, 0.15284391, -0.27...","[[-0.029223118, -0.2376772, 0.1621142, -0.4787..."
4,M1SZX7,"[[-0.20352636, -0.04862832, -0.13626692, -0.18...","[[-0.19660197, -0.04240868, -0.1287835, -0.189...","[[-0.20585948, -0.038870748, -0.11654676, -0.1...","[[-0.21291383, -0.015215989, -0.11722192, -0.2...","[[-0.21137887, -0.055664703, -0.029812263, -0....","[[-0.2299361, -0.033101507, -0.102918126, -0.2...","[[-0.18068947, -0.087003164, -0.033162005, -0....","[[-0.17452998, -0.059683707, -0.12703522, -0.2...","[[-0.22699967, -0.051937103, -0.07090848, -0.2...","[[-0.2010739, -0.05953595, -0.102910616, -0.19...","[[-0.30676037, -0.09311287, 0.1889178, -0.4105...","[[-0.2010739, -0.05953595, -0.102910616, -0.19...","[[-0.30676037, -0.09311287, 0.1889178, -0.4105...","[[-0.1996544, -0.038436014, -0.114058815, -0.1...","[[-0.21398461, -0.016283197, -0.12591267, -0.1...","[[-0.2263892, -0.008352012, -0.110154785, -0.1...","[[-0.2084112, -0.024233289, 0.12354627, -0.397...","[[-0.23759748, -0.015418116, -0.08631317, -0.1...","[[-0.15914695, 0.0013143114, 0.16184449, -0.39..."


In [67]:
valid_embedding_indices = df_protein_all_embeddings.apply(
    lambda row: {col:len(df_protein_all[df_protein_all['uniprot_id']==row['uniprot_id']].iloc[0]['sequence'])==row[col].shape[0] for col in df_protein_all_embeddings.columns[1:]},
    axis=1)
valid_embedding_indices = pd.DataFrame.from_dict(list(valid_embedding_indices))
valid_embedding_indices = valid_embedding_indices.map(lambda x: True if x else np.nan).dropna()
valid_embedding_indices = valid_embedding_indices.index
valid_embedding_indices

Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,  10,
       ...
       990, 991, 992, 993, 994, 995, 996, 997, 998, 999],
      dtype='int64', length=977)

In [70]:
df_protein_all_embeddings_2 = df_protein_all_embeddings.loc[valid_embedding_indices].reset_index(drop=True)
df_protein_all_2 = df_protein_all.loc[valid_embedding_indices].reset_index(drop=True)

In [71]:
def find_sequence_differences(seq1, seq2):
    differences = []
    for pos, (aa1, aa2) in enumerate(zip(seq1, seq2)):
        if aa1 != aa2:
            differences.append(pos)
            
    return differences

df_protein_all_diff_inds = df_protein_all_2.apply(lambda row: {col[:col.find('mutated')-1]:find_sequence_differences(row['sequence'], row[col]) for col in df_protein_all_2.iloc[:1, 2::2].columns}, axis=1)
df_protein_all_diff_inds = pd.DataFrame.from_dict(list(df_protein_all_diff_inds))
df_protein_all_diff_inds = pd.concat([df_protein_all_2[['uniprot_id']], df_protein_all_diff_inds],axis=1)
df_protein_all_diff_inds.head()

Unnamed: 0,uniprot_id,mutBPE blosum62 0.7 0.05 800,mutBPE blosum62 0.7 0.05 3200,mutBPE blosum62 0.7 0.05 12800,mutBPE pam70 0.7 0.05 800,mutBPE pam70 0.7 0.05 3200,mutBPE pam70 0.7 0.05 12800,mutBPE pre blosum62 0.7 0.05 800,mutBPE pre blosum62 0.7 0.05 3200,mutBPE pre blosum62 0.7 0.05 12800
0,A0T3B4,"[29, 30]","[3, 6, 7, 12, 13, 22, 29, 30, 34]","[3, 6, 7, 12, 13, 15, 16, 22, 26, 29, 30, 34, 40]",[],"[6, 13, 15, 28, 30, 33, 34]","[3, 6, 13, 15, 28, 30, 33, 34]","[12, 13, 28]","[3, 12, 13, 15, 16, 22, 28, 34]","[3, 6, 7, 12, 13, 15, 16, 22, 26, 28, 34, 40]"
1,A4D1M9,"[2, 27, 29, 46, 74, 92, 114, 116, 128, 130, 13...","[2, 16, 22, 27, 29, 31, 33, 42, 43, 44, 46, 58...","[2, 7, 8, 9, 12, 16, 19, 20, 22, 27, 29, 31, 3...","[1, 2, 28, 29, 46, 74, 89, 92, 114, 119, 128, ...","[1, 2, 7, 16, 22, 28, 29, 33, 40, 41, 45, 46, ...","[1, 2, 7, 16, 19, 20, 22, 28, 29, 31, 33, 40, ...","[2, 27, 29, 33, 46, 74, 92, 94, 114, 116, 128,...","[2, 16, 21, 23, 26, 27, 29, 32, 33, 42, 45, 46...","[2, 8, 9, 12, 16, 21, 23, 26, 27, 29, 32, 33, ..."
2,B7ZW30,"[3, 8, 31, 58, 76, 85, 89, 90, 93, 121, 124, 1...","[3, 8, 20, 21, 23, 24, 28, 31, 32, 33, 36, 37,...","[3, 8, 12, 18, 20, 21, 23, 24, 28, 31, 32, 33,...","[2, 6, 76, 88, 89, 93, 121, 126, 136, 154, 156...","[2, 6, 12, 13, 20, 22, 23, 28, 37, 56, 57, 73,...","[2, 6, 12, 13, 18, 19, 20, 22, 23, 28, 37, 46,...","[3, 8, 31, 58, 76, 85, 89, 90, 93, 121, 124, 1...","[3, 8, 17, 23, 24, 28, 31, 36, 37, 58, 62, 73,...","[3, 8, 12, 17, 23, 24, 28, 31, 33, 34, 36, 37,..."
3,Q5JPK0,"[2, 4, 6, 7, 47, 48, 71, 103, 112, 115, 116, 1...","[2, 4, 6, 7, 11, 14, 15, 21, 24, 31, 32, 38, 4...","[2, 4, 6, 7, 11, 14, 15, 21, 24, 31, 32, 33, 3...","[2, 4, 10, 23, 24, 47, 48, 71, 80, 82, 92, 96,...","[2, 4, 7, 10, 13, 15, 22, 23, 24, 31, 33, 36, ...","[2, 4, 7, 10, 13, 15, 22, 23, 24, 28, 29, 31, ...","[2, 4, 6, 7, 47, 48, 71, 92, 103, 112, 115, 11...","[2, 4, 6, 7, 11, 14, 15, 17, 18, 24, 31, 32, 4...","[2, 4, 6, 7, 11, 14, 15, 17, 18, 24, 31, 32, 4..."
4,M1SZX7,"[34, 35]","[9, 15, 20, 21, 31, 34, 35, 43, 48, 49, 62, 78]","[4, 9, 15, 20, 21, 31, 34, 35, 43, 48, 49, 62,...","[30, 32, 34, 67, 68, 69, 71, 78, 79]","[5, 9, 12, 13, 20, 30, 32, 34, 42, 43, 52, 62,...","[5, 9, 12, 13, 20, 30, 32, 34, 42, 43, 52, 62,...","[34, 35, 68, 69, 71]","[6, 9, 15, 20, 21, 27, 34, 35, 43, 50, 52, 62,...","[6, 9, 15, 20, 21, 27, 29, 32, 34, 35, 43, 50,..."


In [72]:
from numpy.linalg import norm
def cosine_similarity(A, B):
    return np.sum(A*B, axis=1)/(norm(A,axis=1)*norm(B,axis=1))

def cosine_similarity_dataframe(row):
    result = {}
    for method_name in tokenizer_list.keys():
        # print(row['uniprot_id'], method_name)
        inds = df_protein_all_diff_inds[df_protein_all_diff_inds['uniprot_id'] == row['uniprot_id']].iloc[0][method_name]
        for postfix in ['mutated', 'alternative']:
            cos_sim = cosine_similarity(row['sequence'][inds], row[f'{method_name} {postfix}'][inds])
            result[f"{method_name} {postfix}"] = cos_sim.mean()#, cos_sim.std()

    return result

df_protein_all_similarity = df_protein_all_embeddings_2.apply(cosine_similarity_dataframe, axis=1)
df_protein_all_similarity = pd.DataFrame.from_dict(list(df_protein_all_similarity))
df_protein_all_similarity = pd.concat([df_protein_all_2[['uniprot_id']], df_protein_all_similarity],axis=1)
df_protein_all_similarity = df_protein_all_similarity.fillna(1)
df_protein_all_similarity.head()

  result[f"{method_name} {postfix}"] = cos_sim.mean()#, cos_sim.std()
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,uniprot_id,mutBPE blosum62 0.7 0.05 800 mutated,mutBPE blosum62 0.7 0.05 800 alternative,mutBPE blosum62 0.7 0.05 3200 mutated,mutBPE blosum62 0.7 0.05 3200 alternative,mutBPE blosum62 0.7 0.05 12800 mutated,mutBPE blosum62 0.7 0.05 12800 alternative,mutBPE pam70 0.7 0.05 800 mutated,mutBPE pam70 0.7 0.05 800 alternative,mutBPE pam70 0.7 0.05 3200 mutated,mutBPE pam70 0.7 0.05 3200 alternative,mutBPE pam70 0.7 0.05 12800 mutated,mutBPE pam70 0.7 0.05 12800 alternative,mutBPE pre blosum62 0.7 0.05 800 mutated,mutBPE pre blosum62 0.7 0.05 800 alternative,mutBPE pre blosum62 0.7 0.05 3200 mutated,mutBPE pre blosum62 0.7 0.05 3200 alternative,mutBPE pre blosum62 0.7 0.05 12800 mutated,mutBPE pre blosum62 0.7 0.05 12800 alternative
0,A0T3B4,0.87731,0.813062,0.879199,0.81448,0.875316,0.807653,1.0,1.0,0.876648,0.750814,0.881981,0.849261,0.867044,0.848321,0.876411,0.823784,0.872939,0.80161
1,A4D1M9,0.841761,0.843465,0.776838,0.823549,0.755951,0.722471,0.827841,0.848159,0.761106,0.818973,0.759137,0.803216,0.84105,0.844624,0.777058,0.831508,0.739238,0.779013
2,B7ZW30,0.8935,0.873256,0.889513,0.856054,0.882289,0.746935,0.896589,0.869496,0.888839,0.85151,0.88564,0.857801,0.8907,0.86878,0.890065,0.848266,0.877022,0.847638
3,Q5JPK0,0.884072,0.868907,0.875369,0.827675,0.86958,0.807125,0.878836,0.865202,0.865705,0.80653,0.860203,0.810712,0.882872,0.869878,0.869189,0.820184,0.859502,0.808403
4,M1SZX7,0.878072,0.831048,0.889457,0.864495,0.890862,0.865896,0.887736,0.865964,0.888852,0.798428,0.888852,0.798428,0.878512,0.843191,0.890661,0.788413,0.885297,0.818594


In [73]:
np_protein_all_diff_inds_len = df_protein_all_diff_inds.iloc[:, 1:].map(len)
for col in np_protein_all_diff_inds_len.columns:
    np_protein_all_diff_inds_len.insert(np_protein_all_diff_inds_len.columns.get_loc(col) + 1, f"{col}_copy", np_protein_all_diff_inds_len[col])
np_protein_all_diff_inds_len = np_protein_all_diff_inds_len.to_numpy()
np_protein_all_diff_inds_len.shape

(977, 18)

In [74]:
np.mean(df_protein_all_similarity.iloc[:,1:], axis=0)#.sort_values(ascending=False)

mutBPE blosum62 0.7 0.05 800 mutated              0.882792
mutBPE blosum62 0.7 0.05 800 alternative          0.862655
mutBPE blosum62 0.7 0.05 3200 mutated             0.865186
mutBPE blosum62 0.7 0.05 3200 alternative         0.822683
mutBPE blosum62 0.7 0.05 12800 mutated            0.851335
mutBPE blosum62 0.7 0.05 12800 alternative        0.783308
mutBPE pam70 0.7 0.05 800 mutated                 0.884055
mutBPE pam70 0.7 0.05 800 alternative             0.864682
mutBPE pam70 0.7 0.05 3200 mutated                0.868471
mutBPE pam70 0.7 0.05 3200 alternative            0.833043
mutBPE pam70 0.7 0.05 12800 mutated               0.858090
mutBPE pam70 0.7 0.05 12800 alternative           0.799226
mutBPE pre blosum62 0.7 0.05 800 mutated          0.881700
mutBPE pre blosum62 0.7 0.05 800 alternative      0.862433
mutBPE pre blosum62 0.7 0.05 3200 mutated         0.864750
mutBPE pre blosum62 0.7 0.05 3200 alternative     0.823529
mutBPE pre blosum62 0.7 0.05 12800 mutated        0.8504

In [75]:
np.sum(df_protein_all_similarity.iloc[:,1:] * np_protein_all_diff_inds_len / np_protein_all_diff_inds_len.sum(axis=0), axis=0)#.sort_values(ascending=False)

mutBPE blosum62 0.7 0.05 800 mutated              0.879323
mutBPE blosum62 0.7 0.05 800 alternative          0.859830
mutBPE blosum62 0.7 0.05 3200 mutated             0.863411
mutBPE blosum62 0.7 0.05 3200 alternative         0.813127
mutBPE blosum62 0.7 0.05 12800 mutated            0.847658
mutBPE blosum62 0.7 0.05 12800 alternative        0.763587
mutBPE pam70 0.7 0.05 800 mutated                 0.881125
mutBPE pam70 0.7 0.05 800 alternative             0.862639
mutBPE pam70 0.7 0.05 3200 mutated                0.867789
mutBPE pam70 0.7 0.05 3200 alternative            0.828491
mutBPE pam70 0.7 0.05 12800 mutated               0.857357
mutBPE pam70 0.7 0.05 12800 alternative           0.787973
mutBPE pre blosum62 0.7 0.05 800 mutated          0.878182
mutBPE pre blosum62 0.7 0.05 800 alternative      0.859286
mutBPE pre blosum62 0.7 0.05 3200 mutated         0.862642
mutBPE pre blosum62 0.7 0.05 3200 alternative     0.813535
mutBPE pre blosum62 0.7 0.05 12800 mutated        0.8466