<a href="https://colab.research.google.com/github/angelnavfer/Prediccion-de-mutantes-por-ESM-2/blob/main/Prediccion_de_mutantes_por_ESM_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install fair-esm

import torch
import esm
import pandas as pd
from tqdm import tqdm

# Cargar el modelo ESM-2
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
batch_converter = alphabet.get_batch_converter()
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Determinar la sencuencia de LuxR
WT_SEQ = (
    "MKNINADDTYRIINKIKACRSNNDINQCLSDMTKMVHCEYYLLAIIYPHSMVKSDISILDNYPKKWRQYYDDANLIKYDPIVDYSNSNHSPINWNIFENNAVNKKSPNVIKEAKTSGLITGFSFPIHTANNGFGMLSFAHSEKDNYIDSLFLHACMNIPLIVPSLVDNYRKINIANNKSNNDLTKREKECLAWACEGKSSWDISKILGCSERTVTFHLTNAQMKLNTTNRCQSISKAILTGAIDCPYFKN"
)
mut_range_start, mut_range_end = 175, 240  # 0-based
AMINO_ACIDS = list("ACDEFGHIKLMNPQRSTVWY")

# Generar todos los mutantes simples
mutant_data = []
for pos in range(mut_range_start, mut_range_end + 1):
    wt_aa = WT_SEQ[pos]
    for aa in AMINO_ACIDS:
        label = f"{wt_aa}{pos+1}{aa}"
        seq = list(WT_SEQ)
        seq[pos] = aa
        mutant_data.append((label, "".join(seq)))

print(f"✅ Generated {len(mutant_data)} single mutants (with WT included).")

# Establecer la función de puntuaje
def score_sequence(seq):
    data = [("mutant", seq)]
    batch_labels, batch_strs, batch_tokens = batch_converter(data)
    batch_tokens = batch_tokens.to(device)
    with torch.no_grad():
        logits = model(batch_tokens)["logits"]
    log_probs = torch.log_softmax(logits, dim=-1)
    true_tokens = batch_tokens[:, 1:]
    log_probs_for_true = log_probs[:, :-1, :].gather(2, true_tokens.unsqueeze(-1)).squeeze(-1)
    return log_probs_for_true.sum(dim=1).item()

# Puntuar todos los mutantes
scored_mutants = []
print("🧠 Scoring all single mutants...")
for label, seq in tqdm(mutant_data, desc="Scoring"):
    score = score_sequence(seq)
    scored_mutants.append((label, seq, score))

# Ordenar y guardar los resultados
df = pd.DataFrame(scored_mutants, columns=["Mutant", "Sequence", "Score"])
df.sort_values(by="Score", ascending=False, inplace=True)
df.to_csv("single_mutants_scored.csv", index=False)

print("\n📁 Saved: 'single_mutants_scored.csv'")
print(df.head(10))


✅ Generated 1320 single mutants (with WT included).
🧠 Scoring all single mutants...


Scoring: 100%|██████████| 1320/1320 [2:38:41<00:00,  7.21s/it]


📁 Saved: 'single_mutants_scored.csv'
     Mutant                                           Sequence        Score
723   R212E  MKNINADDTYRIINKIKACRSNNDINQCLSDMTKMVHCEYYLLAII... -1485.180054
449   K198L  MKNINADDTYRIINKIKACRSNNDINQCLSDMTKMVHCEYYLLAII... -1485.360840
1149  S233L  MKNINADDTYRIINKIKACRSNNDINQCLSDMTKMVHCEYYLLAII... -1486.327393
459   K198Y  MKNINADDTYRIINKIKACRSNNDINQCLSDMTKMVHCEYYLLAII... -1486.977051
444   K198F  MKNINADDTYRIINKIKACRSNNDINQCLSDMTKMVHCEYYLLAII... -1487.055054
267   E189I  MKNINADDTYRIINKIKACRSNNDINQCLSDMTKMVHCEYYLLAII... -1487.228271
814   F216R  MKNINADDTYRIINKIKACRSNNDINQCLSDMTKMVHCEYYLLAII... -1487.335205
816   F216T  MKNINADDTYRIINKIKACRSNNDINQCLSDMTKMVHCEYYLLAII... -1487.664429
283   C190E  MKNINADDTYRIINKIKACRSNNDINQCLSDMTKMVHCEYYLLAII... -1487.914062
287   C190I  MKNINADDTYRIINKIKACRSNNDINQCLSDMTKMVHCEYYLLAII... -1488.728516





In [None]:
# Cargar el archivo .CSV
df = pd.read_csv("single_mutants_scored.csv")

# Guardar el archivo FASTA
with open("single_mutants_scored.fasta", "w") as f:
    for _, row in df.iterrows():
        header = f">{row['Mutant']}|Score:{row['Score']:.2f}"
        sequence = row['Sequence']
        f.write(f"{header}\n{sequence}\n")

print("FASTA saved as 'single_mutants_scored.fasta'")

FASTA saved as 'single_mutants_scored.fasta'


In [None]:
import torch
import esm
import pandas as pd

# Volver a cargar el modelo ESM-2
model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()
batch_converter = alphabet.get_batch_converter()
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Volver a determinar la secuencia de LuxR
WT_SEQ = (
    "MKNINADDTYRIINKIKACRSNNDINQCLSDMTKMVHCEYYLLAIIYPHSMVKSDISILDNYPKKWRQYYDDANLIKYDPIVDYSNSNHSPINWNIFENNAVNKKSPNVIKEAKTSGLITGFSFPIHTANNGFGMLSFAHSEKDNYIDSLFLHACMNIPLIVPSLVDNYRKINIANNKSNNDLTKREKECLAWACEGKSSWDISKILGCSERTVTFHLTNAQMKLNTTNRCQSISKAILTGAIDCPYFKN"
)

# Definir la función de puntuaje
def score_sequence(seq):
    data = [("sequence", seq)]
    _, _, batch_tokens = batch_converter(data)
    batch_tokens = batch_tokens.to(device)
    with torch.no_grad():
        logits = model(batch_tokens)["logits"]
    log_probs = torch.log_softmax(logits, dim=-1)
    true_tokens = batch_tokens[:, 1:]
    log_probs_for_true = log_probs[:, :-1, :].gather(2, true_tokens.unsqueeze(-1)).squeeze(-1)
    return log_probs_for_true.sum(dim=1).item()

# Puntuar a la variante WT
wt_score = score_sequence(WT_SEQ)
print(f"✅ WT Score: {wt_score:.2f}")

# Cargar las puntuaciones asociadas a los mutantes previamente guardadas en formato .CSV
df = pd.read_csv("single_mutants_scored.csv")

# Calcular las diferencias de puntuación entre los mutantes y la variante silvestre
df["DeltaScore"] = df["Score"] - wt_score

# 7. Ordenar y guardar un nuevo .CSV
df.sort_values(by="DeltaScore", ascending=False, inplace=True)
df.to_csv("single_mutants_scored_delta.csv", index=False)
print("📁 Updated CSV saved as 'single_mutants_scored_delta.csv'")

# 8. Guardar los resultados en formato .FASTA
with open("single_mutants_scored_delta.fasta", "w") as f:
    for _, row in df.iterrows():
        header = f">{row['Mutant']}|ΔScore:{row['DeltaScore']:.2f}"
        f.write(f"{header}\n{row['Sequence']}\n")
print("📄 FASTA file with delta scores saved as 'single_mutants_scored_delta.fasta'")

✅ WT Score: -1507.66
📁 Updated CSV saved as 'single_mutants_scored_delta.csv'
📄 FASTA file with delta scores saved as 'single_mutants_scored_delta.fasta'
