# Zero-shot mutant prediction For T7

In [6]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
import pandas as pd
from Bio import SeqIO
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings("ignore")
import os
os.environ["http_proxy"] = "http://l0:7890"
os.environ["https_proxy"] = "http://l0:7890"

def read_seq(seq_file):
    for record in SeqIO.parse(seq_file, "fasta"):
        return str(record.seq)

wild_type = f"../example_data/expr/t7.fasta"
sequence = read_seq(wild_type)

all_mutants = []
for i, wt in enumerate(sequence):
    for aa in "ACDEFGHIKLMNPQRSTVWY":
        if aa != wt:
            mutant = f"{wt}{i+1}{aa}"
            all_mutants.append(mutant)
df = pd.DataFrame({"mutant": all_mutants})

In [9]:
base_model_path = "AI4Protein/T7_"
for i in range(0, 5):
    model_path = f"{base_model_path}{i}"
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = AutoModelForMaskedLM.from_pretrained(model_path, trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained("AI4Protein/Prime_690M", trust_remote_code=True)
    model.eval()
    model = model.to(device)

    tokenized = tokenizer(sequence, return_tensors="pt").to(device)
    input_ids = tokenized["input_ids"]
    attention_mask = tokenized["attention_mask"]

    with torch.no_grad():
        output = model(input_ids, attention_mask=attention_mask)
        logits = output.logits[0, 1:-1, :].log_softmax(dim=-1)
    
    scores = []
    for mutant in tqdm(df["mutant"]):
        score = 0
        for sub_mutant in mutant.split(":"):
            wt, idx, mt = sub_mutant[0], int(sub_mutant[1:-1]) - 1, sub_mutant[-1]
            score += (logits[idx, tokenizer.get_vocab()[mt]] - logits[idx, tokenizer.get_vocab()[wt]]).item()
        scores.append(score)
    df[f"model_{i}"] = scores
    print(f"Model {i} done")

config.json:   0%|          | 0.00/775 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.61G [00:00<?, ?B/s]

  0%|          | 0/16777 [00:00<?, ?it/s]

Model 0 done


config.json:   0%|          | 0.00/775 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.61G [00:00<?, ?B/s]

  0%|          | 0/16777 [00:00<?, ?it/s]

Model 1 done


config.json:   0%|          | 0.00/775 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.61G [00:00<?, ?B/s]

  0%|          | 0/16777 [00:00<?, ?it/s]

Model 2 done


config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.61G [00:00<?, ?B/s]

  0%|          | 0/16777 [00:00<?, ?it/s]

Model 3 done


config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.61G [00:00<?, ?B/s]

  0%|          | 0/16777 [00:00<?, ?it/s]

Model 4 done


In [10]:
aggregate_score = 0
for i in range(0, 5):
    # min-max
    df[f"model_{i}"] = (df[f"model_{i}"] - df[f"model_{i}"].min()) / (df[f"model_{i}"].max() - df[f"model_{i}"].min())
    aggregate_score += df[f"model_{i}"]
df["aggregate"] = aggregate_score
df = df.sort_values("aggregate", ascending=False)

In [11]:
df.head(60)

Unnamed: 0,mutant,model_0,model_1,model_2,model_3,model_4,aggregate
14924,Q786L,1.0,1.0,1.0,1.0,1.0,5.0
8459,L446F,0.973196,0.973194,0.973197,0.973096,0.973101,4.865785
8163,S430P,0.962073,0.962007,0.962069,0.962048,0.96205,4.810248
9777,C515P,0.931719,0.931652,0.931712,0.931691,0.931691,4.658465
16425,P865L,0.914613,0.914611,0.914614,0.914612,0.914621,4.573071
15133,W797L,0.879612,0.879631,0.879609,0.879624,0.879632,4.398109
15039,R792M,0.874053,0.874075,0.874051,0.874008,0.874035,4.370222
7114,T375K,0.871116,0.871045,0.871116,0.871149,0.871143,4.355569
15932,C839N,0.869109,0.8691,0.869104,0.86903,0.869036,4.345379
2356,C125A,0.865731,0.865737,0.865731,0.865805,0.865818,4.328822
