# Scoring ProteinGym Data with Prime


In [7]:
from transformers import AutoTokenizer, AutoModel
import torch
import pandas as pd
from Bio import SeqIO
from tqdm.notebook import tqdm
from pathlib import Path
from scipy.stats import spearmanr

def read_seq(seq_file):
    for record in SeqIO.parse(seq_file, "fasta"):
        return str(record.seq)

In [2]:
model_path = "AI4Protein/Prime_690M"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_path, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model.eval()
model = model.to(device)



In [13]:
@torch.no_grad()
def score(fasta, mutant):
    df = pd.read_csv(mutant)
    sequence = read_seq(fasta)
    tokenied_results = tokenizer(sequence, return_tensors="pt")
    input_ids = tokenied_results.input_ids.to(device)
    attention_mask = tokenied_results.attention_mask.to(device)
    logits = model(input_ids, attention_mask=attention_mask).logits[0, 1:-1, :].log_softmax(dim=-1)
    scores = []
    for mutant in df["mutant"]:
        score = 0
        for sub_mutant in mutant.split(":"):
            wt, idx, mt = sub_mutant[0], int(sub_mutant[1:-1]) - 1, sub_mutant[-1]
            score += (logits[idx, tokenizer.get_vocab()[mt]] - logits[idx, tokenizer.get_vocab()[wt]]).item()
        scores.append(score)
    df["predict_score"] = scores
    return df

In [9]:
sequence_folder = Path("../proteingym_v1.0_fasta/fasta")
mutant_folder = Path("../proteingym_v1.0_fasta/mutant")
output_folder = Path("../proteingym_v1.0_fasta/scores")

In [15]:
for file in sequence_folder.glob("*.fasta"):
    stem = file.stem
    df = score(file, mutant_folder / f"{stem}.csv")
    df.to_csv(output_folder / f"{stem}.csv", index=False)
    print(f"Scoring {stem}, rs = {spearmanr(df['score'], df['predict_score']).correlation:.4f}, saved to {output_folder / f'{stem}.csv'}")

Scoring RL40A_YEAST_Roscoe_2013, rs = 0.6550, saved to ../proteingym_v1.0_fasta/scores/RL40A_YEAST_Roscoe_2013.csv
Scoring RL40A_YEAST_Mavor_2016, rs = 0.5691, saved to ../proteingym_v1.0_fasta/scores/RL40A_YEAST_Mavor_2016.csv
Scoring RCRO_LAMBD_Tsuboyama_2023_1ORC, rs = 0.6436, saved to ../proteingym_v1.0_fasta/scores/RCRO_LAMBD_Tsuboyama_2023_1ORC.csv
Scoring RD23A_HUMAN_Tsuboyama_2023_1IFY, rs = 0.5803, saved to ../proteingym_v1.0_fasta/scores/RD23A_HUMAN_Tsuboyama_2023_1IFY.csv
Scoring RDRP_I33A0_Li_2023, rs = 0.3237, saved to ../proteingym_v1.0_fasta/scores/RDRP_I33A0_Li_2023.csv
Scoring REV_HV1H2_Fernandes_2016, rs = 0.3174, saved to ../proteingym_v1.0_fasta/scores/REV_HV1H2_Fernandes_2016.csv
Scoring RFAH_ECOLI_Tsuboyama_2023_2LCL, rs = 0.3725, saved to ../proteingym_v1.0_fasta/scores/RFAH_ECOLI_Tsuboyama_2023_2LCL.csv
Scoring RL20_AQUAE_Tsuboyama_2023_1GYZ, rs = 0.7586, saved to ../proteingym_v1.0_fasta/scores/RL20_AQUAE_Tsuboyama_2023_1GYZ.csv
Scoring RASH_HUMAN_Bandaru_2017,