## Evaluation of fine-tuned model of ESM-2(esm2-t33-650M-UR50D) on the IMMUNECODE dataset

### Load the fine-tuned PEFT model

In [9]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
from peft import *

model_name = 'facebook/esm2_t33_650M_UR50D'
adapter_path = '../output/exp1/mlm_finetune'
base_model = AutoModelForMaskedLM.from_pretrained(model_name, device_map='auto')
# model = PeftModel.from_pretrained(model, adapter_path, device_map='auto')
model = AutoPeftModel.from_pretrained(adapter_path, device_map='auto')
# model = model.merge_and_unload()
tokenizer = AutoTokenizer.from_pretrained(adapter_path)

### Load the IMMUNECODE dataset to evaluate the model

In [12]:
from tcredit.common import StrUtils
from tcredit.data import EpitopeTargetDataset, EpitopeTargetMaskedLMCollator, CN
import numpy as np



def eval_model(target_model=None, tokenizer=None, input_seqs=None, max_len=None):    
    inputs = tokenizer(input_seqs, 
                       padding='max_length', 
                       truncation=False, 
                       max_length=max_len,
                       return_overflowing_tokens=False,
                       return_tensors='pt')
    output = target_model(**inputs)
    token_ids = torch.argmax(output.logits, dim=-1)
    output_seqs = tokenizer.batch_decode(token_ids, skip_special_tokens=True)
    output_seqs = list(map(lambda seq: StrUtils.rm_nonwords(seq), output_seqs))
    
    scores = []
    for input_seq, output_seq in zip(input_seqs, output_seqs):
        score = StrUtils.similarity(input_seq, output_seq)
        scores.append(score)
        # print(f'input : {input_seq}\noutput: {output_seq}')
        # print('>>> similarity score: ', score)
    return np.mean(scores)
    
EpitopeTargetDataset.FN_DATA_CONFIG = '../config/data-test.json'
ds = EpitopeTargetDataset.from_key('immunecode')

epitope_seqs = ds.df[CN.epitope_seq].values[0:20]
target_seqs = ds.df[CN.cdr3b_seq].values[0:20]
input_seqs = [f'{e_seq}{t_seq}' for e_seq, t_seq in zip(epitope_seqs, target_seqs)]
max_len = ds.max_epitope_len + ds.max_target_len + 2
score = eval_model(target_model=base_model, tokenizer=tokenizer, input_seqs=input_seqs, max_len=max_len)
print(f'>>> Mean similarity score of base model: {score}')
score = eval_model(target_model=model, tokenizer=tokenizer, input_seqs=input_seqs, max_len=max_len)
print(f'>>> Mean similarity score of fine-tuned model: {score}')


>>> Mean similarity score of base model: 0.9613205128205129
>>> Mean similarity score of fine-tuned model: 0.6087185631227172


### Results and Discussion

- The mean similarity scores of the base model and the fine-tuned model is 0.96 and 0.61.
- The fine-tuned model is not as good as the base model. This is probably because the mutaion ratio of the TCR CDR3$\\beta$ is too high as 0.4 and no mutation is applied to the epitope sequence.