## Evaluation of fine-tuned model of ESM-2(esm2-t33-650M-UR50D) on the IMMUNECODE dataset

### Load the base pretrained model

In [20]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
from peft import *

model_name = 'facebook/esm2_t33_650M_UR50D'
base_model = AutoModelForMaskedLM.from_pretrained(model_name, device_map='auto').eval()
tokenizer = AutoTokenizer.from_pretrained(adapter_path)

### Load the fine-tuned LoRA adapter model

In [24]:
def load_peft_model(adapter_path='../output/exp1/mlm_finetune'):
    model = AutoPeftModel.from_pretrained(adapter_path, device_map='auto').eval()
    # model = model.merge_and_unload()
    return model

model = load_peft_model('../output/exp1/mlm_finetune')

### Load the IMMUNECODE dataset to evaluate the model

In [25]:
from tcredit.common import StrUtils
from tcredit.data import EpitopeTargetDataset, EpitopeTargetMaskedLMCollator, CN
import numpy as np



def eval_model(target_model=None, tokenizer=None, input_seqs=None, max_len=None):    
    inputs = tokenizer(input_seqs, 
                       padding='max_length', 
                       truncation=False, 
                       max_length=max_len,
                       return_overflowing_tokens=False,
                       return_tensors='pt')
    output = target_model(**inputs)
    token_ids = torch.argmax(output.logits, dim=-1)
    output_seqs = tokenizer.batch_decode(token_ids, skip_special_tokens=True)
    output_seqs = list(map(lambda seq: StrUtils.rm_nonwords(seq), output_seqs))
    
    scores = []
    for input_seq, output_seq in zip(input_seqs, output_seqs):
        score = StrUtils.similarity(input_seq, output_seq)
        scores.append(score)
        print(f'input : {input_seq}\noutput: {output_seq}')
        print('>>> similarity score: ', score)
    return np.mean(scores)
    
EpitopeTargetDataset.FN_DATA_CONFIG = '../config/data-test.json'
ds = EpitopeTargetDataset.from_key('immunecode')

epitope_seqs = ds.df[CN.epitope_seq].values[0:20]
target_seqs = ds.df[CN.cdr3b_seq].values[0:20]
input_seqs = [f'{e_seq}{t_seq}' for e_seq, t_seq in zip(epitope_seqs, target_seqs)]
max_len = ds.max_epitope_len + ds.max_target_len + 2

score = eval_model(target_model=base_model, tokenizer=tokenizer, input_seqs=input_seqs, max_len=max_len)
print(f'>>> Mean similarity score of base model: {score}')
score = eval_model(target_model=model, tokenizer=tokenizer, input_seqs=input_seqs, max_len=max_len)
print(f'>>> Mean similarity score of fine-tuned model: {score}')


input : AYKTFPPTEPKCASSYSARSYNEQFF
output: MYKTFPPTEPKCASSYSARSYNEQFF
>>> similarity score:  0.9615384615384616
input : KTFPPTEPKCASSYSARSYNEQFF
output: MTFPPTEPKCASSYSARSYNEQFF
>>> similarity score:  0.9583333333333334
input : AFLLFLVLICASSSLADYRYEQYF
output: MFLLFLVLICASSSLADYRYEQYF
>>> similarity score:  0.9583333333333334
input : FLAFLLFLVCASSSLADYRYEQYF
output: MLAFLLFLVCASSSLADYRYEQYF
>>> similarity score:  0.9583333333333334
input : FYLCFLAFLCASSSLADYRYEQYF
output: MYLCFLAFLCASSSLADYRYEQYF
>>> similarity score:  0.9583333333333334
input : FYLCFLAFLLCASSSLADYRYEQYF
output: MYLCFLAFLLCASSSLADYRYEQYF
>>> similarity score:  0.96
input : IDFYLCFLAFCASSSLADYRYEQYF
output: MDFYLCFLAFCASSSLADYRYEQYF
>>> similarity score:  0.96
input : IELSLIDFYLCASSSLADYRYEQYF
output: MELSLIDFYLCASSSLADYRYEQYF
>>> similarity score:  0.96
input : LIDFYLCFLCASSSLADYRYEQYF
output: MIDFYLCFLCASSSLADYRYEQYF
>>> similarity score:  0.9583333333333334
input : LLFLVLIMLCASSSLADYRYEQYF
output: MLFLVLIMLCASSSLADYR

- The mean similarity scores of the base model and the fine-tuned model is 0.96 and 0.61.
- The fine-tuned model is not as good as the base model. This is probably because the mutaion ratio of the TCR CDR3$\\beta$ is too high as 0.4 and no mutation is applied to the epitope sequence.

## Experiment 2 with different mutation properties
- Epitope: mut_ratio=0.15, mut_probs=[0.7, 03]
- TCR CDR3$\\beta$: mut_ratio=0.2, mut_probs=[0.8, 0.2]

In [26]:
model = load_peft_model('../output/exp2/mlm_finetune')

In [28]:
score = eval_model(target_model=base_model, tokenizer=tokenizer, input_seqs=input_seqs, max_len=max_len)
print(f'>>> Mean similarity score of base model: {score}')
score = eval_model(target_model=model, tokenizer=tokenizer, input_seqs=input_seqs, max_len=max_len)
print(f'>>> Mean similarity score of fine-tuned model: {score}')

input : AYKTFPPTEPKCASSYSARSYNEQFF
output: MYKTFPPTEPKCASSYSARSYNEQFF
>>> similarity score:  0.9615384615384616
input : KTFPPTEPKCASSYSARSYNEQFF
output: MTFPPTEPKCASSYSARSYNEQFF
>>> similarity score:  0.9583333333333334
input : AFLLFLVLICASSSLADYRYEQYF
output: MFLLFLVLICASSSLADYRYEQYF
>>> similarity score:  0.9583333333333334
input : FLAFLLFLVCASSSLADYRYEQYF
output: MLAFLLFLVCASSSLADYRYEQYF
>>> similarity score:  0.9583333333333334
input : FYLCFLAFLCASSSLADYRYEQYF
output: MYLCFLAFLCASSSLADYRYEQYF
>>> similarity score:  0.9583333333333334
input : FYLCFLAFLLCASSSLADYRYEQYF
output: MYLCFLAFLLCASSSLADYRYEQYF
>>> similarity score:  0.96
input : IDFYLCFLAFCASSSLADYRYEQYF
output: MDFYLCFLAFCASSSLADYRYEQYF
>>> similarity score:  0.96
input : IELSLIDFYLCASSSLADYRYEQYF
output: MELSLIDFYLCASSSLADYRYEQYF
>>> similarity score:  0.96
input : LIDFYLCFLCASSSLADYRYEQYF
output: MIDFYLCFLCASSSLADYRYEQYF
>>> similarity score:  0.9583333333333334
input : LLFLVLIMLCASSSLADYRYEQYF
output: MLFLVLIMLCASSSLADYR

- The mean similarity scores of the base model and the fine-tuned model is 0.96 and 0.94.
- Still, the fine-tuned model score is slightly lower than the base model score.
- Many mismatched AAs were located at the TCR CDR3$\\beta$ sequence. Especially, R->S or S-G mutations were frequently observed.
- We will make the mutation properties of TCRCDR3$\\beta$ equal to that of epitope sequence.    