In [1]:
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    auc, roc_curve,
    matthews_corrcoef
)
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import numpy as np

def get_metrics(y_test, y_test_predictions, y_probs):
    accuracy = accuracy_score(y_test, y_test_predictions)
    precision = precision_score(y_test, y_test_predictions)
    recall = recall_score(y_test, y_test_predictions)
    f1score = f1_score(y_test, y_test_predictions)
    #auc = roc_auc_score(y_test, y_test_predictions)
    mcc = matthews_corrcoef(y_test, y_test_predictions)
    conf_matrix = confusion_matrix(y_test, y_test_predictions)
    #auc_val = roc_auc_score(y_test, y_probs)
    fpr, tpr, thresholds = roc_curve(y_test, y_probs, pos_label = 1)
    auc_val = auc(fpr, tpr)  

    return {"accuracy":accuracy, "precision":precision, "recall":recall, "f1score":f1score, "auc":auc_val, "mcc":mcc}

# 1 Predict with LoRA

This way of prediction using LoRA return always 1 or 0; however, classic ESM2 (BertRnn) model works well with this prediction methodology, returning the same resutls than the trainer (section 3 and 4). Some reasons can be:
* We don't have a config.json, but we included the config produce after training in models folder; however, it didn't works as well.
* As LoRA update only A and B matrices, Maybe we need to include and additional step before doing inferences.

In [17]:
from model_utils_bert import BertRnn, BertRnnDist
from transformers import Trainer, TrainingArguments, BertConfig
from transformers import  AutoModel, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForCausalLM
from dataloader_bert import DataSetLoaderBERT
from utils import compute_metrics
import json
from transformers import TextClassificationPipeline, pipeline, BartForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import set_seed
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

set_seed(10)

Load model and tokenizer

In [12]:
model_name = "/M2/ArgosMHC_models/checkpoints/lora_t33_c3_2/checkpoint-150000/"  # mejor heckpoiunt
name_results = "predictions_esm2_lora_t33_c3" # 
pre_trained = "/M2/ArgosMHC_models/pre_trained_models/esm2_t33_650M_UR50D/"
dataset = "/M2/ArgosMHC_models/dataset/hlab/hlab_test_micro.csv"

model = BertRnn.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(pre_trained)

#In case you have added tokens, it’s recommended to use the PeftModel class rather than AutoModelForCausalLM. The former takes into account resizing the embedding matrix.

Some weights of the model checkpoint at /M2/ArgosMHC_models/checkpoints/lora_t33_c3_2/checkpoint-150000/ were not used when initializing BertRnn: ['base_model.model.bert.encoder.layer.7.intermediate.dense.bias', 'base_model.model.bert.encoder.layer.32.attention.self.query.base_layer.bias', 'base_model.model.rnn.bias_hh_l0', 'base_model.model.bert.encoder.layer.0.attention.self.value.base_layer.weight', 'base_model.model.bert.encoder.layer.7.attention.output.LayerNorm.bias', 'base_model.model.bert.encoder.layer.2.output.dense.bias', 'base_model.model.bert.encoder.layer.3.output.dense.weight', 'base_model.model.bert.encoder.layer.11.attention.self.key.lora_A.default.weight', 'base_model.model.bert.encoder.layer.9.attention.self.value.base_layer.bias', 'base_model.model.bert.encoder.layer.12.attention.self.query.base_layer.weight', 'base_model.model.bert.encoder.layer.20.attention.self.key.base_layer.bias', 'base_model.model.bert.encoder.layer.27.output.dense.weight', 'base_model.model.be

Load dataset

In [13]:

seq_length = 50 # for MHC-I
test_dataset = DataSetLoaderBERT(dataset, tokenizer_name=pre_trained, max_length=seq_length)
data_iter = DataLoader(test_dataset, batch_size=16, shuffle=False)

print( tokenizer("YFAMYQENMAHTDANTLYIIYRDYTWVARVYRGYLFGRDL", padding='max_length', max_length=seq_length) )
print(test_dataset[0])
print(type(test_dataset[0]))

{'input_ids': [0, 19, 18, 5, 20, 19, 16, 9, 17, 20, 5, 21, 11, 13, 5, 17, 11, 4, 19, 12, 12, 19, 10, 13, 19, 11, 22, 7, 5, 10, 7, 19, 10, 6, 19, 4, 18, 6, 10, 13, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]}
{'input_ids': tensor([ 0, 19, 18,  5, 20, 19, 16,  9, 17, 20,  5, 21, 11, 13,  5, 17, 11,  4,
        19, 12, 12, 19, 10, 13, 19, 11, 22,  7,  5, 10,  7, 19, 10,  6, 19,  4,
        18,  6, 10, 13,  4,  2,  1,  1,  1,  1,  1,  1,  1,  1]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0]), 'labels': tensor(1)}
<class 'dict'>


Prediction, it always returns zero

In [18]:
# data_iter, es un dataLoader, de la base de datos de test, y tiene un batch_size de 16
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    return np.exp(x) / np.sum(np.exp(x), axis=0)

model.eval() # is a kind of switch for some specific layers/parts of the model that behave differently during training and inference (evaluating) time. For example, Dropouts Layers, BatchNorm Layers etc. You need to turn them off during model evaluation,

predictions = []
labels = []

with torch.no_grad(): # turn off gradients computation
    for i, batch in enumerate(data_iter): # por cada batch        
        labels.extend(batch['labels'].numpy())
        output = model(batch['input_ids'], batch['attention_mask']) # inference
        for row_sample in output.logits: # por cada muestra del batch
            logits = row_sample.numpy()
            probs = softmax(logits)
            predictions.append( [logits[0], logits[1], probs[0], probs[1]] )

#print(predictions)
df = pd.DataFrame(predictions, columns=["logit_class_0", "logit_class_1", "prob_class_0", "prob_class_1"])
df['prediction'] = df.apply(lambda row: ( 0 if row[0] > row[1] else 1 ), axis=1)
df['label'] = labels
print(df)

print(get_metrics(df['label'], df['prediction'], df['prob_class_1']))

    logit_class_0  logit_class_1  prob_class_0  prob_class_1  prediction  \
0        0.047956      -0.010227      0.514542      0.485458           0   
1        0.040884       0.008847      0.508009      0.491991           0   
2        0.037705       0.000298      0.509351      0.490649           0   
3        0.038920      -0.005073      0.510996      0.489004           0   
4        0.043267      -0.009668      0.513231      0.486769           0   
..            ...            ...           ...           ...         ...   
71       0.030311      -0.001709      0.508004      0.491996           0   
72       0.048248       0.005792      0.510612      0.489388           0   
73       0.039447       0.020678      0.504692      0.495308           0   
74       0.045509      -0.007176      0.513168      0.486832           0   
75       0.043361       0.008816      0.508635      0.491365           0   

    label  
0       1  
1       1  
2       1  
3       1  
4       1  
..    ...  
71 

  _warn_prf(average, modifier, msg_start, len(result))


# 2 Predict LoRA using Trainer

Lets evaluate if Trainer give the same results than prediction (section 1). We will evaluate that, because during training, the model got good results on evaluation dataset. 

After evaluation, the model performed well; thusm we are missing a step in section 1 (predict with LoRa)

In [1]:
from transformers import Trainer, TrainingArguments, BertConfig, AdamW
from model_utils_bert import BertLinear, BertRnn, BertRnnAtt, BertRnnSigmoid
from model_utils_tape import TapeLinear, TapeRnn, TapeRnnAtt, TapeRnnDist
from utils import compute_metrics
from transformers import EarlyStoppingCallback, IntervalStrategy

from tape import ProteinBertConfig
from torch.utils.data import DataLoader
from transformers import get_scheduler, TrainerCallback

from matplotlib import pyplot as plt
from matplotlib.lines import Line2D
import numpy as np
import os
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType


# data loaders
from dataloader_bert import DataSetLoaderBERT, DataSetLoaderBERT_old
from dataloader_tape import DataSetLoaderTAPE

import wandb
from transformers import set_seed
set_seed(10)
#set_seed(1)

path_checkpoints    = "/M2/ArgosMHC_models/checkpoints/lora_t33_c3_2/"  # path to store checkpoints
path_model          = "/M2/ArgosMHC_models/models/lora_t33_c3/"       # path to save the best model
model_name          = "/M2/ArgosMHC_models/pre_trained_models/esm2_t33_650M_UR50D/"   # path of the pre-trained model, for esm2 and protbert

path_train_csv = "/M2/ArgosMHC_models/dataset/hlab/hlab_train.csv"
path_val_csv = "/M2/ArgosMHC_models/dataset/hlab/hlab_val.csv"

max_length = 50 # for hlab dataset

trainset = DataSetLoaderBERT(path=path_train_csv, tokenizer_name=model_name, max_length=max_length)
valset = DataSetLoaderBERT(path=path_val_csv, tokenizer_name=model_name, max_length=max_length)    
config = BertConfig.from_pretrained(model_name, num_labels=2)

config.rnn = "lstm"
config.num_rnn_layer = 2
config.rnn_dropout = 0.1
config.rnn_hidden = 768
config.length = max_length
config.cnn_filters = 512
config.cnn_dropout = 0.1
                     
model_ = BertRnn.from_pretrained(model_name, config=config)

############ hyperparameters #################################################### Configuration 3
num_samples = len(trainset)
num_epochs = 6
batch_size = 16  
weight_decay = 0.01
lr =2e-5
betas = ((0.9, 0.98)) 
num_training_steps = int((num_epochs * num_samples)/batch_size) 
warmup_steps = int(num_training_steps*0.1)

# LoRA config ####################################################################
configLora = { "lora_alpha": 1, "lora_dropout": 0.4, "r": 1 }
peft_config = LoraConfig(
    task_type=TaskType.TOKEN_CLS, 
    inference_mode=False, 
    r=configLora["r"], 
    lora_alpha=configLora["lora_alpha"], 
    target_modules=["query", "key", "value"], # also maybe "dense_h_to_4h" and "dense_4h_to_h"
    lora_dropout=configLora["lora_dropout"], 
    bias="none" # or "all" or "lora_only" 
)

model_ = get_peft_model(model_, peft_config)
#model_ = accelerator.prepare(model_)
model_.print_trainable_parameters()

training_args = TrainingArguments(
        output_dir                  = path_checkpoints, 
        num_train_epochs            = num_epochs,   
        per_device_train_batch_size = batch_size,   
        per_device_eval_batch_size  = batch_size * 8,         
        logging_dir                 = path_checkpoints,        
        logging_strategy            = "steps", #epoch or steps
        eval_steps                  = 3000, # el primer experimento fue con 1000 steps
        save_steps                  = 3000,
        metric_for_best_model       = 'f1',
        load_best_model_at_end      = True,        
        evaluation_strategy         = "steps", #epoch or steps
        save_strategy               = "steps", #epoch or ste  
        logging_steps=3000  # how often to log to W&B
)

optimizer = AdamW(model_.parameters(), lr=lr, betas=betas, weight_decay=weight_decay, correct_bias=True)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_training_steps)

trainer = Trainer(        
        args            = training_args,   
        model           = model_, 
        train_dataset   = trainset,  
        eval_dataset    = valset, 
        compute_metrics = compute_metrics,  
        optimizers      = (optimizer, lr_scheduler),      
        callbacks       = [EarlyStoppingCallback(early_stopping_patience=5)] 
    )

trainer.train(resume_from_checkpoint = True)
print("finish training")
trainer.save_model(path_model)
trainer.model.config.save_pretrained(path_model)

  from .autonotebook import tqdm as notebook_tqdm
You are using a model of type esm to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of the model checkpoint at /M2/ArgosMHC_models/pre_trained_models/esm2_t33_650M_UR50D/ were not used when initializing BertRnn: ['esm.encoder.layer.14.LayerNorm.weight', 'esm.encoder.layer.16.attention.LayerNorm.bias', 'esm.encoder.layer.5.attention.self.key.weight', 'esm.encoder.layer.19.attention.LayerNorm.weight', 'esm.encoder.layer.7.attention.self.value.bias', 'esm.encoder.layer.32.intermediate.dense.bias', 'esm.encoder.layer.30.LayerNorm.weight', 'esm.encoder.layer.7.LayerNorm.bias', 'esm.encoder.layer.6.output.dense.weight', 'esm.encoder.layer.28.intermediate.dense.bias', 'esm.encoder.layer.14.intermediate.dense.bias', 'esm.encoder.layer.8.attention.self.query.bias', 'esm.encoder.layer.27.attention.self.key.weight', 'esm.encoder.layer.3.attention.self.value.weight', 'esm.

trainable params: 256,514 || all params: 679,378,692 || trainable%: 0.03775714531829915


Loading model from /M2/ArgosMHC_models/checkpoints/lora_t33_c3_2/checkpoint-201000.
***** Running training *****
  Num examples = 539019
  Num Epochs = 6
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 202134
  Number of trainable parameters = 256514
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 5
  Continuing training from global step 201000
  Will skip the first 5 epochs then the first 32555 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.
Skipping the first batches:   0%|          | 0/32555 [00:00<?, ?it/s]Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it

Skipping the first batches: 100%|██████████| 32555/32555 [01:09<00:00, 470.04it/s]


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from /M2/ArgosMHC_models/checkpoints/lora_t33_c3_2/checkpoint-195000/ (score: 0.7632602449117809).

100%|██████████| 202134/202134 [07:02<00:00, 478.07it/s]
Saving model checkpoint to /M2/ArgosMHC_models/models/lora_t33_c3/
Trainer.model is not a `PreTrainedModel`, only saving its state dict.


{'train_runtime': 427.4167, 'train_samples_per_second': 7566.653, 'train_steps_per_second': 472.92, 'train_loss': 0.002940129368949713, 'epoch': 6.0}
finish training


Configuration saved in /M2/ArgosMHC_models/models/lora_t33_c3/config.json


In [2]:
print(model_)

PeftModelForTokenClassification(
  (base_model): LoraModel(
    (model): BertRnn(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(33, 1280, padding_idx=1)
          (position_embeddings): Embedding(1026, 1280)
          (token_type_embeddings): Embedding(2, 1280)
          (LayerNorm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.0, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0): BertLayer(
              (attention): BertAttention(
                (self): BertSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=1280, out_features=1280, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.4, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=1280, out_f

When we opends the model using Trainer, the models inference well. So, we are missing a step when oopen the model. Moreover, when we print the model, we noticed that it is a PeftModelForTokenClassification

In [4]:
import pandas as pd
pre_trained = "/M2/ArgosMHC_models/pre_trained_models/esm2_t6_8M_UR50D/"
dataset = "/M2/ArgosMHC_models/dataset/hlab/hlab_test_micro.csv"

seq_length = 50 # for MHC-I

model_trainer = Trainer(model = model_, compute_metrics = compute_metrics)
test_dataset = DataSetLoaderBERT(dataset, tokenizer_name=pre_trained, max_length=seq_length)
predictions, label_ids, metrics = model_trainer.predict(test_dataset)

df = pd.DataFrame(predictions)

df['prediction'] = df.apply(lambda row: ( 0 if row[0] > row[1] else 1 ), axis=1)
df['label'] = label_ids
print(df)
#df.to_csv(name_results + ".csv")

#print(predictions)
print(metrics)

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading file vocab.txt
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
***** Running Prediction *****
  Num examples = 76
  Batch size = 8
100%|██████████| 10/10 [00:00<00:00, 14.74it/s]

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0] [[-0.72885877  0.7541259 ]
 [ 0.68282026 -0.63367534]
 [-1.0395614   0.9736944 ]
 [-1.0817848   1.0852101 ]
 [-0.46247494  0.55944276]
 [-0.56534374  0.5921434 ]
 [-0.83514065  0.8107847 ]
 [-1.2630697   1.1586009 ]
 [-0.95792246  0.980892  ]
 [-1.0324655   0.9924274 ]
 [-0.76731664  0.7217846 ]
 [-1.6330781   1.514336  ]
 [-0.7957608   0.8229094 ]
 [-0.9124353   0.8701133 ]
 [-1.2138525   1.209451  ]
 [-1.0927274   1.0711197 ]
 [-1.0666986   1.0522016 ]
 [-0.45787728  0.42578974]
 [-0.8832142   0.76278913]
 [-0.37021875  0.34802938]
 [-1.2862854   1.2059811 ]
 [-0.8621023   0.8841515 ]
 [-0.7213802   0.6836574 ]
 [-0.4424507   0.43031347]
 [-0.7615446   0.83321893]
 [-1.278306    1.2741243 ]
 [-0.90015024  0.880399  ]
 [-0.39320016  0.33496857]
 [ 0.24058262 -0.17222066]
 [-1.2310044   1.2370495 ]
 [-0.3303253   0.29723084]
 [-1.11




# 3 Predict Classic

Este código, evalua el modelo despues del fine-tuning en el conjunto de testing.

In [98]:
from model_utils_bert import BertRnn, BertRnnDist
from transformers import Trainer, TrainingArguments, BertConfig
from transformers import  AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from dataloader_bert import DataSetLoaderBERT
from utils import compute_metrics
import json
from transformers import TextClassificationPipeline, pipeline, BartForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import set_seed
import pandas as pd
import numpy as np

set_seed(10)

Abrimos el modelo y el tokenizer. Tambien definimos la ruta del modelo, el tokenizer (modelo pre entrenado) y la base de datos de testing. Finalmente, definimos el sufijo de de los archivos a generar como salida.

In [99]:
model_name = "/M2/ArgosMHC_models/checkpoints/classic_t6_c3/checkpoint-102000/"  # mejor checkpoiunt
name_results = "predictions_esm2_t33_c5" # 
#pre_trained = "/M2/ArgosMHC_models/pre_trained_models/esm2_t33_650M_UR50D/"
pre_trained = "/M2/ArgosMHC_models/pre_trained_models/esm2_t6_8M_UR50D/"
dataset = "/M2/ArgosMHC_models/dataset/hlab/hlab_test_micro.csv"

model = BertRnn.from_pretrained(model_name, num_labels=2) # it fail for automodel for sequence classification
tokenizer = AutoTokenizer.from_pretrained(pre_trained)

loading configuration file /M2/ArgosMHC_models/checkpoints/classic_t6_c3/checkpoint-102000/config.json
Model config BertConfig {
  "_name_or_path": "../pre_trained_models/esm2_t6_8M_UR50D",
  "architectures": [
    "BertRnn"
  ],
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "cnn_dropout": 0.1,
  "cnn_filters": 512,
  "emb_layer_norm_before": false,
  "esmfold_config": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 320,
  "initializer_range": 0.02,
  "intermediate_size": 1280,
  "is_folding_model": false,
  "layer_norm_eps": 1e-05,
  "length": 50,
  "mask_token_id": 32,
  "max_position_embeddings": 1026,
  "model_type": "bert",
  "num_attention_heads": 20,
  "num_hidden_layers": 6,
  "num_rnn_layer": 2,
  "pad_token_id": 1,
  "position_embedding_type": "rotary",
  "rnn": "lstm",
  "rnn_dropout": 0.1,
  "rnn_hidden": 768,
  "token_dropout": true,
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,

Definimos el DataLoader

In [100]:
seq_length = 50 # for MHC-I
test_dataset = DataSetLoaderBERT(dataset, tokenizer_name=pre_trained, max_length=seq_length)
data_iter = DataLoader(test_dataset, batch_size=16, shuffle=False)

print( tokenizer("YFAMYQENMAHTDANTLYIIYRDYTWVARVYRGYLFGRDL", padding='max_length', max_length=seq_length) )
print(test_dataset[0])
print(type(test_dataset[0]))

loading file vocab.txt
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json


{'input_ids': [0, 19, 18, 5, 20, 19, 16, 9, 17, 20, 5, 21, 11, 13, 5, 17, 11, 4, 19, 12, 12, 19, 10, 13, 19, 11, 22, 7, 5, 10, 7, 19, 10, 6, 19, 4, 18, 6, 10, 13, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]}
{'input_ids': tensor([ 0, 19, 18,  5, 20, 19, 16,  9, 17, 20,  5, 21, 11, 13,  5, 17, 11,  4,
        19, 12, 12, 19, 10, 13, 19, 11, 22,  7,  5, 10,  7, 19, 10,  6, 19,  4,
        18,  6, 10, 13,  4,  2,  1,  1,  1,  1,  1,  1,  1,  1]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
        0, 0]), 'labels': tensor(1)}
<class 'dict'>


Prediccion usando el dataloader y según el batch size

In [102]:
# data_iter, es un dataLoader, de la base de datos de test, y tiene un batch_size de 16
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    return np.exp(x) / np.sum(np.exp(x), axis=0)

model.eval() # is a kind of switch for some specific layers/parts of the model that behave differently during training and inference (evaluating) time. For example, Dropouts Layers, BatchNorm Layers etc. You need to turn them off during model evaluation,

predictions = []
labels = []

with torch.no_grad(): # turn off gradients computation
    for i, batch in enumerate(data_iter): # por cada batch        
        labels.extend(batch['labels'].numpy())
        output = model(batch['input_ids'], batch['attention_mask']) # inference
        for row_sample in output.logits: # por cada muestra del batch
            logits = row_sample.numpy()
            probs = softmax(logits)
            predictions.append( [logits[0], logits[1], probs[0], probs[1]] )

#print(predictions)
df = pd.DataFrame(predictions, columns=["logit_class_0", "logit_class_1", "prob_class_0", "prob_class_1"])
df['prediction'] = df.apply(lambda row: ( 0 if row[0] > row[1] else 1 ), axis=1)
df['label'] = labels
print(df)

print(get_metrics(df['label'], df['prediction'], df['prob_class_1']))

    logit_class_0  logit_class_1  prob_class_0  prob_class_1  prediction  \
0        1.484725      -2.111731      0.973311      0.026689           0   
1        0.366563      -0.806652      0.763726      0.236274           0   
2       -3.797246       3.086962      0.001023      0.998977           1   
3       -4.328579       3.746734      0.000311      0.999689           1   
4       -1.433432       0.912185      0.087415      0.912585           1   
..            ...            ...           ...           ...         ...   
71       1.846199      -2.293438      0.984321      0.015679           0   
72      -0.228940      -0.585798      0.588280      0.411720           0   
73       1.457081      -2.315464      0.977523      0.022477           0   
74       1.226935      -1.907617      0.958296      0.041704           0   
75       2.441628      -3.086117      0.996041      0.003959           0   

    label  
0       1  
1       1  
2       1  
3       1  
4       1  
..    ...  
71 

Ejemplo de como hacer la Prediccion usando una sola muestra

In [73]:
# algunas muestras
"YFAMYQENMAHTDANTLYIIYRDYTWVARVYRGYRSDTPLIY" # label 1
"YFAMYQENMAHTDANTLYIIYRDYTWVARVYRGYNSDLVQKY" # label 1
"YFAMYGEKVAHTHVDTLYLRYHYYTWAVWAYTWYLLAASEAPR"  # label 0
"YFAMYGEKVAHTHVDTLYLRYHYYTWAVWAYTWYQWSEKVTEE"  # label 0

# el tokenizer devuelve los input_ids y el attention_mask como dos listas
sample = tokenizer("YFAMYQENMAHTDANTLYIIYRDYTWVARVYRGYNSDLVQKY", padding='max_length', max_length=seq_length)

# convertimos en tensor, debe ser lista de listas
ids = torch.IntTensor([sample['input_ids']]) # tensor 2D
masks = torch.IntTensor([sample['attention_mask']]) # tensor 2D

model.eval()
with torch.no_grad(): # turn off gradients computation
    output = model(ids, masks )   

print("output", output)

output SequenceClassifierOutput(loss=None, logits=tensor([-4.3286,  3.7467]), hidden_states=None, attentions=None)


# 4 Predict classic with Trainer

Este ejemplo, tambien hace las predicciones y cálcula las métricas, pero usando el Trainner.

In [92]:
# Predictions for HLAB dataset for TAPE

# load model
from model_utils_bert import BertRnn, BertRnnDist
from transformers import Trainer, TrainingArguments, BertConfig
from sklearn.metrics import accuracy_score, confusion_matrix, matthews_corrcoef, roc_auc_score
from dataloader_bert import DataSetLoaderBERT
from transformers import BertConfig
from utils import compute_metrics
import json

model_name = "/M2/ArgosMHC_models/checkpoints/classic_t6_c3/checkpoint-102000/"  # mejor checkpoiunt
pre_trained = "/M2/ArgosMHC_models/pre_trained_models/esm2_t6_8M_UR50D/"
dataset = "/M2/ArgosMHC_models/dataset/hlab/hlab_test_micro.csv"

seq_length = 50 # for MHC-I
config = BertConfig.from_pretrained(model_name, num_labels=2 )

model = Trainer(model = BertRnn.from_pretrained(model_name, config=config), compute_metrics = compute_metrics)
test_dataset = DataSetLoaderBERT(dataset, tokenizer_name=pre_trained, max_length=seq_length)
predictions, label_ids, metrics = model.predict(test_dataset)

df = pd.DataFrame(predictions)

df['prediction'] = df.apply(lambda row: ( 0 if row[0] > row[1] else 1 ), axis=1)
df['label'] = label_ids
print(df)
#df.to_csv(name_results + ".csv")

#print(predictions)
print(metrics)

loading configuration file /M2/ArgosMHC_models/checkpoints/classic_t6_c3/checkpoint-102000/config.json
Model config BertConfig {
  "_name_or_path": "../pre_trained_models/esm2_t6_8M_UR50D",
  "architectures": [
    "BertRnn"
  ],
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "cnn_dropout": 0.1,
  "cnn_filters": 512,
  "emb_layer_norm_before": false,
  "esmfold_config": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 320,
  "initializer_range": 0.02,
  "intermediate_size": 1280,
  "is_folding_model": false,
  "layer_norm_eps": 1e-05,
  "length": 50,
  "mask_token_id": 32,
  "max_position_embeddings": 1026,
  "model_type": "bert",
  "num_attention_heads": 20,
  "num_hidden_layers": 6,
  "num_rnn_layer": 2,
  "pad_token_id": 1,
  "position_embedding_type": "rotary",
  "rnn": "lstm",
  "rnn_dropout": 0.1,
  "rnn_hidden": 768,
  "token_dropout": true,
  "torch_dtype": "float32",
  "transformers_version": "4.24.0",
  "type_vocab_size": 2,

           0         1  prediction  label
0   1.484571 -2.111625           0      1
1   0.366120 -0.806223           0      1
2  -3.797319  3.086969           1      1
3  -4.328716  3.746843           1      1
4  -1.433371  0.912052           1      1
..       ...       ...         ...    ...
71  1.846014 -2.293381           0      0
72 -0.229090 -0.585730           0      0
73  1.457083 -2.315473           0      0
74  1.226967 -1.907715           0      0
75  2.441543 -3.086087           0      0

[76 rows x 4 columns]
{'test_loss': 0.3903467059135437, 'test_auc': 0.8815789473684211, 'test_precision': 0.9393939393939394, 'test_recall': 0.8157894736842105, 'test_f1': 0.8732394366197183, 'test_sn': 0.9393939393939394, 'test_sp': 0.9473684210526315, 'test_accuracy': 0.881578947368421, 'test_mcc': 0.7698512161427534, 'test_runtime': 0.1327, 'test_samples_per_second': 572.778, 'test_steps_per_second': 75.366}



