# HLAB dataloader adapted

Este dataloader es una version actualizada de HLAB adaptado a la BD utilzada por BERTMHC

In [1]:
import torch
from transformers import BertTokenizer
from torch.utils.data import Dataset
import os
import pandas as pd
import numpy as np
import re

from transformers import Trainer, TrainingArguments, BertConfig
from bin.model_utils import BertForSequenceClassificationTAPE, BertForSequenceClassification
from sklearn.metrics import accuracy_score, confusion_matrix, matthews_corrcoef, roc_auc_score


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class My_Load_Dataset(Dataset):
    def __init__(self, path, tokenizer_name='../../models/prot_bert_bfd', max_length=51):                  
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=False)

        self.seqs, self.labels = self.load_dataset(path)        
        self.max_length = max_length

    # usadao por HLAB, nosotros ya tenemos las seudosecuencias
    """def HLA_trans(self):
        HLA_seq = pd.read_csv('source/MHC_pseudo.dat', sep='\t')
        seqs = {}
        for i in range(len(HLA_seq)):
            seqs[HLA_seq.HLA[i]] = HLA_seq.sequence[i]
        return seqs
    """
    def transform(self, HLA, peptide):
        data = HLA + peptide
        data = data + 'X' * (49 - len(data)) 
        return data

    def read_and_prepare(self,file):
        data = pd.read_csv(file)
        """ # de HLAB original
        seqs = self.HLA_trans()
        data['cost_cents'] = data.apply(
            lambda row: self.transform(
                HLA=seqs[row['HLA'][0:5]+row['HLA'][6:]],
                peptide=row['peptide']),
            axis=1)
        return np.vstack(data.cost_cents)"""
        data['cost_cents'] = data.apply(
            lambda row: self.transform(HLA=row['mhc'], peptide=row['peptide']), axis=1)
        return np.vstack(data.cost_cents)

    def get_label(self,file):
        data = pd.read_csv(file)
        label = []
        #label.append(data['Label'].values)
        label.append(data['masslabel'].values) # netMHCpan3.2 database
        return label

    def load_dataset(self,data_path):
        file = data_path
        df = pd.read_csv(file)
        y_label = self.get_label(file)[0]
        X_test = self.read_and_prepare(file)
        X_test = X_test.tolist()
        X_test = [' '.join(eachseq) for eachseq in X_test]
        X_test = [" ".join(eachseq) for eachseq in
                  X_test]  # ['Y D S E Y R N I F T N T D E S N L Y L S Y N Y Y T W A V D A Y T W Y H M M V I F R L M',.....,'Y D S E Y R N I F T N T D E S N L Y L S Y N Y Y T W A V D A Y T W Y N F L I K F L L I']

        return (X_test, y_label)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        seq = " ".join("".join(self.seqs[idx].split()))
        

        seq_ids = self.tokenizer(seq, truncation=True, padding='max_length', max_length=self.max_length)

        sample = {key: torch.tensor(val) for key, val in seq_ids.items()}
        sample['labels'] = torch.tensor(self.labels[idx])

        return sample


In [3]:
model_name = "../../models/prot_bert_bfd"
train_dataset = My_Load_Dataset(path="../../dataset/netMHCIIpan3.2/train_mini.csv", tokenizer_name=model_name, max_length=51)
val_dataset = My_Load_Dataset(path="../../dataset/netMHCIIpan3.2/eval_mini.csv", tokenizer_name=model_name, max_length=51)
test_dataset = My_Load_Dataset(path="../../dataset/netMHCIIpan3.2/test_mini.csv", tokenizer_name=model_name, max_length=51)


In [8]:
print(train_dataset[0])
print(train_dataset[1])
#print(train_dataset[0]['input_ids'].shape)

{'input_ids': tensor([ 2, 18,  9, 19, 19, 11,  6, 10,  7,  6,  6,  8, 14,  6, 11, 21,  9, 10,
        10, 19, 14, 20, 19, 14, 11, 14,  9,  6, 15, 20, 22,  8,  8, 19, 15, 15,
        11, 16,  5,  8,  6,  5, 15,  5, 15, 10, 20,  5,  7,  5,  3]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0]), 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1]), 'labels': tensor(1)}
{'input_ids': tensor([ 2, 18,  9, 19, 19, 11,  6, 10,  7,  6,  6,  8, 14,  6, 11, 21,  9,  5,
        10, 19,  9, 20, 20,  8,  5, 18, 12, 18, 17, 20, 22,  8,  8, 19, 15,  5,
        10,  9,  9, 12,  8, 16, 24, 14, 18,  8,  8, 21, 15, 10,  3]), 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

: 

In [6]:
def compute_metrics(pred):
    labels = pred.label_ids
    prediction=pred.predictions
    preds = prediction.argmax(-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    sn = tp / (tp + fn)
    sp = tn / (tn + fp)
    mcc = matthews_corrcoef(labels, preds)
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, preds)
    return {
        'auc': auc,
        'sn': sn,
        'sp': sp,
        'accuracy': acc,
        'mcc': mcc
    }

config = BertConfig.from_pretrained("../../models/prot_bert_bfd", num_labels=2)
#config = BertConfig.from_pretrained("bert-base", num_labels=2)

training_args = TrainingArguments(
        output_dir="results/",  # output directory
        num_train_epochs=10,  # total number of training epochs
        per_device_train_batch_size = 32,  # batch size per device during training
        per_device_eval_batch_size = 32,  # batch size for evaluation
        warmup_steps = 1000,  # number of warmup steps for learning rate scheduler
        weight_decay = 0.01,  # strength of weight decay
        learning_rate = 5e-5,  # The initial learning rate for optimizer.
        logging_dir="results/",  # directory for storing logs './logs'
        logging_steps=1052,  # How often to print logs
        save_steps=1052,        
        eval_steps=1052,  # How often to eval
        gradient_accumulation_steps=16,  # total number of steps before back propagation       
    )

model = Trainer(        
        args=training_args,  # training arguments, defined above
        #model=BertForSequenceClassificationTAPE.from_pretrained('bert-base'),  # ProBERT
        model=BertForSequenceClassification.from_pretrained(model_name, config=config),  # ProBERT
        # model=ProteinBertSequenceClsRnn.from_pretrained(model_name, config=config),       # ProBERT+BiLSTM
        # model=ProteinBertSequenceClsRnnAtt.from_pretrained(model_name, config=config),    # ProBERT+BiLSTM+Attention
        # model=ProteinBertSequenceClsCnn.from_pretrained(model_name, config=config),       # ProBERT+CNN
        train_dataset=train_dataset,  # training dataset
        eval_dataset=val_dataset,  # evaluation dataset
        compute_metrics=compute_metrics,  # evaluation metrics
    )

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [7]:
# model.train(resume_from_checkpoint="./checkpoint-25200")  #continue from checkpoint
model.train()
model.save_model('models/')
#predictions, label_ids, metrics = model.predict(test_dataset)
#print(metrics)

***** Running training *****
  Num examples = 107424
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 512
  Gradient Accumulation steps = 16
  Total optimization steps = 2090
  Number of trainable parameters = 91963394
  0%|          | 0/2090 [00:00<?, ?it/s]

TypeError: forward() got an unexpected keyword argument 'attention_mask'

# Model outputs

In [6]:
# evaluamos la salida de los modelos BERT

from transformers import AutoTokenizer, BertModel
import torch

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
print(inputs)
outputs = model(**inputs)

pooler_output = outputs.pooler_output
print(pooler_output.shape)

last_hidden_state = outputs.last_hidden_state
print(last_hidden_state.shape)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'input_ids': tensor([[  101,  7592,  1010,  2026,  3899,  2003, 10140,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}
torch.Size([1, 768])
torch.Size([1, 8, 768])


In [19]:
# para el caso de ProtBert-bfd

from transformers import AutoTokenizer, BertModel
from transformers import BertTokenizer
from transformers import Trainer, TrainingArguments, BertConfig
from bin.model_utils import BertForSequenceClassificationTAPE, BertForSequenceClassification

model_path = '../../models/prot_bert_bfd'

tokenizer = BertTokenizer.from_pretrained(model_path, do_lower_case=False)
inputs = tokenizer("A B C D", truncation=True, padding='max_length', max_length=5)

print(inputs)

config = BertConfig.from_pretrained(model_path, num_labels=2)
model = BertForSequenceClassification.from_pretrained(model_path, config=config)

outputs = model(inputs)

pooler_output = outputs.pooler_output
print(pooler_output.shape)

last_hidden_state = outputs.last_hidden_state
print(last_hidden_state.shape)

{'input_ids': [2, 6, 27, 23, 3], 'token_type_ids': [0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1]}


Some weights of the model checkpoint at ../../models/prot_bert_bfd were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not

AttributeError: 

: 