# Data loader de HLAB

Este es el data loader de HLAB original, es peculiar porque utiliza el dataloader para obtener las seudosecuencias.

In [1]:
import torch
from transformers import BertTokenizer
from torch.utils.data import Dataset
import os
import pandas as pd
import numpy as np
import re

  from .autonotebook import tqdm as notebook_tqdm


In [3]:

class Load_Dataset(Dataset):
    def __init__(self, split="train", tokenizer_name='../../models/prot_bert_bfd', max_length=51):
        """
        Args:
        """               
        self.tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=False)

        if split == "train":
            self.seqs, self.labels = self.load_dataset("dataset/train_data.csv")
        elif split == "valid":
            self.seqs, self.labels = self.load_dataset("dataset/valid_data.csv")
        elif split == "test":
            self.seqs, self.labels = self.load_dataset("dataset/test_data.csv")
        self.max_length = max_length

    def HLA_trans(self):
        HLA_seq = pd.read_csv('source/MHC_pseudo.dat', sep='\t')
        seqs = {}
        for i in range(len(HLA_seq)):
            seqs[HLA_seq.HLA[i]] = HLA_seq.sequence[i]
        return seqs

    def transform(self, HLA, peptide):
        data = HLA + peptide
        data = data + 'X' * (49 - len(data)) # no usa el max length
        return data

    def read_and_prepare(self,file):
        data = pd.read_csv(file)
        seqs = self.HLA_trans()
        data['cost_cents'] = data.apply(
            lambda row: self.transform(
                HLA=seqs[row['HLA'][0:5]+row['HLA'][6:]],
                peptide=row['peptide']),
            axis=1)
        return np.vstack(data.cost_cents)

    def get_label(self,file):
        data = pd.read_csv(file)
        label = []
        label.append(data['Label'].values)
        return label

    def load_dataset(self,data_path):
        file = data_path
        df = pd.read_csv(file)
        y_label = self.get_label(file)[0]
        X_test = self.read_and_prepare(file)
        X_test = X_test.tolist()
        X_test = [' '.join(eachseq) for eachseq in X_test]
        X_test = [" ".join(eachseq) for eachseq in
                  X_test]  # ['Y D S E Y R N I F T N T D E S N L Y L S Y N Y Y T W A V D A Y T W Y H M M V I F R L M',.....,'Y D S E Y R N I F T N T D E S N L Y L S Y N Y Y T W A V D A Y T W Y N F L I K F L L I']

        return (X_test, y_label)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        seq = " ".join("".join(self.seqs[idx].split()))
        seq = re.sub(r"[UZOBJ]", "X", seq).upper()

        seq_ids = self.tokenizer(seq, truncation=True, padding='max_length', max_length=self.max_length)

        sample = {key: torch.tensor(val) for key, val in seq_ids.items()}
        sample['labels'] = torch.tensor(self.labels[idx])

        return sample


In [4]:
model_name = "../../models/prot_bert_bfd"  # '../../models/prot_bert_bfd'
max_seq_length = 51  # 51

train_dataset = Load_Dataset(split="train", tokenizer_name=model_name, max_length=max_seq_length)
val_dataset = Load_Dataset(split="valid", tokenizer_name=model_name, max_length=max_seq_length)
test_dataset = Load_Dataset(split="test", tokenizer_name=model_name, max_length=max_seq_length)


In [6]:
train_dataset[0]
print(train_dataset[0]['input_ids'].shape)

torch.Size([51])


Train

In [7]:
"""
export MODEL_NAME='Rostlab/prot_bert_bfd'
#export MODEL_NAME=./checkpoint-25200
export OUTPUT_DIR=./results_lr5e-5_bert_gas16_bs16
export LOGGING_DIR=./logging_lr5e-5_bert_gas16_bs16
export LR=5e-5
python run_finetune.py \
    --model_name_or_path $MODEL_NAME \
    --output_dir $OUTPUT_DIR\
    --logging_dir $LOGGING_DIR \
    --max_seq_length 51 \
    --do_train \
    --do_eval \
    --per_device_train_batch_size=16   \
    --per_device_eval_batch_size=32   \
    --num_train_epochs 3 \
    --gradient_accumulation_steps 16 \
    --lr $LR\
    --weight_decay 0.01 \
    --warmup_steps 1000 \
    --eval_steps 1052 \
    --save_steps 1052 \
    --logging_steps 1052 \
    --fp16
"""

from transformers import Trainer, TrainingArguments, BertConfig
from bin.model_utils import BertForSequenceClassification
from sklearn.metrics import accuracy_score, confusion_matrix, matthews_corrcoef, roc_auc_score

def compute_metrics(pred):
    '''

    :param pred:
    :return:
    '''
    labels = pred.label_ids
    prediction=pred.predictions
    preds = prediction.argmax(-1)
    tn, fp, fn, tp = confusion_matrix(labels, preds).ravel()
    sn = tp / (tp + fn)
    sp = tn / (tn + fp)
    mcc = matthews_corrcoef(labels, preds)
    acc = accuracy_score(labels, preds)
    auc = roc_auc_score(labels, preds)
    return {
        'auc': auc,
        'sn': sn,
        'sp': sp,
        'accuracy': acc,
        'mcc': mcc
    }

config = BertConfig.from_pretrained("../../models/prot_bert_bfd", num_labels=2)
#config.rnn = args.rnn
#config.num_rnn_layer = args.num_rnn_layer
#config.rnn_dropout = args.rnn_dropout
#config.rnn_hidden = args.rnn_hidden
#config.length = args.max_seq_length
#config.cnn_filters = args.cnn_filters
#config.cnn_dropout = args.cnn_dropout

training_args = TrainingArguments(
        output_dir="results/",  # output directory
        num_train_epochs=10,  # total number of training epochs
        per_device_train_batch_size = 16,  # batch size per device during training
        per_device_eval_batch_size = 32,  # batch size for evaluation
        warmup_steps = 1000,  # number of warmup steps for learning rate scheduler
        weight_decay = 0.01,  # strength of weight decay
        learning_rate = 5e-5,  # The initial learning rate for optimizer.
        logging_dir="results/",  # directory for storing logs './logs'
        logging_steps=1052,  # How often to print logs
        save_steps=1052,
        #do_train=args.do_train,  # Perform training
        #do_eval=args.do_eval,  # Perform evaluation
        eval_steps=1052,  # How often to eval
        #evaluation_strategy=args.evaluation_strategy,  # evalaute per eval_steps
        gradient_accumulation_steps=16,  # total number of steps before back propagation
        #fp16=args.fp16,  # Use mixed precision
        #fp16_opt_level=args.fp16_opt_level,  # mixed precision mode
        #run_name=args.run_name,  # experiment name
        #seed=args.seed,  # Seed for experiment reproducibility 3x3
    )

model = Trainer(
        # model_init=model_init,                # the instantiated 🤗 Transformers model to be trained
        args=training_args,  # training arguments, defined above
        model=BertForSequenceClassification.from_pretrained(model_name, config=config),  # ProBERT
        # model=ProteinBertSequenceClsRnn.from_pretrained(model_name, config=config),       # ProBERT+BiLSTM
        # model=ProteinBertSequenceClsRnnAtt.from_pretrained(model_name, config=config),    # ProBERT+BiLSTM+Attention
        # model=ProteinBertSequenceClsCnn.from_pretrained(model_name, config=config),       # ProBERT+CNN
        train_dataset=train_dataset,  # training dataset
        eval_dataset=val_dataset,  # evaluation dataset
        compute_metrics=compute_metrics,  # evaluation metrics
    )

Some weights of the model checkpoint at ../../models/prot_bert_bfd were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not

In [9]:
# model.train(resume_from_checkpoint="./checkpoint-25200")  #continue from checkpoint
model.train()
model.save_model('models/')
#predictions, label_ids, metrics = model.predict(test_dataset)
#print(metrics)

***** Running training *****
  Num examples = 539019
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 256
  Gradient Accumulation steps = 16
  Total optimization steps = 21050
  Number of trainable parameters = 419933186


RuntimeError: CUDA out of memory. Tried to allocate 16.00 MiB (GPU 0; 7.77 GiB total capacity; 5.02 GiB already allocated; 21.31 MiB free; 5.13 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

: 