# Transformers

Ejemplo de comoabrir un moedlo preentrenado y entrenarlo una vez mas en otra base de datos (no se modifica el modelo)

In [1]:
# dataset ##########################################################################################
from datasets import load_dataset

dataset = load_dataset("yelp_review_full")
dataset["train"][100]
{'label': 0,
 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'}

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))


# model ##########################################################################################
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import numpy as np
import evaluate

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch") 
#evaluation_strategy es para monitorear metricas en la evaluacion

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# traininig #######################################################################################
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)

#trainer.train()

  from .autonotebook import tqdm as notebook_tqdm
Found cached dataset yelp_review_full (/home/vicente/.cache/huggingface/datasets/yelp_review_full/yelp_review_full/1.0.0/e8e18e19d7be9e75642fc66b198abadb116f73599ec89a69ba5dd8d1e57ba0bf)
100%|██████████| 2/2 [00:00<00:00, 204.71it/s]
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequ

# Custom architecture

In [None]:
from pytorch_pretrained_bert.tokenization import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
def get_tokenized_samples(samples, max_seq_length, tokenizer):
    """
    we assume a function label_map that maps each label to an index or vector encoding. Could also be a dictionary.
    :param samples: we assume struct {.text, .label) 
    :param max_seq_length: the maximal sequence length
    :param tokenizer: BERT tokenizer
    :return: list of features
    """

    features = []
    for sample in samples:
        textlist = sample.text.split(' ')
        labellist = sample.label
        tokens = []
        labels = []
        for i, word in enumerate(textlist):
            token = tokenizer.tokenize(word) #tokenize word according to BERT
            tokens.extend(token)
            label = labellist[i]
            # fit labels to tokenized size of word
            for m in range(len(token)):
                if m == 0:
                    labels.append(label)
                else:
                    labels.append("X")
        # if we exceed max sequence length, cut sample
        if len(tokens) >= max_seq_length - 1:
            tokens = tokens[0:(max_seq_length - 2)]
            labels = labels[0:(max_seq_length - 2)]
            
        ntokens = []
        segment_ids = []
        label_ids = []
        # start with [CLS] token
        ntokens.append("[CLS]")
        segment_ids.append(0)
        label_ids.append(label_map(["[CLS]"]))
        for i, token in enumerate(tokens):
            # append tokens
            ntokens.append(token)
            segment_ids.append(0)
            label_ids.append(label_map(labels[i]))
        # end with [SEP] token
        ntokens.append("[SEP]")
        segment_ids.append(0)
        label_ids.append(label_map(["[SEP]"]))
        # convert tokens to IDs
        input_ids = tokenizer.convert_tokens_to_ids(ntokens)
        # build mask of tokens to be accounted for
        input_mask = [1] * len(input_ids) 
        while len(input_ids) < max_seq_length:
            # pad with zeros to maximal length
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)
            label_ids.append([0] * (len(label_list) + 1))

        features.append((input_ids,
                              input_mask,
                              segment_ids,
                              label_id))
    return features

In [None]:
from pytorch_pretrained_bert.modeling import BertPreTrainedModel, BertModel

class MyBertBasedModel(BertPreTrainedModel):
    """
    MyBertBasedModel inherits from BertPreTrainedModel which is an abstract class to handle weights initialization and
        a simple interface for downloading and loading pre-trained models.
    """

    def __init__(self, config, num_labels):
        super(MyBertBasedModel, self).__init__(config)
        self.num_labels = num_labels
        self.bert = BertModel(config) # basic BERT model
        self.dropout = torch.nn.Dropout(config.hidden_dropout_prob)
        self.classifier = torch.nn.Linear(config.hidden_size, num_labels)
        self.apply(self.init_bert_weights)


    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        sequence_output, _ = self.bert(input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False)
        # now you can implement any architecture that receives bert sequence output
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        if labels is not None:
            loss_fct = MyLoss()
            # it is important to activate the loss only on un-padded inputs
            active_loss = attention_mask.view(-1) == 1
            active_logits = logits.view(-1, self.num_labels)[active_loss]
            active_labels = labels.view(-1, self.num_labels)[active_loss]
            loss = loss_fct(active_logits, active_labels)
            return loss
        else:
            return logits

In [None]:
train_tokenized_samples = get_tokenized_samples(train_samples, args.max_seq_length, tokenizer)
model = MyBertBasedModel.from_pretrained(args.bert_model, num_labels = num_labels)
model.train()
for range(n_epochs):
    for sample in train_tokenized_samples:
        input_ids, input_mask, segment_ids, label_ids = sample
        loss = model(input_ids, segment_ids, input_mask, label_ids)
        loss.backward()
        optimizer.step()