In [1]:
from transformers import AutoModel, AutoTokenizer

# BERT Base
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')
model = AutoModel.from_pretrained('neuralmind/bert-base-portuguese-cased')

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
import pandas as pd

In [15]:
df = pd.read_csv("imdb-reviews-pt-br.csv")
df['labels'] = df['sentiment'].map({'pos': 1, 'neg': 0})
df = df[['text_pt', 'labels']]

In [16]:
df.head(2)

Unnamed: 0,text_pt,labels
0,"Mais uma vez, o Sr. Costner arrumou um filme p...",0
1,Este é um exemplo do motivo pelo qual a maiori...,0


In [17]:
from sklearn.model_selection import train_test_split

In [18]:
df_train, df_test = train_test_split(df, test_size=0.2)

In [19]:
df_train.shape

(39567, 2)

In [20]:
df_test.shape

(9892, 2)

In [21]:
from datasets import Dataset, DatasetDict

In [22]:
datasets = DatasetDict({
    "train": Dataset.from_pandas(df_train),
    "test": Dataset.from_pandas(df_test)
    })
datasets = datasets.remove_columns(["__index_level_0__"])

In [23]:
# dataset = Dataset.from_pandas(df)
# dataset_train = Dataset.from_pandas(df_train)
# dataset_test = Dataset.from_pandas(df_test)

In [24]:
datasets

DatasetDict({
    train: Dataset({
        features: ['text_pt', 'labels'],
        num_rows: 39567
    })
    test: Dataset({
        features: ['text_pt', 'labels'],
        num_rows: 9892
    })
})

In [25]:
datasets['train']["text_pt"][0]

'Só de dizer que você tem um filme sobre John Holmes é uma garantia para colocar algumas pessoas na frente da tela, mas o roteirista / diretor James Cox oferece muito mais. Um "Rashamon" do desprezível cenário de Hollywood, o filme divide os assassinatos de julho de 1981 no país das maravilhas através de uma variedade de ângulos e imagens de filmes, mas principalmente através do filtro de John Holmes cogou o cérebro de doninha. Em um filme cheio de vilões, Holmes é o mais vil, o mais patético ou ambos. Várias versões da história emergem e se fundem, enquanto Cox faz pulos e cartas de título em meio a efeitos e emoticons. O diálogo é rápido e naturalista e nunca soa falso. Enquanto o filme acontece dois anos depois de Holmes ter caído de pornografia e se tornar uma depravação alimentada por drogas verdadeiramente perversa, Kilmer exala implacavelmente uma sexualidade tão intensa que pode ser medida em centímetros. Essa sexualidade em suas bordas cria uma sensação de mau presságio que pa

In [26]:
#tokenizer?

In [27]:
tokenizer(datasets['train']["text_pt"][0])

{'input_ids': [101, 7178, 125, 4640, 179, 3983, 376, 222, 998, 498, 2230, 20239, 253, 230, 13649, 221, 7201, 1450, 1101, 229, 2375, 180, 7179, 117, 449, 146, 12919, 120, 2481, 3610, 385, 22312, 6158, 785, 325, 119, 1263, 107, 15505, 546, 181, 107, 171, 10968, 22305, 835, 5391, 125, 7331, 117, 146, 998, 11372, 259, 15986, 125, 1618, 125, 11730, 202, 806, 366, 14192, 10661, 1115, 125, 230, 5402, 125, 18855, 122, 4255, 125, 2465, 117, 449, 1953, 1115, 171, 450, 552, 125, 2230, 20239, 15839, 203, 146, 10209, 125, 171, 4029, 13808, 119, 335, 222, 998, 13140, 125, 16998, 117, 20239, 253, 146, 325, 4661, 117, 146, 325, 4286, 2906, 291, 2592, 119, 11533, 4350, 180, 1081, 6276, 210, 122, 176, 1142, 210, 117, 1139, 385, 22312, 659, 5995, 128, 122, 6536, 125, 1461, 173, 1423, 123, 3997, 122, 5088, 713, 255, 119, 231, 12356, 253, 5941, 122, 21132, 122, 2364, 331, 22278, 15740, 119, 2942, 146, 998, 6491, 682, 481, 700, 125, 20239, 370, 21020, 125, 18591, 3057, 122, 176, 2962, 230, 7642, 5630, 182, 

In [34]:
max_target_length=512
def preprocess_function(examples):
    model_inputs= tokenizer(examples['text_pt'], truncation=True, max_length=512)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["text_pt"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [35]:
tokenized_datasets = datasets.map(preprocess_function, batched=True)#Dataset.from_pandas(dataset_train)
# tokenized_dataset_test = dataset_test.map(preprocess_function, batched=True)#Dataset.from_pandas(dataset_test)

  0%|          | 0/40 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

In [36]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text_pt', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 39567
    })
    test: Dataset({
        features: ['text_pt', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9892
    })
})

In [37]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [38]:
data_collator

DataCollatorWithPadding(tokenizer=PreTrainedTokenizerFast(name_or_path='neuralmind/bert-base-portuguese-cased', vocab_size=29794, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [39]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
)

In [40]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [41]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertModel.forward` and have been ignored: text_pt, labels. If text_pt, labels are not expected by `BertModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 39567
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 4946


KeyError: 'loss'