In [1]:
import pickle
from transformers import AutoTokenizer
from transformers import DataCollatorForTokenClassification
from datasets import load_metric, GeneratorBasedBuilder, load_dataset, load_from_disk
import numpy as np

import json

from typing import List

# Opening file

In [2]:
def open_file(path:str) -> dict:
    file = open(path, 'rb')
    pickle_dict = pickle.load(file)
    file.close()

    return pickle_dict

In [3]:
TRAIN_PKL = r'C:\Users\beama\Desktop\Projects\nlp-transformers-studies\ml4nlp-covid19ner\train.pkl'
TEST_PKL = r'C:\Users\beama\Desktop\Projects\nlp-transformers-studies\ml4nlp-covid19ner\test.pkl'
VAL_PKL = r'C:\Users\beama\Desktop\Projects\nlp-transformers-studies\ml4nlp-covid19ner\val.pkl'

In [4]:
train_dict = open_file(TRAIN_PKL)
test_dict = open_file(TEST_PKL)
val_dict = open_file(VAL_PKL)

In [5]:
print("train keys:", train_dict.keys())
print("test keys:", test_dict.keys())
print("val keys:", val_dict.keys())

train keys: dict_keys(['id', 'word_seq', 'tag_seq'])
test keys: dict_keys(['id', 'word_seq'])
val keys: dict_keys(['id', 'word_seq', 'tag_seq'])


In [6]:
print("train len:", len(train_dict['id']))
print("test len:",  len(test_dict['id']))
print("val len:",  len(val_dict['id']))

train len: 23600
test len: 2950
val len: 2950


Divisao de treino/test/validacao: 80/10/10

In [7]:
# getting unique ner_tags
ner_tags = set()

for value_array in train_dict['tag_seq']:
    for word in value_array:
        ner_tags.add(word)

for value_array in val_dict['tag_seq']:
    for word in value_array:
        ner_tags.add(word)

In [8]:
ner_tags_id = dict()
ner_tags_values = dict()
for i, value in enumerate(ner_tags):
    ner_tags_id[i] = value
    ner_tags_values[value] = i

In [9]:
def dict_to_array(dicitionary: dict, ner_tags_values:dict=None) -> list:
    id_values = dicitionary['id']
    word_values = dicitionary['word_seq']

    array = []
    if 'tag_seq' in dicitionary.keys():
        tag_values = dicitionary['tag_seq']

        for id, word, tag in zip(id_values, word_values, tag_values):
            # ner_tags = [ner_tags_values[value] for value in tag]
            aux = {'id': id, 
                   'tokens': word,
                   'ner_tags': ner_tags}
            array.append(aux)
    else:
        for id, word in zip(id_values, word_values):
            aux = {'id': id, 
                   'tokens': word}
            array.append(aux)
    
    return array

In [10]:
def join_train_val(train_dict, val_dict) -> list:
    texts_array = []
    labels_array = []
    
    word_values = train_dict['word_seq']
    tag_values = train_dict['tag_seq']
    
    for word, tag in zip(word_values, tag_values):
        texts_array.append(word)
        labels_array.append(tag)

    word_values = val_dict['word_seq']
    tag_values = val_dict['tag_seq']
    
    for word, tag in zip(word_values, tag_values):
        texts_array.append(word)
        labels_array.append(tag)

    return texts_array, labels_array

In [28]:
texts_array, labels_array = join_train_val(train_dict, val_dict)

In [29]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(texts_array, labels_array, test_size=.2)
test_texts, val_texts, test_labels, val_labels = train_test_split(val_texts, val_labels, test_size=.5)

In [30]:
len(train_texts[0])

128

In [31]:
def flatten_list(array, dictionary_tokens=None):
    new_array = []
    for text_array in array:
        for word in text_array:
            if dictionary_tokens:
                new_array.append(dictionary_tokens[word])
            else:
                new_array.append(word)
            
    return new_array

In [32]:
train_texts, train_labels = flatten_list(train_texts), flatten_list(train_labels, ner_tags_values)
test_texts, test_labels = flatten_list(test_texts), flatten_list(test_labels, ner_tags_values)
val_texts, val_labels = flatten_list(val_texts), flatten_list(val_labels, ner_tags_values)

In [33]:
from transformers import AutoTokenizer

In [34]:
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

loading configuration file https://huggingface.co/bert-base-cased/resolve/main/config.json from cache at C:\Users\beama/.cache\huggingface\transformers\a803e0468a8fe090683bdc453f4fac622804f49de86d7cecaee92365d4a0f829.a64a22196690e0e82ead56f388a3ef3a50de93335926ccfa20610217db589307
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.20.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading file https://huggingface.co/bert-base-cased/resolve

In [35]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)

In [36]:
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [37]:
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [38]:
import torch

class COVIDDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = COVIDDataset(train_encodings, train_labels)
val_dataset = COVIDDataset(val_encodings, val_labels)
test_dataset = COVIDDataset(test_encodings)

In [39]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/distilbert-base-uncased/resolve/main/config.json from cache at C:\Users\beama/.cache\huggingface\transformers\23454919702d26495337f3da04d1655c7ee010d5ec9d77bdb9e399e00302c0a1.91b885ab15d631bf9cee9dc9d25ece0afd932f2f5130eba28f2055b2220c0333
Model config DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds

In [40]:
trainer.train()

***** Running training *****
  Num examples = 2718720
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1699200
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: token_type_ids. If token_type_ids are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.


IndexError: Target 54 is out of bounds.