# Model Training Pipeline
## Training Steps
1. Load Dataset
2. Data Tokenization
3. Model Parametrization
4. Model Training
5. Model Evaluation

In [54]:
import pandas as pd
import os
from nltk.tokenize import TreebankWordTokenizer
print("Working Directory:", os.getcwd())

Working Directory: c:\Users\arthu\Documents\GitHub\ner-using-bert


### Step One 
# Load Dataset
We are using datasets annotated in [Doccano](https://github.com/doccano/doccano), which outputs a **.jsonl** file with a list of all the documents and their labels.

In [48]:
data_file_path = "BERT_Experiment/acerpi_dataset/train/annotated"
dataset_filename = "labeled_ufrgs_sentences.jsonl"

ufrgs_documents = pd.read_json(os.path.join(data_file_path, dataset_filename), orient='record', lines=True)

In [49]:
#Filter the dataset to remove all sentences without relevant labels besides 'O'
ufrgs_documents = ufrgs_documents[ufrgs_documents['label'].str.len() > 0]

In [53]:
# Keep only the relevant columns: the sentence text, the labels and the id to link back to the original document later on.
ufrgs_labeled_sentences = ufrgs_documents[['text', 'label', 'sentence_id']]

## Step Two
# Data Tokenization
Let's tokenize the sentences and tokens using the TreebankWordTokenizer.  

The first method **tokenize** will split our sentence and return a list of words which we'll organize in a 'tokenized_sentences' list.  

The second method **span_tokenize** will return the first and last indices of the characters in each token in reference to the orignal sentence.  
This allows us to recreate the sentence later on. We'll organize them in a 'token_positions' list.

In [134]:
from transformers import AutoTokenizer
checkpoint = 'neuralmind/bert-large-portuguese-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint, do_lower_case=False)

In [163]:
t = tokenizer(ufrgs_labeled_sentences['text'][3], is_split_into_words=False)
word_ids = t.word_ids()
tokens_by_word_id = {}

for idx, word in enumerate(word_ids):
    if word is not None:
        tokens_by_word_id.setdefault(word, []).append(t.tokens()[idx])

    

In [161]:
ufrgs_labeled_sentences['text'][3]

'CARLOS ALEXANDRE NETTO Reitor'

In [164]:
tokens_by_word_id

{0: ['CA', '##R', '##LO', '##S'],
 1: ['AL', '##E', '##X', '##AN', '##DR', '##E'],
 2: ['N', '##ET', '##TO'],
 3: ['Rei', '##tor']}

In [162]:
tokens_by_word_id

{0: ['car', '##los'],
 1: ['ale', '##xa', '##nd', '##re'],
 2: ['ne', '##tto'],
 3: ['reitor']}

In [None]:
tokenized_sentences = []
token_positions = []
for idx, sentence in annotated_sentences['text'].items():
    tokenized_sentences.append(TreebankWordTokenizer().tokenize(sentence))
    token_positions.append(list(TreebankWordTokenizer().span_tokenize(sentence)))


print(tokenized_sentences[0])
print(token_positions[0])