# Data preparation for RoBERTa model

References:  
    - https://huggingface.co/transformers/_modules/transformers/tokenization_roberta.html#RobertaTokenizer  
    - https://www.kaggle.com/debanga/huggingface-tokenizers-cheat-sheet  
    - https://github.com/billpku/NLP_In_Action  
    - https://androidkt.com/name-entity-recognition-with-bert-in-tensorflow/  
    - https://github.com/smart-patrol/sagemaker-bert

In [1]:
!pip install transformers

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
import pandas as pd
import math
import numpy as np
import sagemaker
import os
import json

from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer
from helpers import SentenceGetter




Neither PyTorch nor TensorFlow >= 2.0 have been found.Models won't be available and only tokenizers, configurationand file/data utilities can be used.


In [3]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()

print(f'Bucket: {bucket}')
print(f'Role: {role}')

Bucket: sagemaker-eu-west-1-087816224558
Role: arn:aws:iam::087816224558:role/service-role/AmazonSageMaker-ExecutionRole-20200424T125478


## Load data from local directory

In [4]:
data = pd.read_csv("../../data/interim/ner_dataset.csv", encoding="latin1").fillna(method="ffill")

In [5]:
print(data.shape)
display(data.head())

(1048575, 4)


Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


## Prepare data for tokenizing

#### Transform data into sentences and labels using SentenceGetter from helpers

In [6]:
getter = SentenceGetter(data)

sentences = [[s[0] for s in sent] for sent in getter.sentences]
labels = [[s[2] for s in sent] for sent in getter.sentences]

print("Example sentence with tags:")
print(" ".join(sentences[0]))
print(" ".join(labels[0]))

Example sentence with tags:
Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .
O O O O O O B-geo O O O O O B-geo O O O O O B-gpe O O O O O


In [7]:
tags_vals = list(set(data["Tag"].values))

# add X label for word piece support, as well as [CLS] and [SEP] - as it is required by RoBERTa
tags_vals.append('X')
tags_vals.append('[CLS]')
tags_vals.append('[SEP]')

# ensure that we've got unique tags
tags_vals = set(tags_vals)

# hardcode tag2idx dictionary for easier reproducibility
tag2idx={
     'B-art': 14,
     'B-eve': 16,
     'B-geo': 0,
     'B-gpe': 13,
     'B-nat': 12,
     'B-org': 10,
     'B-per': 4,
     'B-tim': 2,
     'I-art': 5,
     'I-eve': 7,
     'I-geo': 15,
     'I-gpe': 8,
     'I-nat': 11,
     'I-org': 3,
     'I-per': 6,
     'I-tim': 1,
     'X':17,
     'O': 9,
     '[CLS]':18,
     '[SEP]':19
}

## Set maximum sequence length

In [8]:
MAX_LEN  = 45

## Tokenize for RoBERTa

In [9]:
# load tokenizer for RoBERTa from Hugging Face library
tokenizer=RobertaTokenizer.from_pretrained('roberta-base',
                                           do_lower_case=False)

For RoBERTa to work properly, we need to add CLS and SEP tokens at the beggining and at the end respectively.  
Each word has to be also tokenized using RobertaTokenizer (splitted into word pieces).

In [10]:
tokenized_texts = []
word_piece_labels = []

for word_list,label in (zip(sentences,labels)):
    temp_label = []
    temp_token = []
    
    # add [CLS] token at the front 
    temp_label.append('[CLS]')
    temp_token.append('[CLS]')
    
    for word,lab in zip(word_list,label):
        token_list = tokenizer.tokenize(word)
        for m,token in enumerate(token_list):
            temp_token.append(token)
            if m==0:
                temp_label.append(lab)
            else:
                temp_label.append('X')  
                
    # add [SEP] token at the end
    temp_label.append('[SEP]')
    temp_token.append('[SEP]')
    
    tokenized_texts.append(temp_token)
    word_piece_labels.append(temp_label)
    
print('Tokenized example:')
print(" ".join(temp_token[:6]), "...", temp_token[-1])
print(" ".join(temp_label[:6]), "...", temp_token[-1])

Tokenized example:
[CLS] The United N ations is ... [SEP]
[CLS] O B-org I-org X O ... [SEP]


Next, convert tokenized sequences into ids sequences from RobertaTokenized and add padding (or cut sequence) to MAX_LEN. Add tags for padded sequences - "O" tag (no meaning), here it is number 9.

In [11]:
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in word_piece_labels],
                     maxlen=MAX_LEN, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")

print('Example tokenized text in form of RoBERTa ids:')
print(input_ids[0])
print("")
print('Tags for the example text:')
print(tags[0])

Example tokenized text in form of RoBERTa ids:
[    3 33383  1116 34084  6031  3629 11990  3916  3804 11672 23122   560
  4892 21959   627  5557   179 37590   463 15509   627  5632 24686   337
  1116 24270 19937  5090  7761  6025 12659     4     3     0     0     0
     0     0     0     0     0     0     0     0     0]

Tags for the example text:
[18  9  9  9 17 17  9  9 17  9  0  9  9 17  9  9  9  0  9  9  9  9 17 17
  9 13  9 17  9  9  9  9 19  9  9  9  9  9  9  9  9  9  9  9  9]


For the model to recognize padded values we need to add masks (1 for real tokens and 0 for padded values). RoBERTa is also suited for multi-sequence input thanks to segments, but in our case inputs are single sentences, so we need to simply add the same segment id to each sequence (zero).

In [12]:
attention_masks = [[int(i>0) for i in ii] for ii in input_ids]
segment_ids = [[0] * len(input_id) for input_id in input_ids]

print(attention_masks[0])
print(segment_ids[0])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


## Train/test split

We split data into the same splits that we did in BiLSTM preparation.

In [13]:
tr_inputs, val_inputs, tr_tags, val_tags, tr_masks, val_masks, tr_segs, val_segs = train_test_split(input_ids,
                                                                                                    tags,
                                                                                                    attention_masks,
                                                                                                    segment_ids, 
                                                                                                    random_state=666,
                                                                                                    test_size=0.1)
# check shapes
assert len(tr_inputs) == len(tr_segs) == len(tr_masks)
assert len(val_inputs) == len(val_segs) == len(val_masks)
assert tr_inputs[0].shape[0] == 45
assert val_inputs[0].shape[0] == 45

## Save files locally

In [14]:
pd.concat([pd.DataFrame(tr_tags),
           pd.DataFrame(tr_inputs),
           pd.DataFrame(tr_masks),
           pd.DataFrame(tr_segs)], axis=1).to_csv(os.path.join('../../data/processed_roberta/', 'train_roberta.csv'), header=False, index=False)

pd.concat([pd.DataFrame(val_tags),
           pd.DataFrame(val_inputs),
           pd.DataFrame(val_masks),
           pd.DataFrame(val_segs)], axis=1).to_csv(os.path.join('../../data/processed_roberta/', 'test_roberta.csv'), header=False, index=False)

## Upload data to S3

In [15]:
# current data directory
DATA_DIR = '../../data/processed_roberta'

# data directory in S3
PREFIX = 'named_entity_recognition/roberta_data'

In [16]:
input_data = sagemaker_session.upload_data(path=DATA_DIR, bucket=bucket, key_prefix=PREFIX)

In [17]:
# we need to save directories to those files on S3 for training purposes
data_directories = {'train_data_directory': os.path.join(input_data, 'test_roberta.csv'),
                    'test_data_directory': os.path.join(input_data, 'test_roberta.csv')
                   }

data_directories_file = open("../../src/utils/objects/data_directories_roberta.json", "w")
json.dump(data_directories, data_directories_file)
data_directories_file.close()

In [None]:
# Empty Bucket (optional)

# import boto3
# bucket_to_delete = boto3.resource('s3').Bucket(bucket)
# bucket_to_delete.objects.all().delete()