In [2]:
!pip install transformers

Collecting transformers
  Using cached https://files.pythonhosted.org/packages/12/b5/ac41e3e95205ebf53439e4dd087c58e9fd371fd8e3724f2b9b4cdb8282e5/transformers-2.10.0-py3-none-any.whl
Collecting sentencepiece
  Using cached https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl
Processing /home/ec2-user/.cache/pip/wheels/29/3c/fd/7ce5c3f0666dab31a50123635e6fb5e19ceb42ce38d4e58f45/sacremoses-0.0.43-cp36-none-any.whl
Collecting dataclasses; python_version < "3.7"
  Using cached https://files.pythonhosted.org/packages/e1/d2/6f02df2616fd4016075f60157c7a0452b38d8f7938ae94343911e0fb0b09/dataclasses-0.7-py3-none-any.whl
Collecting tqdm>=4.27
  Using cached https://files.pythonhosted.org/packages/c9/40/058b12e8ba10e35f89c9b1fdfc2d4c7f8c05947df2d5eb3c7b258019fda0/tqdm-4.46.0-py2.py3-none-any.whl
Collecting regex!=2019.12.17
[?25l  Downloading https://files.pythonhosted.org/packages/60/7c/0d

In [3]:
import pandas as pd
import math
import numpy as np
import sagemaker
import os

from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

from transformers import RobertaTokenizer #, RobertaConfig, RobertaForTokenClassification, AdamW

Neither PyTorch nor TensorFlow >= 2.0 have been found.Models won't be available and only tokenizers, configurationand file/data utilities can be used.


In [None]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()

print(f'Bucket: {bucket}')
print(f'Role: {role}')

## Load data from local directory

In [None]:
data = pd.read_csv("../data/interim/ner_dataset.csv", encoding="latin1").fillna(method="ffill")

## Prepare data for tokenizing

In [None]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None
        
# Get full document data struce
getter = SentenceGetter(data)

# Get sentence data
sentences = [[s[0] for s in sent] for sent in getter.sentences]
print(sentences[0])

# Get pos data
poses = [[s[1] for s in sent] for sent in getter.sentences]
print(poses[0])

# Get tag labels data
labels = [[s[2] for s in sent] for sent in getter.sentences]
print(labels[0])

tags_vals = list(set(data["Tag"].values))

# Add X  label for word piece support
# Add [CLS] and [SEP] as BERT need
tags_vals.append('X')
tags_vals.append('[CLS]')
tags_vals.append('[SEP]')

tags_vals = set(tags_vals)

tag2idx={'B-art': 14,
 'B-eve': 16,
 'B-geo': 0,
 'B-gpe': 13,
 'B-nat': 12,
 'B-org': 10,
 'B-per': 4,
 'B-tim': 2,
 'I-art': 5,
 'I-eve': 7,
 'I-geo': 15,
 'I-gpe': 8,
 'I-nat': 11,
 'I-org': 3,
 'I-per': 6,
 'I-tim': 1,
 'X':17,
 'O': 9,
 '[CLS]':18,
 '[SEP]':19}

tag2name={tag2idx[key] : key for key in tag2idx.keys()}

## Set maximum sequence length

In [None]:
max_len  = 45

## Tokenize for RoBERTa

In [None]:
tokenizer=RobertaTokenizer.from_pretrained('roberta-base',do_lower_case=False)

In [None]:
tokenized_texts = []
word_piece_labels = []
i_inc = 0
for word_list,label in (zip(sentences,labels)):
    temp_lable = []
    temp_token = []
    
    # Add [CLS] at the front 
    temp_lable.append('[CLS]')
    temp_token.append('[CLS]')
    
    for word,lab in zip(word_list,label):
        token_list = tokenizer.tokenize(word)
        for m,token in enumerate(token_list):
            temp_token.append(token)
            if m==0:
                temp_lable.append(lab)
            else:
                temp_lable.append('X')  
                
    # Add [SEP] at the end
    temp_lable.append('[SEP]')
    temp_token.append('[SEP]')
    
    tokenized_texts.append(temp_token)
    word_piece_labels.append(temp_lable)
    
    if 5 > i_inc:
        print("No.%d,len:%d"%(i_inc,len(temp_token)))
        print("texts:%s"%(" ".join(temp_token)))
        print("No.%d,len:%d"%(i_inc,len(temp_lable)))
        print("lables:%s"%(" ".join(temp_lable)))
    i_inc +=1

In [None]:
# Make text token into id
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=max_len, dtype="long", truncating="post", padding="post")
print(input_ids[0])

# Make label into id, pad with "O" meaning others
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in word_piece_labels],
                     maxlen=max_len, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")
print(tags[0])

# For fine tune of predict, with token mask is 1,pad token is 0
attention_masks = [[int(i>0) for i in ii] for ii in input_ids]
attention_masks[0];

# Since only one sentence, all the segment set to 0
segment_ids = [[0] * len(input_id) for input_id in input_ids]
segment_ids[0];

## Train/test split

In [None]:
tr_inputs, val_inputs, tr_tags, val_tags,tr_masks, val_masks,tr_segs, val_segs = train_test_split(input_ids, tags,attention_masks,segment_ids, 
                                                            random_state=4, test_size=0.3)
print(len(tr_inputs),len(val_inputs),len(tr_segs),len(val_segs))
print(tr_inputs)
print(tr_tags)

## Save files locally

In [None]:
pd.concat([pd.DataFrame(tr_tags), pd.DataFrame(tr_inputs), pd.DataFrame(tr_masks), pd.DataFrame(tr_segs)], axis=1) \
        .to_csv(os.path.join('../data/processed_roberta/', 'train_roberta.csv'), header=False, index=False)

pd.concat([pd.DataFrame(val_tags), pd.DataFrame(val_inputs), pd.DataFrame(val_masks), pd.DataFrame(val_segs)], axis=1) \
        .to_csv(os.path.join('../data/processed_roberta/', 'test_roberta.csv'), header=False, index=False)

## Upload data to S3

In [None]:
# current data directory
DATA_DIR = '../data/processed_roberta'

# data directory in S3
PREFIX = 'named_entity_recognition/roberta_data'

In [None]:
input_data = sagemaker_session.upload_data(path=DATA_DIR, bucket=bucket, key_prefix=PREFIX)

In [None]:
print(input_data)