In [1]:
import os
import json
import torch
import numpy as np

from config import NerConfig
from model import BertNer
from data_loader import NerDataset

from tqdm import tqdm
from seqeval.metrics import classification_report
from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup, BertTokenizer
from torch.optim import AdamW

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
data_path = os.path.join(os.path.join('data', 'fyp'), "ner_data")
data_path

'data\\fyp\\ner_data'

In [17]:
with open(os.path.join(data_path,"dev_out.txt"), "r", encoding="utf-8") as fp:
        dev_data = fp.read().split("\n")
dev_data = [json.loads(d) for d in dev_data]

In [27]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")

In [28]:
bio_labels = [
    "O",
    "B-EXP",
    "I-EXP",
    "B-EDU",
    "I-EDU",
    "B-REQUIRED-S",
    "I-REQUIRED-S",
    "B-PREFER-S",
    "I-PREFER-S"
  ]

In [39]:
label2id = {label: i for i, label in enumerate(bio_labels)}
id2label = {i: label for i, label in enumerate(bio_labels)}


In [32]:
def tokenize_and_align_labels(item):
        tokenized_input = tokenizer(
            item['text'],
            truncation=False,
            # We use this argument because the texts in our dataset are lists of words (with a label for each word).
            is_split_into_words=True,
        )

        word_ids = tokenized_input.word_ids() 
        previous_word_idx = None
        label_ids = []
        for i, token_id in enumerate(tokenized_input['input_ids']):
            if word_ids[i] is None:
                label_ids.append(0)
            elif word_ids[i] != previous_word_idx:
                label_ids.append(label2id[item['labels'][word_ids[i]]])
            else: 
                label_ids.append(0)

            previous_word_idx = word_ids[i] 

        tokenized_input['labels'] = label_ids
        return tokenized_input

In [34]:
tokenized_input = tokenize_and_align_labels(dev_data[0])

In [40]:
for i in range(len(tokenized_input['input_ids'])):
    print(tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'][i]), tokenized_input['attention_mask'][i], id2label[tokenized_input['labels'][i]])

[CLS] 1 O
( 1 O
Intermediate 1 O
) 1 O
Full 1 O
- 1 O
time 1 O
, 1 O
Permanent 1 O
Re 1 O
##mote 1 O
, 1 O
Canada 1 O
Who 1 O
We 1 O
Are 1 O
At 1 O
##hen 1 O
##nian 1 O
is 1 O
a 1 O
technology 1 O
company 1 O
that 1 O
increases 1 O
trust 1 O
in 1 O
business 1 O
. 1 O
Our 1 O
products 1 O
help 1 O
legal 1 O
, 1 O
finance 1 O
, 1 O
and 1 O
tax 1 O
teams 1 O
be 1 O
transaction 1 O
and 1 O
audit 1 O
- 1 O
ready 1 O
by 1 O
organizing 1 O
business 1 O
entity 1 O
and 1 O
corporate 1 O
structure 1 O
information 1 O
. 1 O
Over 1 O
370 1 O
, 1 O
000 1 O
business 1 O
entities 1 O
in 1 O
almost 1 O
every 1 O
country 1 O
are 1 O
managed 1 O
on 1 O
At 1 O
##hen 1 O
##nian 1 O
to 1 O
auto 1 O
##mate 1 O
work 1 O
##flow 1 O
##s 1 O
for 1 O
ownership 1 O
, 1 O
company 1 O
secret 1 O
##aria 1 O
##l 1 O
, 1 O
governance 1 O
, 1 O
tax 1 O
, 1 O
and 1 O
compliance 1 O
. 1 O
Head 1 O
##qua 1 O
##rter 1 O
##ed 1 O
in 1 O
Calgary 1 O
with 1 O
offices 1 O
in 1 O
Toronto 1 O
and 1 O
Vancouver 1 O
and 1 O
remote

In [44]:
number_of_tokens = len(tokenized_input['input_ids'])
crop = 0
if number_of_tokens > (510):
    crop = int((number_of_tokens - 510) / 2)
count = 0
for i in range(crop, len(tokenized_input['input_ids']) - crop):
    count+= 1
    print(tokenizer.convert_ids_to_tokens(tokenized_input['input_ids'][i]), tokenized_input['attention_mask'][i], id2label[tokenized_input['labels'][i]])



##board 1 O
customers 1 O
including 1 O
the 1 O
migration 1 O
of 1 O
their 1 O
data 1 O
to 1 O
the 1 O
At 1 O
##hen 1 O
##nian 1 O
platform 1 O
- 1 O
Work 1 O
closely 1 O
with 1 O
clients 1 O
data 1 O
using 1 O
S 1 O
##QL 1 O
- 1 O
Main 1 O
##tain 1 O
and 1 O
improve 1 O
current 1 O
data 1 O
migration 1 O
tools 1 O
- 1 O
Pre 1 O
##par 1 O
##e 1 O
detailed 1 O
reports 1 O
on 1 O
data 1 O
an 1 O
##oma 1 O
##lies 1 O
for 1 O
the 1 O
data 1 O
migration 1 O
team 1 O
- 1 O
As 1 O
##sist 1 O
##ing 1 O
On 1 O
##boarding 1 O
Team 1 O
with 1 O
data 1 O
migration 1 O
tasks 1 O
including 1 O
developing 1 O
migration 1 O
scripts 1 O
- 1 O
Work 1 O
with 1 O
the 1 O
Data 1 O
Mi 1 O
##gration 1 O
team 1 O
to 1 O
manually 1 O
verify 1 O
data 1 O
migration 1 O
testing 1 O
rounds 1 O
- 1 O
Val 1 O
##ida 1 O
##te 1 O
migrated 1 O
data 1 O
using 1 O
Mon 1 O
##go 1 O
##D 1 O
##B 1 O
- 1 O
Data 1 O
manipulation 1 O
and 1 O
transformation 1 O
, 1 O
database 1 O
management 1 O
, 1 O
and 1 O
problem 1 O
- 1 O
s