In [1]:
import pandas as pd
import numpy as np
import torch
print(f"Torch Version: {torch.__version__}")

import transformers
print(f"transformers (Adapter) Version: {transformers.__version__}")

Torch Version: 1.8.1
transformers (Adapter) Version: 2.0.1


In [2]:
from transformers import RobertaTokenizer
import numpy as np

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

def encode_batch(batch):
  """Encodes a batch of input data using the model tokenizer."""
  return tokenizer(batch["text"], max_length=80, truncation=True, padding="max_length")

In [3]:
data_path = "./NER_multilabel_data_v3.csv"
df = pd.read_csv(data_path)

all_tags = df.newTag

all_tags = set(all_tags)

all_tags = "|".join(all_tags)
all_tags = all_tags.split("|")
all_tags = set(all_tags)
all_tags = list(all_tags)

In [5]:
from ner_dataset import get_trainset_data_loader

all_tags, trainset, trainloader = get_trainset_data_loader(tokenizer, BATCH_SIZE=128, data_path = data_path)

labels: ['B-art' 'B-eve' 'B-geo' 'B-gpe' 'B-nat' 'B-org' 'B-per' 'B-tim'
 'CountryCode' 'CryptoCurrencyCode' 'CurrencyCode' 'Event' 'Float' 'I-art'
 'I-eve' 'I-geo' 'I-gpe' 'I-nat' 'I-org' 'I-per' 'I-tim' 'Integer'
 'Location' 'Month' 'O' 'Object' 'Party' 'Race' 'SpecialTerm'
 'TemporalUnit' 'Time' 'Timezone' 'US_States']


In [6]:

from transformers import RobertaConfig, RobertaModelWithHeads

config = RobertaConfig.from_pretrained(
    "roberta-base",
    num_labels=len(all_tags),
    label2id = trainset.label_map, 
    id2label = trainset.id2label
)
model = RobertaModelWithHeads.from_pretrained(
    "roberta-base",
    config=config,
)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModelWithHeads: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModelWithHeads were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.embeddings.position_ids']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and infere

In [7]:
all_adapter_name = []
for tag in all_tags:
    adapter_name = f"{tag}_0731"
    name = model.load_adapter(f"./save_adapters/{adapter_name}")
    all_adapter_name.append(name)
    model.load_head(f"./save_heads/{adapter_name}")

Overwriting existing head 'I-per'
Overwriting existing head 'CryptoCurrencyCode'
Overwriting existing head 'I-art'
Overwriting existing head 'B-tim'
Overwriting existing head 'I-org'
Overwriting existing head 'B-eve'
Overwriting existing head 'Time'
Overwriting existing head 'I-nat'
Overwriting existing head 'Month'
Overwriting existing head 'B-geo'
Overwriting existing head 'B-gpe'
Overwriting existing head 'Event'
Overwriting existing head 'Float'
Overwriting existing head 'I-tim'
Overwriting existing head 'B-org'
Overwriting existing head 'Party'
Overwriting existing head 'Race'
Overwriting existing head 'Object'
Overwriting existing head 'I-eve'
Overwriting existing head 'Location'
Overwriting existing head 'B-per'
Overwriting existing head 'Timezone'
Overwriting existing head 'O'
Overwriting existing head 'US_States'
Overwriting existing head 'I-gpe'
Overwriting existing head 'TemporalUnit'
Overwriting existing head 'CurrencyCode'
Overwriting existing head 'B-art'
Overwriting exis

In [11]:
import re

In [14]:
parallel_text = "','".join(all_adapter_name)
result = re.findall(r'[;|(|)]',parallel_text)
if len(result) != 0:
    raise(ValueError("Adapter Name must not contain \"" + '\", \"'.join(result) + '"'))

In [15]:
from transformers.adapters.composition import Parallel
parallel = eval("Parallel('" + "','".join(all_adapter_name) + "')")

model.set_active_adapters(parallel)

adapter_name = "All_tag_2"
model.load_adapter(f"./save_adapters/{adapter_name}")
model.load_head(f"./save_heads/{adapter_name}")

In [16]:
device = "cpu"

In [17]:
def get_adapter_mapping(model):
    print(model.active_head)
    label_2_id_mapping = dict()
    id_2_label_mapping = dict()
    for i, head in enumerate(model.active_head):
        label_2_id_mapping[head] = i
        id_2_label_mapping[i] = head
    return label_2_id_mapping, id_2_label_mapping

In [18]:
def model_predict(model, sentence, device = "cpu"):
    tokenized_sentence = torch.tensor([tokenizer.encode(sentence)])
    pos = torch.tensor([[0] * len(tokenized_sentence)])
    tags = torch.tensor([[1] * len(tokenized_sentence)])

    model = model.to(device)
    with torch.no_grad():
        outputs = model(input_ids=tokenized_sentence.to(device), 
                        token_type_ids=pos.to(device), 
                        attention_mask=tags.to(device))

    logits = outputs[1][0]

    return_tags_order = {}
    all_output = None
    for i, output in enumerate(outputs):

        return_tags_order[i] = (model.active_head[i])

        output = outputs[i][0]

        if all_output != None:
            all_output = torch.cat((all_output, output), dim=2)
        else:
            all_output = output
    all_output = torch.sigmoid(all_output)

    output_array = np.array(all_output)
    output_array = output_array.reshape(output_array.shape[-2], output_array.shape[-1])

    label_confidences = []
    for label_confidence in list(output_array):
        label_confidences.append(list(label_confidence))

    #Drop Head and End since it is start/stop Token
    label_confidences = label_confidences[1:-1]

    max_value = np.array(label_confidences).argmax(axis=1)
    trans_func = np.vectorize(lambda x: model.active_head[x])
    out_labels = trans_func(max_value)

    out_sentence = tokenizer.tokenize(sentence)

    return out_sentence, out_labels, label_confidences, return_tags_order

In [38]:
df = pd.read_csv(data_path)

In [41]:
df[df["newTag"].apply(lambda x: "Float" in x)][0:50]

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Sentence #,Word,POS,Tag,newTag
582,582,419,,Ġ2,CD,O,Float
583,583,419,,.,CD,O,Float
584,584,419,,3,CD,O,Float
8405,8405,6081,,Ġ4,CD,O,Float
8406,8406,6081,,.,CD,O,Float
8407,8407,6081,,6,CD,O,Float
8422,8422,6096,,Ġ17,CD,O,Float
8423,8423,6096,,.,CD,O,Float
8424,8424,6096,,09,CD,O,Float
16475,16475,11903,,Ġ4,CD,O,Float


In [56]:
sentence = "Dan will be deemed to have completed its delivery for 18.22 obligations before 2021-7-5 if in Niall's opinion, the Jeep Car satisfies the Acceptance Criteria, and Niall notifies Dan in writing that it is accepting the Jeep Car."
sentence = "I like to Ġ4"

In [57]:
sen, pred, logits, tags_order = model_predict(model, sentence)

In [58]:
label_2_id_mapping, id_2_label_mapping = get_adapter_mapping(model)

['I-per', 'CryptoCurrencyCode', 'I-art', 'B-tim', 'I-org', 'B-eve', 'Time', 'I-nat', 'Month', 'B-geo', 'B-gpe', 'Event', 'Float', 'I-tim', 'B-org', 'Party', 'Race', 'Object', 'I-eve', 'Location', 'B-per', 'Timezone', 'O', 'US_States', 'I-gpe', 'TemporalUnit', 'CurrencyCode', 'B-art', 'Integer', 'SpecialTerm', 'CountryCode', 'B-nat', 'I-geo']


In [59]:
np.array(sen)

array(['I', 'Ġlike', 'Ġto', 'ĠÄ', 'ł', '4'], dtype='<U5')

In [60]:
np.array(pred)

array(['O', 'O', 'O', 'O', 'O', 'Integer'], dtype='<U7')

In [61]:
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
def interact_word(i):
    print(i)
    print(f"{sen[i].replace('Ġ', '')}: {pred[i]}")
    target = logits[i]

    outprint = {}
    for i in range(len(target)):
        outprint[target[i]] = (f"{tags_order[i].ljust(6)} \t: {target[i]:.5f}")
        
    outprint_keys = list(outprint.keys())
    outprint_keys.sort(reverse=True)
    for i, key in enumerate(outprint_keys):
        print(f"{str(i).ljust(2)} {outprint[key]}")

In [55]:

interact(lambda x: interact_word(x), x=widgets.IntSlider(min=0, max=len(sen)-1, step=1, value=0))

interactive(children=(IntSlider(value=0, description='x', max=3), Output()), _dom_classes=('widget-interact',)…

<function __main__.<lambda>(x)>