In [1]:
%%capture
pip install transformers

In [2]:
import pandas as pd
import torch 
import numpy as np
from transformers import BertTokenizerFast, BertForTokenClassification
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.optim import SGD, Adam

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Read CSV Data

In [4]:
df_train = pd.read_csv('datasets/entities-train.csv')
df_val = pd.read_csv('datasets/entities-test.csv')
df_train.head()

Unnamed: 0,Utterance,Labels
0,$ .,B-pr_curr O
1,$ 13 - 18k offer is still low for your fujifil...,B-pr_curr B-pr_val I-pr_val I-pr_val B-pr O O ...
2,"$ 5,150 totally .",B-pr_curr B-pr_val B-pr_size O
3,$ 9500 per unit is too much .,B-pr_curr B-pr_val B-pr_size I-pr_size O O O O
4,$ 950k - $ 1.3 m is okay .,B-pr_curr B-pr_val O B-pr_curr B-pr_val I-pr_v...


# Initialize Tokenizer

In [5]:
CACHE_DIR = '/media/tfsservices/DATA/NLP/cache/'
MODEL_NAME = "bert-base-uncased"
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME, cache_dir=CACHE_DIR)

# Create Dataset Class 

In [6]:
train_labels = [i.split() for i in df_train['Labels'].values.tolist()]

labels = set()
for lb in train_labels:
    [labels.add(i) for i in lb if i not in labels]

print(sorted(labels))

['B-aattr', 'B-act_acc', 'B-act_bought', 'B-act_buy', 'B-act_dec', 'B-act_have', 'B-act_int', 'B-act_pay', 'B-act_pot', 'B-act_sell', 'B-act_sold', 'B-attach', 'B-cat_ask', 'B-cat_bid', 'B-cat_exw', 'B-cat_max', 'B-cat_min', 'B-cat_oth', 'B-cat_rang', 'B-comp_compl', 'B-comp_sp', 'B-cond_asis', 'B-cond_exc', 'B-cond_good', 'B-cond_miss', 'B-cond_new', 'B-cond_poor', 'B-cond_ref', 'B-cond_vgood', 'B-cond_work', 'B-eq', 'B-eq_loc', 'B-eq_mm', 'B-eq_type', 'B-eq_vint', 'B-eq_wafs', 'B-fac', 'B-fac_prod', 'B-fac_stor', 'B-per_fir', 'B-per_sec', 'B-per_thr', 'B-pr', 'B-pr_curr', 'B-pr_size', 'B-pr_val', 'B-qattr', 'B-qty_any', 'B-qty_lot', 'B-qty_num', 'B-req', 'B-state_crat', 'B-state_demo', 'B-state_dinst', 'B-state_idle', 'B-state_inst', 'B-state_ndemo', 'B-tline', 'I-aattr', 'I-act_acc', 'I-act_bought', 'I-act_buy', 'I-act_dec', 'I-act_have', 'I-act_int', 'I-act_pay', 'I-act_pot', 'I-act_sell', 'I-act_sold', 'I-attach', 'I-cat_ask', 'I-cat_bid', 'I-cat_exw', 'I-cat_max', 'I-cat_min', 'I

In [7]:
labels_to_ids = {k: v for v, k in enumerate(sorted(labels))}
ids_to_labels = {v: k for v, k in enumerate(sorted(labels))}

In [8]:
print(labels_to_ids)

{'B-aattr': 0, 'B-act_acc': 1, 'B-act_bought': 2, 'B-act_buy': 3, 'B-act_dec': 4, 'B-act_have': 5, 'B-act_int': 6, 'B-act_pay': 7, 'B-act_pot': 8, 'B-act_sell': 9, 'B-act_sold': 10, 'B-attach': 11, 'B-cat_ask': 12, 'B-cat_bid': 13, 'B-cat_exw': 14, 'B-cat_max': 15, 'B-cat_min': 16, 'B-cat_oth': 17, 'B-cat_rang': 18, 'B-comp_compl': 19, 'B-comp_sp': 20, 'B-cond_asis': 21, 'B-cond_exc': 22, 'B-cond_good': 23, 'B-cond_miss': 24, 'B-cond_new': 25, 'B-cond_poor': 26, 'B-cond_ref': 27, 'B-cond_vgood': 28, 'B-cond_work': 29, 'B-eq': 30, 'B-eq_loc': 31, 'B-eq_mm': 32, 'B-eq_type': 33, 'B-eq_vint': 34, 'B-eq_wafs': 35, 'B-fac': 36, 'B-fac_prod': 37, 'B-fac_stor': 38, 'B-per_fir': 39, 'B-per_sec': 40, 'B-per_thr': 41, 'B-pr': 42, 'B-pr_curr': 43, 'B-pr_size': 44, 'B-pr_val': 45, 'B-qattr': 46, 'B-qty_any': 47, 'B-qty_lot': 48, 'B-qty_num': 49, 'B-req': 50, 'B-state_crat': 51, 'B-state_demo': 52, 'B-state_dinst': 53, 'B-state_idle': 54, 'B-state_inst': 55, 'B-state_ndemo': 56, 'B-tline': 57, 'I-a

In [12]:
# label_all_tokens = False

# def align_label(texts, labels):
#     tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)

#     word_ids = tokenized_inputs.word_ids()

#     previous_word_idx = None
#     label_ids = []

#     for word_idx in word_ids:

#         if word_idx is None:
#             label_ids.append(-100)

#         elif word_idx != previous_word_idx:
#             try:
#                 label_ids.append(labels_to_ids[labels[word_idx]])
#             except:
#                 label_ids.append(-100)
#         else:
#             try:
#                 label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
#             except:
#                 label_ids.append(-100)
#         previous_word_idx = word_idx

#     return label_ids

def align_label(text, tags):

    words = text.split()
    label_ids = [-100]
    for word, tag in zip(words, tags):
        toks = tokenizer.tokenize(word, truncation=True)

        sub_label_ids = []
        if len(toks) > 1 and tag.startswith('B-'):
            sub_label_ids.append(labels_to_ids[tag])
            i_tag = tag.replace("B-", "I-")
            sub_label_ids.extend([labels_to_ids[i_tag]] * (len(toks)-1))
        else:
            sub_label_ids.extend([labels_to_ids[tag]] * (len(toks)))

        label_ids.extend(sub_label_ids)

    pad_len = 512 - len(label_ids)
    label_ids.extend([-100]*pad_len)

    return label_ids


class DataSequence(torch.utils.data.Dataset):

    def __init__(self, df):
        lb = [i.split() for i in df['Labels'].values.tolist()]
        txt = df['Utterance'].values.tolist()
        self.texts = [tokenizer(str(i), padding='max_length', max_length = 512, truncation=True, return_tensors="pt") for i in txt]
        self.labels = [align_label(i,j) for i,j in zip(txt, lb)]

    def __len__(self):
        return len(self.labels)

    def get_batch_data(self, idx):
        return self.texts[idx]

    def get_batch_labels(self, idx):
        return torch.LongTensor(self.labels[idx])

    def __getitem__(self, idx):
        batch_data = self.get_batch_data(idx)
        batch_labels = self.get_batch_labels(idx)

        return batch_data, batch_labels

In [13]:
train_dataset = DataSequence(df_train)

# Build Model

In [15]:
class BertModel(torch.nn.Module):

    def __init__(self):

        super(BertModel, self).__init__()

        self.bert = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(labels), cache_dir=CACHE_DIR)

    def forward(self, input_ids, attention_mask, labels):

        output = self.bert(input_ids=input_ids, attention_mask=attention_mask, labels=labels, return_dict=False)

        return output

# Model Training

In [16]:
def train_loop(model, df_train, df_val, epochs=5, batch_size=2, learninig_rate=5e-3):

    train_dataset = DataSequence(df_train)
    val_dataset = DataSequence(df_val)

    train_dataloader = DataLoader(train_dataset, num_workers=4, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, num_workers=4, batch_size=batch_size)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    optimizer = SGD(model.parameters(), lr=learninig_rate)

    model.to(device)
    # if use_cuda:
    #     model = model.cuda()

    best_acc = 0
    best_loss = 1000

    for epoch_num in range(epochs):

        total_acc_train = 0
        total_loss_train = 0

        model.train()

        for train_data, train_label in tqdm(train_dataloader):

            train_label = train_label.to(device)
            mask = train_data['attention_mask'].squeeze(1).to(device)
            input_id = train_data['input_ids'].squeeze(1).to(device)

            optimizer.zero_grad()
            loss, logits = model(input_id, mask, train_label)

            for i in range(logits.shape[0]):

                logits_clean = logits[i][train_label[i] != -100]
                label_clean = train_label[i][train_label[i] != -100]

                predictions = logits_clean.argmax(dim=1)
                acc = (predictions == label_clean).float().mean()
                total_acc_train += acc
                total_loss_train += loss.item()

            loss.backward()
            optimizer.step()

        model.eval()

        total_acc_val = 0
        total_loss_val = 0

        for val_data, val_label in val_dataloader:

            val_label = val_label.to(device)
            mask = val_data['attention_mask'].squeeze(1).to(device)
            input_id = val_data['input_ids'].squeeze(1).to(device)

            loss, logits = model(input_id, mask, val_label)

            for i in range(logits.shape[0]):

                logits_clean = logits[i][val_label[i] != -100]
                label_clean = val_label[i][val_label[i] != -100]

                predictions = logits_clean.argmax(dim=1)
                acc = (predictions == label_clean).float().mean()
                total_acc_val += acc
                total_loss_val += loss.item()

        train_loss = total_loss_train / len(df_train)
        train_accuracy = total_acc_train / len(df_train)
        val_accuracy = total_acc_val / len(df_val)
        val_loss = total_loss_val / len(df_val)

        print(
            f'Epochs: {epoch_num + 1} | Loss: {train_loss: .3f} | Accuracy: {train_accuracy: .3f} | Val_Loss: {val_loss: .3f} | Val_Accuracy: {val_accuracy: .3f}')



In [17]:
LEARNING_RATE = 5e-3
EPOCHS = 7
BATCH_SIZE = 2

model = BertModel()
train_loop(model, df_train, df_val, EPOCHS, BATCH_SIZE, LEARNING_RATE)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

Epochs: 1 | Loss:  1.024 | Accuracy:  0.775 | Val_Loss:  0.775 | Accuracy:  0.812


100%|██████████| 3431/3431 [02:57<00:00, 19.29it/s]


Epochs: 2 | Loss:  0.492 | Accuracy:  0.872 | Val_Loss:  0.587 | Accuracy:  0.853


100%|██████████| 3431/3431 [02:57<00:00, 19.31it/s]


Epochs: 3 | Loss:  0.344 | Accuracy:  0.908 | Val_Loss:  0.511 | Accuracy:  0.866


100%|██████████| 3431/3431 [02:56<00:00, 19.45it/s]


Epochs: 4 | Loss:  0.258 | Accuracy:  0.928 | Val_Loss:  0.414 | Accuracy:  0.892


100%|██████████| 3431/3431 [02:56<00:00, 19.39it/s]


Epochs: 5 | Loss:  0.201 | Accuracy:  0.943 | Val_Loss:  0.456 | Accuracy:  0.889


100%|██████████| 3431/3431 [02:56<00:00, 19.44it/s]


Epochs: 6 | Loss:  0.158 | Accuracy:  0.956 | Val_Loss:  0.423 | Accuracy:  0.895


100%|██████████| 3431/3431 [02:56<00:00, 19.45it/s]


Epochs: 7 | Loss:  0.126 | Accuracy:  0.965 | Val_Loss:  0.482 | Accuracy:  0.887


# Evaluate Model

In [18]:
def evaluate(model, df_test):

    test_dataset = DataSequence(df_test)

    test_dataloader = DataLoader(test_dataset, num_workers=4, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    total_acc_test = 0.0

    for test_data, test_label in test_dataloader:

            test_label = test_label.to(device)
            mask = test_data['attention_mask'].squeeze(1).to(device)

            input_id = test_data['input_ids'].squeeze(1).to(device)

            loss, logits = model(input_id, mask, test_label)

            for i in range(logits.shape[0]):

                logits_clean = logits[i][test_label[i] != -100]
                label_clean = test_label[i][test_label[i] != -100]

                predictions = logits_clean.argmax(dim=1)
                acc = (predictions == label_clean).float().mean()
                total_acc_test += acc

    val_accuracy = total_acc_test / len(df_test)
    print(f'Test Accuracy: {val_accuracy: .3f}')


In [19]:
evaluate(model, df_val)

Test Accuracy:  0.887


# Predict One Sentence

In [20]:
from spacy import displacy
import typing as tp

In [21]:
def ner_render(tokens: tp.Sequence[str], ner_tags: tp.Sequence[str], title: tp.Optional[str] = None, **kwargs):
    pos = 0
    ents = []
    for word, tag in zip(tokens, ner_tags):
        if tag.startswith('B'):
            ents.append({
                "start": pos,
                "end": pos + len(word),
                "label": tag.split("-")[1]
            })
        elif tag.startswith('I'):
            ents[-1]["end"] = pos + len(word)
        pos += (len(word) + 1)
    displacy.render({
        "text": " ".join(tokens),
        "ents": ents,
        "title": title
    }, style="ent", manual=True, jupyter=True)

In [22]:
label_all_tokens = False
def align_word_ids(texts):
    
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            # label_ids.append(-100)
            label_ids.append(0)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(1)
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(1 if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids

In [49]:
def evaluate_one_text(model, sentence):

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    with torch.no_grad():
        encoded = tokenizer(sentence, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
        
        mask = encoded['attention_mask'].to(device)
        input_id = encoded['input_ids'].to(device)

        logits = model(input_id, mask, None)

        label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device)

        logits_clean = logits[0][label_ids != 0]
        # predictions = logits_clean.argmax(dim=1).tolist()
        predictions = logits_clean.argmax(dim=1).tolist()

        # print(predictions)
        prediction_labels = [ids_to_labels[i] for i in predictions]

        tokens = tokenizer.convert_ids_to_tokens(input_id[0])
        tokens = tokens[1:1+len(prediction_labels)]
        
        ner_render(tokens, prediction_labels)

In [50]:
evaluate_one_text(model, "500 $ for this.")

In [43]:
onnxfile = "ner_classifier.onnx"
modified_onnxfile = "modified_ner_model.onnx"

In [48]:
path = './model-dict-ent.dat'
torch.save(model.state_dict(), path)

In [49]:
path = './model-dict-ent.dat'
model1 = BertModel()
model1.load_state_dict(torch.load(path))
model1.eval()
evaluate_one_text(model1, "what is the price for this amat/applied materials unit.")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [93]:
input_names = ['input_id', 'mask', 'label']
output_names = ['tags']

txt = "Do you have this equipment?"
encoded = tokenizer(txt, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
dummy_inputs = {'input_id': encoded['input_ids'], 'mask': encoded['attention_mask'], "label":None }

model1.to("cpu")

model1.eval()

torch.onnx.export(model1,                 
                  args=dummy_inputs,
                  input_names=input_names,
                  output_names=output_names,
                  f=onnxfile,
                  opset_version=10,          # the ONNX version to export the model to
                  do_constant_folding=True  # whether to execute constant folding for optimization
                #   dynamic_axes={'input_ids' : {0 : 'batch_size'},    # variable length axes
                #                 'probs' : {0 : 'batch_size'}}
                #   export_params=True,
                #   verbose=True
)

verbose: False, log level: Level.ERROR



In [25]:
evaluate_one_text(model, 'what is the price for this amat/applied materials unit.')

In [26]:
evaluate_one_text(model, 'I want to buy amat / applied materials unit.')

In [28]:
evaluate_one_text(model, "kla tencor?")

In [29]:
evaluate_one_text(model, "Do you have this equipment?")

In [30]:
evaluate_one_text(model, "What is the whether today?")

In [48]:
evaluate_one_text(model, "price?")

In [32]:
evaluate_one_text(model, "is it still deinstalled?")

In [33]:
evaluate_one_text(model, "we are going to remove this unit from production next week.")

In [34]:
evaluate_one_text(model, "we can offer $ 70,000 for the picosun ald in an effort to try to come to an agreement this week .")

In [35]:
evaluate_one_text(model, "$ 19k / ea is much beyond my budget even it is new .")

In [36]:
evaluate_one_text(model, "my customer budget for candella candela cs20r is 125k based on working condition mike .")

In [37]:
evaluate_one_text(model, "we have a dual turn with gantry loader already .")

In [51]:
evaluate_one_text(model, "but it is still being used not heavily but still being used w / back side alignment .")

In [52]:
evaluate_one_text(model, "We do not have approval to purchase anything yet")

In [43]:
evaluate_one_text(model, "who needs this equipment")

In [47]:
evaluate_one_text(model, "pls help me to find FEI FIB ,200,450.") ## not correct

In [54]:
evaluate_one_text(model, "april 16 .") ## not correct

IndexError: list index out of range

In [55]:
evaluate_one_text(model, "sorry we are buying not selling.")

In [56]:
evaluate_one_text(model, "5k £ for this.")

In [57]:
evaluate_one_text(model, "5.000 k £ for this.")

In [88]:
evaluate_one_text(model, "5,000,000 k £ for this.")

In [63]:
evaluate_one_text(model, "$ 5 000 000 for this.")

In [91]:
evaluate_one_text(model, "$ 10-5-000-000 k for this.")

In [86]:
evaluate_one_text(model, "5.000.000 k  for this.")

In [82]:
evaluate_one_text(model, "$ 5,000,000 k for this.")

In [85]:
evaluate_one_text(model, "$ 5,000 k for this.")

In [59]:
evaluate_one_text(model, "usd 5.200.00 k for this.")

In [60]:
evaluate_one_text(model, "we are going to remove it from our cleanroom.")

In [65]:
evaluate_one_text(model, "remote it.")

In [66]:
evaluate_one_text(model, "remote it from warehouse .")

In [67]:
evaluate_one_text(model, "remote it from production .")

In [74]:
evaluate_one_text(model, "remote it from clean room .")

In [75]:
evaluate_one_text(model, "price .")

In [78]:
evaluate_one_text(model, "what is the quote ?")

In [81]:
evaluate_one_text(model, "please see invoice below $20k cif shanghai.")
