In [1]:
%%capture
pip install transformers

In [2]:
import pandas as pd
import torch 
import numpy as np
from transformers import BertTokenizerFast, BertForTokenClassification
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.optim import SGD

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Read CSV Data

In [4]:
df = pd.read_csv('entities-gen.csv')
df.head()

Unnamed: 0,utterance,labels
0,$ .,O O
1,$ 13 - 18k offer is still low for your fujifil...,B-pr_curr B-pr_val I-pr_val I-pr_val O O O O O...
2,"$ 5,150 totally .",B-pr_curr B-pr_val B-pr_size O
3,$ 9500 per unit is too much .,B-pr_curr B-pr_val B-pr_size I-pr_size O O O O
4,$ 950k - $ 1.3 m is okay .,B-pr_curr B-pr_val O B-pr_curr B-pr_val I-pr_v...


# Initialize Tokenizer

In [5]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

# Create Dataset Class 

In [7]:
label_all_tokens = False

def align_label(texts, labels):
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]])
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids

class DataSequence(torch.utils.data.Dataset):

    def __init__(self, df):

        lb = [i.split() for i in df['labels'].values.tolist()]
        txt = df['utterance'].values.tolist()
        self.texts = [tokenizer(str(i), padding='max_length', max_length = 512, truncation=True, return_tensors="pt") for i in txt]
        self.labels = [align_label(i,j) for i,j in zip(txt, lb)]

    def __len__(self):

        return len(self.labels)

    def get_batch_data(self, idx):

        return self.texts[idx]

    def get_batch_labels(self, idx):

        return torch.LongTensor(self.labels[idx])

    def __getitem__(self, idx):

        batch_data = self.get_batch_data(idx)
        batch_labels = self.get_batch_labels(idx)

        return batch_data, batch_labels

# Split Data and Define Unique Labels

In [8]:
labels = [i.split() for i in df['labels'].values.tolist()]
unique_labels = set()

for lb in labels:
        [unique_labels.add(i) for i in lb if i not in unique_labels]
labels_to_ids = {k: v for v, k in enumerate(unique_labels)}
ids_to_labels = {v: k for v, k in enumerate(unique_labels)}

df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42),
                            [int(.98 * len(df)), int(.99 * len(df))])

print(len(df_train),len(df_val), len(df_test))

6724 69 69


# Build Model

In [9]:
class BertModel(torch.nn.Module):

    def __init__(self):

        super(BertModel, self).__init__()

        self.bert = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(unique_labels))

    def forward(self, input_id, mask, label):

        output = self.bert(input_ids=input_id, attention_mask=mask, labels=label, return_dict=False)

        return output

# Model Training

In [10]:
def train_loop(model, df_train, df_val, epochs=5, batch_size=2, learninig_rate=5e-3):

    train_dataset = DataSequence(df_train)
    val_dataset = DataSequence(df_val)

    train_dataloader = DataLoader(train_dataset, num_workers=4, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, num_workers=4, batch_size=batch_size)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    optimizer = SGD(model.parameters(), lr=learninig_rate)

    if use_cuda:
        model = model.cuda()

    best_acc = 0
    best_loss = 1000

    for epoch_num in range(epochs):

        total_acc_train = 0
        total_loss_train = 0

        model.train()

        for train_data, train_label in tqdm(train_dataloader):

            train_label = train_label.to(device)
            mask = train_data['attention_mask'].squeeze(1).to(device)
            input_id = train_data['input_ids'].squeeze(1).to(device)

            optimizer.zero_grad()
            loss, logits = model(input_id, mask, train_label)

            for i in range(logits.shape[0]):

                logits_clean = logits[i][train_label[i] != -100]
                label_clean = train_label[i][train_label[i] != -100]

                predictions = logits_clean.argmax(dim=1)
                acc = (predictions == label_clean).float().mean()
                total_acc_train += acc
                total_loss_train += loss.item()

            loss.backward()
            optimizer.step()

        model.eval()

        total_acc_val = 0
        total_loss_val = 0

        for val_data, val_label in val_dataloader:

            val_label = val_label.to(device)
            mask = val_data['attention_mask'].squeeze(1).to(device)
            input_id = val_data['input_ids'].squeeze(1).to(device)

            loss, logits = model(input_id, mask, val_label)

            for i in range(logits.shape[0]):

                logits_clean = logits[i][val_label[i] != -100]
                label_clean = val_label[i][val_label[i] != -100]

                predictions = logits_clean.argmax(dim=1)
                acc = (predictions == label_clean).float().mean()
                total_acc_val += acc
                total_loss_val += loss.item()

        val_accuracy = total_acc_val / len(df_val)
        val_loss = total_loss_val / len(df_val)

        print(
            f'Epochs: {epoch_num + 1} | Loss: {total_loss_train / len(df_train): .3f} | Accuracy: {total_acc_train / len(df_train): .3f} | Val_Loss: {total_loss_val / len(df_val): .3f} | Accuracy: {total_acc_val / len(df_val): .3f}')



In [11]:
LEARNING_RATE = 5e-3
EPOCHS = 10
BATCH_SIZE = 2


model = BertModel()
train_loop(model, df_train, df_val, EPOCHS, BATCH_SIZE, LEARNING_RATE)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

Epochs: 1 | Loss:  2.207 | Accuracy:  0.584 | Val_Loss:  2.469 | Accuracy:  0.571


100%|██████████| 3362/3362 [02:52<00:00, 19.47it/s]


Epochs: 2 | Loss:  2.589 | Accuracy:  0.521 | Val_Loss:  2.432 | Accuracy:  0.571


100%|██████████| 3362/3362 [02:54<00:00, 19.27it/s]


Epochs: 3 | Loss:  2.573 | Accuracy:  0.521 | Val_Loss:  2.412 | Accuracy:  0.571


100%|██████████| 3362/3362 [02:54<00:00, 19.23it/s]


Epochs: 4 | Loss:  2.561 | Accuracy:  0.521 | Val_Loss:  2.426 | Accuracy:  0.571


100%|██████████| 3362/3362 [02:54<00:00, 19.22it/s]


Epochs: 5 | Loss:  2.555 | Accuracy:  0.521 | Val_Loss:  2.418 | Accuracy:  0.571


100%|██████████| 3362/3362 [02:54<00:00, 19.25it/s]


Epochs: 6 | Loss:  2.548 | Accuracy:  0.521 | Val_Loss:  2.420 | Accuracy:  0.571


100%|██████████| 3362/3362 [02:52<00:00, 19.47it/s]


Epochs: 7 | Loss:  2.544 | Accuracy:  0.521 | Val_Loss:  2.442 | Accuracy:  0.571


100%|██████████| 3362/3362 [02:54<00:00, 19.23it/s]


Epochs: 8 | Loss:  2.540 | Accuracy:  0.521 | Val_Loss:  2.415 | Accuracy:  0.571


100%|██████████| 3362/3362 [02:53<00:00, 19.42it/s]


Epochs: 9 | Loss:  2.541 | Accuracy:  0.521 | Val_Loss:  2.424 | Accuracy:  0.571


100%|██████████| 3362/3362 [02:53<00:00, 19.33it/s]


Epochs: 10 | Loss:  2.541 | Accuracy:  0.521 | Val_Loss:  2.417 | Accuracy:  0.571


# Evaluate Model

In [12]:
def evaluate(model, df_test):

    test_dataset = DataSequence(df_test)

    test_dataloader = DataLoader(test_dataset, num_workers=4, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    total_acc_test = 0.0

    for test_data, test_label in test_dataloader:

            test_label = test_label.to(device)
            mask = test_data['attention_mask'].squeeze(1).to(device)

            input_id = test_data['input_ids'].squeeze(1).to(device)

            loss, logits = model(input_id, mask, test_label)

            for i in range(logits.shape[0]):

                logits_clean = logits[i][test_label[i] != -100]
                label_clean = test_label[i][test_label[i] != -100]

                predictions = logits_clean.argmax(dim=1)
                acc = (predictions == label_clean).float().mean()
                total_acc_test += acc

    val_accuracy = total_acc_test / len(df_test)
    print(f'Test Accuracy: {val_accuracy: .3f}')


In [13]:
evaluate(model, df_test)

Test Accuracy:  0.529


# Predict One Sentence

In [14]:
from spacy import displacy
import typing as tp

In [15]:
def ner_render(tokens: tp.Sequence[str], ner_tags: tp.Sequence[str], title: tp.Optional[str] = None, **kwargs):
    pos = 0
    ents = []
    for word, tag in zip(tokens, ner_tags):
        if tag.startswith('B'):
            ents.append({
                "start": pos,
                "end": pos + len(word),
                "label": tag.split("-")[1]
            })
        elif tag.startswith('I'):
            ents[-1]["end"] = pos + len(word)
        pos += (len(word) + 1)
    displacy.render({
        "text": " ".join(tokens),
        "ents": ents,
        "title": title
    }, style="ent", manual=True, jupyter=True)

In [16]:
def align_word_ids(texts):
    
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=512, truncation=True)

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            # label_ids.append(-100)
            label_ids.append(0)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(1)
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(1 if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx

    return label_ids

In [17]:
def evaluate_one_text(model, sentence):

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:
        model = model.cuda()

    encoded = tokenizer(sentence, padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
    
    mask = encoded['attention_mask'].to(device)
    input_id = encoded['input_ids'].to(device)

    logits = model(input_id, mask, None)
    label_ids = torch.Tensor(align_word_ids(sentence)).unsqueeze(0).to(device)

    logits_clean = logits[0][label_ids != 0]
    predictions = logits_clean.argmax(dim=1).tolist()
    prediction_labels = [ids_to_labels[i] for i in predictions]

    tokens = tokenizer.convert_ids_to_tokens(input_id[0])
    tokens = tokens[1:1+len(prediction_labels)]
    
    ner_render(tokens, prediction_labels)

In [18]:
evaluate_one_text(model, 'What is the price for this amat/applied materials unit.')

In [19]:
evaluate_one_text(model, 'I want to buy amat / applied materials unit.')

In [81]:
evaluate_one_text(model, "kla tencor?")

In [82]:
evaluate_one_text(model, "Do you have this equipment?")

In [83]:
evaluate_one_text(model, "What is the whether today?")

In [84]:
evaluate_one_text(model, "price?")

In [85]:
evaluate_one_text(model, "is it still deinstalled?")

In [86]:
evaluate_one_text(model, "we are going to remove this unit from production next week.")

In [87]:
evaluate_one_text(model, "we can offer $ 70,000 for the picosun ald in an effort to try to come to an agreement this week .")

In [88]:
evaluate_one_text(model, "$ 19k / ea is much beyond my budget even it is new .")

In [89]:
evaluate_one_text(model, "my customer budget for candella candela cs20r is 125k based on working condition mike .")

In [90]:
evaluate_one_text(model, "we have a dual turn with gantry loader already .")

In [91]:
evaluate_one_text(model, "but it is still being used not heavily but still being used w / back side alignment .")

In [92]:
evaluate_one_text(model, "We do not have approval to purchase anything yet")

In [93]:
evaluate_one_text(model, "who needs this equipment")

In [94]:
evaluate_one_text(model, "pls help me to find FEI FIB ,200,450") ## not correct

In [104]:
evaluate_one_text(model, "April 16 .") ## not correct

In [105]:
evaluate_one_text(model, "Sorry we are buying not selling.")