# Imports

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 8.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 53.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 57.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 3.6 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed huggingface-hub-0.6.0 py

In [4]:
import torch
import pandas as pd
import re
import pandas as pd
import torch
import numpy as np
from transformers import BertTokenizer, BertModel
from torch import nn
from torch.optim import Adam

In [8]:
# to change the dataset for training, change this.
# works best with uni_intents_balanced.csv

#DATASET = "dialogue flow management.csv"
DATASET = "uni_intents_balanced.csv"

# Train

In [9]:
#helper code

code_names = {
    -1 : "Unknown",
    0 : "Greeting",
    1 : "Name",
    2 : "Address",
    3 : "DeliveryTime",
    4 : "NumberOfPizzas",
    5 : "AddPizza",
    6 : "RemovePizza",
    7 : "EditPizza",
    8 : "AddTopping",
    9 : "RemoveTopping",
    10 : "AddSide",
    11 : "RemoveSide",
    12 : "AddDrink",
    13 : "RemoveDrink",
    14 : "EndConv",
    15 : "Unknown" # for BERT bug that doesn't allow -1
}

gpu = 0
device = torch.device(gpu if torch.cuda.is_available() else "cpu")
if torch.cuda.is_available():
    torch.cuda.set_device(gpu)


def change_negative(x):
  if x < 0:
    return int(14 + abs(x))
  else:
    return x


Data Loading

In [62]:
# read dataset
data = pd.read_csv(DATASET)
data["category"] = data["intent_code"].apply(lambda x: code_names[x])

# change unknown -1 to 15

data["category"] = data["intent_code"].apply(lambda x: code_names[x])


data["intent_code"] = data["intent_code"].apply(lambda x: change_negative(x))
data = data.sort_values(by="intent_code")

def alphanumericize(x): # and space
    text = re.sub(r'[^A-Za-z0-9 ]+', '', x)
    return text

# lower the text
data["input"] = data["input"].apply(lambda x: x.lower())
# Clean the text
data["input"] = data["input"].apply(lambda x: alphanumericize(x))

PREPROCESSING = f"{str.lower}, {alphanumericize} "

In [63]:
# hyperparameters

TEST_SPLIT = 0.1
VAL_SPLIT = 0.1


embed_size = 300 # how big is each word vector
max_features = 120000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 750 # max number of words in a question to use
LEARN_RATE = 0.001
BATCH_SIZE = 2 # how many samples to process at once
N_EPOCHS = 45 # how many times to iterate over all samples
n_splits = 5 # Number of K-fold Splits
SEED = 48
debug = 0


----
**CREDIT TO:**
*    **Author**: Ruben Winastwan
*    **Title**: Text Classification with BERT in PyTorch
*    **Date**: 30/04/2022
*    **Availability**: https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f
*    **LICENSE**: Apache 2.0 open source
----

In [64]:


class BertClassifier(nn.Module):
      def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()
        n_classes = len(labels)

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, n_classes)
        self.relu = nn.ReLU()
    

      def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer



In [65]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
labels =  {v: k for k, v in code_names.items()}

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['category']]
        self.texts = [tokenizer(text, padding='max_length', max_length = 512, truncation=True, return_tensors="pt") for text in df['input']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [66]:
##
from sklearn.model_selection import train_test_split
#data = data.sample(frac=1).reset_index(drop=True)  # probably not needed due to stratified sampling

# function used twice to make second split
train_X, tv_X, train_y, tv_y = train_test_split(data['input'], data['intent_code'],
                                                    stratify=data['intent_code'], 
                                                    test_size=TEST_SPLIT + VAL_SPLIT, random_state=SEED) # e.g. together 0.4
new_test_split = TEST_SPLIT/(TEST_SPLIT + VAL_SPLIT) 
test_X, valid_X, test_y, valid_y = train_test_split(tv_X, tv_y,
                                                    stratify=tv_y, 
                                                    test_size=new_test_split, random_state=SEED)

# this is for fixing a weird bug with the splits
if test_X.shape[0] == valid_X.shape[0] - 1:
    test_X = test_X.append(pd.Series("hey you all"))
    test_y = test_y.append(pd.Series(0))

# this is to improve intent classifier performance when used with NER
train_X = train_X.append(pd.Series("name")).append(pd.Series("postcode")).append(pd.Series("time"))
train_y = train_y.append(pd.Series(1)).append(pd.Series(2)).append(pd.Series(3))
train_X = train_X.append(pd.Series("add topping")).append(pd.Series("remove topping"))
train_y = train_y.append(pd.Series(8)).append(pd.Series(9))
train_X = train_X.append(pd.Series("add side")).append(pd.Series("remove side"))
train_y = train_y.append(pd.Series(10)).append(pd.Series(11))
train_X = train_X.append(pd.Series("add drink")).append(pd.Series("remove drink"))
train_y = train_y.append(pd.Series(12)).append(pd.Series(13))
###

train_data = pd.DataFrame(zip(train_X, train_y), columns=['input', 'intent_code'])
train_data["category"] = train_data["intent_code"].apply(lambda x: code_names[x])
val_data = pd.DataFrame(zip(valid_X, valid_y), columns=['input', 'intent_code'])
val_data["category"] = val_data["intent_code"].apply(lambda x: code_names[x])
test_data = pd.DataFrame(zip(test_X, test_y), columns=['input', 'intent_code'])
test_data["category"] = test_data["intent_code"].apply(lambda x: code_names[x])

train, val, test = Dataset(train_data), Dataset(val_data), Dataset(test_data)

# Create Data Loaders
train_loader = torch.utils.data.DataLoader(train, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = torch.utils.data.DataLoader(val, batch_size=BATCH_SIZE, shuffle=False)
test_loader = torch.utils.data.DataLoader(test, batch_size=BATCH_SIZE, shuffle=False)

In [67]:
#
LEARN_RATE =  2e-5

model = BertClassifier()
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARN_RATE)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [68]:
#
N_EPOCHS = 8

train_loss = []
valid_loss = []

model = BertClassifier()
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARN_RATE)


for epoch_num in range(N_EPOCHS):

        total_acc_train = 0
        total_loss_train = 0

        for train_input, train_label in (train_loader):

            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
            
            batch_loss = criterion(output, train_label)
            total_loss_train += batch_loss.item()
            
            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()
        
        total_acc_val = 0
        total_loss_val = 0

        with torch.no_grad():

            for val_input, val_label in valid_loader:

                val_label = val_label.to(device)
                mask = val_input['attention_mask'].to(device)
                input_id = val_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)

                batch_loss = criterion(output, val_label)
                total_loss_val += batch_loss.item()
                
                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc
        
        train_loss.append(total_loss_train / len(train_data))
        valid_loss.append(total_loss_val / len(val_data))
        print(
            f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} | Train Accuracy: {total_acc_train / len(train_data): .3f} | Val Loss: {total_loss_val / len(val_data): .3f} | Val Accuracy: {total_acc_val / len(val_data): .3f}')
        
total_acc_test = 0
total_loss_test = 0

with torch.no_grad():
    for test_input, test_label in test_loader:

        test_label = test_label.to(device)
        mask = test_input['attention_mask'].to(device)
        input_id = test_input['input_ids'].squeeze(1).to(device)

        output = model(input_id, mask)

        batch_loss = criterion(output, test_label)
        total_loss_test += batch_loss.item()
        
        acc = (output.argmax(dim=1) == test_label).sum().item()
        total_acc_test += acc

test_loss = total_loss_test / len(test_data)
test_accuracy = total_acc_test / len(test_data)

print(f"Test Loss: {test_loss:.2f}, Test Accuracy: {test_accuracy:.2f}") 


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epochs: 1 | Train Loss:  1.397 | Train Accuracy:  0.093 | Val Loss:  1.384 | Val Accuracy:  0.125
Epochs: 2 | Train Loss:  1.315 | Train Accuracy:  0.202 | Val Loss:  1.238 | Val Accuracy:  0.250
Epochs: 3 | Train Loss:  1.095 | Train Accuracy:  0.394 | Val Loss:  0.913 | Val Accuracy:  0.583
Epochs: 4 | Train Loss:  0.767 | Train Accuracy:  0.668 | Val Loss:  0.579 | Val Accuracy:  0.792
Epochs: 5 | Train Loss:  0.416 | Train Accuracy:  0.927 | Val Loss:  0.353 | Val Accuracy:  0.917
Epochs: 6 | Train Loss:  0.202 | Train Accuracy:  0.984 | Val Loss:  0.171 | Val Accuracy:  0.958
Epochs: 7 | Train Loss:  0.113 | Train Accuracy:  0.990 | Val Loss:  0.114 | Val Accuracy:  0.958
Epochs: 8 | Train Loss:  0.070 | Train Accuracy:  1.000 | Val Loss:  0.102 | Val Accuracy:  0.958
Test Loss: 0.29, Test Accuracy: 0.79


In [69]:
import pickle
pickle.dump(model, open('intent_classifier.pkl','wb'))

In [70]:
import shutil
shutil.copy('intent_classifier.pkl','drive/MyDrive/COM3029 coursework 2/Main/Components/intent_classifier.pkl')

'drive/MyDrive/COM3029 coursework 2/Main/Components/intent_classifier.pkl'

# Prediction

In [80]:
# this is the old code
def predict_intent(input):

    test_data = pd.DataFrame(zip([input], [0]), columns=['input', 'intent_code'])
    test_data["category"] = test_data["intent_code"].apply(lambda x: code_names[x])

    test = Dataset(test_data)

    test_loader = torch.utils.data.DataLoader(test, batch_size=BATCH_SIZE, shuffle=False)

    with torch.no_grad():

        for test_input, test_label in test_loader:

            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)
    
    prediction = output.argmax(dim=1).item()

    if prediction == 15:
      prediction = -1

    return prediction

predict_intent("postcode")

2

In [82]:
import pickle
intent_classifier = pickle.load(open('drive/MyDrive/COM3029 coursework 2/Main/Components/intent_classifier.pkl', 'rb'))

In [89]:
def get_intent(input):
    test_data = pd.DataFrame(zip([input], [0]), columns=['input', 'intent_code'])
    test_data["category"] = test_data["intent_code"].apply(lambda x: code_names[x])
    test = Dataset(test_data)
    test_loader = torch.utils.data.DataLoader(test, batch_size=BATCH_SIZE, shuffle=False)
    with torch.no_grad():

        for test_input, test_label in test_loader:

            test_label = test_label.to("cuda:0")
            mask = test_input['attention_mask'].to("cuda:0")
            input_id = test_input['input_ids'].squeeze(1).to("cuda:0")
            output = intent_classifier(input_id, mask)
    
    prediction = output.argmax(dim=1).item()

    if prediction == 15:
      prediction = -1

    return prediction
print(f"postcode: {get_intent('postcode')}")
print(f"time: {get_intent('time')}")
print(f"add side: {get_intent('add side')}")


postcode: 2
time: 3
add side: 10
