In [None]:
!pip install datasets matplotlib numpy pandas seaborn scikit-learn torch tqdm transformers

In [None]:
import datasets
from datasets import load_metric
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sn
from sklearn.metrics import classification_report, confusion_matrix
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_scheduler

In [None]:
def label_to_int(intent: str) -> int:
    if intent == 'AddToPlaylist':
        return 0
    elif intent == 'BookRestaurant':
        return 1
    elif intent == 'GetWeather':
        return 2
    elif intent == 'PlayMusic':
        return 3
    elif intent == 'RateBook':
        return 4
    elif intent == 'SearchCreativeWork':
        return 5
    elif intent == 'SearchScreeningEvent':
        return 6
    return -1


def adjust_labels(entries: dict) -> dict:
    return {
        'text': entries['text'],
        'intent': [label_to_int(entry) for entry in entries['intent']]
    }

In [None]:
def tokenize_function(examples: dict) -> dict:
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [None]:
# Create a dataset dict with train as dataset.csv and validation as validation_dataset.csv
dataset_dict: datasets.DatasetDict = datasets.load_dataset(
    'csv',
    data_files={
        'train': 'data/dataset.csv',
        'validation': 'data/validation_dataset.csv'
    }
)

In [None]:
for split in ['train', 'validation']:
    dataset_dict[split] = dataset_dict[split].map(
        lambda e: adjust_labels(e), batched=True
    )

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=True)
tokenized_datasets: datasets.DatasetDict = dataset_dict.map(tokenize_function, batched=True)

In [None]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("intent", "labels")
tokenized_datasets.set_format("torch")

In [1]:
train_dataloader: DataLoader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=8)
eval_dataloader: DataLoader = DataLoader(tokenized_datasets["validation"], batch_size=8)

NameError: name 'DataLoader' is not defined

In [None]:
# This model is equal to BERT + a linear layer for classification. In our custom model we designed a FastText + a hidden layer and linear layer for classification
model = AutoModelForSequenceClassification.from_pretrained(
    "prajjwal1/bert-tiny", num_labels=7
)

In [None]:
optimizer = AdamW(model.parameters(), lr=5e-5)

In [None]:
num_epochs: int = 3
num_training_steps: int = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0,
    num_training_steps=num_training_steps
)

In [None]:
device: str = "cuda" if torch.cuda.is_available() else "cpu"
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [None]:
metric = load_metric("accuracy")
model.eval()
preds, trues = [], []
for i, batch in tqdm(enumerate(eval_dataloader), desc="evaluating",
                     total=eval_dataloader.__len__()):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

    _, tag_seq = torch.max(logits, 1)
    preds.extend(tag_seq.cpu().detach().tolist())
    trues.extend(batch['labels'].cpu().detach().tolist())

metric.compute()

In [None]:
names = ['AddToPlaylist', 'BookRestaurant', 'GetWeather', 'PlayMusic', 'RateBook',
         'SearchCreativeWork', 'SearchScreeningEvent']
print(classification_report(
    np.array(trues).flatten(), np.array(preds).flatten(), target_names=names)
)

In [None]:
cm = confusion_matrix(np.array(trues).flatten(), np.array(preds).flatten())
df_cm = pd.DataFrame(cm, index=names, columns=names)
# config plot sizes
sn.set(font_scale=1)
sn.heatmap(df_cm, annot=True, annot_kws={"size": 8}, cmap='coolwarm', linewidth=0.5,
           fmt="")
plt.show()