<a href="https://colab.research.google.com/github/bimhud/pytorch-transformers/blob/master/notebook/Text_Classification_RoBERTa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import json, re
from tqdm import tqdm_notebook
from uuid import uuid4

## Torch Modules
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader


In [4]:
!pip install -U pytorch-transformers

Collecting pytorch-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/b7/d3d18008a67e0b968d1ab93ad444fc05699403fa662f634b2f2c318a508b/pytorch_transformers-1.2.0-py3-none-any.whl (176kB)
[K     |████████████████████████████████| 184kB 2.8MB/s 
Collecting sacremoses (from pytorch-transformers)
[?25l  Downloading https://files.pythonhosted.org/packages/df/24/0b86f494d3a5c7531f6d0c77d39fd8f9d42e651244505d3d737e31db9a4d/sacremoses-0.0.33.tar.gz (802kB)
[K     |████████████████████████████████| 808kB 43.7MB/s 
Collecting sentencepiece (from pytorch-transformers)
[?25l  Downloading https://files.pythonhosted.org/packages/14/3d/efb655a670b98f62ec32d66954e1109f403db4d937c50d779a75b9763a29/sentencepiece-0.1.83-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 38.8MB/s 
Collecting regex (from pytorch-transformers)
[?25l  Downloading https://files.pythonhosted.org/packages/6f/a6/99eeb5904ab763db87af4bd71d9b1dfdd9792681240657a4c0

In [0]:
## PyTorch Transformer
from pytorch_transformers import RobertaModel, RobertaTokenizer
from pytorch_transformers import RobertaForSequenceClassification, RobertaConfig

In [6]:
## Check if Cuda is Available
print(torch.cuda.is_available())

True


In [0]:
## Install PyTorch-Transformer

In [7]:
!git clone https://github.com/snipsco/nlu-benchmark.git

Cloning into 'nlu-benchmark'...
remote: Enumerating objects: 11, done.[K
remote: Counting objects: 100% (11/11), done.[K
remote: Compressing objects: 100% (9/9), done.[K
remote: Total 389 (delta 2), reused 11 (delta 2), pack-reused 378[K
Receiving objects: 100% (389/389), 1.24 MiB | 10.55 MiB/s, done.
Resolving deltas: 100% (242/242), done.


In [0]:
## Importing Datasets

In [8]:
!ls  nlu-benchmark/2017-06-custom-intent-engines

AddToPlaylist	GetWeather  RateBook   SearchCreativeWork
BookRestaurant	PlayMusic   README.md  SearchScreeningEvent


In [0]:
dataset_path = "nlu-benchmark/2017-06-custom-intent-engines"

In [10]:
import os
dataset = pd.DataFrame(columns = ['utterance', 'label'])
for intent in ['AddToPlaylist', 'BookRestaurant', 'GetWeather', 'PlayMusic', 'RateBook', 'SearchCreativeWork',
               'SearchScreeningEvent']:
    with open(dataset_path + os.sep + intent + os.sep +  "train_" + intent + ".json",
              encoding='cp1251') as data_file:
        data = json.load(data_file)
    print("Class: {}, # utterances: {}".format(intent,len(data[intent])))
    texts = []
    for i in range(len(data[intent])):
        text = ''
        for j in range(len(data[intent][i]['data'])):
            text += data[intent][i]['data'][j]['text']
        dataset = dataset.append({'utterance': text, 'label': intent}, ignore_index=True)

Class: AddToPlaylist, # utterances: 300
Class: BookRestaurant, # utterances: 300
Class: GetWeather, # utterances: 300
Class: PlayMusic, # utterances: 300
Class: RateBook, # utterances: 300
Class: SearchCreativeWork, # utterances: 300
Class: SearchScreeningEvent, # utterances: 300


In [11]:
#Get index from multiple labels
label_to_ix = {}
for label in dataset.label:
    for word in label.split():
        if word not in label_to_ix:
            label_to_ix[word]=len(label_to_ix)
label_to_ix

{'AddToPlaylist': 0,
 'BookRestaurant': 1,
 'GetWeather': 2,
 'PlayMusic': 3,
 'RateBook': 4,
 'SearchCreativeWork': 5,
 'SearchScreeningEvent': 6}

In [0]:
## Loading RoBERTa classes

In [13]:
config = RobertaConfig.from_pretrained('roberta-base')
config.num_labels = len(list(label_to_ix.values()))
config

100%|██████████| 473/473 [00:00<00:00, 200010.67B/s]


{
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 7,
  "output_attentions": false,
  "output_hidden_states": false,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 1,
  "vocab_size": 50265
}

In [14]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification(config)

100%|██████████| 898823/898823 [00:00<00:00, 11059490.21B/s]
100%|██████████| 456318/456318 [00:00<00:00, 7037203.60B/s]


In [0]:
## Feature Preparation

In [0]:
def prepare_features(seq_1, max_seq_length = 300, 
             zero_pad = False, include_CLS_token = True, include_SEP_token = True):
    ## Tokenzine Input
    tokens_a = tokenizer.tokenize(seq_1)

    ## Truncate
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0:(max_seq_length - 2)]
    ## Initialize Tokens
    tokens = []
    if include_CLS_token:
        tokens.append(tokenizer.cls_token)
    ## Add Tokens and separators
    for token in tokens_a:
        tokens.append(token)

    if include_SEP_token:
        tokens.append(tokenizer.sep_token)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    ## Input Mask 
    input_mask = [1] * len(input_ids)
    ## Zero-pad sequence lenght
    if zero_pad:
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
    return torch.tensor(input_ids).unsqueeze(0), input_mask

In [17]:
msg = "My dog is cute!"
prepare_features(msg)

(tensor([[    0,  1308,  2335,    16, 11962,   328,     2]]),
 [1, 1, 1, 1, 1, 1, 1])

In [0]:
## Dataset Loader Classes

In [0]:
class Intents(Dataset):
    def __init__(self, dataframe):
        self.len = len(dataframe)
        self.data = dataframe
        
    def __getitem__(self, index):
        utterance = self.data.utterance[index]
        label = self.data.label[index]
        X, _  = prepare_features(utterance)
        y = label_to_ix[self.data.label[index]]
        return X, y
    
    def __len__(self):
        return self.len

In [0]:
train_size = 0.8
train_dataset=dataset.sample(frac=train_size,random_state=200).reset_index(drop=True)
test_dataset=dataset.drop(train_dataset.index).reset_index(drop=True)

In [21]:
print("FULL Dataset: {}".format(dataset.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

FULL Dataset: (2100, 2)
TRAIN Dataset: (1680, 2)
TEST Dataset: (420, 2)


In [0]:
training_set = Intents(train_dataset)
testing_set = Intents(test_dataset)

In [23]:
training_set.__getitem__(0)[0].shape

torch.Size([1, 8])

In [24]:
model(training_set.__getitem__(0)[0])

(tensor([[ 0.1118,  0.3384,  0.1381,  0.0139,  0.0683, -0.0736,  0.1751]],
        grad_fn=<AddmmBackward>),)

In [0]:
## Training Params

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.cuda()

In [0]:
# Parameters
params = {'batch_size': 1,
          'shuffle': True,
          'num_workers': 1}

In [0]:
training_loader = DataLoader(training_set, **params)
testing_loader = DataLoader(testing_set, **params)

In [0]:
loss_function = nn.CrossEntropyLoss()
learning_rate = 1e-05
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)

In [29]:
## Test Forward Pass
inp = training_set.__getitem__(0)[0].cuda()
output = model(inp)[0]
torch.max(output.data, 1)

torch.return_types.max(values=tensor([0.4816], device='cuda:0'), indices=tensor([6], device='cuda:0'))

In [30]:
max_epochs = 3
model = model.train()
for epoch in tqdm_notebook(range(max_epochs)):
    print("EPOCH -- {}".format(epoch))
    for i, (sent, label) in enumerate(training_loader):
        optimizer.zero_grad()
        sent = sent.squeeze(0)
        if torch.cuda.is_available():
          sent = sent.cuda()
          label = label.cuda()
        output = model.forward(sent)[0]
        _, predicted = torch.max(output, 1)
        
        loss = loss_function(output, label)
        loss.backward()
        optimizer.step()
        
        if i%100 == 0:
            correct = 0
            total = 0
            for sent, label in testing_loader:
                sent = sent.squeeze(0)
                if torch.cuda.is_available():
                  sent = sent.cuda()
                  label = label.cuda()
                output = model.forward(sent)[0]
                _, predicted = torch.max(output.data, 1)
                total += label.size(0)
                correct += (predicted.cpu() == label.cpu()).sum()
            accuracy = 100.00 * correct.numpy() / total
            print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

EPOCH -- 0
Iteration: 0. Loss: 2.258636236190796. Accuracy: 28.095238095238095%
Iteration: 100. Loss: 1.5868463516235352. Accuracy: 21.19047619047619%
Iteration: 200. Loss: 1.3071141242980957. Accuracy: 63.80952380952381%
Iteration: 300. Loss: 2.0553653240203857. Accuracy: 0.0%
Iteration: 400. Loss: 2.2822792530059814. Accuracy: 9.285714285714286%
Iteration: 500. Loss: 1.7432736158370972. Accuracy: 26.428571428571427%
Iteration: 600. Loss: 0.5579929351806641. Accuracy: 22.61904761904762%
Iteration: 700. Loss: 0.49189186096191406. Accuracy: 57.142857142857146%
Iteration: 800. Loss: 0.37926673889160156. Accuracy: 47.38095238095238%
Iteration: 900. Loss: 0.0850529670715332. Accuracy: 20.0%
Iteration: 1000. Loss: 0.4579188823699951. Accuracy: 56.19047619047619%
Iteration: 1100. Loss: 0.04705810546875. Accuracy: 69.28571428571429%
Iteration: 1200. Loss: 0.17160773277282715. Accuracy: 68.57142857142857%
Iteration: 1300. Loss: 0.5571842193603516. Accuracy: 82.38095238095238%
Iteration: 1400. 

In [0]:
!mkdir nlu-benchmark/2017-06-custom-intent-engines/RoBerta_Model

In [0]:
 torch.save(model.state_dict(), 'nlu-benchmark/2017-06-custom-intent-engines/RoBerta_Model/roberta_state_dict_'+ str(uuid4())+'.pth')

In [33]:
dataset.tail(5)

Unnamed: 0,utterance,label
2095,Is Across the Line playing at the closest movi...,SearchScreeningEvent
2096,Which animated movies are playing in the neigh...,SearchScreeningEvent
2097,Where is They Always Return at Dawn playing,SearchScreeningEvent
2098,What is the movie schedule in the neighborhood,SearchScreeningEvent
2099,Tell me when Howling II: Your Sister Is a Were...,SearchScreeningEvent


In [35]:
## Load model
!ls nlu-benchmark/2017-06-custom-intent-engines/RoBerta_Model

roberta_state_dict_4fd91890-1424-41fb-8a7a-059fc60bc379.pth


In [0]:
model_path = 'nlu-benchmark/2017-06-custom-intent-engines/RoBerta_Model/roberta_state_dict_4fd91890-1424-41fb-8a7a-059fc60bc379.pth'

In [39]:
%%time
model.load_state_dict(torch.load(model_path, map_location=device))

CPU times: user 92.8 ms, sys: 240 ms, total: 333 ms
Wall time: 339 ms


IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [0]:
def get_reply(msg):
  model.eval()
  input_msg, _ = prepare_features(msg)
  if torch.cuda.is_available():
    input_msg = input_msg.cuda()
  output = model(input_msg)[0]
  pred_score, pred_label = torch.max(output.data, 1)
  
  prediction=list(label_to_ix.keys())[pred_label]
  return prediction,pred_score.cpu().numpy()[0]

In [80]:
label_to_ix.keys()

dict_keys(['AddToPlaylist', 'BookRestaurant', 'GetWeather', 'PlayMusic', 'RateBook', 'SearchCreativeWork', 'SearchScreeningEvent'])

In [81]:
get_reply("play radiohead song")

('PlayMusic', 5.4527564)

In [96]:
get_reply("it is rainy in Sao Paulo")

('GetWeather', 4.8741484)

In [95]:
get_reply("sun shinnes all day")

('PlayMusic', 4.7038445)

In [94]:
get_reply("low humidity, high altitude")

('PlayMusic', 4.1896024)

In [93]:
get_reply("Book tacos for me tonight")

('BookRestaurant', 4.493275)

In [92]:
get_reply("Book a table for me tonight")

('BookRestaurant', 5.5600758)

In [97]:
get_reply("I want BBQ tonight under the rain")

('SearchCreativeWork', 3.6009989)