<a href="https://colab.research.google.com/github/bimhud/pytorch-transformers/blob/master/notebooks/GnS_Classification_RoBERTa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import json, re
from tqdm import tqdm_notebook
from uuid import uuid4

## Torch Modules
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader


In [37]:
!pip install -U pytorch-transformers

Collecting pytorch-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/a3/b7/d3d18008a67e0b968d1ab93ad444fc05699403fa662f634b2f2c318a508b/pytorch_transformers-1.2.0-py3-none-any.whl (176kB)
[K     |████████████████████████████████| 184kB 6.7MB/s 
[?25hCollecting sacremoses (from pytorch-transformers)
[?25l  Downloading https://files.pythonhosted.org/packages/df/24/0b86f494d3a5c7531f6d0c77d39fd8f9d42e651244505d3d737e31db9a4d/sacremoses-0.0.33.tar.gz (802kB)
[K     |████████████████████████████████| 808kB 38.5MB/s 
Collecting regex (from pytorch-transformers)
[?25l  Downloading https://files.pythonhosted.org/packages/6f/a6/99eeb5904ab763db87af4bd71d9b1dfdd9792681240657a4c0a599c10a81/regex-2019.08.19.tar.gz (654kB)
[K     |████████████████████████████████| 655kB 32.4MB/s 
Collecting sentencepiece (from pytorch-transformers)
[?25l  Downloading https://files.pythonhosted.org/packages/14/3d/efb655a670b98f62ec32d66954e1109f403db4d937c50d779a75b9763a29/sentencepiece

In [0]:
## PyTorch Transformer
from pytorch_transformers import RobertaModel, RobertaTokenizer
from pytorch_transformers import RobertaForSequenceClassification, RobertaConfig

In [39]:
%%bash
pip install scikit-learn



In [40]:
## Check if Cuda is Available
print(torch.cuda.is_available())

True


In [0]:
## Install PyTorch-Transformer

Loading and generating datasets

In [84]:
import pandas as pd

dataset = pd.read_csv('nc020-a04_ibli_clean.csv',index_col=0)
dataset.head(2)


Unnamed: 0,Class,Classification,EN
0,1,10001,combusting preparations [chemical additives to...
1,1,10002,adhesives for industrial purposes


In [85]:

dataset.groupby('Class').count()[['EN']]

Unnamed: 0_level_0,EN
Class,Unnamed: 1_level_1
1,752
2,132
3,272
4,110
5,525
6,506
7,613
8,300
9,832
10,298


In [0]:

dataset.groupby('Class').count()

from sklearn.model_selection import train_test_split
train_dataset, test_dataset = train_test_split(dataset, test_size=0.5,random_state=7,stratify=dataset['Class'])




In [87]:
#Get index from multiple labels
label_to_ix = {}
for label in dataset.Class:
        if label not in label_to_ix:
            label_to_ix[label]=len(label_to_ix)
label_to_ix

{1: 0,
 2: 1,
 3: 2,
 4: 3,
 5: 4,
 6: 5,
 7: 6,
 8: 7,
 9: 8,
 10: 9,
 11: 10,
 12: 11,
 13: 12,
 14: 13,
 15: 14,
 16: 15,
 17: 16,
 18: 17,
 19: 18,
 20: 19,
 21: 20,
 22: 21,
 23: 22,
 24: 23,
 25: 24,
 26: 25,
 27: 26,
 28: 27,
 29: 28,
 30: 29,
 31: 30,
 32: 31,
 33: 32,
 34: 33,
 35: 34,
 36: 35,
 37: 36,
 38: 37,
 39: 38,
 40: 39,
 41: 40,
 42: 41,
 43: 42,
 44: 43,
 45: 44}

In [0]:
## Loading RoBERTa classes

In [88]:
config = RobertaConfig.from_pretrained('roberta-base')
config.num_labels = len(list(label_to_ix.values()))
config

{
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 45,
  "output_attentions": false,
  "output_hidden_states": false,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 1,
  "vocab_size": 50265
}

In [0]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification(config)

In [0]:
## Feature Preparation

In [0]:
def prepare_features(seq_1, max_seq_length = 50, 
             zero_pad = False, include_CLS_token = True, include_SEP_token = True):
    ## Tokenzine Input
    tokens_a = tokenizer.tokenize(seq_1)

    ## Truncate
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0:(max_seq_length - 2)]
    ## Initialize Tokens
    tokens = []
    if include_CLS_token:
        tokens.append(tokenizer.cls_token)
    ## Add Tokens and separators
    for token in tokens_a:
        tokens.append(token)

    if include_SEP_token:
        tokens.append(tokenizer.sep_token)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    ## Input Mask 
    input_mask = [1] * len(input_ids)
    ## Zero-pad sequence lenght
    if zero_pad:
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
    return torch.tensor(input_ids).unsqueeze(0), input_mask

In [53]:
msg = "adhesives for industrial purposes"
prepare_features(msg)

(tensor([[   0, 2329, 5065, 3699,   13, 2683, 6216,    2]]),
 [1, 1, 1, 1, 1, 1, 1, 1])

In [0]:
## Dataset Loader Classes

In [0]:
#must reset index in order to work with GnSDataset

train_dataset.reset_index(drop=True,inplace=True)
test_dataset.reset_index(drop=True,inplace=True)

In [0]:
class GnSDataset(Dataset):
    
    def __init__(self, dataframe, text_column='EN',label_column='Class'):
        """
          the input dataframe needs to have index reset as its id will be used to locate sample
          @return
              X as feature
              Y as index of the label starting from 0

        """
      
        self.len = len(dataframe)
        self.data = dataframe
        self.text_column = text_column
        self.label_column = label_column
        
    def __getitem__(self, index):
        text = self.data.loc[index,self.text_column]
        label = self.data.loc[index,self.label_column]
        X, _  = prepare_features(text)
        y = label_to_ix[label]
        return X, y
    
    def __len__(self):
        return self.len

In [116]:
print("FULL Dataset: {}".format(nice_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

FULL Dataset: (10180, 3)
TRAIN Dataset: (5090, 3)
TEST Dataset: (5090, 3)


In [0]:
training_set = GnSDataset(train_dataset)
testing_set = GnSDataset(test_dataset)

In [120]:
training_set.__getitem__(1)[0].shape

torch.Size([1, 9])

In [94]:
model(training_set.__getitem__(0)[0])

(tensor([[-7.5383e-02,  4.2130e-01, -7.8537e-03, -1.8962e-01,  2.9304e-01,
           2.4149e-01, -4.8129e-02,  1.2000e-01, -1.0194e-01, -2.3640e-01,
          -3.7385e-02, -4.8046e-02, -3.0860e-01, -1.7311e-02, -2.0144e-04,
          -2.7983e-01,  5.1237e-02,  2.8065e-01, -4.0959e-01,  1.2102e-01,
          -4.2142e-01,  8.8456e-02, -1.5533e-01,  2.2567e-01, -1.2917e-01,
           2.6488e-01,  4.9281e-01,  7.6306e-03,  3.9414e-01, -2.3879e-01,
           3.4856e-02,  5.3802e-01, -1.3614e-01, -5.1286e-02, -1.6683e-01,
          -1.1440e-02, -3.1003e-01,  5.6523e-02, -2.4193e-01,  1.2755e-01,
           1.1385e-01,  2.2734e-01, -1.7085e-01, -2.9089e-02, -4.4490e-01]],
        grad_fn=<AddmmBackward>),)

In [0]:
## Training Params

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.cuda()

In [0]:
# Parameters
params = {'batch_size': 1,
          'shuffle': True,
          'num_workers': 1}

In [0]:
training_loader = DataLoader(training_set, **params)
testing_loader = DataLoader(testing_set, **params)

In [0]:
loss_function = nn.CrossEntropyLoss()
learning_rate = 1e-05
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)

In [135]:
## Test Forward Pass
inp = training_set.__getitem__(0)[0].cuda()
output = model(inp)[0]
torch.max(output.data, 1)

torch.return_types.max(values=tensor([1.1256], device='cuda:0'), indices=tensor([6], device='cuda:0'))

In [0]:
max_epochs = 5
model = model.train()
for epoch in tqdm_notebook(range(max_epochs)):
    print("EPOCH -- {}".format(epoch))
    for i, (sent, label) in enumerate(training_loader):
        optimizer.zero_grad()
        sent = sent.squeeze(0)
        if torch.cuda.is_available():
          sent = sent.cuda()
          label = label.cuda()
        output = model.forward(sent)[0]
        _, predicted = torch.max(output, 1)
        
        loss = loss_function(output, label)
        loss.backward()
        optimizer.step()
        
        if i%100 == 0:
            correct = 0
            total = 0
            for sent, label in testing_loader:
                sent = sent.squeeze(0)
                if torch.cuda.is_available():
                  sent = sent.cuda()
                  label = label.cuda()
                output = model.forward(sent)[0]
                _, predicted = torch.max(output.data, 1)
                total += label.size(0)
                correct += (predicted.cpu() == label.cpu()).sum()
            accuracy = 100.00 * correct.numpy() / total
            print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

EPOCH -- 0
Iteration: 0. Loss: 2.8438620567321777. Accuracy: 7.013752455795678%


In [0]:
!mkdir nlu-benchmark/2017-06-custom-intent-engines/RoBerta_Model

In [0]:
 torch.save(model.state_dict(), 'nlu-benchmark/2017-06-custom-intent-engines/RoBerta_Model/roberta_state_dict_'+ str(uuid4())+'.pth')

In [0]:
dataset.tail(5)

Unnamed: 0,utterance,label
2095,Is Across the Line playing at the closest movi...,SearchScreeningEvent
2096,Which animated movies are playing in the neigh...,SearchScreeningEvent
2097,Where is They Always Return at Dawn playing,SearchScreeningEvent
2098,What is the movie schedule in the neighborhood,SearchScreeningEvent
2099,Tell me when Howling II: Your Sister Is a Were...,SearchScreeningEvent


In [0]:
## Load model
!ls nlu-benchmark/2017-06-custom-intent-engines/RoBerta_Model

roberta_state_dict_4fd91890-1424-41fb-8a7a-059fc60bc379.pth


In [0]:
model_path = 'nlu-benchmark/2017-06-custom-intent-engines/RoBerta_Model/roberta_state_dict_4fd91890-1424-41fb-8a7a-059fc60bc379.pth'

In [0]:
%%time
model.load_state_dict(torch.load(model_path, map_location=device))

CPU times: user 92.8 ms, sys: 240 ms, total: 333 ms
Wall time: 339 ms


IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [0]:
def get_reply(msg):
  model.eval()
  input_msg, _ = prepare_features(msg)
  if torch.cuda.is_available():
    input_msg = input_msg.cuda()
  output = model(input_msg)[0]
  pred_score, pred_label = torch.max(output.data, 1)
  
  prediction=list(label_to_ix.keys())[pred_label]
  return prediction,pred_score.cpu().numpy()[0]

In [0]:
label_to_ix.keys()

dict_keys(['AddToPlaylist', 'BookRestaurant', 'GetWeather', 'PlayMusic', 'RateBook', 'SearchCreativeWork', 'SearchScreeningEvent'])

In [0]:
get_reply("play radiohead song")

('PlayMusic', 5.4527564)

In [0]:
get_reply("it is rainy in Sao Paulo")

('GetWeather', 4.8741484)

In [0]:
get_reply("sun shinnes all day")

('PlayMusic', 4.7038445)

In [0]:
get_reply("low humidity, high altitude")

('PlayMusic', 4.1896024)

In [0]:
get_reply("Book tacos for me tonight")

('BookRestaurant', 4.493275)

In [0]:
get_reply("Book a table for me tonight")

('BookRestaurant', 5.5600758)

In [0]:
get_reply("I want BBQ tonight under the rain")

('SearchCreativeWork', 3.6009989)