<a href="https://colab.research.google.com/github/binliu0630/Deep_Learning/blob/master/Text_Classification_RoBERTa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import json, re
from tqdm import tqdm_notebook
from uuid import uuid4

## Torch Modules
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader


# ## Mount Drive into Colab
# from google.colab import drive
# drive.mount('/content/drive')

In [0]:
## Install PyTorch-Transformer
!pip install -U pytorch-transformers

Requirement already up-to-date: pytorch-transformers in /usr/local/lib/python3.6/dist-packages (1.1.0)


In [0]:
## PyTorch Transformer
from pytorch_transformers import RobertaModel, RobertaTokenizer
from pytorch_transformers import RobertaForSequenceClassification, RobertaConfig

In [0]:
## Check if Cuda is Available
print(torch.cuda.is_available())

True


# INPUT THE DATA

In [0]:
data_path = "https://s3.amazonaws.com/tomk/h2o-world/megan/AmazonReviews.csv"

review = pd.read_csv(data_path)

review.head(2)

Unnamed: 0,ProductId,UserId,Summary,Score,HelpfulnessDenominator,Id,ProfileName,HelpfulnessNumerator,Time,Text
0,B00141QYSQ,A1YS02UZZGRDCT,Do Not Buy,1,2,41471,Evan Eberhardt,2,1348358400,These are made in China (do not buy ANY pet fo...
1,B0089SPEO2,A3JOYNYL458QHP,Less lemon and less zing,3,0,28582,coleridge,0,1323907200,"Everything is ok, except it just isn't as good..."


In [0]:
# add label: positivie/1, negative/0
review['label'] = np.where(review['Score'] > 4,'Positive', 
                           np.where(review['Score'] == 4, 'Neutral', 'Negative'))

In [0]:
review['label'].value_counts()

Positive    63988
Negative    21791
Neutral     14221
Name: label, dtype: int64

In [0]:
# dataset = pd.DataFrame(columns = ['utterance', 'label'])
# for intent in ['AddToPlaylist', 'BookRestaurant', 'GetWeather', 'PlayMusic', 'RateBook', 'SearchCreativeWork',
#                'SearchScreeningEvent']:
#     with open(dataset_path + intent + "/train_" + intent + ".json",
#               encoding='cp1251') as data_file:
#         data = json.load(data_file)
#     print("Class: {}, # utterances: {}".format(intent,len(data[intent])))
#     texts = []
#     for i in range(len(data[intent])):
#         text = ''
#         for j in range(len(data[intent][i]['data'])):
#             text += data[intent][i]['data'][j]['text']
#         dataset = dataset.append({'utterance': text, 'label': intent}, ignore_index=True)

In [0]:
dataset = review.loc[:, ['Text', 'label']]
dataset.head()

Unnamed: 0,Text,label
0,These are made in China (do not buy ANY pet fo...,Negative
1,"Everything is ok, except it just isn't as good...",Negative
2,Best cat treat ever. There isn't anything comp...,Positive
3,My two Corgis were thoroughly spoiled by my la...,Positive
4,We used to have drive down to the specialty pe...,Positive


In [0]:
dataset['label'].value_counts()

Positive    63988
Negative    21791
Neutral     14221
Name: label, dtype: int64

In [0]:
label_to_ix = {}
for label in dataset.label:
    for word in label.split():
        if word not in label_to_ix:
            label_to_ix[word]=len(label_to_ix)
label_to_ix

{'Negative': 0, 'Neutral': 2, 'Positive': 1}

In [0]:
## Loading RoBERTa classes

In [0]:
config = RobertaConfig.from_pretrained('roberta-base')
config.num_labels = len(set(review['label']))
config

{
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 514,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 3,
  "output_attentions": false,
  "output_hidden_states": false,
  "torchscript": false,
  "type_vocab_size": 1,
  "vocab_size": 50265
}

In [0]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification(config)

In [0]:
## Feature Preparation

In [0]:
def prepare_features(seq_1, max_seq_length = 300, 
             zero_pad = False, include_CLS_token = True, include_SEP_token = True):
    ## Tokenzine Input
    tokens_a = tokenizer.tokenize(seq_1)

    ## Truncate
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0:(max_seq_length - 2)]
    ## Initialize Tokens
    tokens = []
    if include_CLS_token:
        tokens.append(tokenizer.cls_token)
    ## Add Tokens and separators
    for token in tokens_a:
        tokens.append(token)

    if include_SEP_token:
        tokens.append(tokenizer.sep_token)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)
    ## Input Mask 
    input_mask = [1] * len(input_ids)
    ## Zero-pad sequence lenght
    if zero_pad:
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
    return torch.tensor(input_ids).unsqueeze(0), input_mask

In [0]:
msg = "My dog is cute!"
prepare_features(msg)

(tensor([[    0,  2387,  2335,    16, 11962,   328,     2]]),
 [1, 1, 1, 1, 1, 1, 1])

In [0]:

## Dataset Loader Classes

In [0]:
class Intents(Dataset):
    def __init__(self, dataframe):
        self.len = len(dataframe)
        self.data = dataframe
        
    def __getitem__(self, index):
        text = self.data.Text[index]
        label = self.data.label[index]
        X, _  = prepare_features(text)
#         y = self.data.PositiveReview.values[index]
        y = label_to_ix[self.data.label[index]]
        return X, y
    
    def __len__(self):
        return self.len

In [0]:
train_size = 0.8
train_dataset=dataset.sample(frac=train_size,random_state=200).reset_index(drop=True)
test_dataset=dataset.drop(train_dataset.index).reset_index(drop=True)

In [0]:
print("FULL Dataset: {}".format(dataset.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

FULL Dataset: (100000, 2)
TRAIN Dataset: (80000, 2)
TEST Dataset: (20000, 2)


In [0]:
train_dataset['label'].value_counts()

Positive    51195
Negative    17397
Neutral     11408
Name: label, dtype: int64

In [0]:
training_set = Intents(train_dataset)
testing_set = Intents(test_dataset)

In [0]:
training_set.__getitem__(0)[0].shape

torch.Size([1, 30])

In [0]:
model(training_set.__getitem__(0)[0])

RuntimeError: ignored

In [0]:
## Training Params

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.cuda()

In [0]:
# Parameters
params = {'batch_size': 2,
          'shuffle': True,
          'num_workers': 1}

In [0]:
training_loader = DataLoader(training_set, **params)
testing_loader = DataLoader(testing_set, **params)

In [0]:
loss_function = nn.CrossEntropyLoss()
learning_rate = 1e-05
optimizer = optim.Adam(params =  model.parameters(), lr=learning_rate)

In [0]:
## Test Forward Pass
inp = training_set.__getitem__(0)[0].cuda()
output = model(inp)[0]
torch.max(output.data, 1)

A sequence with no special tokens has been passed to the RoBERTa model. This model requires special tokens in order to work. Please specify add_special_tokens=True in your encoding.


torch.return_types.max(values=tensor([0.2610, 0.2722, 0.3194, 0.3925, 0.4743, 0.4059, 0.2735, 0.2649, 0.2718,
        0.3710, 0.4542, 0.5980, 0.2627, 0.3007, 0.2009, 0.0277, 0.5106, 0.3303,
        0.3068, 0.3984, 0.4793, 0.3765, 0.2396, 0.3065, 0.5058, 0.5999, 0.2753,
        0.1836, 0.3323, 0.3559], device='cuda:0'), indices=tensor([1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 2, 1, 0, 1, 0, 1, 1, 1,
        1, 0, 0, 0, 0, 0], device='cuda:0'))

In [0]:
for i, (sent, label) in enumerate(training_loader):
  print(i, (sent, label))
  if i == 2:
    break

RuntimeError: ignored

In [0]:
for i, batch in enumerate(training_loader):
  print(i, batch)
  if i ==2:
    break

RuntimeError: ignored

In [0]:
max_epochs = 3
model = model.train()
for epoch in tqdm_notebook(range(max_epochs)):
    print("EPOCH -- {}".format(epoch))
    for i, (sent, label) in enumerate(training_loader):
        optimizer.zero_grad()
        sent = sent.squeeze(0)
        if torch.cuda.is_available():
          sent = sent.cuda()
          label = label.cuda()
        output = model.forward(sent)[0]
        _, predicted = torch.max(output, 1)
        
        loss = loss_function(output, label)
        loss.backward()
        optimizer.step()
        
        if i%100 == 0:
            correct = 0
            total = 0
            for sent, label in testing_loader:
                sent = sent.squeeze(0)
                if torch.cuda.is_available():
                  sent = sent.cuda()
                  label = label.cuda()
                output = model.forward(sent)[0]
                _, predicted = torch.max(output.data, 1)
                total += label.size(0)
#                 print(f'p{predicted.cpu()}')
#                 print(label.cpu())
                correct += (predicted.cpu() == label.cpu()).sum()
            
            print(f'len of total: {total}')    
            print(f'correct: {correct}')  
            accuracy = 100.00 * correct.numpy() / total
            print('Iteration: {}. Loss: {}. Accuracy: {}%'.format(i, loss.item(), accuracy))

Exception ignored in: <bound method _DataLoaderIter.__del__ of <torch.utils.data.dataloader._DataLoaderIter object at 0x7f7dc05006d8>>
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py", line 677, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.6/dist-packages/torch/utils/data/dataloader.py", line 659, in _shutdown_workers
    w.join()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 124, in join
    res = self._popen.wait(timeout)
  File "/usr/lib/python3.6/multiprocessing/popen_fork.py", line 50, in wait
    return self.poll(os.WNOHANG if timeout == 0.0 else 0)
  File "/usr/lib/python3.6/multiprocessing/popen_fork.py", line 28, in poll
    pid, sts = os.waitpid(self.pid, flag)
KeyboardInterrupt: 


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

EPOCH -- 0



RuntimeError: ignored

In [0]:
 torch.save(model.state_dict(), 'drive/My Drive/Datasets/roberta_state_dict_'+ str(uuid4())+'.pth')

NameError: ignored

In [0]:
dataset.tail(5)

Unnamed: 0,Text,label
2495,"Intense, pure lemon flavor in these cookies wi...",Positive
2496,I love this product. Make homemade sport gels ...,Positive
2497,"Unique blueberry flavor, but this is a VERY li...",Negative
2498,"I like having Zico with dinner, it really quen...",Positive
2499,These bars in the peanut/chocolate or the choc...,Positive


In [0]:
## Load model

In [0]:
model_path = 'drive/My Drive/Datasets/roberta_state_dict_7a4ff0ec-b474-4622-8a2f-88c58f364272.pth'

In [0]:
model.load_state_dict(torch.load(model_path, map_location=device))

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [0]:
def get_reply(msg):
  model.eval()
  input_msg, _ = prepare_features(msg)
  if torch.cuda.is_available():
    input_msg = input_msg.cuda()
  output = model(input_msg)[0]
  _, pred_label = torch.max(output.data, 1)
  prediction=list(label_to_ix.keys())[pred_label]
  return prediction

In [0]:
label_to_ix.keys()

dict_keys(['Negative', 'Positive'])

In [0]:
get_reply("play radiohead song")

'Positive'

In [0]:
get_reply("it is rainy in Sao Paulo")

'Positive'

In [0]:
get_reply("sun shinnes all day BAD")

'Positive'

In [0]:
get_reply("low humidity, high altitude")

'Positive'

In [0]:
get_reply("Book tacos for me tonight")

'Positive'

In [0]:
get_reply("Book a table for me tonight")

'Positive'

In [0]:
get_reply("I really don't like BBQ tonight")

'Positive'