# **BERT for Targeted Sentiment Analysis**

# 1. Load the data

In [1]:
!pip install pytorch_pretrained_bert

Collecting pytorch_pretrained_bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K     |██▋                             | 10kB 26.4MB/s eta 0:00:01[K     |█████▎                          | 20kB 29.5MB/s eta 0:00:01[K     |████████                        | 30kB 34.4MB/s eta 0:00:01[K     |██████████▋                     | 40kB 38.0MB/s eta 0:00:01[K     |█████████████▎                  | 51kB 31.7MB/s eta 0:00:01[K     |███████████████▉                | 61kB 34.5MB/s eta 0:00:01[K     |██████████████████▌             | 71kB 28.8MB/s eta 0:00:01[K     |█████████████████████▏          | 81kB 30.1MB/s eta 0:00:01[K     |███████████████████████▉        | 92kB 31.8MB/s eta 0:00:01[K     |██████████████████████████▌     | 102kB 29.7MB/s eta 0:00:01[K     |█████████████████████████████▏  | 112kB 29.7MB/s eta 0:00:01[K     |████████████

In [2]:
import numpy as np
import pandas as pd

import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torchtext
from torchtext.vocab import Vectors
from torchtext.datasets import SequenceTaggingDataset

from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam

Using TensorFlow backend.


In [3]:
from google.colab import drive
drive.mount("data")

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at data


In [0]:
class NoReCfine(SequenceTaggingDataset):
    def __init__(self, path, fields, encoding="utf-8", separator="\t", **kwargs):
        super().__init__(path, fields)

    @classmethod
    def splits(cls, fields, train_data="data/My Drive/projects/deep_learning/in5550-exam/data/train.conll", dev_data="data/My Drive/projects/deep_learning/in5550-exam/data/dev.conll", test_data="data/My Drive/projects/deep_learning/in5550-exam/data/test.conll"):
        return NoReCfine(train_data, fields), NoReCfine(dev_data, fields), NoReCfine(test_data, fields)

In [0]:
TEXT = torchtext.data.Field(lower=False, include_lengths=True, batch_first=True, pad_token=None)
LABEL = torchtext.data.Field(batch_first=True, unk_token=None, pad_token=None)
FIELDS = [("text", TEXT), ("label", LABEL)]

train_data, valid_data, test_data = NoReCfine.splits(FIELDS)

In [6]:
print(f'Number of training examples: {len(train_data):,}')
print(f'Number of training examples: {len(valid_data):,}')
print(f'Number of testing examples:    {len(test_data)}')

text_length = [len(sentence) for sentence in list(train_data.text)]

print(f"\nNumber of sentences in train_data.text: {len(text_length)}")
print(f'Number of words in train_data: {sum(text_length):,}')

Number of training examples: 5,915
Number of training examples: 1,151
Number of testing examples:    895

Number of sentences in train_data.text: 5915
Number of words in train_data: 98,483


In [0]:
# Create the vocabulary for words embeddings
LABEL.build_vocab(train_data)

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [9]:
torch.cuda.get_device_name(0) 

'Tesla P4'

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=False)

100%|██████████| 995526/995526 [00:01<00:00, 900849.38B/s]


In [11]:
train_sentences = [' '.join(sentence.text) for sentence in train_data]
valid_sentences = [' '.join(sentence.text) for sentence in valid_data]

train_sentences[25]

'Ja , jeg snakker til deg , Deep Blue .'

In [12]:
max_seq_len = np.max([len(sentence.text) for sentence in train_data])
tags2vals = {i: lab for i, lab in enumerate(LABEL.vocab.itos)}
tag2idx = {lab: i for i, lab in enumerate(LABEL.vocab.itos)}

print(f"max: {max_seq_len}")
print(f"labels: {LABEL.vocab.itos}")
print(f"tag2idx: {tag2idx}")

max: 103
labels: ['O', 'I-targ-Positive', 'B-targ-Positive', 'I-targ-Negative', 'B-targ-Negative']
tag2idx: {'O': 0, 'I-targ-Positive': 1, 'B-targ-Positive': 2, 'I-targ-Negative': 3, 'B-targ-Negative': 4}


In [13]:
train_tokenized = [tokenizer.tokenize(sent) for sent in train_sentences]
train_labels = [sentence.label for sentence in train_data]

valid_tokenized = [tokenizer.tokenize(sent) for sent in valid_sentences]
valid_labels = [sentence.label for sentence in valid_data]

print(train_tokenized[2])
print(train_labels[2])

['Tin', '##ie', 'Tem', '##pah', 'sk', '##uff', '##er', '.']
['B-targ-Negative', 'I-targ-Negative', 'O', 'O']


In [0]:
X_train = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in train_tokenized],
                   maxlen=max_seq_len, dtype="long", truncating="post", padding="post")

X_valid = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in train_tokenized],
                   maxlen=max_seq_len, dtype="long", truncating="post", padding="post")

In [0]:
Y_train = pad_sequences([[tag2idx.get(l) for l in lab] for lab in train_labels],
                     maxlen=max_seq_len, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")

Y_valid = pad_sequences([[tag2idx.get(l) for l in lab] for lab in train_labels],
                     maxlen=max_seq_len, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")

In [16]:
X_train.shape # (sentences, maximum sequence length)

(5915, 103)

In [17]:
Y_train.shape

(5915, 103)

In [18]:
X_train

array([[ 13258,  10216,  85202, ...,      0,      0,      0],
       [ 11771,  12457,  14946, ...,      0,      0,      0],
       [ 48800,  10400,  53696, ...,      0,      0,      0],
       ...,
       [ 15651, 109275,  38424, ...,      0,      0,      0],
       [ 66717,  10217,  14547, ...,      0,      0,      0],
       [ 10666,  10163,  32472, ...,      0,      0,      0]])

In [19]:
Y_train

array([[0, 0, 4, ..., 0, 0, 0],
       [2, 1, 1, ..., 0, 0, 0],
       [4, 3, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [0]:
attention_masks_train = [[float(i>0) for i in ii] for ii in X_train]
attention_masks_valid = [[float(i>0) for i in ii] for ii in X_valid]

In [21]:
len(attention_masks_train) # list of lists of shape (sentences, labels )

5915

In [0]:
# X_train, X_valid, Y_train, Y_valid = train_test_split(X, Y, 
#                                                             random_state=20, test_size=0.1)
# Mask_train, Mask_valid, _, _ = train_test_split(attention_masks, X,
#                                              random_state=20, test_size=0.1)

In [0]:
X_train = torch.tensor(X_train)
X_valid = torch.tensor(X_valid)
Y_train = torch.tensor(Y_train)
Y_valid = torch.tensor(Y_valid)
Mask_train = torch.tensor(attention_masks_train)
Mask_valid = torch.tensor(attention_masks_valid)

In [0]:
batch_s = 32

data_train = TensorDataset(X_train, Mask_train, Y_train)
data_train_sampler = RandomSampler(data_train)
DL_train = DataLoader(data_train, sampler=data_train_sampler, batch_size=batch_s)

data_valid = TensorDataset(X_valid, Mask_valid, Y_valid)
data_valid_sampler = SequentialSampler(data_valid)
DL_valid = DataLoader(data_valid, sampler=data_valid_sampler, batch_size=batch_s)

In [25]:
model = BertForTokenClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(tag2idx))

100%|██████████| 662804195/662804195 [01:08<00:00, 9727419.95B/s] 


In [26]:
model.cuda()
print('model on cuda')

model on cuda


In [0]:
FULL_FINETUNING = True

if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

In [28]:
!pip install seqeval

Collecting seqeval
  Downloading https://files.pythonhosted.org/packages/34/91/068aca8d60ce56dd9ba4506850e876aba5e66a6f2f29aa223224b50df0de/seqeval-0.0.12.tar.gz
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-0.0.12-cp36-none-any.whl size=7424 sha256=d9b5456a4a409e4376709348e85e727bad16048b40baa2ee78ec624ad7654927
  Stored in directory: /root/.cache/pip/wheels/4f/32/0a/df3b340a82583566975377d65e724895b3fad101a3fb729f68
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-0.0.12


In [0]:
from seqeval.metrics import f1_score

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [30]:
epochs = 5
max_grad_norm = 1.0

for epoch in range(epochs):
    # TRAIN loop
    print(f"Epoch: {epoch}")
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(DL_train):
        # add batch to gpu
#         batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        b_input_ids, b_input_mask, b_labels = b_input_ids.to(device), b_input_mask.to(device), b_labels.to(device)
        # forward pass
        loss = model(b_input_ids.long(), token_type_ids=None,
                     attention_mask=b_input_mask.long(), labels=b_labels.long())
        # backward pass
        loss.backward()
        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
        # update parameters
        optimizer.step()
        model.zero_grad()
        print(f"\rStep:{step+1:5d}/{len(DL_train)}", end='')
    # print train loss per epoch
    print("\nTrain loss: {}".format(tr_loss/nb_tr_steps))
    # VALIDATION on validation set
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for step, batch in enumerate(DL_valid):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            tmp_eval_loss = model(b_input_ids.long(), token_type_ids=None,
                                  attention_mask=b_input_mask, labels=b_labels)
            logits = model(b_input_ids, token_type_ids=None,
                           attention_mask=b_input_mask)
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.append(label_ids)
        
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy
        
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
        print(f"\rStep:{step+1:5d}/{len(DL_valid)}", end='')
    eval_loss = eval_loss/nb_eval_steps
    print("\nValidation loss: {}".format(eval_loss))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    pred_tags = [tags2vals[p_i] for p in predictions for p_i in p]
    valid_tags = [tags2vals[l_ii] for l in true_labels for l_i in l for l_ii in l_i]
    print("F1-Score: {}".format(f1_score(pred_tags, valid_tags)))
    print("--------------------------------------------------\n")

Epoch: 0
Step:    1/185

RuntimeError: ignored

In [0]:
model.eval()
predictions = []
true_labels = []
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
for batch in DL_valid:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask, b_labels = batch

    with torch.no_grad():
        tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                              attention_mask=b_input_mask, labels=b_labels)
        logits = model(b_input_ids, token_type_ids=None,
                       attention_mask=b_input_mask)
        
    logits = logits.detach().cpu().numpy()
    predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
    label_ids = b_labels.to('cpu').numpy()
    true_labels.append(label_ids)
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)

    eval_loss += tmp_eval_loss.mean().item()
    eval_accuracy += tmp_eval_accuracy

    nb_eval_examples += b_input_ids.size(0)
    nb_eval_steps += 1

pred_tags = [[tags2vals[p_i] for p_i in p] for p in predictions]
valid_tags = [[tags2vals[l_ii] for l_ii in l_i] for l in true_labels for l_i in l ]
print("Validation loss: {}".format(eval_loss/nb_eval_steps))
print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
print("Validation F1-Score: {}".format(f1_score(pred_tags, valid_tags)))