In [1]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn import preprocessing, model_selection

from transformers import BertModel, BertTokenizer

from tqdm import tqdm
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import BertModel, BertTokenizer
import transformers

In [2]:
path = 'data/restaurants_laptop_train_with_pos.csv'

df = pd.read_csv(path)

# replace all -1 to 2 since pytorch cannot handle negative
# so, 2 now means negative polarity
df.polarity = df.polarity.replace(-1,2)

encoder = preprocessing.LabelEncoder()
df.loc[:, "aspect_tag"] = encoder.fit_transform(df["aspect_tag"])

sentences = df.groupby("num")["text"].apply(list).values
aspect_tags = df.groupby("num")["aspect_tag"].apply(list).values
polarity_tags = df.groupby("num")["polarity"].apply(list).values

polarity_unique_values = df.polarity.unique()

print('num of aspect tags: {}'.format(len(encoder.classes_)))
print('num of polarity tags: {}'.format(len(polarity_unique_values)))



num of aspect tags: 2
num of polarity tags: 3


In [3]:
print(sentences[4])
print(aspect_tags[4])
print(polarity_tags[4])

['Drinks', 'got', 'screwed', 'up', ',', 'she', 'acted', 'put', 'upon', '.']
[0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
[2, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [4]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')
    
def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    elif isinstance(data, dict):
        for k, v in data.items():
            data[k] = v.to(device)
        return data
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl: 
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)

In [5]:

device = get_default_device()
print(device)

NUM_EPOCHS = 5
TRAIN_BATCH_SIZE = 32
TEST_BATCH_SIZE = 8
MODEL_PATH = "model.bin"
TEST_SIZE = 0.2

NUM_ASPECT_TAGS = len(encoder.classes_)
NUM_POLARITY_TAGS = len(polarity_unique_values)

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

cuda


In [6]:
# Get all the vocab in the dataset
vocab = set()
for s in sentences:
#     print(s)
    vocab.update(set(s))
print(len(vocab))

print(len(tokenizer))

# Increase the vocab in the tokenizer
tokenizer.add_tokens(list(vocab))

4304
28996


1534

In [7]:
class SentenceTagDataset(Dataset):
    def __init__(self, tokenizer, sentences, aspect_tags, polarity_tags, max_length=128):
        self.sentences = sentences
        self.aspect_tags = aspect_tags
        self.polarity_tags = polarity_tags
        self.max_length = max_length
        self.items_to_replace = set([101, 102])

        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.sentences)
        
    def __getitem__(self, idx):
        
        sentence = self.sentences[idx] # Get a sentence
        aspect_tags = self.aspect_tags[idx] # Get the corresponding aspect tags
        polarity_tags = self.polarity_tags[idx] # Get the corresponding polarity tags
        
        sentence_encoding = self.tokenizer.encode_plus(
            sentence,
            max_length = self.max_length,
            return_token_type_ids = True,
            pad_to_max_length = True,
            return_attention_mask = True,
            return_tensor = 'pt',            
        )
        aspect_tags_encoding = self.tokenizer.encode_plus(
            aspect_tags,
            max_length = self.max_length,
            add_special_tokens = True,
            return_token_type_ids = False,
            pad_to_max_length = True,
            return_attention_mask = False,
            return_tensor = 'pt',            
        )     
        polarity_tags_encoding = self.tokenizer.encode_plus(
            polarity_tags,
            max_length = self.max_length,
            add_special_tokens = True,
            return_token_type_ids = False,
            pad_to_max_length = True,
            return_attention_mask = False,
            return_tensor = 'pt',            
        )     
        
        # To debug if there is any [UNK]
#         input_ids = sentence_encoding['input_ids']
#         attention_mask = np.logical_not(sentence_encoding['attention_mask'])
#         input_ids = np.ma.compressed(np.ma.masked_where(attention_mask, input_ids))        
#         token_list = tokenizer.convert_ids_to_tokens(input_ids)
#         found = -1
#         try:
#             found = token_list.index("[UNK]")
#         except ValueError:
#             found = -1
#         if found >= 0:
#             print(sentence)
#             print(token_list)
        
        
        # We are not learning the two BERT special characters, so replace them to '1' 
        # i.e. Not Aspect Terms
        aspect_tags_encoding['input_ids'] = [1 if x in self.items_to_replace 
                                             else x for x in aspect_tags_encoding['input_ids']]        
        
        # We are not learning the two BERT special characters, so replace them to '0' 
        # i.e. Neutral polarity
        polarity_tags_encoding['input_ids'] = [0 if x in self.items_to_replace 
                                               else x for x in polarity_tags_encoding['input_ids']]        
        
        return {
            "input_ids": torch.tensor(sentence_encoding['input_ids'], dtype=torch.long),
            "attention_mask": torch.tensor(sentence_encoding['attention_mask'], dtype=torch.long),
            "token_type_ids": torch.tensor(sentence_encoding['token_type_ids'], dtype=torch.long),
            "aspect_tags": torch.tensor(aspect_tags_encoding['input_ids'], dtype=torch.long),
            "polarity_tags": torch.tensor(polarity_tags_encoding['input_ids'], dtype=torch.long),
        }

def test_dataset():


    train_dataset = SentenceTagDataset(tokenizer=tokenizer,
                                       sentences=train_sentences,
                                       aspect_tags=train_aspect_tags,
                                       polarity_tags=train_polarity_tags)

    train_data_loader = DeviceDataLoader(torch.utils.data.DataLoader(
        train_dataset, batch_size=32), device)    

    print(train_dataset[0])
    


    data = train_dataset[0]
    input_ids = data['input_ids']
    attention_mask = np.logical_not(data['attention_mask'])
    aspect_tags = data['aspect_tags']
    polarity_tags = data['polarity_tags']
    input_ids = np.ma.compressed(np.ma.masked_where(attention_mask, input_ids))
    aspect_tags = np.ma.compressed(np.ma.masked_where(attention_mask, aspect_tags))
    polarity_tags = np.ma.compressed(np.ma.masked_where(attention_mask, polarity_tags))

    print(len(input_ids))
    # print(len(aspect_tags))
    # print(input_ids)
    # print(aspect_tags)

    # items_to_replace = set([101, 102])
    # aspect_tags = [1 if x in items_to_replace else x for x in aspect_tags]
    # print(aspect_tags)


    print(train_dataset.tokenizer.convert_ids_to_tokens(input_ids))
    print(encoder.inverse_transform(aspect_tags))  
    print(polarity_tags)  
    
    for batch in train_data_loader:
        print(batch)
        input_ids_list = batch['input_ids']
        print(input_ids_list.shape)
        break    

In [8]:
# test_dataset()

In [9]:
def loss_fn(output, target, mask, num_labels):
    cel = nn.CrossEntropyLoss()
    masking = mask.view(-1) == 1
    pred = output.view(-1, num_labels)
    true = torch.where(masking, target.view(-1), 
                       torch.tensor(cel.ignore_index).type_as(target))
    loss = cel(pred, true)
    return loss

class AspectExtractionModel(nn.Module):
    def __init__(self, num_aspect_tags, num_polarity_tags, num_vocab):
        super(AspectExtractionModel, self).__init__()
        self.num_aspect_tags = num_aspect_tags
        self.num_polarity_tags = num_polarity_tags
        self.bert_model = transformers.BertModel.from_pretrained("bert-base-cased")        
        self.dropout1 = nn.Dropout(0.3)
        self.fc1 = nn.Linear(768, self.num_aspect_tags)
        self.dropout2 = nn.Dropout(0.3)
        self.fc2 = nn.Linear(768, self.num_polarity_tags)
        # if the number of vocab has been increased, then need to add the new vector 
        # at the end of the embedding matrix
        self.bert_model.resize_token_embeddings(num_vocab)
                
    def forward(self, input_ids, attention_mask, token_type_ids, aspect_tags, polarity_tags):
        out, _ = self.bert_model(input_ids, attention_mask = attention_mask, 
                                 token_type_ids = token_type_ids, return_dict=False)
        
        tag_out = self.dropout1(out)
        tag_out = self.fc1(tag_out)
        
        pol_out = self.dropout2(out)
        pol_out = self.fc2(pol_out)
        
        loss_tag = loss_fn(tag_out, aspect_tags, attention_mask, self.num_aspect_tags)
        loss_pol = loss_fn(pol_out, polarity_tags, attention_mask, self.num_polarity_tags)
        loss = (loss_tag + loss_pol) / 2
        
        s = nn.Softmax(dim=2)
        
        tag_out = s(tag_out)
        pol_out = s(pol_out)
        
        return tag_out, pol_out, loss

In [10]:
def cal_acc(pred_tags, true_tags, mask):
    
    batch = pred_tags.shape[0]
    acc = 0
    for i in range(batch):
        pred_array = pred_tags[i].cpu().detach().numpy()
        true_array = true_tags[i].cpu().detach().numpy()
        mask_array = mask[i].cpu().detach().numpy()

        
        # when comparing the accuracy, only compare the portion without the padding
        # use the mask to remove the padding
        #
        # in Bert, mask is created with 0 for padding, so need to flip it around, so 1 
        # is padding (then we will can use numpy compressed to remove them later)
        mask_array = np.logical_not(mask_array)

        # Now, only the portion that the true sentence is left for pred and true to
        # calculate the accuracy
        pred_unpadded = np.ma.compressed(np.ma.masked_where(mask_array, pred_array))
        true_unpadded = np.ma.compressed(np.ma.masked_where(mask_array, true_array))
        
#         print('i: {}'.format(i))
#         print('pred_array: {}'.format(pred_array))
#         print('true_array: {}'.format(true_array))
#         print('mask_array: {}'.format(mask_array))
#         print('pred_masked: {}'.format(pred_masked))
#         print('true_masked: {}'.format(true_masked))
#         print('='*20)

        acc += np.sum(pred_unpadded == true_unpadded) / len(pred_unpadded)
    return acc / batch


In [11]:
(train_sentences, test_sentences, 
 train_aspect_tags, test_aspect_tags) = model_selection.train_test_split(
    sentences, aspect_tags, random_state = 42, test_size = TEST_SIZE)

(_, _, 
 train_polarity_tags, test_polarity_tags) = model_selection.train_test_split(
    sentences, polarity_tags, random_state = 42, test_size = TEST_SIZE)

train_sentences.shape, test_sentences.shape, train_aspect_tags.shape, test_aspect_tags.shape, train_polarity_tags.shape, test_polarity_tags.shape

((1580,), (396,), (1580,), (396,), (1580,), (396,))

In [12]:
print(train_sentences[78])
print(train_aspect_tags[78])
print(train_polarity_tags[78])

['Of', 'course', 'this', 'atmosphere', 'is', 'lacking', ',', 'but', 'what', 'do', 'you', 'expect', 'from', 'a', '24', 'hour', 'bagel', 'place', 'anyways', '?']
[1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1]
[0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [13]:
train_dataset = SentenceTagDataset(tokenizer=tokenizer, sentences=train_sentences, 
                                   aspect_tags=train_aspect_tags,
                                   polarity_tags=train_polarity_tags)
train_data_loader = DeviceDataLoader(torch.utils.data.DataLoader(
    train_dataset, batch_size=TRAIN_BATCH_SIZE), device)    

test_dataset = SentenceTagDataset(tokenizer=tokenizer, sentences=test_sentences, 
                                  aspect_tags=test_aspect_tags,
                                  polarity_tags=test_polarity_tags)
test_data_loader = DeviceDataLoader(torch.utils.data.DataLoader(
    test_dataset, batch_size=TEST_BATCH_SIZE), device)   

model = to_device(AspectExtractionModel(num_aspect_tags = NUM_ASPECT_TAGS, 
                                        num_polarity_tags = NUM_POLARITY_TAGS,
                                        num_vocab = len(tokenizer)), device)

print(model)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


AspectExtractionModel(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30530, 768)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)

In [14]:
torch.cuda.empty_cache()

num_train_steps = int(len(train_sentences) / TRAIN_BATCH_SIZE * NUM_EPOCHS)
optimizer = AdamW(model.parameters(), lr=3e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0, 
                                            num_training_steps=num_train_steps)

best_loss = np.inf

for epoch in range(NUM_EPOCHS):
    train_losses = []
    train_acc = []
    test_loss = []
    test_acc = []

    model.train()
    for data in tqdm(train_data_loader, total=len(train_data_loader)):
        optimizer.zero_grad()
        pred_aspect_tags, pred_polarity_tags, loss = model(**data)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        train_losses.append(loss.item())

        true_aspect_tags = data['aspect_tags']
        true_polarity_tags = data['polarity_tags']
        mask = data['attention_mask']
        aspect_acc = cal_acc(torch.argmax(pred_aspect_tags, dim=2), true_aspect_tags, mask)
        polarity_acc = cal_acc(torch.argmax(pred_polarity_tags, dim=2), true_polarity_tags, mask)
        avg_acc = (aspect_acc + polarity_acc) / 2
        train_acc.append(avg_acc)
        
    model.eval()
    for data in tqdm(test_data_loader, total=len(test_data_loader)):
        pred_aspect_tags, pred_polarity_tags, loss = model(**data)
        
        test_loss.append(loss.item())
        
        true_aspect_tags = data['aspect_tags']
        true_polarity_tags = data['polarity_tags']
        mask = data['attention_mask']
        aspect_acc = cal_acc(torch.argmax(pred_aspect_tags, dim=2), true_aspect_tags, mask)
        polarity_acc = cal_acc(torch.argmax(pred_polarity_tags, dim=2), true_polarity_tags, mask)
        avg_acc = (aspect_acc + polarity_acc) / 2
        test_acc.append(avg_acc)
        
    avg_train_loss = sum(train_losses) / len(train_losses)
    avg_train_acc = sum(train_acc) / len(train_acc)
    avg_test_loss = sum(test_loss) / len(test_loss)
    avg_test_acc = sum(test_acc) / len(test_acc)
        
    print("Train acc: {:.2f}%; Valid acc: {:.2f}%".format(avg_train_acc*100, avg_test_acc*100))
    print("Train Loss: {:.5f}; Valid Loss: {:.5f}".format(avg_train_loss, avg_test_loss))
    
    if avg_test_loss < best_loss:
        torch.save(model.state_dict(), MODEL_PATH)
        best_loss = avg_test_loss    
    

  0%|          | 0/50 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 50/50 [00:26<00:00,  1.92it/s]
100%|██████████| 50/50 [00:02<00:00, 18.93it/s]


Train acc: 89.71%; Valid acc: 94.25%
Train Loss: 0.27639; Valid Loss: 0.14560


100%|██████████| 50/50 [00:26<00:00,  1.90it/s]
100%|██████████| 50/50 [00:02<00:00, 18.55it/s]


Train acc: 94.87%; Valid acc: 95.08%
Train Loss: 0.13026; Valid Loss: 0.12790


100%|██████████| 50/50 [00:26<00:00,  1.86it/s]
100%|██████████| 50/50 [00:02<00:00, 18.27it/s]


Train acc: 96.03%; Valid acc: 95.45%
Train Loss: 0.10214; Valid Loss: 0.12635


100%|██████████| 50/50 [00:27<00:00,  1.84it/s]
100%|██████████| 50/50 [00:02<00:00, 17.31it/s]


Train acc: 96.67%; Valid acc: 95.37%
Train Loss: 0.08446; Valid Loss: 0.12842


100%|██████████| 50/50 [00:27<00:00,  1.82it/s]
100%|██████████| 50/50 [00:02<00:00, 17.91it/s]


Train acc: 97.04%; Valid acc: 95.65%
Train Loss: 0.07502; Valid Loss: 0.12465


In [15]:
def random_test(test_dataset, test_data_loader, model, num=5, model_path=None):
    if model_path is not None: # load the saved model
        print('Loading saved model from: {}'.format(model_path))
        model.load_state_dict(torch.load(model_path))
    model = to_device(model, device)
    
    with torch.no_grad():
        for i in range(num):
            
            data = next(iter(test_data_loader))
            
            pred_aspect_tags, pred_polarity_tags, _ = model(**data)
            
            
            input_ids = data['input_ids']
            pred_aspect_tags = torch.argmax(pred_aspect_tags, dim=2)
            pred_polarity_tags = torch.argmax(pred_polarity_tags, dim=2)
            true_aspect_tags = data['aspect_tags']
            true_polarity_tags = data['polarity_tags']
            mask = data['attention_mask']
            
            # Randomly pick a test data from this batch
            #
            idx = np.random.randint(0,pred_aspect_tags.shape[0],size=1)[0]

            ids_array = input_ids[idx].cpu().numpy()
            pred_aspect_array = pred_aspect_tags[idx].cpu().numpy()
            true_aspect_array = true_aspect_tags[idx].cpu().numpy()
            pred_polarity_array = pred_polarity_tags[idx].cpu().numpy()
            true_polarity_array = true_polarity_tags[idx].cpu().numpy()
            mask_array = mask[idx].cpu().numpy()

            # Remove the padding as we do not want to print them
            #
            mask_array = np.logical_not(mask_array)

            # Only print the unpadded portion
            ids_unpadded = np.ma.compressed(np.ma.masked_where(mask_array, ids_array))
            pred_aspect_unpadded = np.ma.compressed(np.ma.masked_where(mask_array, 
                                                                       pred_aspect_array))
            true_aspect_unpadded = np.ma.compressed(np.ma.masked_where(mask_array, 
                                                                       true_aspect_array))
            pred_polarity_unpadded = np.ma.compressed(np.ma.masked_where(mask_array, 
                                                                         pred_polarity_array))
            true_polarity_unpadded = np.ma.compressed(np.ma.masked_where(mask_array, 
                                                                         true_polarity_array))
            
            aspect_acc = np.sum(pred_aspect_unpadded == 
                         true_aspect_unpadded) / len(pred_aspect_unpadded)
            polarity_acc = np.sum(pred_polarity_unpadded == 
                                  true_polarity_unpadded) / len(pred_polarity_unpadded)
            
            # let's replace 2 back to -1 for presentation
            pred_polarity_unpadded = np.where(pred_polarity_unpadded == 2, -1, 
                                              pred_polarity_unpadded)
            true_polarity_unpadded = np.where(true_polarity_unpadded == 2, -1, 
                                              true_polarity_unpadded)

            print("Aspect Acc: {:.2f}%".format(aspect_acc*100))
            print("Polarity Acc: {:.2f}%".format(polarity_acc*100))
            print("Predicted Aspect:")
            print(encoder.inverse_transform(pred_aspect_unpadded))
            print("True Aspect:")
            print(encoder.inverse_transform(true_aspect_unpadded))
            print("Predicted Polarity:")
            print(pred_polarity_unpadded)
            print("True Polarity:")
            print(true_polarity_unpadded)
            print("Sentence:")
            print(test_dataset.tokenizer.convert_ids_to_tokens(ids_unpadded))            
            print()


In [16]:
random_test(test_dataset, test_data_loader, model, num=10, model_path=MODEL_PATH)


Loading saved model from: model.bin
Aspect Acc: 93.94%
Polarity Acc: 93.94%
Predicted Aspect:
['NAT' 'NAT' 'NAT' 'NAT' 'NAT' 'NAT' 'NAT' 'NAT' 'NAT' 'NAT' 'NAT' 'AT'
 'NAT' 'NAT' 'NAT' 'NAT' 'NAT' 'NAT' 'NAT' 'NAT' 'NAT' 'NAT' 'NAT' 'NAT'
 'NAT' 'NAT' 'NAT' 'NAT' 'NAT' 'NAT' 'AT' 'NAT' 'NAT']
True Aspect:
['NAT' 'NAT' 'NAT' 'NAT' 'NAT' 'NAT' 'NAT' 'NAT' 'NAT' 'NAT' 'NAT' 'NAT'
 'NAT' 'NAT' 'NAT' 'NAT' 'NAT' 'NAT' 'NAT' 'NAT' 'NAT' 'NAT' 'NAT' 'NAT'
 'NAT' 'NAT' 'NAT' 'NAT' 'NAT' 'AT' 'AT' 'NAT' 'NAT']
Predicted Polarity:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
True Polarity:
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0]
Sentence:
['[CLS]', 'As', 'soon', 'as', 'I', 'wake', 'up', 'on', 'a', 'saturday', 'or', 'sunday', 'it', 'is', 'the', 'first', 'thing', 'on', 'my', 'mind', 'is', 'when', 'and', 'how', 'I', 'will', 'be', 'getting', 'to', 'fried', 'dumpling', '.', '[SEP]']

Aspect Acc: 100.00%
Polarity Acc: 100.00%
Predicted Aspect:
['NAT' 