In [2]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn import preprocessing, model_selection

from transformers import BertModel, BertTokenizer

from tqdm import tqdm
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import BertModel, BertTokenizer
import transformers

In [3]:
path = 'data/restaurants_laptop_train_with_pos.csv'

df = pd.read_csv(path)

encoder = preprocessing.LabelEncoder()
df.loc[:, "aspect_tag"] = encoder.fit_transform(df["aspect_tag"])
print('num of aspect tags: {}'.format(len(encoder.classes_)))

sentences = df.groupby("num")["text"].apply(list).values
aspect_tags = df.groupby("num")["aspect_tag"].apply(list).values

num of aspect tags: 2


In [4]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')
    
def to_device(data, device):
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    elif isinstance(data, dict):
        for k, v in data.items():
            data[k] = v.to(device)
        return data
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl: 
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)

In [5]:

device = get_default_device()
print(device)

NUM_EPOCHS = 5
TRAIN_BATCH_SIZE = 32
TEST_BATCH_SIZE = 8
MODEL_PATH = "model.bin"
TEST_SIZE = 0.2

NUM_ASPECT_TAGS = len(encoder.classes_)

tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

cuda


In [6]:
class SentenceTagDataset(Dataset):
    def __init__(self, tokenizer, sentences, aspect_tags, max_length=128):
        self.sentences = sentences
        self.aspect_tags = aspect_tags
        self.max_length = max_length
        self.items_to_replace = set([101, 102])

        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.sentences)
        
    def __getitem__(self, idx):
        sentence = self.sentences[idx] # Get a sentence
        aspect_tags = self.aspect_tags[idx] # Get the corresponding aspect tags
        
        sentence_encoding = self.tokenizer.encode_plus(
            sentence,
            max_length = self.max_length,
            return_token_type_ids = True,
            pad_to_max_length = True,
            return_attention_mask = True,
            return_tensor = 'pt',            
        )
        aspect_tags_encoding = self.tokenizer.encode_plus(
            aspect_tags,
            max_length = self.max_length,
            add_special_tokens = True,
            return_token_type_ids = False,
            pad_to_max_length = True,
            return_attention_mask = False,
            return_tensor = 'pt',            
        )     
        
        # We are not learning the two BERT special characters, so replace them to '1' 
        # i.e. Not Aspect Terms
        aspect_tags_encoding['input_ids'] = [1 if x in self.items_to_replace 
                                             else x for x in aspect_tags_encoding['input_ids']]        
        
        return {
            "input_ids": torch.tensor(sentence_encoding['input_ids'], dtype=torch.long),
            "attention_mask": torch.tensor(sentence_encoding['attention_mask'], dtype=torch.long),
            "token_type_ids": torch.tensor(sentence_encoding['token_type_ids'], dtype=torch.long),
            "aspect_tags": torch.tensor(aspect_tags_encoding['input_ids'], dtype=torch.long),
        }

def test_dataset():


    train_dataset = SentenceTagDataset(sentences=train_sentences,
                                       aspect_tags=train_aspect_tags)

    train_data_loader = DeviceDataLoader(torch.utils.data.DataLoader(
        train_dataset, batch_size=32), device)    

    print(train_dataset[0])
    


    data = train_dataset[0]
    input_ids = data['input_ids']
    attention_mask = np.logical_not(data['attention_mask'])
    aspect_tags = data['aspect_tags']
    input_ids = np.ma.compressed(np.ma.masked_where(attention_mask, input_ids))
    aspect_tags = np.ma.compressed(np.ma.masked_where(attention_mask, aspect_tags))

    print(len(input_ids))
    # print(len(aspect_tags))
    # print(input_ids)
    # print(aspect_tags)

    # items_to_replace = set([101, 102])
    # aspect_tags = [1 if x in items_to_replace else x for x in aspect_tags]
    # print(aspect_tags)


    print(train_dataset.tokenizer.convert_ids_to_tokens(input_ids))
    print(encoder.inverse_transform(aspect_tags))  
    
    for batch in train_data_loader:
        print(batch)
        input_ids_list = batch['input_ids']
        print(input_ids_list.shape)
        break    

In [7]:
def loss_fn(output, target, mask, num_labels):
    cel = nn.CrossEntropyLoss()
    masking = mask.view(-1) == 1
    pred = output.view(-1, num_labels)
    true = torch.where(masking, target.view(-1), 
                       torch.tensor(cel.ignore_index).type_as(target))
    loss = cel(pred, true)
    return loss

class AspectExtractionModel(nn.Module):
    def __init__(self, num_aspect_tags):
        super(AspectExtractionModel, self).__init__()
        self.num_aspect_tags = num_aspect_tags
        self.bert_model = transformers.BertModel.from_pretrained("bert-base-cased")
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(768, self.num_aspect_tags)
        
    def forward(self, input_ids, attention_mask, token_type_ids, aspect_tags):
        out, _ = self.bert_model(input_ids, attention_mask = attention_mask, 
                                 token_type_ids = token_type_ids, return_dict=False)
        tag_out = self.dropout(out)
        tag_out = self.fc(tag_out)
        
        loss_tag = loss_fn(tag_out, aspect_tags, attention_mask, self.num_aspect_tags)
        
        s = nn.Softmax(dim=2)
        
        tag_out = s(tag_out)
        
        return tag_out, loss_tag

In [8]:
def cal_acc(pred_tags, true_tags, mask):
    
    batch = pred_tags.shape[0]
    acc = 0
    for i in range(batch):
        pred_array = pred_tags[i].cpu().detach().numpy()
        true_array = true_tags[i].cpu().detach().numpy()
        mask_array = mask[i].cpu().detach().numpy()

        
        # when comparing the accuracy, only compare the portion without the padding
        # use the mask to remove the padding
        #
        # in Bert, mask is created with 0 for padding, so need to flip it around, so 1 
        # is padding (then we will can use numpy compressed to remove them later)
        mask_array = np.logical_not(mask_array)

        # Now, only the portion that the true sentence is left for pred and true to
        # calculate the accuracy
        pred_unpadded = np.ma.compressed(np.ma.masked_where(mask_array, pred_array))
        true_unpadded = np.ma.compressed(np.ma.masked_where(mask_array, true_array))
        
#         print('i: {}'.format(i))
#         print('pred_array: {}'.format(pred_array))
#         print('true_array: {}'.format(true_array))
#         print('mask_array: {}'.format(mask_array))
#         print('pred_masked: {}'.format(pred_masked))
#         print('true_masked: {}'.format(true_masked))
#         print('='*20)

        acc += np.sum(pred_unpadded == true_unpadded) / len(pred_unpadded)
    return acc / batch


In [9]:
def random_test(test_dataset, test_data_loader, model, num=5, model_path=None):
    if model_path is not None: # load the saved model
        print('Loading saved model from: {}'.format(model_path))
        model.load_state_dict(torch.load(model_path))
    model = to_device(model, device)
    
    with torch.no_grad():
        for i in range(num):
            
            data = next(iter(test_data_loader))
            
            pred_tags, _ = model(**data)
            
            
            input_ids = data['input_ids']
            pred_tags = torch.argmax(pred_tags, dim=2)
            true_tags = data['aspect_tags']
            mask = data['attention_mask']
            
            # Randomly pick a test data from this batch
            #
            idx = np.random.randint(0,pred_tags.shape[0],size=1)[0]

            ids_array = input_ids[idx].cpu().numpy()
            pred_array = pred_tags[idx].cpu().numpy()
            true_array = true_tags[idx].cpu().numpy()
            mask_array = mask[idx].cpu().numpy()

            # Remove the padding as we do not want to print them
            #
            mask_array = np.logical_not(mask_array)

            # Only print the unpadded portion
            ids_unpadded = np.ma.compressed(np.ma.masked_where(mask_array, ids_array))
            pred_unpadded = np.ma.compressed(np.ma.masked_where(mask_array, pred_array))
            true_unpadded = np.ma.compressed(np.ma.masked_where(mask_array, true_array))
            
            acc = np.sum(pred_unpadded == true_unpadded) / len(pred_unpadded)

            print("Acc: {:.2f}%".format(acc*100))
            print("Predicted:")
            print(encoder.inverse_transform(pred_unpadded))
            print("True:")
            print(encoder.inverse_transform(true_unpadded))
            print("Sentence:")
            print(test_dataset.tokenizer.convert_ids_to_tokens(ids_unpadded))            
            print()

In [10]:
(train_sentences, test_sentences, 
 train_aspect_tags, test_aspect_tags) = model_selection.train_test_split(
    sentences, aspect_tags, random_state = 42, test_size = TEST_SIZE)

train_dataset = SentenceTagDataset(tokenizer=tokenizer, sentences=train_sentences, 
                                   aspect_tags=train_aspect_tags)
train_data_loader = DeviceDataLoader(torch.utils.data.DataLoader(
    train_dataset, batch_size=TRAIN_BATCH_SIZE), device)    

test_dataset = SentenceTagDataset(tokenizer=tokenizer, sentences=test_sentences, 
                                  aspect_tags=test_aspect_tags)
test_data_loader = DeviceDataLoader(torch.utils.data.DataLoader(
    test_dataset, batch_size=TEST_BATCH_SIZE), device)   

model = to_device(AspectExtractionModel(num_aspect_tags = NUM_ASPECT_TAGS), device)
print(model)


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


AspectExtractionModel(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [None]:
torch.cuda.empty_cache()

num_train_steps = int(len(train_sentences) / TRAIN_BATCH_SIZE * NUM_EPOCHS)
optimizer = AdamW(model.parameters(), lr=3e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0, 
                                            num_training_steps=num_train_steps)

best_loss = np.inf

for epoch in range(NUM_EPOCHS):
    train_losses = []
    train_acc = []
    test_loss = []
    test_acc = []

    model.train()
    for data in tqdm(train_data_loader, total=len(train_data_loader)):
        optimizer.zero_grad()
        pred_tags, loss = model(**data)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        train_losses.append(loss.item())

        true_tags = data['aspect_tags']
        mask = data['attention_mask']
        train_acc.append(cal_acc(torch.argmax(pred_tags, dim=2), true_tags, mask))
        

    model.eval()
    for data in tqdm(test_data_loader, total=len(test_data_loader)):
        pred_tags, loss = model(**data)
        
        test_loss.append(loss.item())
        
        true_tags = data['aspect_tags']
        mask = data['attention_mask']
        test_acc.append(cal_acc(torch.argmax(pred_tags, dim=2), true_tags, mask))
        
    avg_train_loss = sum(train_losses) / len(train_losses)
    avg_train_acc = sum(train_acc) / len(train_acc)
    avg_test_loss = sum(test_loss) / len(test_loss)
    avg_test_acc = sum(test_acc) / len(test_acc)
        
    print("Train acc: {:.2f}%; Valid acc: {:.2f}%".format(avg_train_acc*100, avg_test_acc*100))
    print("Train Loss: {:.5f}; Valid Loss: {:.5f}".format(avg_train_loss, avg_test_loss))
    
    if avg_test_loss < best_loss:
        torch.save(model.state_dict(), MODEL_PATH)
        best_loss = avg_test_loss    
    

  0%|          | 0/50 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 50/50 [00:25<00:00,  1.93it/s]
100%|██████████| 50/50 [00:02<00:00, 19.24it/s]


Train acc: 90.48%; Valid acc: 96.07%
Train Loss: 0.22160; Valid Loss: 0.10508


100%|██████████| 50/50 [00:26<00:00,  1.90it/s]
100%|██████████| 50/50 [00:02<00:00, 18.78it/s]


Train acc: 96.73%; Valid acc: 96.61%
Train Loss: 0.08848; Valid Loss: 0.08943


100%|██████████| 50/50 [00:27<00:00,  1.85it/s]
100%|██████████| 50/50 [00:02<00:00, 18.43it/s]


Train acc: 97.88%; Valid acc: 96.95%
Train Loss: 0.05801; Valid Loss: 0.09264


100%|██████████| 50/50 [00:27<00:00,  1.84it/s]
100%|██████████| 50/50 [00:02<00:00, 18.46it/s]


Train acc: 98.58%; Valid acc: 96.86%
Train Loss: 0.04151; Valid Loss: 0.09407


 40%|████      | 20/50 [00:10<00:16,  1.81it/s]

In [None]:
random_test(test_dataset, test_data_loader, model, num=10, model_path=MODEL_PATH)
