In [3]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn import preprocessing, model_selection
from sklearn.metrics import f1_score, confusion_matrix, classification_report

from transformers import BertModel, BertTokenizer

from tqdm import tqdm
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import BertModel, BertTokenizer
from transformers import AutoModel, AutoTokenizer
import transformers

import time
import matplotlib.pyplot as plt

from torchvision.transforms import ToTensor
import math

In [4]:
# path = '../data/restaurants_laptop_train_with_pos.csv'
path = '../data/restaurants_laptop_train_with_pos_cleaned.csv'

df = pd.read_csv(path)

# df = df[:1000]

# replace all -1 to 2 since pytorch cannot handle negative
# so, 2 now means negative polarity
df.polarity = df.polarity.replace(-1,2)

encoder = preprocessing.LabelEncoder()
df.loc[:, "aspect_tag"] = encoder.fit_transform(df["aspect_tag"])

sentences = df.groupby("num")["text"].apply(list).values
aspect_tags = df.groupby("num")["aspect_tag"].apply(list).values
polarity_tags = df.groupby("num")["polarity"].apply(list).values

polarity_unique_values = df.polarity.unique()

print('num of aspect tags: {}'.format(len(encoder.classes_)))
print('num of polarity tags: {}'.format(len(polarity_unique_values)))

np.where(encoder.classes_ == "AT")[0].item()

num of aspect tags: 2
num of polarity tags: 3


0

In [5]:
df

Unnamed: 0,num,text,pos,aspect_tag,polarity
0,s_1,I,PRON,1,0
1,s_1,charge,VERB,1,0
2,s_1,it,PRON,1,0
3,s_1,at,ADP,1,0
4,s_1,night,NOUN,1,0
...,...,...,...,...,...
55474,s_3432,and,CCONJ,1,0
55475,s_3432,rice,NOUN,0,0
55476,s_3432,and,CCONJ,1,0
55477,s_3432,glass,NOUN,0,0


In [6]:
# def get_a_set(idx, sentences, aspect_tags, polarity_tags):
#     return sentences[idx], aspect_tags[idx], polarity_tags[idx]

# sentence, aspect_tag, polarit_tag = get_a_set(0, sentences, aspect_tags, polarity_tags)

In [7]:
# sentence, aspect_tag, polarit_tag

In [8]:
def get_new_aspect_cluster(left, right, aspect_term, polarity, sentence):

    if len(polarity) == 0:
        polarity = 0
    else:
        polarity = int(sum(polarity)/len(polarity))
   
    left.extend(aspect_term)
    left.extend(right)
    return {
        "local_context":left,
        "global_context":sentence,
        "aspect_term":aspect_term,
        "polarity":polarity,
    }

def chop(sentence, aspect_tag, polarity_tag):
    ret_aspect_clusters = []
#     ret_aspect_clusters = {
#         "context":list(),
#         "aspect_term":list(),
#         "polarity":list(),
#     }
    left = []
    right = []
    aspect_term = []
    polarity = []
    doing_left = True
    doing_right = False
    doing_aspect = False
    for i in range(len(sentence)):
        # check if the current token is an aspect term
        if aspect_tag[i] == 0:
            if doing_left:
                doing_aspect = True
                doing_left = False
            elif doing_right:
                doing_right = False
                doing_aspect = True
                # Now, need to save the previous aspect term cluster
                ret_aspect_clusters.append(get_new_aspect_cluster(
                    left, right, aspect_term, polarity, sentence))
                left = right
                right = []
                aspect_term = []
                polarity = []
            aspect_term.append(sentence[i])
            polarity.append(polarity_tag[i])
        else:
            if doing_left:
                left.append(sentence[i])
            elif doing_right:
                right.append(sentence[i])
            else:
                doing_aspect = False
                doing_right = True
                right.append(sentence[i])
                
    ret_aspect_clusters.append(get_new_aspect_cluster(
        left, right, aspect_term, polarity, sentence))
    
    return ret_aspect_clusters

In [9]:
# chop(sentence, aspect_tag, polarit_tag)

In [10]:
all_aspect_clusters = []
for i in range(len(sentences)):
    all_aspect_clusters.extend(chop(sentences[i], aspect_tags[i], polarity_tags[i]))

In [11]:
len(all_aspect_clusters)

5733

In [12]:
all_aspect_clusters

[{'local_context': ['I',
   'charge',
   'it',
   'at',
   'night',
   'and',
   'skip',
   'taking',
   'the',
   'cord',
   'with',
   'me',
   'because',
   'of',
   'the',
   'good'],
  'global_context': ['I',
   'charge',
   'it',
   'at',
   'night',
   'and',
   'skip',
   'taking',
   'the',
   'cord',
   'with',
   'me',
   'because',
   'of',
   'the',
   'good',
   'battery',
   'life'],
  'aspect_term': ['cord'],
  'polarity': 0},
 {'local_context': ['with',
   'me',
   'because',
   'of',
   'the',
   'good',
   'battery',
   'life'],
  'global_context': ['I',
   'charge',
   'it',
   'at',
   'night',
   'and',
   'skip',
   'taking',
   'the',
   'cord',
   'with',
   'me',
   'because',
   'of',
   'the',
   'good',
   'battery',
   'life'],
  'aspect_term': ['battery', 'life'],
  'polarity': 1},
 {'local_context': ['However',
   'the',
   'multi',
   '-',
   'touch',
   'gestures',
   'and',
   'large'],
  'global_context': ['However',
   'the',
   'multi',
   '-',
   

In [13]:
def get_default_device():
    """Pick GPU if available, else CPU"""
    if torch.cuda.is_available():
        return torch.device('cuda')
    else:
        return torch.device('cpu')
    
def to_device(data, device):
    toTensor = ToTensor()
    """Move tensor(s) to chosen device"""
    if isinstance(data, (list,tuple)):
        return [to_device(x, device) for x in data]
    elif isinstance(data, dict):
        for k, v in data.items():
            data[k] = to_device(v, device)
        return data
    elif isinstance(data, str):
        return data
    return data.to(device, non_blocking=True)

class DeviceDataLoader():
    """Wrap a dataloader to move data to a device"""
    def __init__(self, dl, device):
        self.dl = dl
        self.device = device
        
    def __iter__(self):
        """Yield a batch of data after moving it to device"""
        for b in self.dl: 
            yield to_device(b, self.device)

    def __len__(self):
        """Number of batches"""
        return len(self.dl)

In [14]:

device = get_default_device()
print(device)

NUM_EPOCHS = 5
TRAIN_BATCH_SIZE = 8
TEST_BATCH_SIZE = 8
MODEL_PATH = "model.aspect_clustering.chop.bin"
TEST_SIZE = 0.2

NUM_ASPECT_TAGS = len(encoder.classes_)
NUM_POLARITY_TAGS = len(polarity_unique_values)

# tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

cuda


In [15]:
class SentenceTagDataset(Dataset):
    def __init__(self, tokenizer, aspect_clusters, 
                 max_length=128):
        self.aspect_clusters = aspect_clusters
        self.max_length = max_length
        self.special_token = -100

        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.aspect_clusters)

    def __getitem__(self, idx):
        aspect_cluster = self.aspect_clusters[idx]  # Get a cluster
        l_context = aspect_cluster["local_context"]
        g_context = aspect_cluster["global_context"]
        aspect_term = aspect_cluster["aspect_term"]
        polarity = aspect_cluster["polarity"]

        l_context_encoding = self.tokenizer(
            l_context,
            is_split_into_words=True,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_token_type_ids=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        g_context_encoding = self.tokenizer(
            g_context,
            is_split_into_words=True,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_token_type_ids=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return {
            "l_context": l_context_encoding["input_ids"][0],
            "l_attention_mask": l_context_encoding["attention_mask"][0],
            "l_token_type_ids": l_context_encoding["token_type_ids"][0],
            "g_context": g_context_encoding["input_ids"][0],
            "g_attention_mask": g_context_encoding["attention_mask"][0],
            "g_token_type_ids": g_context_encoding["token_type_ids"][0],
#             "aspect_term": aspect_term,
            "polarity": torch.tensor(polarity, dtype=torch.long),
        }
    


In [16]:
def loss_fn(output, target, mask, num_labels):
    cel = nn.CrossEntropyLoss()
    masking = mask.view(-1) == 1
    pred = output.view(-1, num_labels)
    true = torch.where(masking, target.view(-1), 
                       torch.tensor(cel.ignore_index).type_as(target))
    loss = cel(pred, true)
    return loss

def scaled_dot_product(q, k, v, mask=None):
    d_k = q.size()[-1]
    attn_logits = torch.matmul(q, k.transpose(-2, -1))
    attn_logits = attn_logits / math.sqrt(d_k)
    if mask is not None:
        attn_logits = attn_logits.masked_fill(mask == 0, -9e15)
    attention = F.softmax(attn_logits, dim=-1)
    values = torch.matmul(attention, v)
    return values, attention

class PolarityExtractionModel(nn.Module):
    def __init__(self, num_polarity_tags, alpha=0.5):
        super(PolarityExtractionModel, self).__init__()
        self.num_polarity_tags = num_polarity_tags
        self.bert_model1 = transformers.BertModel.from_pretrained("bert-base-cased")        
        self.bert_model2 = transformers.BertModel.from_pretrained("bert-base-cased")        
        self.dropout = nn.Dropout(0.3)
        # self.bert.config.hidden_size is 768
        self.fc = nn.Linear(768, self.num_polarity_tags)
        
        
        self.qkv_proj = nn.Linear(12, 768)
        self.o_proj = nn.Linear(768, 768)
        
        self.pool = nn.AvgPool1d(1)

    def forward(self, l_context, l_attention_mask, l_token_type_ids, 
                g_context, g_attention_mask, g_token_type_ids):
        _, l_pool_out = self.bert_model1(l_context, attention_mask = l_attention_mask, 
                                 token_type_ids = l_token_type_ids, return_dict=False)
        _, g_pool_out = self.bert_model2(g_context, attention_mask = g_attention_mask, 
                                 token_type_ids = g_token_type_ids, return_dict=False)
        
        avg_pool = torch.div(torch.add(l_pool_out * (1 - alpha), g_pool_out * alpha), 2)
        
#         print(seq_out.shape)
        
#         batch_size, seq_length, embed_dim = seq_out.size()
        
#         print(batch_size, seq_length, embed_dim)
        
#         qkv = seq_out.reshape(batch_size, seq_length, 12, int(embed_dim/12))
#         qkv = qkv.permute(0, 2, 1, 3) # [Batch, Head, SeqLen, Dims]
#         q, k, v = qkv.chunk(3, dim=-1)
        
#         # Determine value outputs
#         values, attention = scaled_dot_product(q, k, v, mask=None)
#         values = values.permute(0, 2, 1, 3) # [Batch, SeqLen, Head, Dims]
#         values = values.reshape(batch_size, seq_length, embed_dim)

        out = self.dropout(avg_pool)
#         out = self.pool(seq_out)
#         out = self.dropout(out)
        out = self.fc(out)
        
#         loss_tag = loss_fn(tag_out, aspect_tags, attention_mask, self.num_aspect_tags)
#         loss_pol = loss_fn(pol_out, polarity_tags, attention_mask, self.num_polarity_tags)
#         loss = (loss_tag + loss_pol) / 2

        s = nn.Softmax(dim=1)
    
        out = s(out)
        
        return out

In [17]:
train_aspect_clusters, test_aspect_clusters = model_selection.train_test_split(
    all_aspect_clusters, random_state = 42, test_size = TEST_SIZE)
len(train_aspect_clusters), len(test_aspect_clusters)

(4586, 1147)

In [18]:
train_dataset = SentenceTagDataset(tokenizer=tokenizer, aspect_clusters=train_aspect_clusters)
# train_data_loader = DeviceDataLoader(torch.utils.data.DataLoader(
#     train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True), device)    
train_data_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True)

test_dataset =SentenceTagDataset(tokenizer=tokenizer, aspect_clusters=test_aspect_clusters)
# test_data_loader = DeviceDataLoader(torch.utils.data.DataLoader(
#     test_dataset, batch_size=TEST_BATCH_SIZE, shuffle=True), device)   
test_data_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=TEST_BATCH_SIZE, shuffle=True)

# model = to_device(PolarityExtractionModel(num_polarity_tags = NUM_POLARITY_TAGS), device)
# print(model)

In [19]:
def train_one_model(train_data_loader, test_data_loader, alpha=0.5):
    
    best_valid_acc = np.inf * -1
    best_valid_f1 = np.inf * -1
    
    model = to_device(PolarityExtractionModel(num_polarity_tags = NUM_POLARITY_TAGS, alpha=alpha), device)
    
    torch.cuda.empty_cache()

#     NUM_EPOCHS = 1

    num_train_steps = int(len(train_aspect_clusters) / TRAIN_BATCH_SIZE * NUM_EPOCHS)
    optimizer = AdamW(model.parameters(), lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps=0, 
                                                num_training_steps=num_train_steps)
    loss_fn = nn.CrossEntropyLoss()

    best_loss = np.inf

    history = {
        "train_loss": list(),
        "train_acc": list(),
        "train_f1": list(),
        "valid_loss": list(),
        "valid_acc": list(),
        "valid_f1": list(),
    }

    for epoch in range(NUM_EPOCHS):

        train_losses = []
        valid_losses = []

        model.train()
        final_pred_polarity_tags = []
        final_true_polarity_tags = []
        for data in tqdm(train_data_loader, total=len(train_data_loader)):
            for k, v in data.items():
                data[k] = v.to(device)
            optimizer.zero_grad()
    #         print(data)
            l_context = data['l_context']
            l_attention_mask = data['l_attention_mask']
            l_token_type_ids = data['l_token_type_ids']
            g_context = data['g_context']
            g_attention_mask = data['g_attention_mask']
            g_token_type_ids = data['g_token_type_ids']
            true_polarity = data['polarity']

            optimizer.zero_grad()
            outputs = model(l_context, l_attention_mask, l_token_type_ids,
                            g_context, g_attention_mask, g_token_type_ids)

            _, pred_polarity = torch.max(outputs, dim=1)

            loss = loss_fn(outputs, true_polarity)

            train_losses.append(loss.item())
            final_pred_polarity_tags.append(pred_polarity)
            final_true_polarity_tags.append(true_polarity)

            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()

        avg_train_loss = np.mean(train_losses)
        # flatten the list of tensors
        final_pred_polarity_tags = torch.cat(final_pred_polarity_tags, 0)
        final_true_polarity_tags = torch.cat(final_true_polarity_tags, 0)
        # calculate the accuracy
        avg_train_acc = torch.sum(final_pred_polarity_tags == 
                                  final_true_polarity_tags) / final_true_polarity_tags.numel()
        train_f1 = f1_score(final_true_polarity_tags.cpu().numpy(), 
                            final_pred_polarity_tags.cpu().numpy(), average='weighted')

        model.eval()
        final_pred_polarity_tags = []
        final_true_polarity_tags = []
        for data in tqdm(test_data_loader, total=len(test_data_loader)):
            for k, v in data.items():
                data[k] = v.to(device)
            l_context = data['l_context']
            l_attention_mask = data['l_attention_mask']
            l_token_type_ids = data['l_token_type_ids']
            g_context = data['g_context']
            g_attention_mask = data['g_attention_mask']
            g_token_type_ids = data['g_token_type_ids']
            true_polarity = data['polarity']

            outputs = model(l_context, l_attention_mask, l_token_type_ids,
                            g_context, g_attention_mask, g_token_type_ids)
            _, pred_polarity = torch.max(outputs, dim=1)

            loss = loss_fn(outputs, true_polarity)

            valid_losses.append(loss.item())
            final_pred_polarity_tags.append(pred_polarity)
            final_true_polarity_tags.append(true_polarity)

        avg_valid_loss = np.mean(valid_losses)
        # flatten the list of tensors
        final_pred_polarity_tags = torch.cat(final_pred_polarity_tags, 0)
        final_true_polarity_tags = torch.cat(final_true_polarity_tags, 0)
        # calculate the accuracy
        avg_valid_acc = torch.sum(final_pred_polarity_tags == 
                                  final_true_polarity_tags) / final_true_polarity_tags.numel()
        valid_f1 = f1_score(final_true_polarity_tags.cpu().numpy(), 
                            final_pred_polarity_tags.cpu().numpy(), average='weighted')

        print("Train Loss: {:.5f}; Valid Loss: {:.5f}".format(avg_train_loss, avg_valid_loss))
        print("Train acc: {:.2f}%; Valid acc: {:.2f}%".format(avg_train_acc*100, avg_valid_acc*100))
        print("Train f1: {:.2f}%; Valid f1: {:.2f}%".format(train_f1*100, valid_f1*100))

        if avg_valid_loss < best_loss:
            torch.save(model.state_dict(), MODEL_PATH)
            best_loss = avg_valid_loss    

        history['train_loss'].append(avg_train_loss)
        history['train_acc'].append(avg_train_acc.cpu().numpy())
        history['train_f1'].append(train_f1)
        history['valid_loss'].append(avg_valid_loss)
        history['valid_acc'].append(avg_valid_acc.cpu().numpy())
        history['valid_f1'].append(valid_f1)
        
        if avg_valid_acc.cpu().numpy().item() > best_valid_acc:
            best_valid_acc = round(avg_valid_acc.cpu().numpy().item(), 4)
        if valid_f1 > best_valid_f1:
            best_valid_f1 = round(valid_f1, 4)
            
    return best_valid_acc, best_valid_f1
    

In [20]:
%%time

alpha_list = [0.5, 0.6, 0.7]

history = {
    "valid_acc": list(),
    "valid_f1": list(),
}

for alpha in alpha_list:
    torch.cuda.empty_cache()
    valid_acc, valid_f1 = train_one_model(train_data_loader, test_data_loader, alpha=alpha)
    history["valid_acc"].append(valid_acc)
    history["valid_f1"].append(valid_f1)


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_rela

Train Loss: 0.86911; Valid Loss: 0.79692
Train acc: 67.88%; Valid acc: 75.41%
Train f1: 63.51%; Valid f1: 74.60%


100%|██████████| 574/574 [03:17<00:00,  2.90it/s]
100%|██████████| 144/144 [00:15<00:00,  9.11it/s]


Train Loss: 0.77162; Valid Loss: 0.77029
Train acc: 77.74%; Valid acc: 77.68%
Train f1: 75.96%; Valid f1: 76.52%


100%|██████████| 574/574 [03:19<00:00,  2.88it/s]
100%|██████████| 144/144 [00:15<00:00,  9.00it/s]


Train Loss: 0.71634; Valid Loss: 0.76964
Train acc: 83.38%; Valid acc: 76.90%
Train f1: 82.57%; Valid f1: 76.58%


100%|██████████| 574/574 [03:20<00:00,  2.87it/s]
100%|██████████| 144/144 [00:16<00:00,  8.63it/s]


Train Loss: 0.67246; Valid Loss: 0.76624
Train acc: 88.53%; Valid acc: 77.77%
Train f1: 88.17%; Valid f1: 78.01%


100%|██████████| 574/574 [03:24<00:00,  2.81it/s]
100%|██████████| 144/144 [00:18<00:00,  7.91it/s]


Train Loss: 0.64108; Valid Loss: 0.76343
Train acc: 91.54%; Valid acc: 78.29%
Train f1: 91.34%; Valid f1: 78.39%


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_rela

Train Loss: 0.86486; Valid Loss: 0.79526
Train acc: 68.58%; Valid acc: 75.15%
Train f1: 62.87%; Valid f1: 71.70%


100%|██████████| 574/574 [03:21<00:00,  2.85it/s]
100%|██████████| 144/144 [00:17<00:00,  8.32it/s]


Train Loss: 0.76830; Valid Loss: 0.77502
Train acc: 77.85%; Valid acc: 77.68%
Train f1: 76.05%; Valid f1: 75.37%


100%|██████████| 574/574 [03:26<00:00,  2.78it/s]
100%|██████████| 144/144 [00:16<00:00,  8.90it/s]


Train Loss: 0.71308; Valid Loss: 0.76298
Train acc: 83.54%; Valid acc: 77.86%
Train f1: 82.63%; Valid f1: 77.39%


100%|██████████| 574/574 [03:26<00:00,  2.78it/s]
100%|██████████| 144/144 [00:16<00:00,  8.83it/s]


Train Loss: 0.66844; Valid Loss: 0.76383
Train acc: 88.73%; Valid acc: 77.86%
Train f1: 88.30%; Valid f1: 78.21%


100%|██████████| 574/574 [03:22<00:00,  2.83it/s]
100%|██████████| 144/144 [00:16<00:00,  8.84it/s]


Train Loss: 0.64002; Valid Loss: 0.75448
Train acc: 91.91%; Valid acc: 79.16%
Train f1: 91.67%; Valid f1: 79.11%


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_rela

Train Loss: 0.87296; Valid Loss: 0.79237
Train acc: 67.90%; Valid acc: 76.02%
Train f1: 63.25%; Valid f1: 73.51%


100%|██████████| 574/574 [03:27<00:00,  2.77it/s]
100%|██████████| 144/144 [00:16<00:00,  8.76it/s]


Train Loss: 0.76818; Valid Loss: 0.77496
Train acc: 77.95%; Valid acc: 77.59%
Train f1: 76.67%; Valid f1: 77.07%


100%|██████████| 574/574 [03:28<00:00,  2.75it/s]
100%|██████████| 144/144 [00:16<00:00,  8.91it/s]


Train Loss: 0.72153; Valid Loss: 0.78062
Train acc: 82.99%; Valid acc: 76.81%
Train f1: 82.27%; Valid f1: 76.14%


100%|██████████| 574/574 [03:22<00:00,  2.84it/s]
100%|██████████| 144/144 [00:16<00:00,  8.87it/s]


Train Loss: 0.69333; Valid Loss: 0.77190
Train acc: 85.80%; Valid acc: 77.24%
Train f1: 85.20%; Valid f1: 76.93%


100%|██████████| 574/574 [03:23<00:00,  2.82it/s]
100%|██████████| 144/144 [00:16<00:00,  8.76it/s]


Train Loss: 0.67000; Valid Loss: 0.76696
Train acc: 88.01%; Valid acc: 77.59%
Train f1: 87.53%; Valid f1: 77.18%
Wall time: 55min 34s


In [21]:
history

{'valid_acc': [0.7829, 0.7916, 0.7759],
 'valid_f1': [0.7839432363807616, 0.7910996768580242, 0.7718448251479557]}

0.7833

In [None]:
%%time

torch.cuda.empty_cache()

model = to_device(PolarityExtractionModel(num_polarity_tags = NUM_POLARITY_TAGS, alpha=0.6), device)

num_train_steps = int(len(train_aspect_clusters) / TRAIN_BATCH_SIZE * NUM_EPOCHS)
optimizer = AdamW(model.parameters(), lr=3e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0, 
                                            num_training_steps=num_train_steps)
loss_fn = nn.CrossEntropyLoss()

best_loss = np.inf

history = {
    "train_loss": list(),
    "train_acc": list(),
    "train_f1": list(),
    "valid_loss": list(),
    "valid_acc": list(),
    "valid_f1": list(),
}

# NUM_EPOCHS = 1

for epoch in range(NUM_EPOCHS):
    
    train_losses = []
    valid_losses = []
    
    model.train()
    final_pred_polarity_tags = []
    final_true_polarity_tags = []
    for data in tqdm(train_data_loader, total=len(train_data_loader)):
        for k, v in data.items():
            data[k] = v.to(device)
        optimizer.zero_grad()
#         print(data)
        l_context = data['l_context']
        l_attention_mask = data['l_attention_mask']
        l_token_type_ids = data['l_token_type_ids']
        g_context = data['g_context']
        g_attention_mask = data['g_attention_mask']
        g_token_type_ids = data['g_token_type_ids']
        true_polarity = data['polarity']

        optimizer.zero_grad()
        outputs = model(l_context, l_attention_mask, l_token_type_ids,
                        g_context, g_attention_mask, g_token_type_ids)
        
        _, pred_polarity = torch.max(outputs, dim=1)
        
        loss = loss_fn(outputs, true_polarity)
        
        train_losses.append(loss.item())
        final_pred_polarity_tags.append(pred_polarity)
        final_true_polarity_tags.append(true_polarity)
        
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
                
    avg_train_loss = np.mean(train_losses)
    # flatten the list of tensors
    final_pred_polarity_tags = torch.cat(final_pred_polarity_tags, 0)
    final_true_polarity_tags = torch.cat(final_true_polarity_tags, 0)
    # calculate the accuracy
    avg_train_acc = torch.sum(final_pred_polarity_tags == 
                              final_true_polarity_tags) / final_true_polarity_tags.numel()
    train_f1 = f1_score(final_true_polarity_tags.cpu().numpy(), 
                        final_pred_polarity_tags.cpu().numpy(), average='weighted')
        
    model.eval()
    final_pred_polarity_tags = []
    final_true_polarity_tags = []
    for data in tqdm(test_data_loader, total=len(test_data_loader)):
        for k, v in data.items():
            data[k] = v.to(device)
        l_context = data['l_context']
        l_attention_mask = data['l_attention_mask']
        l_token_type_ids = data['l_token_type_ids']
        g_context = data['g_context']
        g_attention_mask = data['g_attention_mask']
        g_token_type_ids = data['g_token_type_ids']
        true_polarity = data['polarity']
    
        outputs = model(l_context, l_attention_mask, l_token_type_ids,
                        g_context, g_attention_mask, g_token_type_ids)
        _, pred_polarity = torch.max(outputs, dim=1)
        
        loss = loss_fn(outputs, true_polarity)
        
        valid_losses.append(loss.item())
        final_pred_polarity_tags.append(pred_polarity)
        final_true_polarity_tags.append(true_polarity)
        
    avg_valid_loss = np.mean(valid_losses)
    # flatten the list of tensors
    final_pred_polarity_tags = torch.cat(final_pred_polarity_tags, 0)
    final_true_polarity_tags = torch.cat(final_true_polarity_tags, 0)
    # calculate the accuracy
    avg_valid_acc = torch.sum(final_pred_polarity_tags == 
                              final_true_polarity_tags) / final_true_polarity_tags.numel()
    valid_f1 = f1_score(final_true_polarity_tags.cpu().numpy(), 
                        final_pred_polarity_tags.cpu().numpy(), average='weighted')

    print("Train Loss: {:.5f}; Valid Loss: {:.5f}".format(avg_train_loss, avg_valid_loss))
    print("Train acc: {:.2f}%; Valid acc: {:.2f}%".format(avg_train_acc*100, avg_valid_acc*100))
    print("Train f1: {:.2f}%; Valid f1: {:.2f}%".format(train_f1*100, valid_f1*100))

    if avg_valid_loss < best_loss:
        torch.save(model.state_dict(), MODEL_PATH)
        best_loss = avg_valid_loss    
        
    history['train_loss'].append(avg_train_loss)
    history['train_acc'].append(avg_train_acc.cpu().numpy())
    history['train_f1'].append(train_f1)
    history['valid_loss'].append(avg_valid_loss)
    history['valid_acc'].append(avg_valid_acc.cpu().numpy())
    history['valid_f1'].append(valid_f1)


In [None]:
plt.plot(history['train_acc'], label='train accuracy')
plt.plot(history['valid_acc'], label='validation accuracy')
plt.title('Training history')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend()
plt.ylim([0.5, 1]);

In [None]:
def get_classification_report(test_data_loader, model, model_path=None):
    if model_path is not None: # load the saved model
        print('Loading saved model from: {}'.format(model_path))
        model.load_state_dict(torch.load(model_path))
    model = to_device(model, device)   
    
    model.eval()
    final_pred_polarity_tags = []
    final_true_polarity_tags = []
    with torch.no_grad():
        for data in tqdm(test_data_loader, total=len(test_data_loader)):
            for k, v in data.items():
                data[k] = v.to(device)
            l_context = data['l_context']
            l_attention_mask = data['l_attention_mask']
            l_token_type_ids = data['l_token_type_ids']
            g_context = data['g_context']
            g_attention_mask = data['g_attention_mask']
            g_token_type_ids = data['g_token_type_ids']
            true_polarity = data['polarity']
            
            outputs = model(l_context, l_attention_mask, l_token_type_ids,
                            g_context, g_attention_mask, g_token_type_ids)
            _, pred_polarity = torch.max(outputs, dim=1)
            
            final_pred_polarity_tags.append(pred_polarity)
            final_true_polarity_tags.append(true_polarity)
            
    # flatten the list of tensors
    final_pred_polarity_tags = torch.cat(final_pred_polarity_tags, 0).cpu().numpy()
    final_true_polarity_tags = torch.cat(final_true_polarity_tags, 0).cpu().numpy()
            
    print(classification_report(final_true_polarity_tags, final_pred_polarity_tags, 
                                target_names=["Neutral", "Positive", "Negative"]))
#     print(classification_report(final_true_polarity_tags, final_pred_polarity_tags))
    
get_classification_report(test_data_loader, model, model_path=MODEL_PATH)

In [27]:


def test_dataset(idx=0):


    train_dataset = SentenceTagDataset(tokenizer=tokenizer,
                                       aspect_clusters=train_aspect_clusters)
    toTensor = ToTensor()
#     dl = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=lambda x: x)

    train_data_loader = DeviceDataLoader(torch.utils.data.DataLoader(
        train_dataset, batch_size=32, shuffle=True), device)    

    data = next(iter(train_data_loader))
    data = train_dataset[idx]
    local_context = data['l_context'].cpu().numpy()
    global_context = data['g_context'].cpu().numpy()
    l_attention_mask = np.logical_not(data['l_attention_mask'].cpu().numpy())
    g_attention_mask = np.logical_not(data['g_attention_mask'].cpu().numpy())
#     aspect_term = data['aspect_term']
    polarity = data['polarity']
    
    print("*** Raw Data")
    print("*** local_context")
    print(local_context)
    print("*** global_context")
    print(global_context)
    print("*** polarity")
    print(polarity)
#     print("*** aspect_term")
#     print(aspect_term)
    print()
    
    local_context = np.ma.compressed(np.ma.masked_where(l_attention_mask, local_context))
    global_context = np.ma.compressed(np.ma.masked_where(g_attention_mask, global_context))
    
#     input_ids = input_ids[(input_ids!=101) & (input_ids!=102)]
        
    orig_local_context = np.array(train_dataset.tokenizer.convert_ids_to_tokens(local_context))
    orig_global_context = np.array(train_dataset.tokenizer.convert_ids_to_tokens(global_context))
    
    print(orig_local_context)
    print(orig_global_context)

test_dataset(1)

*** Raw Data
*** local_context
[  101  1109 14538  1110  1304  3613  1105  3903  1112   146   102     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]
*** global_context
[  101  1109 14538  1110  1304  3613  1105  3903  1112   146  6829  5127
 15004  1116  1105  4683   102     0     0     0     0     0     0     0
     0     0     0     0