In [1]:
from torch.utils.data import DataLoader, ConcatDataset
from transformers import BertTokenizer
import torch
from torch.nn.utils.rnn import pad_sequence

import pandas as pd
import time
import numpy as np

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from model import bertATE, bertABSA
from dataset import datasetATM, datasetABSA

In [3]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(DEVICE)
pretrain_model_name = "bert-base-uncased"

cuda:0


In [4]:
lr = 2e-5

tokenizer = BertTokenizer.from_pretrained(pretrain_model_name)

modelATE = bertATE(pretrain_model_name).to(DEVICE)
optimizerATE = torch.optim.Adam(modelATE.parameters(), lr=lr)

modelABSA = bertABSA(pretrain_model_name).to(DEVICE)
optimizerABSA = torch.optim.Adam(modelABSA.parameters(), lr=lr)

In [5]:
def evl_time(t):
    min, sec= divmod(t, 60)
    hr, min = divmod(min, 60)
    return int(hr), int(min), int(sec)

def load_model(model, path):
    model.load_state_dict(torch.load(path), strict=False)
    return model
    
def save_model(model, name):
    torch.save(model.state_dict(), name)

# Aspect Term Extraction

In [6]:
laptops_train_ds = datasetATM(pd.read_csv("data/laptops_train.csv"), tokenizer)
laptops_test_ds = datasetATM(pd.read_csv("data/laptops_test.csv"), tokenizer)
restaurants_train_ds = datasetATM(pd.read_csv("data/restaurants_train.csv"), tokenizer)
restaurants_test_ds = datasetATM(pd.read_csv("data/restaurants_test.csv"), tokenizer)
twitter_train_ds = datasetATM(pd.read_csv("data/twitter_train.csv"), tokenizer)
twitter_test_ds = datasetATM(pd.read_csv("data/twitter_test.csv"), tokenizer)

In [7]:
w,x,y,z = laptops_train_ds.__getitem__(121)
print(w)
print(x)
print(x.size())
print(y)
print(y.size())
print(z)
print(z.size())

['the', 'battery', 'life', 'seems', 'to', 'be', 'very', 'good', ',', 'and', 'have', 'had', 'no', 'issues', 'with', 'it', '.']
tensor([1996, 6046, 2166, 3849, 2000, 2022, 2200, 2204, 1010, 1998, 2031, 2018,
        2053, 3314, 2007, 2009, 1012])
torch.Size([17])
tensor([0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
torch.Size([17])
tensor([-1,  2,  2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1])
torch.Size([17])


In [8]:
train_ds = ConcatDataset([laptops_train_ds, restaurants_train_ds, twitter_train_ds])
test_ds = ConcatDataset([laptops_test_ds, restaurants_test_ds, twitter_test_ds])

In [9]:
def create_mini_batch(samples):
    ids_tensors = [s[1] for s in samples]
    ids_tensors = pad_sequence(ids_tensors, batch_first=True)

    tags_tensors = [s[2] for s in samples]
    tags_tensors = pad_sequence(tags_tensors, batch_first=True)

    pols_tensors = [s[3] for s in samples]
    pols_tensors = pad_sequence(pols_tensors, batch_first=True)
    
    masks_tensors = torch.zeros(ids_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(ids_tensors != 0, 1)
    
    return ids_tensors, tags_tensors, pols_tensors, masks_tensors

In [10]:
train_loader = DataLoader(train_ds, batch_size=5, collate_fn=create_mini_batch, shuffle = True)
test_loader = DataLoader(test_ds, batch_size=50, collate_fn=create_mini_batch, shuffle = True)

In [11]:
for batch in train_loader:
    w,x,y,z = batch
    print(w)
    print(w.size())
    print(x)
    print(x.size())
    print(y)
    print(y.size())
    print(z)
    print(z.size())
    break

tensor([[ 2138,  2643,  1998,  3889,  5841,  2119,  5223,  2149,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0],
        [ 2074,  3427,  1996,  2034,  4302, 10693,  3185,  1010,  2009,  1000,
          1055,  1000,  2175,  2078,  6583,  2022,  1037,  2204,  2154,  1024,
          1011, 25269,  2497,  1011,     0,     0,     0,     0,     0],
        [19387,  2332, 11927,  4710,  1024,  1996, 18264, 11891,   999,   999,
           999,   999,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0],
        [ 2111,  2064,  2831,  2004,  2172,  4485,  2004,  1996,  2215,  2055,
          4172,  8854,  1004,  3814,  2232, 11782,  2021,  4365,  2027,  2437,
          2009,  2750,  1996,  2111,  3331,  1012,  1045, 12979,  2068],
        [ 1045,  2245,  4083,  1996,  6097,  9808,  2052,  2022,  2524,  101

In [12]:
def train_model_ATE(loader, epochs):
    all_data = len(loader)
    for epoch in range(epochs):
        finish_data = 0
        losses = []
        current_times = []
        correct_predictions = 0
        
        for data in loader:
            t0 = time.time()
            ids_tensors, tags_tensors, _, masks_tensors = data
            ids_tensors = ids_tensors.to(DEVICE)
            tags_tensors = tags_tensors.to(DEVICE)
            masks_tensors = masks_tensors.to(DEVICE)

            loss = modelATE(ids_tensors=ids_tensors, tags_tensors=tags_tensors, masks_tensors=masks_tensors)
            losses.append(loss.item())
            loss.backward()
            optimizerATE.step()
            optimizerATE.zero_grad()

            finish_data += 1
            current_times.append(round(time.time()-t0,3))
            current = np.mean(current_times)
            hr, min, sec = evl_time(current*(all_data-finish_data) + current*all_data*(epochs-epoch-1))
            print('epoch:', epoch, " batch:", finish_data, "/" , all_data, " loss:", np.mean(losses), " hr:", hr, " min:", min," sec:", sec)         

        save_model(modelATE, 'bert_ATE.pkl')
        
def test_model_ATE(loader):
    pred = []
    trueth = []
    with torch.no_grad():
        for data in loader:

            ids_tensors, tags_tensors, _, masks_tensors = data
            ids_tensors = ids_tensors.to(DEVICE)
            tags_tensors = tags_tensors.to(DEVICE)
            masks_tensors = masks_tensors.to(DEVICE)

            outputs = modelATE(ids_tensors=ids_tensors, tags_tensors=None, masks_tensors=masks_tensors)

            _, predictions = torch.max(outputs, dim=2)

            pred += list([int(j) for i in predictions for j in i ])
            trueth += list([int(j) for i in tags_tensors for j in i ])

    return trueth, pred



In [13]:
# %time train_model_ATE(train_loader, 3)

In [14]:
modelATE = load_model(modelATE, 'bert_ATE.pkl')

  model.load_state_dict(torch.load(path), strict=False)


In [15]:
%time x, y = test_model_ATE(test_loader)
print(classification_report(x, y, target_names=[str(i) for i in range(3)]))

CPU times: total: 13.6 s
Wall time: 14.7 s
              precision    recall  f1-score   support

           0       0.99      1.00      0.99    142707
           1       0.87      0.87      0.87      6486
           2       0.94      0.69      0.80      3837

    accuracy                           0.98    153030
   macro avg       0.93      0.85      0.89    153030
weighted avg       0.98      0.98      0.98    153030



# Aspect Based Sentiment Analysis

In [16]:
laptops_train_ds = datasetABSA(pd.read_csv("data/laptops_train.csv"), tokenizer)
laptops_test_ds = datasetABSA(pd.read_csv("data/laptops_test.csv"), tokenizer)
restaurants_train_ds = datasetABSA(pd.read_csv("data/restaurants_train.csv"), tokenizer)
restaurants_test_ds = datasetABSA(pd.read_csv("data/restaurants_test.csv"), tokenizer)
twitter_train_ds = datasetABSA(pd.read_csv("data/twitter_train.csv"), tokenizer)
twitter_test_ds = datasetABSA(pd.read_csv("data/twitter_test.csv"), tokenizer)

In [17]:
w,x,y,z = laptops_train_ds.__getitem__(121)
print(w)
print(len(w))
print(x)
print(len(x))
print(y)
print(len(y))
print(z)

['[cls]', 'the', 'battery', 'life', 'seems', 'to', 'be', 'very', 'good', ',', 'and', 'have', 'had', 'no', 'issues', 'with', 'it', '.', '[sep]', 'battery', 'life']
21
tensor([ 100, 1996, 6046, 2166, 3849, 2000, 2022, 2200, 2204, 1010, 1998, 2031,
        2018, 2053, 3314, 2007, 2009, 1012,  100, 6046, 2166])
21
tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1])
21
tensor(2)


In [18]:
def create_mini_batch2(samples):
    ids_tensors = [s[1] for s in samples]
    ids_tensors = pad_sequence(ids_tensors, batch_first=True)

    segments_tensors = [s[2] for s in samples]
    segments_tensors = pad_sequence(segments_tensors, batch_first=True)

    label_ids = torch.stack([s[3] for s in samples])
    
    masks_tensors = torch.zeros(ids_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(ids_tensors != 0, 1)

    return ids_tensors, segments_tensors, masks_tensors, label_ids

In [19]:
train_ds = ConcatDataset([laptops_train_ds, restaurants_train_ds, twitter_train_ds])
test_ds = ConcatDataset([laptops_test_ds, restaurants_test_ds, twitter_test_ds])

train_loader = DataLoader(train_ds, batch_size=4, collate_fn=create_mini_batch2, shuffle = True)
test_loader = DataLoader(test_ds, batch_size=50, collate_fn=create_mini_batch2, shuffle = True)

In [20]:
for batch in train_loader:
    w,x,y,z = batch
    print(w)
    print(w.size())
    print(x)
    print(x.size())
    print(y)
    print(y.size())
    print(z)
    print(z.size())
    break

tensor([[  100,  2235,  3529,  2015,  2005,  2364,  4372, 13334,  1010,  1045,
          2018, 11840,  1011,  1048, 15185,  1011,  2347,  2102,  7622,  1011,
         25269,  2497,  1011,  6513,  2018,  7975,  1010,  2009,  2001,  2204,
          1012,   100,  7975,     0,     0,     0,     0,     0],
        [  100,  6289,  2232,  1010,  2652, 16115,  8827,  2361,  2085,  2008,
          1045,  1000,  2310,  1000,  2525,  7791,  2000,  1996,  9808,  2102,
          1037,  6474,  2335, 25828,  5683,  6881,   100,  8827,  2361,     0,
             0,     0,     0,     0,     0,     0,     0,     0],
        [  100,  1996,  3968,  2265,  2074,  2209,  1037, 12528,  1997,  6796,
         12170, 22669,  2000,  2191,  1037,  1000,  1000, 25591,  1000,  1000,
          2391,  1012,  1012,  1012,  1045,  3246,  2017,  1000,  2128,  1000,
          7098,  1052,  2475,  1012,   100,  6796, 12170, 22669],
        [  100,  1996, 24318, 28496,  2015,  2020,  2723,  2098,  2011,  1996,
         243

In [21]:
def train_model_ABSA(loader, epochs):
    all_data = len(loader)
    for epoch in range(epochs):
        finish_data = 0
        losses = []
        current_times = []
        correct_predictions = 0
        
        for data in loader:
            t0 = time.time()
            ids_tensors, segments_tensors, masks_tensors, label_ids = data
            ids_tensors = ids_tensors.to(DEVICE)
            segments_tensors = segments_tensors.to(DEVICE)
            label_ids = label_ids.to(DEVICE)
            masks_tensors = masks_tensors.to(DEVICE)

            loss = modelABSA(ids_tensors=ids_tensors, lable_tensors=label_ids, masks_tensors=masks_tensors, segments_tensors=segments_tensors)
            losses.append(loss.item())
            loss.backward()
            optimizerABSA.step()
            optimizerABSA.zero_grad()

            finish_data += 1
            current_times.append(round(time.time()-t0,3))
            current = np.mean(current_times)
            hr, min, sec = evl_time(current*(all_data-finish_data) + current*all_data*(epochs-epoch-1))
            print('epoch:', epoch, " batch:", finish_data, "/" , all_data, " loss:", np.mean(losses), " hr:", hr, " min:", min," sec:", sec)         

        save_model(modelABSA, 'bert_ABSA.pkl')
        
def test_model_ABSA(loader):
    pred = []
    trueth = []
    with torch.no_grad():
        for data in loader:

            ids_tensors, segments_tensors, masks_tensors, label_ids = data
            ids_tensors = ids_tensors.to(DEVICE)
            segments_tensors = segments_tensors.to(DEVICE)
            masks_tensors = masks_tensors.to(DEVICE)

            outputs = modelABSA(ids_tensors, None, masks_tensors=masks_tensors, segments_tensors=segments_tensors)
            
            _, predictions = torch.max(outputs, dim=1)

            pred += list([int(i) for i in predictions])
            trueth += list([int(i) for i in label_ids])

    return trueth, pred



In [22]:
%time train_model_ABSA(train_loader, 6)

epoch: 0  batch: 1 / 3044  loss: 1.1571080684661865  hr: 0  min: 46  sec: 52
epoch: 0  batch: 2 / 3044  loss: 1.1395832300186157  hr: 0  min: 29  sec: 58
epoch: 0  batch: 3 / 3044  loss: 1.1923088630040486  hr: 0  min: 27  sec: 11
epoch: 0  batch: 4 / 3044  loss: 1.1574071943759918  hr: 0  min: 25  sec: 29
epoch: 0  batch: 5 / 3044  loss: 1.1631940126419067  hr: 0  min: 24  sec: 42
epoch: 0  batch: 6 / 3044  loss: 1.1434458096822102  hr: 0  min: 24  sec: 14
epoch: 0  batch: 7 / 3044  loss: 1.1737637860434396  hr: 0  min: 23  sec: 49
epoch: 0  batch: 8 / 3044  loss: 1.202772006392479  hr: 0  min: 23  sec: 41
epoch: 0  batch: 9 / 3044  loss: 1.198934170934889  hr: 0  min: 23  sec: 29
epoch: 0  batch: 10 / 3044  loss: 1.2016431450843812  hr: 0  min: 23  sec: 20
epoch: 0  batch: 11 / 3044  loss: 1.1878696571696887  hr: 0  min: 23  sec: 17
epoch: 0  batch: 12 / 3044  loss: 1.1755335529645283  hr: 0  min: 23  sec: 20
epoch: 0  batch: 13 / 3044  loss: 1.1657852576329157  hr: 0  min: 23  sec: 

In [23]:
# modelABSA = load_model(modelABSA, 'bert_ABSA.pkl')

  model.load_state_dict(torch.load(path), strict=False)


In [24]:
%time x, y = test_model_ABSA(test_loader)
print(classification_report(x, y, target_names=[str(i) for i in range(3)]))

CPU times: total: 10.9 s
Wall time: 11.6 s
              precision    recall  f1-score   support

           0       0.67      0.79      0.73       497
           1       0.78      0.43      0.56       710
           2       0.78      0.92      0.84      1239

    accuracy                           0.75      2446
   macro avg       0.74      0.72      0.71      2446
weighted avg       0.76      0.75      0.74      2446



# ATE + ABSA

In [27]:
def predict_model_ABSA(sentence, aspect, tokenizer):
    t1 = tokenizer.tokenize(sentence)
    t2 = tokenizer.tokenize(aspect)

    word_pieces = ['[cls]']
    word_pieces += t1
    word_pieces += ['[sep]']
    word_pieces += t2

    segment_tensor = [0] + [0]*len(t1) + [0] + [1]*len(t2)

    ids = tokenizer.convert_tokens_to_ids(word_pieces)
    input_tensor = torch.tensor([ids]).to(DEVICE)
    segment_tensor = torch.tensor(segment_tensor).to(DEVICE)

    with torch.no_grad():
        outputs = modelABSA(input_tensor, None, None, segments_tensors=segment_tensor)
        _, predictions = torch.max(outputs, dim=1)
    
    return word_pieces, predictions, outputs

def predict_model_ATE(sentence, tokenizer):
    word_pieces = []
    tokens = tokenizer.tokenize(sentence)
    word_pieces += tokens

    ids = tokenizer.convert_tokens_to_ids(word_pieces)
    input_tensor = torch.tensor([ids]).to(DEVICE)

    with torch.no_grad():
        outputs = modelATE(input_tensor, None, None)
        _, predictions = torch.max(outputs, dim=2)
    predictions = predictions[0].tolist()

    return word_pieces, predictions, outputs

def ATE_ABSA(text):
    terms = []
    word = ""
    x, y, z = predict_model_ATE(text, tokenizer)
    for i in range(len(y)):
        if y[i] == 1:
            if len(word) != 0:
                terms.append(word.replace(" ##",""))
            word = x[i]
        if y[i] == 2:
            word += (" " + x[i])
            
    
    if len(word) != 0:
            terms.append(word.replace(" ##",""))
            
    print("tokens:", x)
    print("ATE:", terms)
    
    if len(terms) != 0:
        for i in terms:
            _, c, p = predict_model_ABSA(text, i, tokenizer)
            print("term:", [i], "class:", [int(c)], "ABSA:", [float(p[0][0]), float(p[0][1]), float(p[0][2])])


In [28]:
text = "For the price you pay this product is very good. However, battery life is a little lack-luster coming from a MacBook Pro."
ATE_ABSA(text)

tokens: ['for', 'the', 'price', 'you', 'pay', 'this', 'product', 'is', 'very', 'good', '.', 'however', ',', 'battery', 'life', 'is', 'a', 'little', 'lack', '-', 'lust', '##er', 'coming', 'from', 'a', 'mac', '##book', 'pro', '.']
ATE: ['price', 'battery life']
term: ['price'] class: [2] ABSA: [-0.8602946996688843, -2.789419174194336, 4.508271217346191]
term: ['battery life'] class: [0] ABSA: [4.390956878662109, -2.653144598007202, -0.35245683789253235]


In [29]:
text = "I think Apple is better than Microsoft."
ATE_ABSA(text)

tokens: ['i', 'think', 'apple', 'is', 'better', 'than', 'microsoft', '.']
ATE: ['microsoft']
term: ['microsoft'] class: [0] ABSA: [1.8145989179611206, -0.841198742389679, 0.156061589717865]
