IMPORTS

In [None]:
!pip install transformers==3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==3
  Downloading transformers-3.0.0-py3-none-any.whl (754 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m754.6/754.6 KB[0m [31m39.4 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m72.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers==0.8.0-rc4
  Downloading tokenizers-0.8.0rc4-cp38-cp38-manylinux1_x86_64.whl (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m103.1 MB/s[0m eta [36m0:00:00[0m
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m880.6/880.6 KB[0m [31m64.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (se

In [None]:
from transformers import BertModel
import torch

from torch.utils.data import DataLoader, ConcatDataset, Dataset
from transformers import BertTokenizer
import torch
from torch.nn.utils.rnn import pad_sequence
import pandas as pd
import time
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
class dataset_ATM(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        tokens, tags, pols = self.df.iloc[idx, :3].values

        tokens = tokens.replace("'", "").strip("][").split(', ')
        tags = tags.strip('][').split(', ')
        pols = pols.strip('][').split(', ')

        bert_tokens = []
        bert_tags = []
        bert_pols = []
        for i in range(len(tokens)):
            t = self.tokenizer.tokenize(tokens[i])
            bert_tokens += t
            bert_tags += [int(tags[i])]*len(t)
            bert_pols += [int(pols[i])]*len(t)
        
        bert_ids = self.tokenizer.convert_tokens_to_ids(bert_tokens)

        ids_tensor = torch.tensor(bert_ids)
        tags_tensor = torch.tensor(bert_tags)
        pols_tensor = torch.tensor(bert_pols)

        return bert_tokens, ids_tensor, tags_tensor, pols_tensor

    def __len__(self):
        return len(self.df)

In [None]:
class bert_ATE(torch.nn.Module):
    def __init__(self, pretrain_model):
        super(bert_ATE, self).__init__()
        self.bert = BertModel.from_pretrained(pretrain_model)
        self.linear = torch.nn.Linear(self.bert.config.hidden_size, 3)
        self.loss_fn = torch.nn.CrossEntropyLoss()

    def forward(self, ids_tensors, tags_tensors, masks_tensors):
        bert_outputs,_ = self.bert(input_ids=ids_tensors, attention_mask=masks_tensors)
        # print(bert_outputs.size())
        linear_outputs = self.linear(bert_outputs)
        # print(linear_outputs.size())

        if tags_tensors is not None:
            tags_tensors = tags_tensors.view(-1)
            linear_outputs = linear_outputs.view(-1,3)
            # print(linear_outputs.size())
            # print(tags_tensors.size())
            loss = self.loss_fn(linear_outputs, tags_tensors)
            return loss
        else:
            return linear_outputs

In [None]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
pretrain_model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(pretrain_model_name)
lr = 2e-5
model_ATE = bert_ATE(pretrain_model_name).to(DEVICE)
optimizer_ATE = torch.optim.Adam(model_ATE.parameters(), lr=lr)

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
def evl_time(t):
    min, sec= divmod(t, 60)
    hr, min = divmod(min, 60)
    return int(hr), int(min), int(sec)

def load_model(model, path):
    model.load_state_dict(torch.load(path), strict=False)
    return model
    
def save_model(model, name):
    torch.save(model.state_dict(), name)

In [None]:
laptops_train_ds = dataset_ATM(pd.read_csv("data/laptops_train.csv"), tokenizer)
laptops_test_ds = dataset_ATM(pd.read_csv("data/laptops_test.csv"), tokenizer)
restaurants_train_ds = dataset_ATM(pd.read_csv("data/restaurants_train.csv"), tokenizer)
restaurants_test_ds = dataset_ATM(pd.read_csv("data/restaurants_test.csv"), tokenizer)
twitter_train_ds = dataset_ATM(pd.read_csv("data/twitter_train.csv"), tokenizer)
twitter_test_ds = dataset_ATM(pd.read_csv("data/twitter_test.csv"), tokenizer)

In [None]:
train_ds = ConcatDataset([laptops_train_ds, restaurants_train_ds, twitter_train_ds])
test_ds = ConcatDataset([laptops_test_ds, restaurants_test_ds, twitter_test_ds])

In [None]:
def create_mini_batch(samples):
    ids_tensors = [s[1] for s in samples]
    ids_tensors = pad_sequence(ids_tensors, batch_first=True)

    tags_tensors = [s[2] for s in samples]
    tags_tensors = pad_sequence(tags_tensors, batch_first=True)

    pols_tensors = [s[3] for s in samples]
    pols_tensors = pad_sequence(pols_tensors, batch_first=True)
    
    masks_tensors = torch.zeros(ids_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(ids_tensors != 0, 1)
    
    return ids_tensors, tags_tensors, pols_tensors, masks_tensors

In [None]:
train_loader = DataLoader(train_ds, batch_size=5, collate_fn=create_mini_batch, shuffle = True)
test_loader = DataLoader(test_ds, batch_size=50, collate_fn=create_mini_batch, shuffle = True)

In [None]:
def train_model_ATE(loader, epochs):
    all_data = len(loader)
    for epoch in range(epochs):
        finish_data = 0
        losses = []
        current_times = []
        correct_predictions = 0
        
        for data in loader:
            t0 = time.time()
            ids_tensors, tags_tensors, _, masks_tensors = data
            ids_tensors = ids_tensors.to(DEVICE)
            tags_tensors = tags_tensors.to(DEVICE)
            masks_tensors = masks_tensors.to(DEVICE)

            loss = model_ATE(ids_tensors=ids_tensors, tags_tensors=tags_tensors, masks_tensors=masks_tensors)
            losses.append(loss.item())
            loss.backward()
            optimizer_ATE.step()
            optimizer_ATE.zero_grad()

            finish_data += 1
            current_times.append(round(time.time()-t0,3))
            current = np.mean(current_times)
            hr, min, sec = evl_time(current*(all_data-finish_data) + current*all_data*(epochs-epoch-1))
            print('epoch:', epoch, " batch:", finish_data, "/" , all_data, " loss:", np.mean(losses), " hr:", hr, " min:", min," sec:", sec)         

        save_model(model_ATE, 'bert_ATE.pkl')

In [None]:
def test_model_ATE(loader):
    pred = []
    trueth = []
    with torch.no_grad():
        for data in loader:

            ids_tensors, tags_tensors, _, masks_tensors = data
            ids_tensors = ids_tensors.to(DEVICE)
            tags_tensors = tags_tensors.to(DEVICE)
            masks_tensors = masks_tensors.to(DEVICE)

            outputs = model_ATE(ids_tensors=ids_tensors, tags_tensors=None, masks_tensors=masks_tensors)

            _, predictions = torch.max(outputs, dim=2)

            pred += list([int(j) for i in predictions for j in i ])
            trueth += list([int(j) for i in tags_tensors for j in i ])

    return trueth, pred

In [None]:
%time train_model_ATE(train_loader, 3)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
epoch: 0  batch: 2311 / 2436  loss: 0.06314276958254482  hr: 0  min: 5  sec: 32
epoch: 0  batch: 2312 / 2436  loss: 0.06312280048672726  hr: 0  min: 5  sec: 32
epoch: 0  batch: 2313 / 2436  loss: 0.06320399200220939  hr: 0  min: 5  sec: 32
epoch: 0  batch: 2314 / 2436  loss: 0.0631809657656774  hr: 0  min: 5  sec: 32
epoch: 0  batch: 2315 / 2436  loss: 0.06315523246718072  hr: 0  min: 5  sec: 32
epoch: 0  batch: 2316 / 2436  loss: 0.06314015064689806  hr: 0  min: 5  sec: 32
epoch: 0  batch: 2317 / 2436  loss: 0.06311633587712068  hr: 0  min: 5  sec: 31
epoch: 0  batch: 2318 / 2436  loss: 0.06309009847901477  hr: 0  min: 5  sec: 31
epoch: 0  batch: 2319 / 2436  loss: 0.06306743280624862  hr: 0  min: 5  sec: 31
epoch: 0  batch: 2320 / 2436  loss: 0.06304396225286524  hr: 0  min: 5  sec: 31
epoch: 0  batch: 2321 / 2436  loss: 0.06302985342969124  hr: 0  min: 5  sec: 31
epoch: 0  batch: 2322 / 2436  loss: 0.06300521622780204 

In [None]:
model_ATE = load_model(model_ATE, 'bert_ATE.pkl')

In [None]:
%time x, y = test_model_ATE(test_loader)
print(classification_report(x, y, target_names=[str(i) for i in range(3)]))

CPU times: user 13.6 s, sys: 25.6 ms, total: 13.6 s
Wall time: 13.6 s
              precision    recall  f1-score   support

           0       0.99      1.00      0.99    142075
           1       0.88      0.88      0.88      6486
           2       0.93      0.73      0.81      3837

    accuracy                           0.98    152398
   macro avg       0.93      0.87      0.90    152398
weighted avg       0.98      0.98      0.98    152398



In [None]:
def predict_model_ATE(sentence, tokenizer):
    word_pieces = []
    tokens = tokenizer.tokenize(sentence)
    word_pieces += tokens

    ids = tokenizer.convert_tokens_to_ids(word_pieces)
    input_tensor = torch.tensor([ids]).to(DEVICE)

    with torch.no_grad():
        outputs = model_ATE(input_tensor, None, None)
        _, predictions = torch.max(outputs, dim=2)
    predictions = predictions[0].tolist()

    return word_pieces, predictions, outputs


def get_AT(text):
    terms = []
    word = ""
    x, y, z = predict_model_ATE(text, tokenizer)
    for i in range(len(y)):
        if y[i] == 1:
            if len(word) != 0:
                terms.append(word.replace(" ##",""))
            word = x[i]
        if y[i] == 2:
            word += (" " + x[i])
            
    
    if len(word) != 0:
            terms.append(word.replace(" ##",""))
            
    print("TOKENS:", x)
    print("ASPECTS:", terms)

In [None]:
get_AT("For the price you pay this product is very good. However, battery life is a little lack-luster coming from a MacBook Pro.")

TOKENS: ['for', 'the', 'price', 'you', 'pay', 'this', 'product', 'is', 'very', 'good', '.', 'however', ',', 'battery', 'life', 'is', 'a', 'little', 'lack', '-', 'lust', '##er', 'coming', 'from', 'a', 'mac', '##book', 'pro', '.']
ASPECTS: ['price', 'battery life']


In [None]:
get_AT("To be completely fair, the only redeeming factor was the food, which was above average, but couldn't make up for all the other deficiencies of Teodora.")

TOKENS: ['to', 'be', 'completely', 'fair', ',', 'the', 'only', 'red', '##eem', '##ing', 'factor', 'was', 'the', 'food', ',', 'which', 'was', 'above', 'average', ',', 'but', 'couldn', "'", 't', 'make', 'up', 'for', 'all', 'the', 'other', 'def', '##iciencies', 'of', 'te', '##od', '##ora', '.']
ASPECTS: ['food']
