# Import dependencies

In [1]:
import torch

from torch.utils.data import DataLoader, RandomSampler

from torch import cuda

In [2]:
torch.cuda.empty_cache()

In [3]:
import numpy as np

In [4]:
import spacy

In [5]:
import pandas as pd

In [6]:
from transformers import BertTokenizer

In [7]:
import BERT_BiLSTM
from ModelDataset import ModelDataset

In [8]:
from semeval_reader import SemevalReader

In [9]:
device = 'cuda' if cuda.is_available() else 'cpu'

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [11]:
model = torch.load('bert_bilstm_model.pth')
model.to(device)

BERT_BiLSTM(
  (embedding): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T

# Loading data

In [12]:
semeval_reader = SemevalReader("semeval16_restaurants_train.xml")
reviews = semeval_reader.read_reviews()

In [13]:
sentences = []

for review in reviews:
    for sentence in review.sentences:
        sentences.append(sentence.text)

# Aspect extraction

In [14]:
nlp = spacy.load("en_core_web_lg")

### Sentence 0 analysis

In [15]:
print(f"Sentence: {sentences[0]}")
doc = nlp(sentences[0])


Sentence: judging from previous posts this used to be a good place, but not any longer.


In [16]:
spacy.displacy.render(doc, style='dep', jupyter=True)

In [17]:
for token in doc:
    print(f"\tToken: {token.text}")
    print(f"\t\tDependency: {token.dep_}")
    print(f"\t\tPOS: {token.pos_}")
    print(f"\t\tHead text: {token.head.text}")
    print(f"\t\tHead POS: {token.head.pos_}")
    print(f"\t\tChildren: {[child for child in token.children]}")

	Token: judging
		Dependency: ROOT
		POS: VERB
		Head text: judging
		Head POS: VERB
		Children: [from, ,, but, longer, .]
	Token: from
		Dependency: prep
		POS: ADP
		Head text: judging
		Head POS: VERB
		Children: [posts]
	Token: previous
		Dependency: amod
		POS: ADJ
		Head text: posts
		Head POS: NOUN
		Children: []
	Token: posts
		Dependency: pobj
		POS: NOUN
		Head text: from
		Head POS: ADP
		Children: [previous, used]
	Token: this
		Dependency: nsubj
		POS: DET
		Head text: used
		Head POS: VERB
		Children: []
	Token: used
		Dependency: relcl
		POS: VERB
		Head text: posts
		Head POS: NOUN
		Children: [this, be]
	Token: to
		Dependency: aux
		POS: PART
		Head text: be
		Head POS: VERB
		Children: []
	Token: be
		Dependency: xcomp
		POS: VERB
		Head text: used
		Head POS: VERB
		Children: [to, place]
	Token: a
		Dependency: det
		POS: DET
		Head text: place
		Head POS: NOUN
		Children: []
	Token: good
		Dependency: amod
		POS: ADJ
		Head text: place
		Head POS: NOUN
		Children: 

## Aspect extraction

In [18]:
ext_aspects = []

for sentence in sentences:
    important = nlp(sentence)
    descriptive_item = ''
    target = ''
    added_terms = ''
    for token in important:
        if token.dep_ == 'nsubj' and token.pos_ == 'NOUN':
            target = token.text
        if token.pos_ == 'ADJ':
            added_terms = ''
        for mini_token in token.children:
            if mini_token.pos_ != 'ADV':
                continue
            added_terms += mini_token.text + ' '
        descriptive_item = added_terms + token.text
        ext_aspects.append({'aspect': target, 'description': descriptive_item})

In [19]:
ext_aspects

[{'aspect': '', 'description': 'longer judging'},
 {'aspect': '', 'description': 'longer from'},
 {'aspect': '', 'description': 'previous'},
 {'aspect': '', 'description': 'posts'},
 {'aspect': '', 'description': 'this'},
 {'aspect': '', 'description': 'used'},
 {'aspect': '', 'description': 'to'},
 {'aspect': '', 'description': 'be'},
 {'aspect': '', 'description': 'a'},
 {'aspect': '', 'description': 'good'},
 {'aspect': '', 'description': 'place'},
 {'aspect': '', 'description': ','},
 {'aspect': '', 'description': 'but'},
 {'aspect': '', 'description': 'not'},
 {'aspect': '', 'description': 'any'},
 {'aspect': '', 'description': 'any longer'},
 {'aspect': '', 'description': 'any .'},
 {'aspect': '', 'description': 'we'},
 {'aspect': '', 'description': ','},
 {'aspect': '', 'description': 'there'},
 {'aspect': '', 'description': 'were'},
 {'aspect': '', 'description': 'four'},
 {'aspect': '', 'description': 'of'},
 {'aspect': '', 'description': 'us'},
 {'aspect': '', 'description': 

In [20]:
processed_sentences = [sentence.lower() for sentence in sentences]
aspect_feature_pairs = []
for sentence in nlp.pipe(processed_sentences):
    chunks = [(chunk.root.text) for chunk in sentence.noun_chunks if chunk.root.pos_ == 'NOUN']

    terms = ' '.join([token.lemma_ for token in sentence if (not token.is_stop and not token.is_punct and (token.pos_ == "ADJ" or token.pos_ == "VERB"))])

    aspect_feature_pairs.append({'aspect': chunks, 'description': terms})

# Sentiment prediction

In [21]:
def predict_sent(dataloader):
    results = []

    for _,data in enumerate(dataloader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)

        outputs = model(ids, mask, token_type_ids)

        result = np.argmax(outputs.cpu().detach().numpy(), axis=1)

        results.append(result)

    return results

In [22]:
df = pd.DataFrame(columns=['text','target_list'])

for aspect_feature_pair in aspect_feature_pairs:
    df = df.append({'text': aspect_feature_pair['description'], 'target_list': [0, 0, 0]}, ignore_index=True)

In [23]:
df.head()

Unnamed: 0,text,target_list
0,judge previous good,"[0, 0, 0]"
1,arrive act impose rude,"[0, 0, 0]"
2,bring complimentary ignore repeat throw,"[0, 0, 0]"
3,lousy sweet salty tiny,"[0, 0, 0]"
4,complain small,"[0, 0, 0]"


In [24]:
dataset = ModelDataset(df, tokenizer)

In [25]:
dataloader = DataLoader(
    dataset,
    sampler = RandomSampler(dataset),
    batch_size = 4
)

In [26]:
results = predict_sent(dataloader)