In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
import json

import torch
import torch.nn.functional as F
from transformers import BertTokenizer

from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

In [3]:
from model.model import bertATE, bertABSA
from transformers import pipeline

# Load Model

In [4]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

DEVICE

device(type='cuda', index=0)

In [5]:
pretrain_model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(pretrain_model_name)

lr = 2e-5
modelATE = bertATE(pretrain_model_name).to(DEVICE)
# optimizerATE = torch.optim.Adam(model_ATE.parameters(), lr=lr)
modelABSA = bertABSA(pretrain_model_name).to(DEVICE)
# optimizerABSA = torch.optim.Adam(model_ABSA.parameters(), lr=lr)

In [6]:
modelABSA.load_state_dict(torch.load("model/bert_ABSA.pkl"), strict=False)
modelATE.load_state_dict(torch.load("model/bert_ATE.pkl"), strict=False)

  modelABSA.load_state_dict(torch.load("model/bert_ABSA.pkl"), strict=False)
  modelATE.load_state_dict(torch.load("model/bert_ATE.pkl"), strict=False)


<All keys matched successfully>

# Load Dataset

In [7]:
with open("temp_phone-1.json", "r") as file:
    data = json.load(file)

data

{'0': {'review_id': 0,
  'review_time': '2019-02-14',
  'review': 'Best phone money can buy.So far this phone is the bees knees.',
  'rating': 5,
  'helpfulVotes': 3.0,
  'review_processed': 'Best phone money can buy.So far this phone is the bees knees.'},
 '1': {'review_id': 1,
  'review_time': '2019-08-02',
  'review': 'Pixel 3 has frequent display defects.I like this phone model. I bought this phone as a gift to my wife. Unfortunately, the phone turned out to be defective. There is a pink tint on the screen. Amazon does not want to properly help customers who are not in the USA. This is very sad.',
  'rating': 3,
  'helpfulVotes': 20.0,
  'review_processed': 'There is a pink tint on the screen. Amazon does not want to properly help customers who are not in the U.S. This is very sad. I like this phone model, but it has frequent display defects. I bought it as a gift for my wife, but the phone is defective.'},
 '2': {'review_id': 2,
  'review_time': '2018-04-30',
  'review': 'the lowe

In [8]:
df = pd.DataFrame.from_dict(data, orient='index')
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 87 entries, 0 to 86
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   review_id         87 non-null     int64  
 1   review_time       87 non-null     object 
 2   review            87 non-null     object 
 3   rating            87 non-null     int64  
 4   helpfulVotes      87 non-null     float64
 5   review_processed  87 non-null     object 
dtypes: float64(1), int64(2), object(3)
memory usage: 4.8+ KB
None


Unnamed: 0,review_id,review_time,review,rating,helpfulVotes,review_processed
0,0,2019-02-14,Best phone money can buy.So far this phone is ...,5,3.0,Best phone money can buy.So far this phone is ...
1,1,2019-08-02,Pixel 3 has frequent display defects.I like th...,3,20.0,There is a pink tint on the screen. Amazon doe...
2,2,2018-04-30,the lower left corner of the phone doesnt work...,2,1.0,The lower left corner of the phone doesn't wor...
3,3,2017-01-16,"Great phone, be careful with the sellers.The p...",3,10.0,"The phone is wonderful, if you're considering ..."
4,4,2019-06-22,U had no issue with the seller they were great...,1,3.0,I was very disappointed in the way this phone ...


# Aspect Based Sentiment Analysis Process

In [9]:
def predict_model_ABSA(sentence, aspect, tokenizer):
    t1 = tokenizer.tokenize(sentence)
    t2 = tokenizer.tokenize(aspect)

    word_pieces = ['[cls]']
    word_pieces += t1
    word_pieces += ['[sep]']
    word_pieces += t2

    segment_tensor = [0] + [0]*len(t1) + [0] + [1]*len(t2)

    ids = tokenizer.convert_tokens_to_ids(word_pieces)
    input_tensor = torch.tensor([ids]).to(DEVICE)
    segment_tensor = torch.tensor(segment_tensor).to(DEVICE)

    with torch.no_grad():
        outputs = modelABSA(input_tensor, None, None, segments_tensors=segment_tensor)
        _, predictions = torch.max(outputs, dim=1)
    
    return word_pieces, predictions, outputs

def predict_model_ATE(sentence, tokenizer):
    word_pieces = []
    tokens = tokenizer.tokenize(sentence)
    word_pieces += tokens

    ids = tokenizer.convert_tokens_to_ids(word_pieces)
    input_tensor = torch.tensor([ids]).to(DEVICE)

    with torch.no_grad():
        outputs = modelATE(input_tensor, None, None)
        _, predictions = torch.max(outputs, dim=2)
    predictions = predictions[0].tolist()

    return word_pieces, predictions, outputs

def ATE_ABSA(text):
    mapper = { 0: 'negative', 1: 'neutral', 2: 'positive'}
    terms = []
    word = ""
    x, y, z = predict_model_ATE(text, tokenizer)
    for i in range(len(y)):
        if y[i] == 1:
            if len(word) != 0:
                terms.append(word.replace(" ##",""))
            word = x[i]
        if y[i] == 2:
            word += (" " + x[i])
            
    
    if len(word) != 0:
            terms.append(word.replace(" ##",""))
            
    storage = []
    pattern = r'^[\s\W]+$'
    if len(terms) != 0:
        # print("TERMS:", terms)
        for term in terms:
            if bool(re.fullmatch(pattern, term)):
                continue
            _, c, logits = predict_model_ABSA(text, term, tokenizer)
            prob = F.softmax(logits[0], dim=0)  # Apply softmax along the appropriate dimension
            result = {
                'term': term.strip(),
                'class': mapper.get(int(c)),
                'probability': [float(p) for p in prob.cpu().numpy()]                 
            }
            # Make sure it's unique
            if result not in storage:
                storage.append(result)
    
    return storage

In [10]:
def adjust_aspect(x):
    if len(x) == 0:
        return x
    
    result = []
    separate = []
    temp = []
    idx = len(x) - 1
    pattern = r"#"

    # Separate aspects
    while idx >= 0 :
        next = x[idx - 1]
        current = x[idx]
            
        if re.search(pattern, current['term']): 
            temp.append(current)
        elif (idx < len(x) - 1 and re.search(pattern, x[idx + 1]['term'])):
            temp.append(current)
            separate.append(tuple(temp))
            temp = []
        else:
            result.append(current)

        idx -= 1

    # Handling
    for s in separate:
        term = ""
        scores = []
        for item in s:
            scores.append(max(item.get('probability')))
            term = re.sub(r"#*", "", item['term']).strip() + term
            
        best_index = scores.index(max(scores))
        best_item = s[best_index]
        class_ = best_item.get('class')
        prob = best_item.get('probability')

        result.append({'term': term, 'class': class_, 'probability': prob})
        
    return result

In [11]:
sample = df['review_processed'].iloc[13]
sample

'Its awesome.Wonderful phone'

In [12]:
test = ATE_ABSA(sample)

test

[{'term': 'phone',
  'class': 'positive',
  'probability': [8.548454206902534e-05,
   0.00011841457308037207,
   0.9997960925102234]}]

In [13]:
adjust_aspect(test)

[{'term': 'phone',
  'class': 'positive',
  'probability': [8.548454206902534e-05,
   0.00011841457308037207,
   0.9997960925102234]}]

In [14]:
def process(x):
    result = ATE_ABSA(x)
    result = adjust_aspect(result)
    return result

tqdm.pandas()
df['aspect_sentiment'] = df['review_processed'].progress_apply(lambda x: process(x))

100%|██████████████████████████████████████████████████████████████████████████████████| 87/87 [00:03<00:00, 24.94it/s]


In [16]:
df.head()

Unnamed: 0,review_id,review_time,review,rating,helpfulVotes,review_processed,aspect_sentiment
0,0,2019-02-14,Best phone money can buy.So far this phone is ...,5,3.0,Best phone money can buy.So far this phone is ...,[]
1,1,2019-08-02,Pixel 3 has frequent display defects.I like th...,3,20.0,There is a pink tint on the screen. Amazon doe...,"[{'term': 'display', 'class': 'negative', 'pro..."
2,2,2018-04-30,the lower left corner of the phone doesnt work...,2,1.0,The lower left corner of the phone doesn't wor...,"[{'term': 'styluses', 'class': 'negative', 'pr..."
3,3,2017-01-16,"Great phone, be careful with the sellers.The p...",3,10.0,"The phone is wonderful, if you're considering ...","[{'term': 'lock button', 'class': 'negative', ..."
4,4,2019-06-22,U had no issue with the seller they were great...,1,3.0,I was very disappointed in the way this phone ...,"[{'term': 'seller', 'class': 'positive', 'prob..."


In [17]:
# df.to_excel('fail.xlsx', index=False)

In [22]:
mask = df['aspect_sentiment'].apply(lambda x: True if len(x) == 0 else False).values

df = df[~ mask].copy()

print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 79 entries, 1 to 86
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   review_id         79 non-null     int64  
 1   review_time       79 non-null     object 
 2   review            79 non-null     object 
 3   rating            79 non-null     int64  
 4   helpfulVotes      79 non-null     float64
 5   review_processed  79 non-null     object 
 6   aspect_sentiment  79 non-null     object 
dtypes: float64(1), int64(2), object(4)
memory usage: 4.9+ KB
None


Unnamed: 0,review_id,review_time,review,rating,helpfulVotes,review_processed,aspect_sentiment
1,1,2019-08-02,Pixel 3 has frequent display defects.I like th...,3,20.0,There is a pink tint on the screen. Amazon doe...,"[{'term': 'display', 'class': 'negative', 'pro..."
2,2,2018-04-30,the lower left corner of the phone doesnt work...,2,1.0,The lower left corner of the phone doesn't wor...,"[{'term': 'styluses', 'class': 'negative', 'pr..."
3,3,2017-01-16,"Great phone, be careful with the sellers.The p...",3,10.0,"The phone is wonderful, if you're considering ...","[{'term': 'lock button', 'class': 'negative', ..."
4,4,2019-06-22,U had no issue with the seller they were great...,1,3.0,I was very disappointed in the way this phone ...,"[{'term': 'seller', 'class': 'positive', 'prob..."
5,6,2019-08-12,Good phone!.I personally think this phone is n...,5,1.0,I have used this phone for a while now and it'...,"[{'term': 'screen', 'class': 'negative', 'prob..."


In [44]:
# Get context

def get_context(sentence, term):
    storage = []

    for sent in sent_tokenize(sentence):
        pattern = r'\b' + re.escape(term) + r'\b'
        if re.search(pattern, sent, re.IGNORECASE):
            storage.append(sent)
        elif all(re.search(rf'\b{word}\b', sent, re.IGNORECASE) for word in term.split(" ")):
            storage.append(sent)

    return storage

def get_terms(row):
    result = []
    items = row.get('aspect_sentiment')
    sentence = row.get('review_processed')
    
    if len(items) > 0:
        for item in items:
            temp = item.copy()
            term = item.get('term')
            context = get_context(sentence, term)
            temp['context'] = context
            result.append(temp)
        return result
    return []

In [45]:
df['aspect_sentiment'] = [get_terms(row) for _, row in df.iterrows()]

RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 2
RULES 2
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 2
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1
RULES 2
RULES 1
RULES 1
RULES 1
RULES 1
RULES 1


In [46]:
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 79 entries, 1 to 86
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   review_id         79 non-null     int64  
 1   review_time       79 non-null     object 
 2   review            79 non-null     object 
 3   rating            79 non-null     int64  
 4   helpfulVotes      79 non-null     float64
 5   review_processed  79 non-null     object 
 6   aspect_sentiment  79 non-null     object 
dtypes: float64(1), int64(2), object(4)
memory usage: 7.0+ KB
None


Unnamed: 0,review_id,review_time,review,rating,helpfulVotes,review_processed,aspect_sentiment
1,1,2019-08-02,Pixel 3 has frequent display defects.I like th...,3,20.0,There is a pink tint on the screen. Amazon doe...,"[{'term': 'display', 'class': 'negative', 'pro..."
2,2,2018-04-30,the lower left corner of the phone doesnt work...,2,1.0,The lower left corner of the phone doesn't wor...,"[{'term': 'styluses', 'class': 'negative', 'pr..."
3,3,2017-01-16,"Great phone, be careful with the sellers.The p...",3,10.0,"The phone is wonderful, if you're considering ...","[{'term': 'lock button', 'class': 'negative', ..."
4,4,2019-06-22,U had no issue with the seller they were great...,1,3.0,I was very disappointed in the way this phone ...,"[{'term': 'seller', 'class': 'positive', 'prob..."
5,6,2019-08-12,Good phone!.I personally think this phone is n...,5,1.0,I have used this phone for a while now and it'...,"[{'term': 'screen', 'class': 'negative', 'prob..."


In [47]:
result = df.to_dict('index')

result

{'1': {'review_id': 1,
  'review_time': '2019-08-02',
  'review': 'Pixel 3 has frequent display defects.I like this phone model. I bought this phone as a gift to my wife. Unfortunately, the phone turned out to be defective. There is a pink tint on the screen. Amazon does not want to properly help customers who are not in the USA. This is very sad.',
  'rating': 3,
  'helpfulVotes': 20.0,
  'review_processed': 'There is a pink tint on the screen. Amazon does not want to properly help customers who are not in the U.S. This is very sad. I like this phone model, but it has frequent display defects. I bought it as a gift for my wife, but the phone is defective.',
  'aspect_sentiment': [{'term': 'display',
    'class': 'negative',
    'probability': [0.9978808760643005,
     0.0004631515475921333,
     0.0016559597570449114],
    'context': ['I like this phone model, but it has frequent display defects.']},
   {'term': 'screen',
    'class': 'negative',
    'probability': [0.9974362254142761

In [48]:
with open("meta-data_phone.json", "w") as file:
    json.dump(result, file, indent=4)