In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
import json

import torch
import torch.nn.functional as F
from transformers import BertTokenizer

from tqdm import tqdm

In [3]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

In [4]:
from model.model import bertATE, bertABSA
from transformers import pipeline

# Load Model

In [5]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

DEVICE

device(type='cuda', index=0)

In [6]:
pretrain_model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(pretrain_model_name)

lr = 2e-5
modelATE = bertATE(pretrain_model_name).to(DEVICE)
# optimizerATE = torch.optim.Adam(model_ATE.parameters(), lr=lr)
modelABSA = bertABSA(pretrain_model_name).to(DEVICE)
# optimizerABSA = torch.optim.Adam(model_ABSA.parameters(), lr=lr)

In [7]:
modelABSA.load_state_dict(torch.load("model/bert_ABSA.pkl"), strict=False)
modelATE.load_state_dict(torch.load("model/bert_ATE.pkl"), strict=False)

  modelABSA.load_state_dict(torch.load("model/bert_ABSA.pkl"), strict=False)
  modelATE.load_state_dict(torch.load("model/bert_ATE.pkl"), strict=False)


<All keys matched successfully>

In [8]:
# model = pipeline('text-classification', model='model/sentiment-0', device=0)

# Load Dataset

In [11]:
with open("temp/temp-1.json", "r") as file:
    data = json.load(file)

data

{'0': {'review_id': 0,
  'review_time': '2024-11-28',
  'like': 0,
  'review': 'The cars design is like it came from a cartoon.',
  'review_processed': 'The cars design is like it came from a cartoon.'},
 '1': {'review_id': 2,
  'review_time': '2024-11-27',
  'like': 0,
  'review': 'Tankz u for being so honest.\nMuch love and respect from trinidad',
  'review_processed': 'Tankz u for being so honest.\nMuch love and respect from trinidad'},
 '2': {'review_id': 3,
  'review_time': '2024-11-26',
  'like': 0,
  'review': 'Amazing review your really good at this love watching you man I feel like you could review anything. Well done',
  'review_processed': 'Amazing review your really good at this love watching you man I feel like you could review anything. Well done'},
 '3': {'review_id': 5,
  'review_time': '2024-11-25',
  'like': 0,
  'review': 'Honestly, the Cybertruck might be the first car I’d ever put a sticker on if I owned one someday. Otherwise, I’d constantly worry about accidental

In [12]:
df = pd.DataFrame.from_dict(data, orient='index')
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 1511 entries, 0 to 1510
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   review_id         1511 non-null   int64 
 1   review_time       1511 non-null   object
 2   like              1511 non-null   int64 
 3   review            1511 non-null   object
 4   review_processed  1511 non-null   object
dtypes: int64(2), object(3)
memory usage: 70.8+ KB
None


Unnamed: 0,review_id,review_time,like,review,review_processed
0,0,2024-11-28,0,The cars design is like it came from a cartoon.,The cars design is like it came from a cartoon.
1,2,2024-11-27,0,Tankz u for being so honest.\nMuch love and re...,Tankz u for being so honest.\nMuch love and re...
2,3,2024-11-26,0,Amazing review your really good at this love w...,Amazing review your really good at this love w...
3,5,2024-11-25,0,"Honestly, the Cybertruck might be the first ca...",The Cybertruck might be the first car I've eve...
4,6,2024-11-25,0,I will be buying one anytime soon add expensiv...,I will be buying one anytime soon. It's a very...


# Aspect Based Sentiment Analysis Process

In [13]:
def predict_model_ABSA(sentence, aspect, tokenizer):
    t1 = tokenizer.tokenize(sentence)
    t2 = tokenizer.tokenize(aspect)

    word_pieces = ['[cls]']
    word_pieces += t1
    word_pieces += ['[sep]']
    word_pieces += t2

    segment_tensor = [0] + [0]*len(t1) + [0] + [1]*len(t2)

    ids = tokenizer.convert_tokens_to_ids(word_pieces)
    input_tensor = torch.tensor([ids]).to(DEVICE)
    segment_tensor = torch.tensor(segment_tensor).to(DEVICE)

    with torch.no_grad():
        outputs = modelABSA(input_tensor, None, None, segments_tensors=segment_tensor)
        _, predictions = torch.max(outputs, dim=1)
    
    return word_pieces, predictions, outputs

def predict_model_ATE(sentence, tokenizer):
    word_pieces = []
    tokens = tokenizer.tokenize(sentence)
    word_pieces += tokens

    ids = tokenizer.convert_tokens_to_ids(word_pieces)
    input_tensor = torch.tensor([ids]).to(DEVICE)

    with torch.no_grad():
        outputs = modelATE(input_tensor, None, None)
        _, predictions = torch.max(outputs, dim=2)
    predictions = predictions[0].tolist()

    return word_pieces, predictions, outputs

def ATE_ABSA(text):
    mapper = { 0: 'negative', 1: 'neutral', 2: 'positive'}
    terms = []
    word = ""
    x, y, z = predict_model_ATE(text, tokenizer)
    for i in range(len(y)):
        if y[i] == 1:
            if len(word) != 0:
                terms.append(word.replace(" ##",""))
            word = x[i]
        if y[i] == 2:
            word += (" " + x[i])
            
    
    if len(word) != 0:
            terms.append(word.replace(" ##",""))
            
    storage = []
    pattern = r'^[\s\W]+$'
    if len(terms) != 0:
        # print("TERMS:", terms)
        for term in terms:
            if bool(re.fullmatch(pattern, term)) and len(term) < 2:
                continue
            _, c, logits = predict_model_ABSA(text, term, tokenizer)
            prob = F.softmax(logits[0], dim=0)  # Apply softmax along the appropriate dimension
            result = {
                'term': term.strip(),
                'class': mapper.get(int(c)),
                'probability': [float(p) for p in prob.cpu().numpy()]                 
            }
            # Make sure it's unique
            if result not in storage:
                storage.append(result)
    
    return storage

In [14]:
def adjust_aspect(x):
    if len(x) == 0:
        return x
    
    result = []
    separate = []
    temp = []
    idx = len(x) - 1
    pattern = r"#"

    # Separate aspects
    while idx >= 0 :
        next = x[idx - 1]
        current = x[idx]
            
        if re.search(pattern, current['term']): 
            temp.append(current)
        elif (idx < len(x) - 1 and re.search(pattern, x[idx + 1]['term'])):
            temp.append(current)
            separate.append(tuple(temp))
            temp = []
        else:
            result.append(current)

        idx -= 1

    # Handling
    for s in separate:
        term = ""
        scores = []
        for item in s:
            scores.append(max(item.get('probability')))
            term = re.sub(r"#*", "", item['term']).strip() + term
            
        best_index = scores.index(max(scores))
        best_item = s[best_index]
        class_ = best_item.get('class')
        prob = best_item.get('probability')

        result.append({'term': term, 'class': class_, 'probability': prob})
        
    return result

In [15]:
sample = df['review_processed'].iloc[13]
sample

"I can't believe there are people that don't have a clue what that truck is. The idea of a 90s mini-truck version of a Cyber Truck gives me a woody like I was back in high school again."

In [16]:
test = ATE_ABSA(sample)

test

[]

In [17]:
adjust_aspect(test)

[]

**Process Data**

In [31]:
def process(x):
    result = ATE_ABSA(x)
    result = adjust_aspect(result)
    return result

tqdm.pandas()
df['aspect_sentiment'] = df['review_processed'].progress_apply(lambda x: process(x))

100%|██████████████████████████████████████████████████████████████████████████████| 1511/1511 [00:37<00:00, 39.77it/s]


In [32]:
# tqdm.pandas()

# df['sentiment'] = df['review_processed'].progress_apply(lambda x: model(x)[0]['label'])

In [33]:
df.head()

Unnamed: 0,review_id,review_time,like,review,review_processed,aspect_sentiment
0,0,2024-11-28,0,The cars design is like it came from a cartoon.,The cars design is like it came from a cartoon.,"[{'term': 'cars', 'class': 'negative', 'probab..."
1,2,2024-11-27,0,Tankz u for being so honest.\nMuch love and re...,Tankz u for being so honest.\nMuch love and re...,"[{'term': 'tank', 'class': 'positive', 'probab..."
2,3,2024-11-26,0,Amazing review your really good at this love w...,Amazing review your really good at this love w...,[]
3,5,2024-11-25,0,"Honestly, the Cybertruck might be the first ca...",The Cybertruck might be the first car I've eve...,"[{'term': 'stick', 'class': 'negative', 'proba..."
4,6,2024-11-25,0,I will be buying one anytime soon add expensiv...,I will be buying one anytime soon. It's a very...,"[{'term': 'truck', 'class': 'negative', 'proba..."


In [34]:
# df.to_excel('fail.xlsx', index=False)

In [35]:
import re

# The sentence
sentence = ("The line was extremely big and was moving very slowly, plus they were out of apples and strawberries, "
            "but they gave me a substitute cookie. The staff were very apologetic for being slow, but it was the middle "
            "of the night and they were extremely under-staffed.")

# Check for both words using all()
result = all(re.search(rf'\b{word}\b', sentence, re.IGNORECASE) for word in ["apples", "strawberries"])
print(result)  # Output: False



True


In [36]:
def get_context(sentence, term):
    storage = []

    for sent in sent_tokenize(sentence):
        pattern = r'\b' + re.escape(term) + r'\b'
        if re.search(pattern, sent, re.IGNORECASE):
            storage.append(sent)
        elif all(re.search(rf'\b{word}\b', sent, re.IGNORECASE) for word in term.split(" ")):
            storage.append(sent)

    return storage

def get_terms(row):
    result = []
    items = row.get('aspect_sentiment')
    sentence = row.get('review_processed')
    
    if len(items) > 0:
        for item in items:
            temp = item.copy()
            term = item.get('term')
            context = get_context(sentence, term)
            temp['context'] = context
            result.append(temp)
        return result
    return []

In [37]:
df['aspect_sentiment'] = [get_terms(row) for _, row in df.iterrows()]

In [38]:
df.head()

Unnamed: 0,review_id,review_time,like,review,review_processed,aspect_sentiment
0,0,2024-11-28,0,The cars design is like it came from a cartoon.,The cars design is like it came from a cartoon.,"[{'term': 'cars', 'class': 'negative', 'probab..."
1,2,2024-11-27,0,Tankz u for being so honest.\nMuch love and re...,Tankz u for being so honest.\nMuch love and re...,"[{'term': 'tank', 'class': 'positive', 'probab..."
2,3,2024-11-26,0,Amazing review your really good at this love w...,Amazing review your really good at this love w...,[]
3,5,2024-11-25,0,"Honestly, the Cybertruck might be the first ca...",The Cybertruck might be the first car I've eve...,"[{'term': 'stick', 'class': 'negative', 'proba..."
4,6,2024-11-25,0,I will be buying one anytime soon add expensiv...,I will be buying one anytime soon. It's a very...,"[{'term': 'truck', 'class': 'negative', 'proba..."


In [39]:
result = df.to_dict('index')

In [40]:
result

{'0': {'review_id': 0,
  'review_time': '2024-11-28',
  'like': 0,
  'review': 'The cars design is like it came from a cartoon.',
  'review_processed': 'The cars design is like it came from a cartoon.',
  'aspect_sentiment': [{'term': 'cars',
    'class': 'negative',
    'probability': [0.9722225069999695,
     0.00603770324960351,
     0.021739840507507324],
    'context': ['The cars design is like it came from a cartoon.']}]},
 '1': {'review_id': 2,
  'review_time': '2024-11-27',
  'like': 0,
  'review': 'Tankz u for being so honest.\nMuch love and respect from trinidad',
  'review_processed': 'Tankz u for being so honest.\nMuch love and respect from trinidad',
  'aspect_sentiment': [{'term': 'tank',
    'class': 'positive',
    'probability': [0.00017829200078267604,
     0.00019761022122111171,
     0.9996241331100464],
    'context': []}]},
 '2': {'review_id': 3,
  'review_time': '2024-11-26',
  'like': 0,
  'review': 'Amazing review your really good at this love watching you man 

In [41]:
with open("temp/meta-data.json", "w") as file:
    json.dump(result, file, indent=4)

In [42]:
# df.to_excel('preprocessed-mcd.xlsx', index=False)
# df.to_csv('preprocessed-mcd.csv', index=False)

In [43]:
torch.cuda.reset_peak_memory_stats()  # Reset peak stats for debugging
torch.cuda.empty_cache()  # Clear cache
