In [74]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
import json

import torch
import torch.nn.functional as F
from transformers import BertTokenizer

from tqdm import tqdm

In [75]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from nltk.tree import Tree

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

In [76]:
from model.model import bertATE, bertABSA
from transformers import pipeline

In [77]:
def pruning(list_: list) -> list:
  one_word = []
  multiple_words = []

  storage = []
  for e in list_:
    if len(e.split(" ")) > 1:
      multiple_words.append(e)
    else:
      one_word.append(e)
  # print("MULTIPLE: ", multiple_words)
  # print("ONE: ", one_word)
  for e in one_word:
    pattern = r'\b' + re.escape(e) + r'\b'
    if not any(re.search(pattern, element) for element in multiple_words):
      storage.append(e)

  return storage + multiple_words


In [78]:
# Define helper function
def is_abnormal_noun(text):
  """
    If text only contains special character/number/both OR total length less than 3 it specified as abnormal.
  """
  if re.match(r'^[0-9\W]+$', text) or len(text) < 3:
    return True
  else:
    return False



def extract_aspects(sentence):
    # Tokenize the sentence
    tokens = word_tokenize(sentence)

    # Perform POS tagging
    tagged = pos_tag(tokens)

    aspects = []

    # Rule 1: Extract noun phrases (Noun + Noun or Adjective + Noun)
    for i in range(len(tagged) - 1):
        word, tag = tagged[i]
        next_word, next_tag = tagged[i + 1]

        # Noun + Noun or Adjective + Noun
        if (tag.startswith('NN') and next_tag.startswith('NN')) or (tag.startswith('JJ') and next_tag.startswith('NN')):
            aspects.append(f"{word} {next_word}")
            # print(f"RULE 1.1: {word} {next_word}")

        # Noun after certain verbs (is, are, seems, feels)
        if tag.startswith('NN') and next_word.lower() in ['is', 'are', 'seems', 'feels']:
            if not is_abnormal_noun(word):
                aspects.append(word)
            # print(f"RULE 1.2: {word}")

    # Rule 2: Extract nouns in prepositional phrases
    for i in range(len(tagged)):
        word, tag = tagged[i]
        if tag.startswith('NN') and i > 0 and tagged[i - 1][0].lower() in ['of', 'in', 'with', 'about']:
            if not is_abnormal_noun(word):
                aspects.append(word)
            # print(f"RULE 2.0: {word}")

    # Rule 3: Handle negation
    for i in range(len(tagged) - 1):
        word, tag = tagged[i]
        next_word, next_tag = tagged[i + 1]
        if word.lower() in ['not', 'no'] and next_tag.startswith('NN'):
            if not is_abnormal_noun(next_word):
                aspects.append(next_word)  # Consider context for negation
            # print(f"RULE 3.0: {next_word}")

    # Rule 4: If there is no aspects; extract all possible Noun word
    if len(aspects) == 0:
      for i in range(len(tagged)):
        word, tag = tagged[i]
        if tag.startswith('NN'):
            if not is_abnormal_noun(word):
                aspects.append(word)
          # print(f"RULE 4.0: {word}")

    return list(set(aspects))  # Return unique aspects

def extract_rules(sentence):
  temp = extract_aspects(sentence)
  return pruning(temp)

# Load Model

In [79]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

DEVICE

device(type='cuda', index=0)

In [80]:
pretrain_model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(pretrain_model_name)

lr = 2e-5
modelABSA = bertABSA(pretrain_model_name).to(DEVICE)
# optimizerABSA = torch.optim.Adam(model_ABSA.parameters(), lr=lr)

In [81]:
modelABSA.load_state_dict(torch.load("model/bert_ABSA.pkl"), strict=False)

  modelABSA.load_state_dict(torch.load("model/bert_ABSA.pkl"), strict=False)


<All keys matched successfully>

In [82]:
# model = pipeline('text-classification', model='model/sentiment-0', device=0)

# Load Dataset

In [83]:
with open("temp/temp-1.json", "r") as file:
    data = json.load(file)

data

{'0': {'review_id': 0,
  'review_time': '2024-11-28',
  'like': 0,
  'review': 'The cars design is like it came from a cartoon.',
  'review_processed': 'The cars design is like it came from a cartoon.'},
 '1': {'review_id': 2,
  'review_time': '2024-11-27',
  'like': 0,
  'review': 'Tankz u for being so honest.\nMuch love and respect from trinidad',
  'review_processed': 'Tankz u for being so honest.\nMuch love and respect from trinidad'},
 '2': {'review_id': 3,
  'review_time': '2024-11-26',
  'like': 0,
  'review': 'Amazing review your really good at this love watching you man I feel like you could review anything. Well done',
  'review_processed': 'Amazing review your really good at this love watching you man I feel like you could review anything. Well done'},
 '3': {'review_id': 5,
  'review_time': '2024-11-25',
  'like': 0,
  'review': 'Honestly, the Cybertruck might be the first car I’d ever put a sticker on if I owned one someday. Otherwise, I’d constantly worry about accidental

In [84]:
df = pd.DataFrame.from_dict(data, orient='index')
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 1511 entries, 0 to 1510
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   review_id         1511 non-null   int64 
 1   review_time       1511 non-null   object
 2   like              1511 non-null   int64 
 3   review            1511 non-null   object
 4   review_processed  1511 non-null   object
dtypes: int64(2), object(3)
memory usage: 70.8+ KB
None


Unnamed: 0,review_id,review_time,like,review,review_processed
0,0,2024-11-28,0,The cars design is like it came from a cartoon.,The cars design is like it came from a cartoon.
1,2,2024-11-27,0,Tankz u for being so honest.\nMuch love and re...,Tankz u for being so honest.\nMuch love and re...
2,3,2024-11-26,0,Amazing review your really good at this love w...,Amazing review your really good at this love w...
3,5,2024-11-25,0,"Honestly, the Cybertruck might be the first ca...",The Cybertruck might be the first car I've eve...
4,6,2024-11-25,0,I will be buying one anytime soon add expensiv...,I will be buying one anytime soon. It's a very...


# Aspect Based Sentiment Analysis Process

In [85]:
def predict_model_ABSA(sentence, aspect, tokenizer):
    t1 = tokenizer.tokenize(sentence)
    t2 = tokenizer.tokenize(aspect)

    word_pieces = ['[cls]']
    word_pieces += t1
    word_pieces += ['[sep]']
    word_pieces += t2

    segment_tensor = [0] + [0]*len(t1) + [0] + [1]*len(t2)

    ids = tokenizer.convert_tokens_to_ids(word_pieces)
    input_tensor = torch.tensor([ids]).to(DEVICE)
    segment_tensor = torch.tensor(segment_tensor).to(DEVICE)

    with torch.no_grad():
        outputs = modelABSA(input_tensor, None, None, segments_tensors=segment_tensor)
        _, predictions = torch.max(outputs, dim=1)
    
    return word_pieces, predictions, outputs

def ATE_ABSA(text):
    mapper = { 0: 'negative', 1: 'neutral', 2: 'positive'}
    terms = []
    word = ""

    terms = extract_rules(text)
            
    storage = []
    pattern = r'^[\s\W]+$'
    if len(terms) != 0:
        # print("TERMS:", terms)
        for term in terms:
            if bool(re.fullmatch(pattern, term)) and len(term) < 2:
                continue
            _, c, logits = predict_model_ABSA(text, term, tokenizer)
            prob = F.softmax(logits[0], dim=0)  # Apply softmax along the appropriate dimension
            result = {
                'term': term.strip(),
                'class': mapper.get(int(c)),
                'probability': [float(p) for p in prob.cpu().numpy()]                 
            }
            # Make sure it's unique
            if result not in storage:
                storage.append(result)
    
    return storage

In [86]:
def adjust_aspect(x):
    if len(x) == 0:
        return x
    
    result = []
    separate = []
    temp = []
    idx = len(x) - 1
    pattern = r"#"

    # Separate aspects
    while idx >= 0 :
        next = x[idx - 1]
        current = x[idx]
            
        if re.search(pattern, current['term']): 
            temp.append(current)
        elif (idx < len(x) - 1 and re.search(pattern, x[idx + 1]['term'])):
            temp.append(current)
            separate.append(tuple(temp))
            temp = []
        else:
            result.append(current)

        idx -= 1

    # Handling
    for s in separate:
        term = ""
        scores = []
        for item in s:
            scores.append(max(item.get('probability')))
            term = re.sub(r"#*", "", item['term']).strip() + term
            
        best_index = scores.index(max(scores))
        best_item = s[best_index]
        class_ = best_item.get('class')
        prob = best_item.get('probability')

        result.append({'term': term, 'class': class_, 'probability': prob})
        
    return result

In [87]:
sample = df['review_processed'].iloc[16]
sample

'I actually like how it looks Imagine my surprise today when I saw learned people found it ugly?'

In [88]:
test = ATE_ABSA(sample)

test

[{'term': 'surprise today',
  'class': 'positive',
  'probability': [0.003896825248375535,
   0.017867902293801308,
   0.9782352447509766]},
 {'term': 'learned people',
  'class': 'negative',
  'probability': [0.6598256826400757,
   0.33099207282066345,
   0.009182200767099857]}]

In [89]:
adjust_aspect(test)

[{'term': 'learned people',
  'class': 'negative',
  'probability': [0.6598256826400757,
   0.33099207282066345,
   0.009182200767099857]},
 {'term': 'surprise today',
  'class': 'positive',
  'probability': [0.003896825248375535,
   0.017867902293801308,
   0.9782352447509766]}]

In [91]:
ATE_ABSA(sample)

[{'term': 'surprise today',
  'class': 'positive',
  'probability': [0.003896825248375535,
   0.017867902293801308,
   0.9782352447509766]},
 {'term': 'learned people',
  'class': 'negative',
  'probability': [0.6598256826400757,
   0.33099207282066345,
   0.009182200767099857]}]

**Process Data**

In [92]:
def process(x):
    result = ATE_ABSA(x)
    # result = adjust_aspect(result)
    return result

tqdm.pandas()
df['aspect_sentiment'] = df['review_processed'].progress_apply(lambda x: process(x))

100%|██████████████████████████████████████████████████████████████████████████████| 1511/1511 [02:06<00:00, 11.94it/s]


In [93]:
# tqdm.pandas()

# df['sentiment'] = df['review_processed'].progress_apply(lambda x: model(x)[0]['label'])

In [94]:
df.head()

Unnamed: 0,review_id,review_time,like,review,review_processed,aspect_sentiment
0,0,2024-11-28,0,The cars design is like it came from a cartoon.,The cars design is like it came from a cartoon.,"[{'term': 'cars design', 'class': 'negative', ..."
1,2,2024-11-27,0,Tankz u for being so honest.\nMuch love and re...,Tankz u for being so honest.\nMuch love and re...,"[{'term': 'Much love', 'class': 'positive', 'p..."
2,3,2024-11-26,0,Amazing review your really good at this love w...,Amazing review your really good at this love w...,"[{'term': 'Well', 'class': 'positive', 'probab..."
3,5,2024-11-25,0,"Honestly, the Cybertruck might be the first ca...",The Cybertruck might be the first car I've eve...,"[{'term': 'first car', 'class': 'negative', 'p..."
4,6,2024-11-25,0,I will be buying one anytime soon add expensiv...,I will be buying one anytime soon. It's a very...,"[{'term': 'stuff', 'class': 'positive', 'proba..."


In [95]:
# df.to_excel('fail.xlsx', index=False)

In [96]:
import re

# The sentence
sentence = ("The line was extremely big and was moving very slowly, plus they were out of apples and strawberries, "
            "but they gave me a substitute cookie. The staff were very apologetic for being slow, but it was the middle "
            "of the night and they were extremely under-staffed.")

# Check for both words using all()
result = all(re.search(rf'\b{word}\b', sentence, re.IGNORECASE) for word in ["apples", "strawberries"])
print(result)  # Output: False



True


In [108]:
def get_context(sentence, term):
    storage = []

    for sent in sent_tokenize(sentence):
        try:
            pattern = r'\b' + re.escape(term) + r'\b'
            if re.search(pattern, sent, re.IGNORECASE):
                storage.append(sent)
            elif all(re.search(rf'\b{word}\b', sent, re.IGNORECASE) for word in term.split(" ")):
                storage.append(sent)
        except:
            continue

    return storage

def get_terms(row):
    result = []
    items = row.get('aspect_sentiment')
    sentence = row.get('review_processed')
    
    if len(items) > 0:
        for item in items:
            # Extract necessary parts
            temp = item.copy()
            term = item.get('term')
            # Get context of term
            context = get_context(sentence, term)
            temp['context'] = context
            # Update term
            term = re.sub(r'[^A-Za-z0-9\s]', '', term)
            term = lemmatizer.lemmatize(term.lower()).strip()
            temp['term'] = term
            # Add new item to result
            result.append(temp)
        return result
    return []

In [109]:
df['aspect_sentiment'] = [get_terms(row) for _, row in df.iterrows()]

In [110]:
df.head()

Unnamed: 0,review_id,review_time,like,review,review_processed,aspect_sentiment
0,0,2024-11-28,0,The cars design is like it came from a cartoon.,The cars design is like it came from a cartoon.,"[{'term': 'cars design', 'class': 'negative', ..."
1,2,2024-11-27,0,Tankz u for being so honest.\nMuch love and re...,Tankz u for being so honest.\nMuch love and re...,"[{'term': 'much love', 'class': 'positive', 'p..."
2,3,2024-11-26,0,Amazing review your really good at this love w...,Amazing review your really good at this love w...,"[{'term': 'well', 'class': 'positive', 'probab..."
3,5,2024-11-25,0,"Honestly, the Cybertruck might be the first ca...",The Cybertruck might be the first car I've eve...,"[{'term': 'first car', 'class': 'negative', 'p..."
4,6,2024-11-25,0,I will be buying one anytime soon add expensiv...,I will be buying one anytime soon. It's a very...,"[{'term': 'stuff', 'class': 'positive', 'proba..."


In [111]:
result = df.to_dict('index')

In [112]:
result

{'0': {'review_id': 0,
  'review_time': '2024-11-28',
  'like': 0,
  'review': 'The cars design is like it came from a cartoon.',
  'review_processed': 'The cars design is like it came from a cartoon.',
  'aspect_sentiment': [{'term': 'cars design',
    'class': 'negative',
    'probability': [0.9754676222801208,
     0.002085205866023898,
     0.02244718186557293],
    'context': ['The cars design is like it came from a cartoon.']}]},
 '1': {'review_id': 2,
  'review_time': '2024-11-27',
  'like': 0,
  'review': 'Tankz u for being so honest.\nMuch love and respect from trinidad',
  'review_processed': 'Tankz u for being so honest.\nMuch love and respect from trinidad',
  'aspect_sentiment': [{'term': 'much love',
    'class': 'positive',
    'probability': [0.0006950621027499437,
     0.0009537364239804447,
     0.9983512163162231],
    'context': ['Much love and respect from trinidad']},
   {'term': 'tankz u',
    'class': 'positive',
    'probability': [0.00012535897258203477,
     

In [113]:
with open("temp/meta-data.json", "w") as file:
    json.dump(result, file, indent=4)

In [114]:
# df.to_excel('preprocessed-mcd.xlsx', index=False)
# df.to_csv('preprocessed-mcd.csv', index=False)

In [115]:
torch.cuda.reset_peak_memory_stats()  # Reset peak stats for debugging
torch.cuda.empty_cache()  # Clear cache
