In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
import json

import torch
import torch.nn.functional as F
from transformers import BertTokenizer

from tqdm import tqdm

In [11]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

In [12]:
from model.model import bertATE, bertABSA
from transformers import pipeline

# Load Model

In [13]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

DEVICE

device(type='cuda', index=0)

In [14]:
pretrain_model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(pretrain_model_name)

lr = 2e-5
modelATE = bertATE(pretrain_model_name).to(DEVICE)
# optimizerATE = torch.optim.Adam(model_ATE.parameters(), lr=lr)
modelABSA = bertABSA(pretrain_model_name).to(DEVICE)
# optimizerABSA = torch.optim.Adam(model_ABSA.parameters(), lr=lr)

In [15]:
modelABSA.load_state_dict(torch.load("model/bert_ABSA.pkl"), strict=False)
modelATE.load_state_dict(torch.load("model/bert_ATE.pkl"), strict=False)

  modelABSA.load_state_dict(torch.load("model/bert_ABSA.pkl"), strict=False)
  modelATE.load_state_dict(torch.load("model/bert_ATE.pkl"), strict=False)


<All keys matched successfully>

In [16]:
# model = pipeline('text-classification', model='model/sentiment-0', device=0)

# Load Dataset

In [17]:
with open("temp-1.json", "r") as file:
    data = json.load(file)

data

{'0': {'reviewer_id': 1,
  'review_time': '2024-09-21',
  'rating': 1,
  'review': 'Why does it look like someone spit on my food?\nI had a normal transaction,  everyone was chill and polite, but now i dont want to eat this. Im trying not to think about what this milky white/clear substance is all over my food, i d*** sure am not coming back.',
  'review_processed': "I had a normal transaction, but now I don't want to eat this. I'm trying not to think about what this milky white/clear substance is all over my food, and I'm sure I'm not coming back."},
 '1': {'reviewer_id': 2,
  'review_time': '2024-12-16',
  'rating': 4,
  'review': "It'd McDonalds. It is what it is as far as the food and atmosphere go. The staff here does make a difference. They are all friendly, accommodating and always smiling. Makes for a more pleasant experience than many other fast food places.",
  'review_processed': "It's what it is as far as the food and atmosphere go. The staff are friendly, accommodating, an

In [18]:
df = pd.DataFrame.from_dict(data, orient='index')
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 5565 entries, 0 to 5564
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   reviewer_id       5565 non-null   int64 
 1   review_time       5565 non-null   object
 2   rating            5565 non-null   int64 
 3   review            5565 non-null   object
 4   review_processed  5565 non-null   object
dtypes: int64(2), object(3)
memory usage: 260.9+ KB
None


Unnamed: 0,reviewer_id,review_time,rating,review,review_processed
0,1,2024-09-21,1,Why does it look like someone spit on my food?...,"I had a normal transaction, but now I don't wa..."
1,2,2024-12-16,4,It'd McDonalds. It is what it is as far as the...,It's what it is as far as the food and atmosph...
2,3,2024-12-16,1,Made a mobile order got to the speaker and che...,I made a mobile order got to the speaker and c...
3,4,2024-11-21,5,My mc. Crispy chicken sandwich was ï¿½ï¿½ï¿½ï¿...,My mc. Crispy chicken sandwich was customer s...
4,5,2024-10-21,1,"I repeat my order 3 times in the drive thru, a...",I repeat my order three times in the drive thr...


# Aspect Based Sentiment Analysis Process

In [19]:
def predict_model_ABSA(sentence, aspect, tokenizer):
    t1 = tokenizer.tokenize(sentence)
    t2 = tokenizer.tokenize(aspect)

    word_pieces = ['[cls]']
    word_pieces += t1
    word_pieces += ['[sep]']
    word_pieces += t2

    segment_tensor = [0] + [0]*len(t1) + [0] + [1]*len(t2)

    ids = tokenizer.convert_tokens_to_ids(word_pieces)
    input_tensor = torch.tensor([ids]).to(DEVICE)
    segment_tensor = torch.tensor(segment_tensor).to(DEVICE)

    with torch.no_grad():
        outputs = modelABSA(input_tensor, None, None, segments_tensors=segment_tensor)
        _, predictions = torch.max(outputs, dim=1)
    
    return word_pieces, predictions, outputs

def predict_model_ATE(sentence, tokenizer):
    word_pieces = []
    tokens = tokenizer.tokenize(sentence)
    word_pieces += tokens

    ids = tokenizer.convert_tokens_to_ids(word_pieces)
    input_tensor = torch.tensor([ids]).to(DEVICE)

    with torch.no_grad():
        outputs = modelATE(input_tensor, None, None)
        _, predictions = torch.max(outputs, dim=2)
    predictions = predictions[0].tolist()

    return word_pieces, predictions, outputs

def ATE_ABSA(text):
    mapper = { 0: 'negative', 1: 'neutral', 2: 'positive'}
    terms = []
    word = ""
    x, y, z = predict_model_ATE(text, tokenizer)
    for i in range(len(y)):
        if y[i] == 1:
            if len(word) != 0:
                terms.append(word.replace(" ##",""))
            word = x[i]
        if y[i] == 2:
            word += (" " + x[i])
            
    
    if len(word) != 0:
            terms.append(word.replace(" ##",""))
            
    storage = []
    pattern = r'^[\s\W]+$'
    if len(terms) != 0:
        # print("TERMS:", terms)
        for term in terms:
            if bool(re.fullmatch(pattern, term)) and len(term) < 2:
                continue
            _, c, logits = predict_model_ABSA(text, term, tokenizer)
            prob = F.softmax(logits[0], dim=0)  # Apply softmax along the appropriate dimension
            result = {
                'term': term.strip(),
                'class': mapper.get(int(c)),
                'probability': [float(p) for p in prob.cpu().numpy()]                 
            }
            # Make sure it's unique
            if result not in storage:
                storage.append(result)
    
    return storage

In [20]:
def adjust_aspect(x):
    if len(x) == 0:
        return x
    
    result = []
    separate = []
    temp = []
    idx = len(x) - 1
    pattern = r"#"

    # Separate aspects
    while idx >= 0 :
        next = x[idx - 1]
        current = x[idx]
            
        if re.search(pattern, current['term']): 
            temp.append(current)
        elif (idx < len(x) - 1 and re.search(pattern, x[idx + 1]['term'])):
            temp.append(current)
            separate.append(tuple(temp))
            temp = []
        else:
            result.append(current)

        idx -= 1

    # Handling
    for s in separate:
        term = ""
        scores = []
        for item in s:
            scores.append(max(item.get('probability')))
            term = re.sub(r"#*", "", item['term']).strip() + term
            
        best_index = scores.index(max(scores))
        best_item = s[best_index]
        class_ = best_item.get('class')
        prob = best_item.get('probability')

        result.append({'term': term, 'class': class_, 'probability': prob})
        
    return result

In [21]:
sample = df['review_processed'].iloc[13]
sample

'Ordered food wrong and left out part of the order also. One of the drink cups were not filled properly.'

In [22]:
test = ATE_ABSA(sample)

test

[{'term': 'food',
  'class': 'negative',
  'probability': [0.9835325479507446,
   0.015864670276641846,
   0.000602721469476819]},
 {'term': 'drink cups',
  'class': 'negative',
  'probability': [0.9989131689071655,
   0.0007870964473113418,
   0.000299714069114998]}]

In [23]:
adjust_aspect(test)

[{'term': 'drink cups',
  'class': 'negative',
  'probability': [0.9989131689071655,
   0.0007870964473113418,
   0.000299714069114998]},
 {'term': 'food',
  'class': 'negative',
  'probability': [0.9835325479507446,
   0.015864670276641846,
   0.000602721469476819]}]

**Process Data**

In [24]:
def process(x):
    result = ATE_ABSA(x)
    result = adjust_aspect(result)
    return result

tqdm.pandas()
df['aspect_sentiment'] = df['review_processed'].progress_apply(lambda x: process(x))

100%|██████████████████████████████████████████████████████████████████████████████| 5565/5565 [03:59<00:00, 23.19it/s]


In [25]:
# tqdm.pandas()

# df['sentiment'] = df['review_processed'].progress_apply(lambda x: model(x)[0]['label'])

In [26]:
df.head()

Unnamed: 0,reviewer_id,review_time,rating,review,review_processed,aspect_sentiment
0,1,2024-09-21,1,Why does it look like someone spit on my food?...,"I had a normal transaction, but now I don't wa...","[{'term': 'food', 'class': 'negative', 'probab..."
1,2,2024-12-16,4,It'd McDonalds. It is what it is as far as the...,It's what it is as far as the food and atmosph...,"[{'term': 'staff', 'class': 'positive', 'proba..."
2,3,2024-12-16,1,Made a mobile order got to the speaker and che...,I made a mobile order got to the speaker and c...,"[{'term': 'speaker', 'class': 'neutral', 'prob..."
3,4,2024-11-21,5,My mc. Crispy chicken sandwich was ï¿½ï¿½ï¿½ï¿...,My mc. Crispy chicken sandwich was customer s...,"[{'term': 'service', 'class': 'positive', 'pro..."
4,5,2024-10-21,1,"I repeat my order 3 times in the drive thru, a...",I repeat my order three times in the drive thr...,"[{'term': 'fries', 'class': 'positive', 'proba..."


In [27]:
# df.to_excel('fail.xlsx', index=False)

In [28]:
import re

# The sentence
sentence = ("The line was extremely big and was moving very slowly, plus they were out of apples and strawberries, "
            "but they gave me a substitute cookie. The staff were very apologetic for being slow, but it was the middle "
            "of the night and they were extremely under-staffed.")

# Check for both words using all()
result = all(re.search(rf'\b{word}\b', sentence, re.IGNORECASE) for word in ["apples", "strawberries"])
print(result)  # Output: False



True


In [29]:
def get_context(sentence, term):
    storage = []

    for sent in sent_tokenize(sentence):
        pattern = r'\b' + re.escape(term) + r'\b'
        if re.search(pattern, sent, re.IGNORECASE):
            storage.append(sent)
        elif all(re.search(rf'\b{word}\b', sent, re.IGNORECASE) for word in term.split(" ")):
            storage.append(sent)

    return storage

def get_terms(row):
    result = []
    items = row.get('aspect_sentiment')
    sentence = row.get('review_processed')
    
    if len(items) > 0:
        for item in items:
            temp = item.copy()
            term = item.get('term')
            context = get_context(sentence, term)
            temp['context'] = context
            result.append(temp)
        return result
    return []

In [30]:
df['aspect_sentiment'] = [get_terms(row) for _, row in df.iterrows()]

In [31]:
df.head()

Unnamed: 0,reviewer_id,review_time,rating,review,review_processed,aspect_sentiment
0,1,2024-09-21,1,Why does it look like someone spit on my food?...,"I had a normal transaction, but now I don't wa...","[{'term': 'food', 'class': 'negative', 'probab..."
1,2,2024-12-16,4,It'd McDonalds. It is what it is as far as the...,It's what it is as far as the food and atmosph...,"[{'term': 'staff', 'class': 'positive', 'proba..."
2,3,2024-12-16,1,Made a mobile order got to the speaker and che...,I made a mobile order got to the speaker and c...,"[{'term': 'speaker', 'class': 'neutral', 'prob..."
3,4,2024-11-21,5,My mc. Crispy chicken sandwich was ï¿½ï¿½ï¿½ï¿...,My mc. Crispy chicken sandwich was customer s...,"[{'term': 'service', 'class': 'positive', 'pro..."
4,5,2024-10-21,1,"I repeat my order 3 times in the drive thru, a...",I repeat my order three times in the drive thr...,"[{'term': 'fries', 'class': 'positive', 'proba..."


In [32]:
result = df.to_dict('index')

In [33]:
result

{'0': {'reviewer_id': 1,
  'review_time': '2024-09-21',
  'rating': 1,
  'review': 'Why does it look like someone spit on my food?\nI had a normal transaction,  everyone was chill and polite, but now i dont want to eat this. Im trying not to think about what this milky white/clear substance is all over my food, i d*** sure am not coming back.',
  'review_processed': "I had a normal transaction, but now I don't want to eat this. I'm trying not to think about what this milky white/clear substance is all over my food, and I'm sure I'm not coming back.",
  'aspect_sentiment': [{'term': 'food',
    'class': 'negative',
    'probability': [0.9742005467414856,
     0.0233919695019722,
     0.002407489577308297],
    'context': ["I'm trying not to think about what this milky white/clear substance is all over my food, and I'm sure I'm not coming back."]},
   {'term': 'substance',
    'class': 'negative',
    'probability': [0.9821223020553589,
     0.0007803093758411705,
     0.0170973930507898

In [34]:
with open("meta-data.json", "w") as file:
    json.dump(result, file, indent=4)

In [35]:
# df.to_excel('preprocessed-mcd.xlsx', index=False)
# df.to_csv('preprocessed-mcd.csv', index=False)

In [36]:
torch.cuda.reset_peak_memory_stats()  # Reset peak stats for debugging
torch.cuda.empty_cache()  # Clear cache
