In [333]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
import json

import torch
import torch.nn.functional as F
from transformers import BertTokenizer

from tqdm import tqdm

In [334]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

In [335]:
from model.model import bertATE, bertABSA
from transformers import pipeline

# Load Model

In [336]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

DEVICE

device(type='cuda', index=0)

In [337]:
pretrain_model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(pretrain_model_name)

lr = 2e-5
modelATE = bertATE(pretrain_model_name).to(DEVICE)
# optimizerATE = torch.optim.Adam(model_ATE.parameters(), lr=lr)
modelABSA = bertABSA(pretrain_model_name).to(DEVICE)
# optimizerABSA = torch.optim.Adam(model_ABSA.parameters(), lr=lr)

In [338]:
modelABSA.load_state_dict(torch.load("model/bert_ABSA.pkl"), strict=False)
modelATE.load_state_dict(torch.load("model/bert_ATE.pkl"), strict=False)

  modelABSA.load_state_dict(torch.load("model/bert_ABSA.pkl"), strict=False)
  modelATE.load_state_dict(torch.load("model/bert_ATE.pkl"), strict=False)


<All keys matched successfully>

In [339]:
model = pipeline('text-classification', model='model/sentiment-0', device=0)

# Load Dataset

In [340]:
with open("temp-1.json", "r") as file:
    data = json.load(file)

data

{'0': {'reviewer_id': 1,
  'review_time': '2024-09-04',
  'rating': 1,
  'review_processed': "I had a normal transaction, everyone was calm and polite, but now I don't want to eat this. I'm trying not to think about what this milky white/clear substance is all over my food, and I'm sure I'm not coming back."},
 '1': {'reviewer_id': 2,
  'review_time': '2024-11-29',
  'rating': 4,
  'review_processed': "The staff at McDonald's are friendly, accommodating and always smiling. It is what it is as far as the food and atmosphere go; it's what it's always been. It makes for a more pleasant experience than many other fast food places in the area."},
 '2': {'reviewer_id': 3,
  'review_time': '2024-11-29',
  'rating': 1,
  'review_processed': 'I made a mobile order got to the speaker and checked it in, but the line was not moving, so I had to leave. I never got the refund in the app. I called them and they said I could only get my money back in person because it was stuck in the system.'},
 '3':

In [341]:
df = pd.DataFrame.from_dict(data, orient='index')
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   reviewer_id       100 non-null    int64 
 1   review_time       100 non-null    object
 2   rating            100 non-null    int64 
 3   review_processed  100 non-null    object
dtypes: int64(2), object(2)
memory usage: 3.9+ KB
None


Unnamed: 0,reviewer_id,review_time,rating,review_processed
0,1,2024-09-04,1,"I had a normal transaction, everyone was calm ..."
1,2,2024-11-29,4,"The staff at McDonald's are friendly, accommod..."
2,3,2024-11-29,1,I made a mobile order got to the speaker and c...
3,4,2024-11-04,5,"Crispy chicken sandwich was delicious, and cus..."
4,5,2024-10-04,1,I repeat my order three times in the drive thr...


# Load Model

In [342]:
def predict_model_ABSA(sentence, aspect, tokenizer):
    t1 = tokenizer.tokenize(sentence)
    t2 = tokenizer.tokenize(aspect)

    word_pieces = ['[cls]']
    word_pieces += t1
    word_pieces += ['[sep]']
    word_pieces += t2

    segment_tensor = [0] + [0]*len(t1) + [0] + [1]*len(t2)

    ids = tokenizer.convert_tokens_to_ids(word_pieces)
    input_tensor = torch.tensor([ids]).to(DEVICE)
    segment_tensor = torch.tensor(segment_tensor).to(DEVICE)

    with torch.no_grad():
        outputs = modelABSA(input_tensor, None, None, segments_tensors=segment_tensor)
        _, predictions = torch.max(outputs, dim=1)
    
    return word_pieces, predictions, outputs

def predict_model_ATE(sentence, tokenizer):
    word_pieces = []
    tokens = tokenizer.tokenize(sentence)
    word_pieces += tokens

    ids = tokenizer.convert_tokens_to_ids(word_pieces)
    input_tensor = torch.tensor([ids]).to(DEVICE)

    with torch.no_grad():
        outputs = modelATE(input_tensor, None, None)
        _, predictions = torch.max(outputs, dim=2)
    predictions = predictions[0].tolist()

    return word_pieces, predictions, outputs

def ATE_ABSA(text):
    mapper = { 0: 'negative', 1: 'neutral', 2: 'positive'}
    terms = []
    word = ""
    x, y, z = predict_model_ATE(text, tokenizer)
    for i in range(len(y)):
        if y[i] == 1:
            if len(word) != 0:
                terms.append(word.replace(" ##",""))
            word = x[i]
        if y[i] == 2:
            word += (" " + x[i])
            
    
    if len(word) != 0:
            terms.append(word.replace(" ##",""))
            
    storage = []
    pattern = r'^[\s\W]+$'
    if len(terms) != 0:
        # print("TERMS:", terms)
        for term in terms:
            if bool(re.fullmatch(pattern, term)):
                continue
            _, c, logits = predict_model_ABSA(text, term, tokenizer)
            prob = F.softmax(logits[0], dim=0)  # Apply softmax along the appropriate dimension
            result = {
                'term': term.strip(),
                'class': mapper.get(int(c)),
                'probability': [float(p) for p in prob.cpu().numpy()]                 
            }
            # Make sure it's unique
            if result not in storage:
                storage.append(result)
    
    return storage

In [343]:
def adjust_aspect(x):
    if len(x) == 0:
        return x
    
    result = []
    separate = []
    temp = []
    idx = len(x) - 1
    pattern = r"#"

    # Separate aspects
    while idx >= 0 :
        next = x[idx - 1]
        current = x[idx]
            
        if re.search(pattern, current['term']): 
            temp.append(current)
        elif (idx < len(x) - 1 and re.search(pattern, x[idx + 1]['term'])):
            temp.append(current)
            separate.append(tuple(temp))
            temp = []
        else:
            result.append(current)

        idx -= 1

    # Handling
    for s in separate:
        term = ""
        scores = []
        for item in s:
            scores.append(max(item.get('probability')))
            term = re.sub(r"#*", "", item['term']).strip() + term
            
        best_index = scores.index(max(scores))
        best_item = s[best_index]
        class_ = best_item.get('class')
        prob = best_item.get('probability')

        result.append({'term': term, 'class': class_, 'probability': prob})
        
    return result

In [344]:
sample = df['review_processed'].iloc[13]
sample

'I was disappointed that the restaurant did not get me my full order, meaning I did not receive my condiments for my big breakfast with hot cakes. No butter, no syrup, and no utensils. Oh, and the egg was half the size of the sausage.'

In [345]:
test = ATE_ABSA(sample)

test

[{'term': 'con',
  'class': 'negative',
  'probability': [0.9474828839302063,
   0.04992890730500221,
   0.0025881922338157892]},
 {'term': '##diment',
  'class': 'negative',
  'probability': [0.89311683177948, 0.05806119740009308, 0.04882201552391052]},
 {'term': '##s',
  'class': 'negative',
  'probability': [0.7784608602523804,
   0.02422277256846428,
   0.19731631875038147]},
 {'term': 'breakfast with hot cakes',
  'class': 'neutral',
  'probability': [0.10337114334106445,
   0.8078848719596863,
   0.08874398469924927]},
 {'term': 'butter',
  'class': 'negative',
  'probability': [0.9982079267501831,
   0.0008565931930206716,
   0.0009354839567095041]},
 {'term': 'syrup',
  'class': 'negative',
  'probability': [0.9983137845993042,
   0.0005611283122561872,
   0.0011251039104536176]},
 {'term': 'ut',
  'class': 'negative',
  'probability': [0.9965523481369019,
   0.002759113209322095,
   0.0006885712500661612]},
 {'term': '##ens',
  'class': 'negative',
  'probability': [0.77451699

In [346]:
adjust_aspect(test)

[{'term': 'sausage',
  'class': 'negative',
  'probability': [0.9789081811904907,
   0.019984861835837364,
   0.0011069513857364655]},
 {'term': 'egg',
  'class': 'negative',
  'probability': [0.9930011034011841,
   0.002290455624461174,
   0.00470846937969327]},
 {'term': 'syrup',
  'class': 'negative',
  'probability': [0.9983137845993042,
   0.0005611283122561872,
   0.0011251039104536176]},
 {'term': 'butter',
  'class': 'negative',
  'probability': [0.9982079267501831,
   0.0008565931930206716,
   0.0009354839567095041]},
 {'term': 'breakfast with hot cakes',
  'class': 'neutral',
  'probability': [0.10337114334106445,
   0.8078848719596863,
   0.08874398469924927]},
 {'term': 'utensils',
  'class': 'negative',
  'probability': [0.9965523481369019,
   0.002759113209322095,
   0.0006885712500661612]},
 {'term': 'condiments',
  'class': 'negative',
  'probability': [0.9474828839302063,
   0.04992890730500221,
   0.0025881922338157892]}]

# Process Data

In [347]:
def process(x):
    result = ATE_ABSA(x)
    result = adjust_aspect(result)
    return result

tqdm.pandas()
df['aspect_sentiment'] = df['review_processed'].progress_apply(lambda x: process(x))

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:04<00:00, 22.10it/s]


In [348]:
tqdm.pandas()

df['sentiment'] = df['review_processed'].progress_apply(lambda x: model(x)[0]['label'])

100%|███████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 202.12it/s]


In [349]:
df.head()

Unnamed: 0,reviewer_id,review_time,rating,review_processed,aspect_sentiment,sentiment
0,1,2024-09-04,1,"I had a normal transaction, everyone was calm ...","[{'term': 'food', 'class': 'negative', 'probab...",negative
1,2,2024-11-29,4,"The staff at McDonald's are friendly, accommod...","[{'term': 'fast food', 'class': 'postive', 'pr...",positive
2,3,2024-11-29,1,I made a mobile order got to the speaker and c...,"[{'term': 'speaker', 'class': 'neutral', 'prob...",negative
3,4,2024-11-04,5,"Crispy chicken sandwich was delicious, and cus...","[{'term': 'sandwich', 'class': 'postive', 'pro...",positive
4,5,2024-10-04,1,I repeat my order three times in the drive thr...,"[{'term': 'fries', 'class': 'negative', 'proba...",negative


In [350]:
# df.to_excel('fail.xlsx', index=False)

In [351]:
import re

# The sentence
sentence = ("The line was extremely big and was moving very slowly, plus they were out of apples and strawberries, "
            "but they gave me a substitute cookie. The staff were very apologetic for being slow, but it was the middle "
            "of the night and they were extremely under-staffed.")

# Check for both words using all()
result = all(re.search(rf'\b{word}\b', sentence, re.IGNORECASE) for word in ["apples", "strawberries"])
print(result)  # Output: False



True


In [352]:
def get_context(sentence, term):
    storage = []

    for sent in sent_tokenize(sentence):
        pattern = r'\b' + re.escape(term) + r'\b'
        if re.search(pattern, sent, re.IGNORECASE):
            storage.append(sent)
        elif all(re.search(rf'\b{word}\b', sentence, re.IGNORECASE) for word in term.split(" ")):
            storage.append(sent)

    return storage

def get_terms(row):
    result = []
    items = row.get('aspect_sentiment')
    sentence = row.get('review_processed')
    
    if len(items) > 0:
        for item in items:
            temp = item.copy()
            term = item.get('term')
            context = get_context(sentence, term)
            temp['context'] = context
            result.append(temp)
        return result
    return []

In [353]:
df['aspect_sentiment'] = [get_terms(row) for _, row in df.iterrows()]

In [354]:
df.head()

Unnamed: 0,reviewer_id,review_time,rating,review_processed,aspect_sentiment,sentiment
0,1,2024-09-04,1,"I had a normal transaction, everyone was calm ...","[{'term': 'food', 'class': 'negative', 'probab...",negative
1,2,2024-11-29,4,"The staff at McDonald's are friendly, accommod...","[{'term': 'fast food', 'class': 'postive', 'pr...",positive
2,3,2024-11-29,1,I made a mobile order got to the speaker and c...,"[{'term': 'speaker', 'class': 'neutral', 'prob...",negative
3,4,2024-11-04,5,"Crispy chicken sandwich was delicious, and cus...","[{'term': 'sandwich', 'class': 'postive', 'pro...",positive
4,5,2024-10-04,1,I repeat my order three times in the drive thr...,"[{'term': 'fries', 'class': 'negative', 'proba...",negative


In [355]:
result = df.to_dict('index')

In [356]:
# df.to_excel('preprocessed-mcd.xlsx', index=False)
# df.to_csv('preprocessed-mcd.csv', index=False)

In [357]:
with open("temp-2.json", "w") as file:
    json.dump(result, file, indent=4)