In [1]:
# Utility: Preparation text

from contractions import CONTRACTION_MAP

##========== PREPARATION TEXT ===========##

# Contraction
def expand_contractions(sentence, contraction_mapping=CONTRACTION_MAP):
    """
    Expand the contractions in a sentence. For example don't => do not.
    
    Paramters:
    sentence (str): The input sentence to clean.
    contraction_mapping (dict): A dictionary for mapping contractions.
    
    
    Returns:
    str: The expanded contraction sentence.
    """
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                      flags=re.IGNORECASE|re.DOTALL)
    
    def expanded_match(contraction):
        """
        Filter for expanding the matched contraction.
        
        Parameters:
        contraction (str): The input of contraction
        
        Returns:
        str: The expanded contraction.
        """
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match) if contraction_mapping.get(match) else contraction_mapping.get(match.lower())
        
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction
    
    expanded_sentence = contractions_pattern.sub(expanded_match, sentence)
    return expanded_sentence


def remove_extra_spaces(sentence):
    # Use regex to replace multiple spaces with a single space
    return re.sub(r'\s+', ' ', sentence).strip()


def remove_non_ascii(text):
    """
    Remove all non-ASCII characters from the text.

    Parameters:
    text (str): The input text to clean.

    Returns:
    str: The cleaned text with only ASCII characters.
    
    """
    
    return ''.join([char for char in text if ord(char) < 128])

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
import json

import torch
import torch.nn.functional as F
from transformers import BertTokenizer

from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from model.model import bertATE, bertABSA
from transformers import pipeline

# Load Model

In [4]:
# DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# DEVICE

**Summarization**

In [5]:
modelBertSum = pipeline('summarization', model='model/summarization-0', device=0)
modelBertSum



<transformers.pipelines.text2text_generation.SummarizationPipeline at 0x21a15945fd0>

**Question Answering Extraction**

In [6]:
# modelQA = pipeline('question-answering', model='model/question-ans', device=0)
# modelQA

**Aspect Based Sentiment Analysis**

In [7]:
# pretrain_model_name = "bert-base-uncased"
# tokenizer = BertTokenizer.from_pretrained(pretrain_model_name)

# lr = 2e-5
# modelATE = bertATE(pretrain_model_name).to(DEVICE)
# # optimizerATE = torch.optim.Adam(model_ATE.parameters(), lr=lr)
# modelABSA = bertABSA(pretrain_model_name).to(DEVICE)
# # optimizerABSA = torch.optim.Adam(model_ABSA.parameters(), lr=lr)

In [8]:
# modelABSA.load_state_dict(torch.load("model/bert_ABSA.pkl"), strict=False)
# modelATE.load_state_dict(torch.load("model/bert_ATE.pkl"), strict=False)

# Load Dataset

In [9]:
df = pd.read_csv('sample-mcd.csv', encoding='latin1')
df = df[['reviewer_id', 'review_time', 'review', 'rating']].copy()
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   reviewer_id  100 non-null    int64 
 1   review_time  100 non-null    object
 2   review       100 non-null    object
 3   rating       100 non-null    object
dtypes: int64(1), object(3)
memory usage: 3.2+ KB
None


Unnamed: 0,reviewer_id,review_time,review,rating
0,1,3 months ago,Why does it look like someone spit on my food?...,1 star
1,2,5 days ago,It'd McDonalds. It is what it is as far as the...,4 stars
2,3,5 days ago,Made a mobile order got to the speaker and che...,1 star
3,4,a month ago,My mc. Crispy chicken sandwich was ÃÂ¯ÃÂ¿ÃÂ...,5 stars
4,5,2 months ago,"I repeat my order 3 times in the drive thru, a...",1 star


In [10]:
# Apply extraction

def process_text(x):
    # Prepare sentence
    texts = expand_contractions(x)
    texts = remove_extra_spaces(x)
    texts = remove_non_ascii(x)
    
    return texts

df['review_processed'] = df['review'].apply(process_text)
df.head()

Unnamed: 0,reviewer_id,review_time,review,rating,review_processed
0,1,3 months ago,Why does it look like someone spit on my food?...,1 star,Why does it look like someone spit on my food?...
1,2,5 days ago,It'd McDonalds. It is what it is as far as the...,4 stars,It'd McDonalds. It is what it is as far as the...
2,3,5 days ago,Made a mobile order got to the speaker and che...,1 star,Made a mobile order got to the speaker and che...
3,4,a month ago,My mc. Crispy chicken sandwich was ÃÂ¯ÃÂ¿ÃÂ...,5 stars,My mc. Crispy chicken sandwich was customer s...
4,5,2 months ago,"I repeat my order 3 times in the drive thru, a...",1 star,"I repeat my order 3 times in the drive thru, a..."


In [11]:
sample = df['review_processed'].iloc[0]
actual = df['review'].iloc[0]

print("ACTUAL")
print(actual)
print("PROCESSED")
print(sample)

ACTUAL
Why does it look like someone spit on my food?
I had a normal transaction,  everyone was chill and polite, but now i dont want to eat this. Im trying not to think about what this milky white/clear substance is all over my food, i d*** sure am not coming back.
PROCESSED
Why does it look like someone spit on my food?
I had a normal transaction,  everyone was chill and polite, but now i dont want to eat this. Im trying not to think about what this milky white/clear substance is all over my food, i d*** sure am not coming back.


# Preprocessing

In [12]:
def is_only_number(text):
    
    pattern = r'^\d+$'
    
    if re.search(pattern, text, re.IGNORECASE):
        return True
    return False

In [13]:
def extract_date(x):
    # Get current date
    current_date = pd.to_datetime("today").date()

    offset, period, _ = x.split(" ")
    if is_only_number(offset):
        offset = int(offset)
    else:
        offset = 1

    if "year" in period:
        offset = pd.DateOffset(years=offset)
    elif "month" in period:
        offset = pd.DateOffset(months=offset)
    else:
        offset = pd.DateOffset(days=offset)

    return (current_date - offset).date().strftime('%Y-%m-%d')

In [14]:
def extract_rating(x):
    score = x.split(" ")[0]
    return int(score)

In [15]:
df['review_time'] = df['review_time'].apply(extract_date)

In [16]:
df['rating'] = df['rating'].apply(extract_rating)

In [17]:
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   reviewer_id       100 non-null    int64 
 1   review_time       100 non-null    object
 2   review            100 non-null    object
 3   rating            100 non-null    int64 
 4   review_processed  100 non-null    object
dtypes: int64(2), object(3)
memory usage: 4.0+ KB
None


Unnamed: 0,reviewer_id,review_time,review,rating,review_processed
0,1,2024-09-04,Why does it look like someone spit on my food?...,1,Why does it look like someone spit on my food?...
1,2,2024-11-29,It'd McDonalds. It is what it is as far as the...,4,It'd McDonalds. It is what it is as far as the...
2,3,2024-11-29,Made a mobile order got to the speaker and che...,1,Made a mobile order got to the speaker and che...
3,4,2024-11-04,My mc. Crispy chicken sandwich was ÃÂ¯ÃÂ¿ÃÂ...,5,My mc. Crispy chicken sandwich was customer s...
4,5,2024-10-04,"I repeat my order 3 times in the drive thru, a...",1,"I repeat my order 3 times in the drive thru, a..."


# Paraphrasing

In [18]:
# modelBertSum(f"correct: {sample} </s>", min_length=32, max_length=64)
modelBertSum(f"correct: {sample} </s>")

Your max_length is set to 142, but your input_length is only 69. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=34)


[{'summary_text': "I had a normal transaction, everyone was calm and polite, but now I don't want to eat this. I'm trying not to think about what this milky white/clear substance is all over my food, and I'm sure I'm not coming back."}]

In [19]:
tqdm.pandas()
df['review_processed'] = df['review_processed'].progress_apply(lambda x: modelBertSum(f"correct: {x}</s>")[0]['summary_text'])

  0%|                                                                                          | 0/100 [00:00<?, ?it/s]Your max_length is set to 142, but your input_length is only 68. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=34)
  2%|█▋                                                                                | 2/100 [00:03<03:14,  1.98s/it]Your max_length is set to 142, but your input_length is only 55. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=27)
  3%|██▍                                                                               | 3/100 [00:07<04:28,  2.76s/it]Your max_length is set to 142, but your input_length is only 85. Since this is a summarization task, where outputs shorter than the input are typ

# Save Data

In [24]:
result = df[['reviewer_id', 'review_time', 'rating', 'review_processed']].to_dict('index')
result

{0: {'reviewer_id': 1,
  'review_time': '2024-09-04',
  'rating': 1,
  'review_processed': "I had a normal transaction, everyone was calm and polite, but now I don't want to eat this. I'm trying not to think about what this milky white/clear substance is all over my food, and I'm sure I'm not coming back."},
 1: {'reviewer_id': 2,
  'review_time': '2024-11-29',
  'rating': 4,
  'review_processed': "The staff at McDonald's are friendly, accommodating and always smiling. It is what it is as far as the food and atmosphere go; it's what it's always been. It makes for a more pleasant experience than many other fast food places in the area."},
 2: {'reviewer_id': 3,
  'review_time': '2024-11-29',
  'rating': 1,
  'review_processed': 'I made a mobile order got to the speaker and checked it in, but the line was not moving, so I had to leave. I never got the refund in the app. I called them and they said I could only get my money back in person because it was stuck in the system.'},
 3: {'revie

In [25]:
with open("temp-1.json", "w") as file:
    json.dump(result, file, indent=4)

In [22]:
# df.to_csv("preprocessed-mcd.csv", index=False)