In [54]:
# Utility: Preparation text

from contractions import CONTRACTION_MAP

##========== PREPARATION TEXT ===========##

# Contraction
def expand_contractions(sentence, contraction_mapping=CONTRACTION_MAP):
    """
    Expand the contractions in a sentence. For example don't => do not.
    
    Paramters:
    sentence (str): The input sentence to clean.
    contraction_mapping (dict): A dictionary for mapping contractions.
    
    
    Returns:
    str: The expanded contraction sentence.
    """
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                      flags=re.IGNORECASE|re.DOTALL)
    
    def expanded_match(contraction):
        """
        Filter for expanding the matched contraction.
        
        Parameters:
        contraction (str): The input of contraction
        
        Returns:
        str: The expanded contraction.
        """
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match) if contraction_mapping.get(match) else contraction_mapping.get(match.lower())
        
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction
    
    expanded_sentence = contractions_pattern.sub(expanded_match, sentence)
    return expanded_sentence


def remove_extra_spaces(sentence):
    # Use regex to replace multiple spaces with a single space
    return re.sub(r'\s+', ' ', sentence).strip()


def remove_non_ascii(text):
    """
    Remove all non-ASCII characters from the text.

    Parameters:
    text (str): The input text to clean.

    Returns:
    str: The cleaned text with only ASCII characters.
    
    """
    
    return ''.join([char for char in text if ord(char) < 128])

In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
import json

from nltk.tokenize import word_tokenize

import torch
import torch.nn.functional as F
from transformers import BertTokenizer

from tqdm import tqdm

In [56]:
from transformers import pipeline

# Load Model

**Summarization**

In [57]:
modelBertSum = pipeline('summarization', model='model/summarization-0', device=0)
modelBertSum



<transformers.pipelines.text2text_generation.SummarizationPipeline at 0x299dc322d10>

# Load Dataset

In [58]:
# df = pd.read_csv('sample-mcd.csv', encoding='latin1')
df = pd.read_csv('McDonald_s_Reviews.csv', encoding='latin1')
df = df[['reviewer_id', 'review_time', 'review', 'rating']].copy()
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   reviewer_id  100 non-null    int64 
 1   review_time  100 non-null    object
 2   review       100 non-null    object
 3   rating       100 non-null    object
dtypes: int64(1), object(3)
memory usage: 3.2+ KB
None


Unnamed: 0,reviewer_id,review_time,review,rating
0,1,3 months ago,Why does it look like someone spit on my food?...,1 star
1,2,5 days ago,It'd McDonalds. It is what it is as far as the...,4 stars
2,3,5 days ago,Made a mobile order got to the speaker and che...,1 star
3,4,a month ago,My mc. Crispy chicken sandwich was ÃÂ¯ÃÂ¿ÃÂ...,5 stars
4,5,2 months ago,"I repeat my order 3 times in the drive thru, a...",1 star


In [59]:
# Apply extraction

def process_text(x):
    # Prepare sentence
    texts = expand_contractions(x)
    texts = remove_extra_spaces(x)
    texts = remove_non_ascii(x)
    
    return texts

df['review_processed'] = df['review'].apply(process_text)
df.head()

Unnamed: 0,reviewer_id,review_time,review,rating,review_processed
0,1,3 months ago,Why does it look like someone spit on my food?...,1 star,Why does it look like someone spit on my food?...
1,2,5 days ago,It'd McDonalds. It is what it is as far as the...,4 stars,It'd McDonalds. It is what it is as far as the...
2,3,5 days ago,Made a mobile order got to the speaker and che...,1 star,Made a mobile order got to the speaker and che...
3,4,a month ago,My mc. Crispy chicken sandwich was ÃÂ¯ÃÂ¿ÃÂ...,5 stars,My mc. Crispy chicken sandwich was customer s...
4,5,2 months ago,"I repeat my order 3 times in the drive thru, a...",1 star,"I repeat my order 3 times in the drive thru, a..."


In [60]:
sample = df['review_processed'].iloc[0]
actual = df['review'].iloc[0]

print("ACTUAL")
print(actual)
print("PROCESSED")
print(sample)

ACTUAL
Why does it look like someone spit on my food?
I had a normal transaction,  everyone was chill and polite, but now i dont want to eat this. Im trying not to think about what this milky white/clear substance is all over my food, i d*** sure am not coming back.
PROCESSED
Why does it look like someone spit on my food?
I had a normal transaction,  everyone was chill and polite, but now i dont want to eat this. Im trying not to think about what this milky white/clear substance is all over my food, i d*** sure am not coming back.


# Preprocessing

In [61]:
def is_only_number(text):
    
    pattern = r'^\d+$'
    
    if re.search(pattern, text, re.IGNORECASE):
        return True
    return False

In [62]:
def extract_date(x):
    # Get current date
    current_date = pd.to_datetime("today").date()

    offset, period, _ = x.split(" ")
    if is_only_number(offset):
        offset = int(offset)
    else:
        offset = 1

    if "year" in period:
        offset = pd.DateOffset(years=offset)
    elif "month" in period:
        offset = pd.DateOffset(months=offset)
    else:
        offset = pd.DateOffset(days=offset)

    return (current_date - offset).date().strftime('%Y-%m-%d')

In [63]:
def extract_rating(x):
    score = x.split(" ")[0]
    return int(score)

In [64]:
df['review_time'] = df['review_time'].apply(extract_date)

In [65]:
df['rating'] = df['rating'].apply(extract_rating)

In [66]:
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   reviewer_id       100 non-null    int64 
 1   review_time       100 non-null    object
 2   review            100 non-null    object
 3   rating            100 non-null    int64 
 4   review_processed  100 non-null    object
dtypes: int64(2), object(3)
memory usage: 4.0+ KB
None


Unnamed: 0,reviewer_id,review_time,review,rating,review_processed
0,1,2024-09-20,Why does it look like someone spit on my food?...,1,Why does it look like someone spit on my food?...
1,2,2024-12-15,It'd McDonalds. It is what it is as far as the...,4,It'd McDonalds. It is what it is as far as the...
2,3,2024-12-15,Made a mobile order got to the speaker and che...,1,Made a mobile order got to the speaker and che...
3,4,2024-11-20,My mc. Crispy chicken sandwich was ÃÂ¯ÃÂ¿ÃÂ...,5,My mc. Crispy chicken sandwich was customer s...
4,5,2024-10-20,"I repeat my order 3 times in the drive thru, a...",1,"I repeat my order 3 times in the drive thru, a..."


# Paraphrasing

In [67]:
def process(x):
    tokens = word_tokenize(x)
    print("NUM TOKEN: ", len(tokens))
    if len(tokens) > 30:
        set_max = int(0.99 * len(tokens))
        set_min = int(0.25 * len(tokens))
    
        return modelBertSum(f"correct:{x} </s>", min_length=set_min, max_length=set_max)[0]['summary_text']
    return x

In [68]:
# Testing
# for i in range(0, 30):
#     sample = df['review_processed'].iloc[i]
#     print(f"{i}. ", sample)

#     print("SUMMARIZE: ")
#     print(process(sample))
#     print()

In [69]:
tqdm.pandas()
# df['review_processed'] = df['review_processed'].progress_apply(lambda x: modelBertSum(f"correct: {x}</s>")[0]['summary_text'])
df['review_processed'] = df['review_processed'].progress_apply(lambda x: process(x))

  0%|                                                                                          | 0/100 [00:00<?, ?it/s]

NUM TOKEN:  60


  2%|█▋                                                                                | 2/100 [00:02<02:22,  1.45s/it]

NUM TOKEN:  49


  3%|██▍                                                                               | 3/100 [00:04<02:19,  1.44s/it]

NUM TOKEN:  74


  4%|███▎                                                                              | 4/100 [00:07<03:21,  2.10s/it]

NUM TOKEN:  13
NUM TOKEN:  69


  6%|████▉                                                                             | 6/100 [00:10<02:55,  1.87s/it]

NUM TOKEN:  47


  7%|█████▋                                                                            | 7/100 [00:12<02:57,  1.91s/it]

NUM TOKEN:  193


  8%|██████▌                                                                           | 8/100 [00:15<03:12,  2.09s/it]

NUM TOKEN:  70


  9%|███████▍                                                                          | 9/100 [00:17<03:00,  1.98s/it]

NUM TOKEN:  38


 10%|████████                                                                         | 10/100 [00:18<02:35,  1.72s/it]

NUM TOKEN:  121


 11%|████████▉                                                                        | 11/100 [00:19<02:31,  1.70s/it]

NUM TOKEN:  2
NUM TOKEN:  75


 13%|██████████▌                                                                      | 13/100 [00:22<02:05,  1.44s/it]

NUM TOKEN:  85


 14%|███████████▎                                                                     | 14/100 [00:24<02:14,  1.56s/it]

NUM TOKEN:  67


 15%|████████████▏                                                                    | 15/100 [00:26<02:38,  1.87s/it]

NUM TOKEN:  37


 16%|████████████▉                                                                    | 16/100 [00:28<02:22,  1.70s/it]

NUM TOKEN:  29
NUM TOKEN:  58


 18%|██████████████▌                                                                  | 18/100 [00:30<01:59,  1.46s/it]

NUM TOKEN:  32


 19%|███████████████▍                                                                 | 19/100 [00:31<01:58,  1.46s/it]

NUM TOKEN:  81


 20%|████████████████▏                                                                | 20/100 [00:34<02:27,  1.84s/it]

NUM TOKEN:  43


 21%|█████████████████                                                                | 21/100 [00:36<02:25,  1.84s/it]

NUM TOKEN:  224


 22%|█████████████████▊                                                               | 22/100 [00:39<02:44,  2.12s/it]

NUM TOKEN:  284


 23%|██████████████████▋                                                              | 23/100 [00:43<03:22,  2.63s/it]

NUM TOKEN:  22
NUM TOKEN:  58


 25%|████████████████████▎                                                            | 25/100 [00:45<02:31,  2.02s/it]

NUM TOKEN:  6
NUM TOKEN:  31


 27%|█████████████████████▊                                                           | 27/100 [00:46<01:44,  1.43s/it]

NUM TOKEN:  12
NUM TOKEN:  1
NUM TOKEN:  23
NUM TOKEN:  32


 31%|█████████████████████████                                                        | 31/100 [00:48<01:00,  1.14it/s]

NUM TOKEN:  31


 32%|█████████████████████████▉                                                       | 32/100 [00:49<01:06,  1.02it/s]

NUM TOKEN:  98


 33%|██████████████████████████▋                                                      | 33/100 [00:53<01:41,  1.51s/it]

NUM TOKEN:  13
NUM TOKEN:  104


 35%|████████████████████████████▎                                                    | 35/100 [00:55<01:27,  1.35s/it]

NUM TOKEN:  28
NUM TOKEN:  45


 37%|█████████████████████████████▉                                                   | 37/100 [00:57<01:15,  1.20s/it]

NUM TOKEN:  7
NUM TOKEN:  11
NUM TOKEN:  8
NUM TOKEN:  76


 41%|█████████████████████████████████▏                                               | 41/100 [00:59<00:51,  1.16it/s]

NUM TOKEN:  25
NUM TOKEN:  126


 43%|██████████████████████████████████▊                                              | 43/100 [01:03<01:02,  1.10s/it]

NUM TOKEN:  45


 44%|███████████████████████████████████▋                                             | 44/100 [01:04<01:03,  1.14s/it]

NUM TOKEN:  26
NUM TOKEN:  19
NUM TOKEN:  101


 47%|██████████████████████████████████████                                           | 47/100 [01:06<00:52,  1.01it/s]

NUM TOKEN:  15
NUM TOKEN:  75


 49%|███████████████████████████████████████▋                                         | 49/100 [01:09<00:52,  1.02s/it]

NUM TOKEN:  62


 50%|████████████████████████████████████████▌                                        | 50/100 [01:11<01:01,  1.22s/it]

NUM TOKEN:  228


 51%|█████████████████████████████████████████▎                                       | 51/100 [01:14<01:20,  1.64s/it]

NUM TOKEN:  32


 52%|██████████████████████████████████████████                                       | 52/100 [01:16<01:15,  1.57s/it]

NUM TOKEN:  56


 53%|██████████████████████████████████████████▉                                      | 53/100 [01:17<01:11,  1.52s/it]

NUM TOKEN:  40


 54%|███████████████████████████████████████████▋                                     | 54/100 [01:19<01:10,  1.53s/it]

NUM TOKEN:  49


 55%|████████████████████████████████████████████▌                                    | 55/100 [01:20<01:03,  1.41s/it]

NUM TOKEN:  15
NUM TOKEN:  4
NUM TOKEN:  67


 58%|██████████████████████████████████████████████▉                                  | 58/100 [01:22<00:43,  1.02s/it]

NUM TOKEN:  95


 59%|███████████████████████████████████████████████▊                                 | 59/100 [01:24<00:51,  1.24s/it]

NUM TOKEN:  95


 60%|████████████████████████████████████████████████▌                                | 60/100 [01:27<01:04,  1.61s/it]

NUM TOKEN:  16
NUM TOKEN:  74


 62%|██████████████████████████████████████████████████▏                              | 62/100 [01:29<00:52,  1.38s/it]

NUM TOKEN:  54


 63%|███████████████████████████████████████████████████                              | 63/100 [01:31<00:57,  1.56s/it]

NUM TOKEN:  198


 64%|███████████████████████████████████████████████████▊                             | 64/100 [01:34<01:07,  1.86s/it]

NUM TOKEN:  43


 65%|████████████████████████████████████████████████████▋                            | 65/100 [01:35<01:02,  1.80s/it]

NUM TOKEN:  50


 66%|█████████████████████████████████████████████████████▍                           | 66/100 [01:38<01:04,  1.89s/it]

NUM TOKEN:  161


 67%|██████████████████████████████████████████████████████▎                          | 67/100 [01:40<01:05,  2.00s/it]

NUM TOKEN:  3
NUM TOKEN:  64


 69%|███████████████████████████████████████████████████████▉                         | 69/100 [01:41<00:44,  1.45s/it]

NUM TOKEN:  85


 70%|████████████████████████████████████████████████████████▋                        | 70/100 [01:43<00:45,  1.51s/it]

NUM TOKEN:  61


 71%|█████████████████████████████████████████████████████████▌                       | 71/100 [01:46<00:52,  1.82s/it]

NUM TOKEN:  58


 72%|██████████████████████████████████████████████████████████▎                      | 72/100 [01:48<00:55,  2.00s/it]

NUM TOKEN:  54


 73%|███████████████████████████████████████████████████████████▏                     | 73/100 [01:51<00:58,  2.17s/it]

NUM TOKEN:  92


 74%|███████████████████████████████████████████████████████████▉                     | 74/100 [01:54<01:00,  2.34s/it]

NUM TOKEN:  90


 75%|████████████████████████████████████████████████████████████▊                    | 75/100 [01:55<00:53,  2.15s/it]

NUM TOKEN:  59


 76%|█████████████████████████████████████████████████████████████▌                   | 76/100 [01:58<00:53,  2.23s/it]

NUM TOKEN:  97


 77%|██████████████████████████████████████████████████████████████▎                  | 77/100 [02:00<00:49,  2.15s/it]

NUM TOKEN:  40


 78%|███████████████████████████████████████████████████████████████▏                 | 78/100 [02:01<00:44,  2.04s/it]

NUM TOKEN:  21
NUM TOKEN:  22
NUM TOKEN:  22
NUM TOKEN:  112


 82%|██████████████████████████████████████████████████████████████████▍              | 82/100 [02:04<00:20,  1.16s/it]

NUM TOKEN:  49


 83%|███████████████████████████████████████████████████████████████████▏             | 83/100 [02:06<00:21,  1.28s/it]

NUM TOKEN:  60


 84%|████████████████████████████████████████████████████████████████████             | 84/100 [02:08<00:22,  1.38s/it]

NUM TOKEN:  62


 85%|████████████████████████████████████████████████████████████████████▊            | 85/100 [02:10<00:22,  1.52s/it]

NUM TOKEN:  26
NUM TOKEN:  89


 87%|██████████████████████████████████████████████████████████████████████▍          | 87/100 [02:13<00:20,  1.56s/it]

NUM TOKEN:  27
NUM TOKEN:  42


 89%|████████████████████████████████████████████████████████████████████████         | 89/100 [02:15<00:14,  1.32s/it]

NUM TOKEN:  84


 90%|████████████████████████████████████████████████████████████████████████▉        | 90/100 [02:17<00:16,  1.61s/it]

NUM TOKEN:  87


 91%|█████████████████████████████████████████████████████████████████████████▋       | 91/100 [02:20<00:16,  1.83s/it]

NUM TOKEN:  72


 92%|██████████████████████████████████████████████████████████████████████████▌      | 92/100 [02:23<00:17,  2.13s/it]

NUM TOKEN:  164


 93%|███████████████████████████████████████████████████████████████████████████▎     | 93/100 [02:25<00:15,  2.19s/it]

NUM TOKEN:  79


 94%|████████████████████████████████████████████████████████████████████████████▏    | 94/100 [02:27<00:12,  2.08s/it]

NUM TOKEN:  86


 95%|████████████████████████████████████████████████████████████████████████████▉    | 95/100 [02:30<00:10,  2.19s/it]

NUM TOKEN:  67


 96%|█████████████████████████████████████████████████████████████████████████████▊   | 96/100 [02:32<00:09,  2.26s/it]

NUM TOKEN:  177


 97%|██████████████████████████████████████████████████████████████████████████████▌  | 97/100 [02:34<00:06,  2.25s/it]

NUM TOKEN:  64


 98%|███████████████████████████████████████████████████████████████████████████████▍ | 98/100 [02:36<00:04,  2.13s/it]

NUM TOKEN:  30
NUM TOKEN:  12
NUM TOKEN:  49


100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [02:38<00:00,  1.59s/it]


# Save Data

In [73]:
result = df[['reviewer_id', 'review_time', 'rating', 'review', 'review_processed']].to_dict('index')
result

{0: {'reviewer_id': 1,
  'review_time': '2024-09-20',
  'rating': 1,
  'review': 'Why does it look like someone spit on my food?\nI had a normal transaction,  everyone was chill and polite, but now i dont want to eat this. Im trying not to think about what this milky white/clear substance is all over my food, i d*** sure am not coming back.',
  'review_processed': "I had a normal transaction, but now I don't want to eat this. I'm trying not to think about what this milky white/clear substance is all over my food, and I'm sure I'm not coming back."},
 1: {'reviewer_id': 2,
  'review_time': '2024-12-15',
  'rating': 4,
  'review': "It'd McDonalds. It is what it is as far as the food and atmosphere go. The staff here does make a difference. They are all friendly, accommodating and always smiling. Makes for a more pleasant experience than many other fast food places.",
  'review_processed': "It's what it is as far as the food and atmosphere go. The staff are friendly, accommodating, and al

In [74]:
with open("temp-1.json", "w") as file:
    json.dump(result, file, indent=4)

In [72]:
# df.to_csv("preprocessed-mcd.csv", index=False)