In [1]:
# Utility: Preparation text

from contractions import CONTRACTION_MAP

##========== PREPARATION TEXT ===========##

# Contraction
def expand_contractions(sentence, contraction_mapping=CONTRACTION_MAP):
    """
    Expand the contractions in a sentence. For example don't => do not.
    
    Paramters:
    sentence (str): The input sentence to clean.
    contraction_mapping (dict): A dictionary for mapping contractions.
    
    
    Returns:
    str: The expanded contraction sentence.
    """
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                      flags=re.IGNORECASE|re.DOTALL)
    
    def expanded_match(contraction):
        """
        Filter for expanding the matched contraction.
        
        Parameters:
        contraction (str): The input of contraction
        
        Returns:
        str: The expanded contraction.
        """
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match) if contraction_mapping.get(match) else contraction_mapping.get(match.lower())
        
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction
    
    expanded_sentence = contractions_pattern.sub(expanded_match, sentence)
    return expanded_sentence


def remove_extra_spaces(sentence):
    # Use regex to replace multiple spaces with a single space
    return re.sub(r'\s+', ' ', sentence).strip()


def remove_non_ascii(text):
    """
    Remove all non-ASCII characters from the text.

    Parameters:
    text (str): The input text to clean.

    Returns:
    str: The cleaned text with only ASCII characters.
    
    """
    
    return ''.join([char for char in text if ord(char) < 128])

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from datetime import datetime

import re
import json

from nltk.tokenize import word_tokenize

import torch
import torch.nn.functional as F
from transformers import BertTokenizer

from tqdm import tqdm

In [3]:
from transformers import pipeline

# Load Model

**Summarization**

In [4]:
modelBertSum = pipeline('summarization', model='model/summarization-0', device=0)
modelBertSum



<transformers.pipelines.text2text_generation.SummarizationPipeline at 0x20f6cef1e50>

# Load Dataset

In [8]:
with open('temp/representation.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

len(data)

1511

In [9]:
df = pd.DataFrame.from_dict(data)
df = df[['review_id', 'review_time', 'review', 'like']].copy()
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1511 entries, 0 to 1510
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   review_id    1511 non-null   int64 
 1   review_time  1511 non-null   object
 2   review       1511 non-null   object
 3   like         1511 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 47.3+ KB
None


Unnamed: 0,review_id,review_time,review,like
0,0,2024-11-28T13:41:44Z,The cars design is like it came from a cartoon.,0
1,2,2024-11-27T01:56:33Z,Tankz u for being so honest.\nMuch love and re...,0
2,3,2024-11-26T18:27:21Z,Amazing review your really good at this love w...,0
3,5,2024-11-25T18:09:24Z,"Honestly, the Cybertruck might be the first ca...",0
4,6,2024-11-25T03:55:09Z,I will be buying one anytime soon add expensiv...,0


In [10]:
# # df = pd.read_csv('sample-mcd.csv', encoding='latin1')
# df = pd.read_csv('McDonald_s_Reviews.csv', encoding='latin1')
# df = df[['reviewer_id', 'review_time', 'review', 'rating']].copy()
# print(df.info())
# df.head()

In [11]:
# Apply extraction

def process_text(x):
    # Prepare sentence
    texts = expand_contractions(x)
    texts = remove_extra_spaces(x)
    texts = remove_non_ascii(x)
    
    return texts

df['review_processed'] = df['review'].apply(process_text)
df.head()

Unnamed: 0,review_id,review_time,review,like,review_processed
0,0,2024-11-28T13:41:44Z,The cars design is like it came from a cartoon.,0,The cars design is like it came from a cartoon.
1,2,2024-11-27T01:56:33Z,Tankz u for being so honest.\nMuch love and re...,0,Tankz u for being so honest.\nMuch love and re...
2,3,2024-11-26T18:27:21Z,Amazing review your really good at this love w...,0,Amazing review your really good at this love w...
3,5,2024-11-25T18:09:24Z,"Honestly, the Cybertruck might be the first ca...",0,"Honestly, the Cybertruck might be the first ca..."
4,6,2024-11-25T03:55:09Z,I will be buying one anytime soon add expensiv...,0,I will be buying one anytime soon add expensiv...


# Preprocessing

In [12]:
def is_only_number(text):
    
    pattern = r'^\d+$'
    
    if re.search(pattern, text, re.IGNORECASE):
        return True
    return False

In [13]:
# def extract_date(x):
#     # Get current date
#     current_date = pd.to_datetime("today").date()

#     offset, period, _ = x.split(" ")
#     if is_only_number(offset):
#         offset = int(offset)
#     else:
#         offset = 1

#     if "year" in period:
#         offset = pd.DateOffset(years=offset)
#     elif "month" in period:
#         offset = pd.DateOffset(months=offset)
#     else:
#         offset = pd.DateOffset(days=offset)

#     return (current_date - offset).date().strftime('%Y-%m-%d')

In [24]:
def extract_date(x):
    # Convert to datetime
    datetime_obj = datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ')

    return str(datetime_obj.date())

In [25]:
def extract_rating(x):
    score = x.split(" ")[0]
    return int(score)

In [26]:
df['review_time'] = df['review_time'].apply(extract_date)

In [27]:
# df['rating'] = df['rating'].apply(extract_rating)

In [28]:
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1511 entries, 0 to 1510
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   review_id         1511 non-null   int64 
 1   review_time       1511 non-null   object
 2   review            1511 non-null   object
 3   like              1511 non-null   int64 
 4   review_processed  1511 non-null   object
dtypes: int64(2), object(3)
memory usage: 59.1+ KB
None


Unnamed: 0,review_id,review_time,review,like,review_processed
0,0,2024-11-28,The cars design is like it came from a cartoon.,0,The cars design is like it came from a cartoon.
1,2,2024-11-27,Tankz u for being so honest.\nMuch love and re...,0,Tankz u for being so honest.\nMuch love and re...
2,3,2024-11-26,Amazing review your really good at this love w...,0,Amazing review your really good at this love w...
3,5,2024-11-25,"Honestly, the Cybertruck might be the first ca...",0,"Honestly, the Cybertruck might be the first ca..."
4,6,2024-11-25,I will be buying one anytime soon add expensiv...,0,I will be buying one anytime soon add expensiv...


# Paraphrasing

In [29]:
def process(x):
    tokens = word_tokenize(x)
    print("NUM TOKEN: ", len(tokens))
    if len(tokens) > 30:
        set_max = int(0.99 * len(tokens))
        set_min = int(0.25 * len(tokens))
    
        return modelBertSum(f"correct:{x} </s>", min_length=set_min, max_length=set_max)[0]['summary_text']
    return x

In [30]:
# Testing
# for i in range(0, 30):
#     sample = df['review_processed'].iloc[i]
#     print(f"{i}. ", sample)

#     print("SUMMARIZE: ")
#     print(process(sample))
#     print()

In [None]:
tqdm.pandas()
# df['review_processed'] = df['review_processed'].progress_apply(lambda x: modelBertSum(f"correct: {x}</s>")[0]['summary_text'])
df['review_processed'] = df['review_processed'].progress_apply(lambda x: process(x))

  0%|                                                                                         | 0/1511 [00:00<?, ?it/s]

NUM TOKEN:  11
NUM TOKEN:  13
NUM TOKEN:  21
NUM TOKEN:  42


  0%|▎                                                                                | 5/1511 [00:04<20:45,  1.21it/s]

NUM TOKEN:  59


  0%|▎                                                                                | 6/1511 [00:05<22:33,  1.11it/s]

NUM TOKEN:  39


  0%|▍                                                                                | 7/1511 [00:06<27:02,  1.08s/it]

NUM TOKEN:  10
NUM TOKEN:  128


  1%|▍                                                                                | 9/1511 [00:10<34:44,  1.39s/it]

NUM TOKEN:  47


  1%|▌                                                                               | 10/1511 [00:11<33:54,  1.36s/it]

NUM TOKEN:  55


  1%|▌                                                                               | 11/1511 [00:14<39:05,  1.56s/it]

NUM TOKEN:  20
NUM TOKEN:  13
NUM TOKEN:  13
NUM TOKEN:  76


  1%|▊                                                                               | 15/1511 [00:16<24:13,  1.03it/s]

NUM TOKEN:  16
NUM TOKEN:  20
NUM TOKEN:  19
NUM TOKEN:  46


# Save Data

In [None]:
result = df[['review_id', 'review_time', 'like', 'review', 'review_processed']].to_dict('index')
result

In [54]:
with open("temp-1.json", "w") as file:
    json.dump(result, file, indent=4)

In [55]:
# df.to_csv("preprocessed-mcd.csv", index=False)

In [56]:
torch.cuda.reset_peak_memory_stats()  # Reset peak stats for debugging
torch.cuda.empty_cache()  # Clear cache
