In [12]:
# Utility: Preparation text

from contractions import CONTRACTION_MAP

##========== PREPARATION TEXT ===========##

# Contraction
def expand_contractions(sentence, contraction_mapping=CONTRACTION_MAP):
    """
    Expand the contractions in a sentence. For example don't => do not.
    
    Paramters:
    sentence (str): The input sentence to clean.
    contraction_mapping (dict): A dictionary for mapping contractions.
    
    
    Returns:
    str: The expanded contraction sentence.
    """
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                      flags=re.IGNORECASE|re.DOTALL)
    
    def expanded_match(contraction):
        """
        Filter for expanding the matched contraction.
        
        Parameters:
        contraction (str): The input of contraction
        
        Returns:
        str: The expanded contraction.
        """
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match) if contraction_mapping.get(match) else contraction_mapping.get(match.lower())
        
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction
    
    expanded_sentence = contractions_pattern.sub(expanded_match, sentence)
    return expanded_sentence


def remove_extra_spaces(sentence):
    # Use regex to replace multiple spaces with a single space
    return re.sub(r'\s+', ' ', sentence).strip()


def remove_non_ascii(text):
    """
    Remove all non-ASCII characters from the text.

    Parameters:
    text (str): The input text to clean.

    Returns:
    str: The cleaned text with only ASCII characters.
    
    """
    
    return ''.join([char for char in text if ord(char) < 128])

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
import json
import os

import torch
import torch.nn.functional as F
from transformers import BertTokenizer

from tqdm import tqdm

In [14]:
from model.model import bertATE, bertABSA
from transformers import pipeline

# Load Model

In [55]:
modelBertSum = pipeline('summarization', model='model/summarization-0', device=0)
modelBertSum



<transformers.pipelines.text2text_generation.SummarizationPipeline at 0x29bb8e76e90>

# Load Dataset

In [56]:
df = pd.read_csv("./sample-phone.csv")
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   asin          100 non-null    object 
 1   name          100 non-null    object 
 2   rating        100 non-null    int64  
 3   date          100 non-null    object 
 4   verified      100 non-null    bool   
 5   title         100 non-null    object 
 6   body          100 non-null    object 
 7   helpfulVotes  100 non-null    float64
dtypes: bool(1), float64(1), int64(1), object(5)
memory usage: 5.7+ KB
None


Unnamed: 0,asin,name,rating,date,verified,title,body,helpfulVotes
0,B07K76LBLZ,PlaYboY BuNnY,5,"February 14, 2019",True,Best phone money can buy,So far this phone is the bees knees.,3.0
1,B07P8MQHSH,Gemini_13,3,"August 2, 2019",True,Pixel 3 has frequent display defects,I like this phone model. I bought this phone a...,20.0
2,B01NC2MEJP,Kashm30uttz,2,"April 30, 2018",True,the lower left corner of the phone doesnt work...,the lower left corner of the phone doesnt work...,1.0
3,B018OMP8ES,Techie241,3,"January 16, 2017",True,"Great phone, be careful with the sellers","The phone is wonderful, if you're considering ...",10.0
4,B07QC4R451,Michelle Wildhagen,1,"June 22, 2019",True,U had no issue with the seller they were great...,I was very disappointed in the way this phone ...,3.0


In [57]:
sample = df.iloc[0]['date']

print(sample)
pd.to_datetime(sample)

February 14, 2019


Timestamp('2019-02-14 00:00:00')

In [58]:
# Check is asin unique
df['asin'].nunique()

87

In [59]:
df['verified'].value_counts()

verified
True     87
False    13
Name: count, dtype: int64

In [60]:
# Prepare data
df = df[df['verified'] == True]
df['date'] = df['date'].apply(lambda x: str(pd.to_datetime(x).date()))
df['review'] = df['title'] + ". " + df['body']
df = df.reset_index().rename({'index': 'review_id', 'date': 'review_time'}, axis=1)
df = df[['review_id', 'review_time', 'review_time', 'rating', 'helpfulVotes']].copy()

print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   review_id     87 non-null     int64  
 1   date          87 non-null     object 
 2   review        87 non-null     object 
 3   rating        87 non-null     int64  
 4   helpfulVotes  87 non-null     float64
dtypes: float64(1), int64(2), object(2)
memory usage: 3.5+ KB
None


Unnamed: 0,review_id,date,review,rating,helpfulVotes
0,0,2019-02-14,Best phone money can buy.So far this phone is ...,5,3.0
1,1,2019-08-02,Pixel 3 has frequent display defects.I like th...,3,20.0
2,2,2018-04-30,the lower left corner of the phone doesnt work...,2,1.0
3,3,2017-01-16,"Great phone, be careful with the sellers.The p...",3,10.0
4,4,2019-06-22,U had no issue with the seller they were great...,1,3.0


In [61]:
# Apply extraction

def process_text(x):
    # Prepare sentence
    texts = expand_contractions(x)
    texts = remove_extra_spaces(x)
    texts = remove_non_ascii(x)
    
    return texts

df['review_processed'] = df['review'].apply(process_text)
df.head()

Unnamed: 0,review_id,date,review,rating,helpfulVotes,review_processed
0,0,2019-02-14,Best phone money can buy.So far this phone is ...,5,3.0,Best phone money can buy.So far this phone is ...
1,1,2019-08-02,Pixel 3 has frequent display defects.I like th...,3,20.0,Pixel 3 has frequent display defects.I like th...
2,2,2018-04-30,the lower left corner of the phone doesnt work...,2,1.0,the lower left corner of the phone doesnt work...
3,3,2017-01-16,"Great phone, be careful with the sellers.The p...",3,10.0,"Great phone, be careful with the sellers.The p..."
4,4,2019-06-22,U had no issue with the seller they were great...,1,3.0,U had no issue with the seller they were great...


# Paraphrasing

In [62]:
sample = df['review_processed'].iloc[45]
actual = df['review'].iloc[45]

print("ACTUAL")
print(actual)
print("PROCESSED")
print(sample)

ACTUAL
wifi problems.This phone like other sony phones has serious wifi issues. Even non flagship products from htc, samsung, huwaie don't have problems with wifi. It will connect but will not have internet while other devices on my network have no issues.
PROCESSED
wifi problems.This phone like other sony phones has serious wifi issues. Even non flagship products from htc, samsung, huwaie don't have problems with wifi. It will connect but will not have internet while other devices on my network have no issues.


In [63]:
modelBertSum(f"correct:{sample} </s>")

Your max_length is set to 142, but your input_length is only 62. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=31)


[{'summary_text': "This phone like other sony phones has serious wifi issues. It will connect but will not have internet while other devices on my network have no issues. Even non flagship products from HTC, Samsung, and Sony don't have problems with wifi issues with these phones."}]

In [66]:
tqdm.pandas()
def process(x):
    if len(x) > 150:
        return  modelBertSum(f"correct:{x}</s>")[0]['summary_text']
    return x
df['review_processed'] = df['review_processed'].progress_apply(lambda x: process(x))

  0%|                                                                                           | 0/87 [00:00<?, ?it/s]Your max_length is set to 142, but your input_length is only 68. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=34)
  3%|██▊                                                                                | 3/87 [00:10<05:00,  3.58s/it]Your max_length is set to 142, but your input_length is only 51. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=25)
  5%|███▊                                                                               | 4/87 [00:21<08:19,  6.02s/it]Your max_length is set to 142, but your input_length is only 112. Since this is a summarization task, where outputs shorter than the input are ty

# Save Dataset

In [73]:
result = df.to_dict('index')
result

{0: {'review_id': 0,
  'review_time': '2019-02-14',
  'review': 'Best phone money can buy.So far this phone is the bees knees.',
  'rating': 5,
  'helpfulVotes': 3.0,
  'review_processed': 'Best phone money can buy.So far this phone is the bees knees.'},
 1: {'review_id': 1,
  'review_time': '2019-08-02',
  'review': 'Pixel 3 has frequent display defects.I like this phone model. I bought this phone as a gift to my wife. Unfortunately, the phone turned out to be defective. There is a pink tint on the screen. Amazon does not want to properly help customers who are not in the USA. This is very sad.',
  'rating': 3,
  'helpfulVotes': 20.0,
  'review_processed': 'There is a pink tint on the screen. Amazon does not want to properly help customers who are not in the U.S. This is very sad. I like this phone model, but it has frequent display defects. I bought it as a gift for my wife, but the phone is defective.'},
 2: {'review_id': 2,
  'review_time': '2018-04-30',
  'review': 'the lower left

In [74]:
with open("temp_phone-1.json", "w") as file:
    json.dump(result, file, indent=4)