In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import spacy

import re
import os

In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
df = pd.read_csv('./sample-mcd.csv', encoding='latin1')
df = df[['reviewer_id', 'review_time', 'review', 'rating']]

df['rating'] = df['rating'].apply(lambda x: int(x.split(" ")[0]))

print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   reviewer_id  100 non-null    int64 
 1   review_time  100 non-null    object
 2   review       100 non-null    object
 3   rating       100 non-null    int64 
dtypes: int64(2), object(2)
memory usage: 3.3+ KB
None


Unnamed: 0,reviewer_id,review_time,review,rating
0,1,3 months ago,Why does it look like someone spit on my food?...,1
1,2,5 days ago,It'd McDonalds. It is what it is as far as the...,4
2,3,5 days ago,Made a mobile order got to the speaker and che...,1
3,4,a month ago,My mc. Crispy chicken sandwich was ÃÂ¯ÃÂ¿ÃÂ...,5
4,5,2 months ago,"I repeat my order 3 times in the drive thru, a...",1


In [4]:
# Load Bing Liu's opinion word dictionary
bing_liu_opinion_words = set()  # Add the actual list of opinion words here

# Function to load opinion words from Bing Liu lexicon
def load_opinion_words(filepath):
    global bing_liu_opinion_words
    temp = pd.read_table(filepath, comment=';', header=None)[0].to_list()
    bing_liu_opinion_words = bing_liu_opinion_words.union(set(temp))


# Load opinion words
current_dir = os.getcwd()
load_opinion_words(os.path.join(current_dir, 'util/opinion-lexicon-English/negative-words.txt'))
load_opinion_words(os.path.join(current_dir, 'util/opinion-lexicon-English/positive-words.txt'))

In [5]:
from contractions import CONTRACTION_MAP

##========== PREPARATION TEXT ===========##

# Contraction
def expand_contractions(sentence, contraction_mapping=CONTRACTION_MAP):
    """
    Expand the contractions in a sentence. For example don't => do not.
    
    Paramters:
    sentence (str): The input sentence to clean.
    contraction_mapping (dict): A dictionary for mapping contractions.
    
    
    Returns:
    str: The expanded contraction sentence.
    """
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                      flags=re.IGNORECASE|re.DOTALL)
    
    def expanded_match(contraction):
        """
        Filter for expanding the matched contraction.
        
        Parameters:
        contraction (str): The input of contraction
        
        Returns:
        str: The expanded contraction.
        """
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match) if contraction_mapping.get(match) else contraction_mapping.get(match.lower())
        
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction
    
    expanded_sentence = contractions_pattern.sub(expanded_match, sentence)
    return expanded_sentence


def remove_extra_spaces(sentence):
    # Use regex to replace multiple spaces with a single space
    return re.sub(r'\s+', ' ', sentence).strip()


def remove_non_ascii(text):
    """
    Remove all non-ASCII characters from the text.

    Parameters:
    text (str): The input text to clean.

    Returns:
    str: The cleaned text with only ASCII characters.
    
    """
    
    return ''.join([char for char in text if ord(char) < 128])

In [30]:
##=========== EXTRACT ASPECT ============##
# Cross product two lists
def cross_product_str(first, second):
    """
    Do cross product

    parameters
    -----------
    first: list/string
    second: list/string

    return: list of string
    """
    temp = []
    if type(first) == str:
        first = [first]
    if type(second) == str:
        second = [second]
    for i in first:
        for j in second:
            temp.append(i + ' ' + j)
    return temp

def cross_product_tuple(first, second):
    """
    Do cross product

    parameters
    -----------
    first: list/string
    second: list/string

    return: list of tuple
    """
    temp = []
    if type(first) == str:
        first = [first]
    if type(second) == str:
        second = [second]
    for i in first:
        for j in second:
            temp.append((i, j))
    return temp

# Get neglection text
def get_neglect(token):
    for t in token.children:
        if (t.dep_ == 'neg') or (t.dep_ == 'det' and t.text.lower() == 'no'):
            return 'not'
    return ''
    
# Get token conjugation
def get_token_conj(token):
    for t in token.children:
        if t.dep_ == 'conj':
            return t

    return None

# Get token coordinate conjugation
# def get_token_cc(token):
#     for t in token.children:
#         if t.dep_ == 'cc':
#             return t
#     return None

# Crawling all possibile conjugation
def extract_conj(token, neglect=False, lemma=False, all_token=False):
    result = []
    current = get_token_conj(token)
    while current:
        if neglect:
            neg = get_neglect(current)
            # If lemma
            if lemma:
                text = (neg + ' ' + current.lemma_).strip()
            else:
                text = (neg + ' ' + current.text).strip()
                    
            result.append(text)
        else:
            result.append(current.text)
        current = get_token_conj(current)

    return result

# Get sentences that include coordinating conjunction and its conjuct
def get_text_conj(token):
    # Get all sentence of series include the conjugation
    tokens = [token]
    # Get all token
    tokens += extract_conj(token, all_token=True)

    text = ''
    for i, t in enumerate(tokens):
        text = text + t.text
        if i < len(tokens) - 1:
            if t.dep_ == 'cc':
                text += ' '
            else:
                text += ', '

    # text = text.strip()
    return text


# # Get the sentence point mapper
# def get_sentence_mapper():
#     sentence_point = {}
#     for i, s in enumerate(doc.sents):
#         sentence_point[i] = (s.start, s.end)
#     return sentence_point
    
# # Get location sentence
# sentence_mapper = get_sentence_mapper(doc)

def get_sentence_location(mapper, position):
    for s in mapper.keys():
        interval = mapper[s]
        if position >= interval[0] and position < interval[1]:
            return s

In [7]:
# Extract all raw aspects
def get_raw_aspects(doc):
    # Define global variables
    global bing_liu_opinion_words
    
    # Define local variables
    storage = []

    # Going through all token
    for idx, token in enumerate(doc):
        # Make sure the text is more than 2 word
        if len(token.text) < 3:
            continue

        # If the word is noun and preceded by an adjective
        if idx != 0 and (token.pos_ == 'NOUN' and doc[idx - 1].pos_ == 'ADJ'):
            # If the adjective is an opinion
            if doc[idx - 1].text not in bing_liu_opinion_words:
                # Concatenate adj + word then add to storage
                text = doc[idx - 1].text + ' ' + token.text
                storage.append((text, idx - 1, idx + 1))
            else:
                # Else, add noun only
                text = token.text
                storage.append((text, idx, idx + 1))
            continue
            
        # If the word is noun and preceded by another noun
        if idx != 0 and (token.pos_ == 'NOUN' and doc[idx - 1].pos_ == 'NOUN'):
            text = doc[idx - 1].text + ' ' + token.text
            storage.append((text, idx - 1, idx + 1))
            continue

        # If the word is noun and direct object
        if token.pos_ == 'NOUN' and (token.dep_ == 'dobj'):
            text = token.text
            storage.append((text, idx, idx + 1))
            continue
    
        # If the word is noun and a subject of sentence
        if token.pos_ == 'NOUN' and token.dep_ == 'nsubj':
            text = token.text
            storage.append((text, idx, idx + 1))
            continue
    
        # If the word is noun and a conj of another noun
        if (token.pos_ == 'NOUN' and token.dep_ == 'conj') and (token.head.pos_ == 'NOUN'):
            text = token.text
            storage.append((text, idx, idx + 1))
            continue
    
        # # If the sentence contains SUBJECT VERB, then makes it true
        # if token.dep_ == 'nsubj' and token.head.pos_ == 'VERB':
        #     is_contain_subject_verb = True
    
        # # If token is word that contain pre-modifier
        # if (token.dep_ == 'amod' and token.head.pos_ == 'NOUN'):
        #     if token.head.i != idx + 1:
        #         continue
        #     text = token.text + ' ' + token.head.text
        #     storage.append((text, idx, token.head.i + 1))
    
        # # If token is word that contain post-modifier
        # if (token.dep_ == 'pobj' and token.pos_ == 'NOUN'):
        #     if token.head.dep_ == 'prep' and token.head.head.pos_ == 'NOUN':
        #         text = token.head.head.text + ' ' + token.head.text + ' ' + token.text
        #         start = token.head.head.i
        #         storage.append((text, start, idx + 1))
            
        
        # If token is adverb modifier and its head is NOUN then store it.
        if (token.dep_ == 'advmod' and token.head.pos_ == 'NOUN'):
            text = token.head.text + ' ' + token.text
            storage.append((text, token.head.i, idx + 1))
            # adv_adj_mod.append((text, idx, idx + 1))

    # Sort storage
    storage = list(set(storage))
    storage = sorted(storage, key=lambda x: (x[1], x[0]))

    return storage

# Prunning raw aspect
def prunning_aspect(list_, doc):
    # Define local variables
    drop_idx = []
    storage = {}
    
    # Get sentence mapper and prepare storage
    sentence_points = {}
    for i, s in enumerate(doc.sents):
        sentence_points[i] = (s.start, s.end)
        storage[i] = []

    for idx, item in enumerate(list_):
        # As long as current idx does not more than maximum list_ index
        if idx != len(list_) - 1:
            # Get the next item
            next_item = list_[idx + 1]
            # If current item start position and next item end position are overlapping
            if item[-1] - 1 == next_item[1]:
                # We merge the text based on last text in current item and first text in next item
                append_text = ' '.join(next_item[0].split()[1:])
                # Update next item values
                new_text = item[0] + ' ' + append_text
                new_start = item[1]
                new_end = next_item[-1]
                list_[idx + 1] = (new_text, new_start, new_end)

                # Add current index into dropped index list
                drop_idx.append(idx)
            
            # If current item start position = next item end position (They are next to each other)
            if item[-1] == next_item[1]:
                # Update the next value (do not have to merge the text based on specific text).
                new_text = item[0] + ' ' + next_item[0]
                new_start = item[1]
                new_end = next_item[-1]
                list_[idx + 1] = (new_text, new_start, new_end)

                # Add current index into dropped index list
                drop_idx.append(idx)
                
    list_ = [list_[i] for i in range(len(list_)) if i not in drop_idx]

    # Create return as mapper
    for i, s in enumerate(list_):
        text, start, end = s
        sentence_location = get_sentence_location(sentence_points, start)
        # Update value
        storage[sentence_location].append(text)
    return storage

In [27]:
# Generate mapper pronouns-antecedents (subject only)
def get_mapper_pron_ant(doc):
    
    def locate_subject_ant_pron(_doc):
        # Locate potential antecedents and pronouns (subject only)
    
        # Define local variables
        antecedents = []
        pron = []
    
        # Get sentence mapper
        sentence_points = {}
        for i, s in enumerate(_doc.sents):
            sentence_points[i] = (s.start, s.end)
        
        for token in _doc:
            # Condition potential antecedents
            # If the token is not pronouns and it's a subject
            if (token.pos_ in ['NOUN', 'PROPN']) and (token.dep_ == 'nsubj'):
                start = token.i
                end = start + 1
                location_sentence = get_sentence_location(sentence_points, start)
                antecedents.append((token, start, location_sentence))
                # Check is there any conj
                # antecedents += extract_conj(token, only_token=True)
        
            # if (token.pos_ != 'PRON') and (token.dep_ == 'dobj' or token.dep_ == 'pobj'):
            #     start = token.i
            #     end = start + 1
            #     location_sentence = get_sentence_location(sentence_points, start)
            #     antecedents.append((token, start, location_sentence))
            #     # Check is there any conj
            #     # antecedents += extract_conj(token, only_token=True)    
        
            # Condition potential pronouns
            # Rule 1
            # If pron is subject (it could be same sentence or previously)
            if (token.pos_ == 'PRON') and (token.dep_ == 'nsubj'):
                # start = est_loc - len(token.text)
                # end = est_loc
                # start = ex.index(token.text)
                # end = start + len(token.text)
                start = token.i
                end = start + 1
                location_sentence = get_sentence_location(sentence_points, start)
                pron.append((token, start, location_sentence))
                
            # Rule 2
            # If pron is possesion (ant is subject in the same sentence)
            if (token.pos_ == 'PRON') and (token.dep_ == 'poss'):
                # start = est_loc - len(token.text)
                # end = est_loc
                start = token.i
                end = start + 1
                location_sentence = get_sentence_location(sentence_points, start)
                pron.append((token, start, location_sentence))
        
            # Rule 3
            # If pron is object
            # if (token.pos_ == 'PRON') and (token.dep_ == 'dobj' or token.dep_ == 'pobj'):
            #     start = token.i
            #     end = start + 1
            #     location_sentence = get_sentence_location(sentence_points, start)
            #     pron.append((token, start, location_sentence))
        
        
        return (antecedents, pron)

    # Filter sentence
    def filter_sentence(_list, location):
        temp = []
        for e in _list:
            if e[-1] == location:
                temp.append(e)
        return temp

    # Define local variable
    mapper = {}
    result = None

    antecedents, pronouns = locate_subject_ant_pron(doc)
    
    if len(pronouns) > 0:
        for p in pronouns:
            # Current status
            is_success = False

            # Get current text, index token, and location sentence token
            token_pron, index_pron, sent_pron = p
            current_sentence = sent_pron
            
            while current_sentence > -1:
                # Get the antecedents
                filter_antecedents = filter_sentence(antecedents, current_sentence)

                # If the filter antecedents exist
                if len(filter_antecedents) > 0:
                    for ant in filter_antecedents:
                        token_ant, index_ant, sent_ant = ant
                        # If antecedent is subject and pronouns is subject or possession and antecedent on the left of pronoun
                        if ('subj' in token_ant.dep_) and ('subj' in token_pron.dep_ or 'poss' in token_pron.dep_) and (index_ant < index_pron):
                            mapper[index_pron] = index_ant
                            is_success = True
                            break
                        # if ('obj' in token_ant.dep_ and 'obj' in token_pron.dep_) and (index_ant < index_pron):
                        #     mapper[index_pron] = index_ant
                        #     is_success = True
                        #     break
                
                # If already success, break it.
                if is_success:
                    break
                    
                current_sentence -= 1

    return mapper

In [31]:
def get_raw_abilities(doc):
    storage = {}

    # Get sentence mapper and prepare storage
    sentence_points = {}
    for i, s in enumerate(doc.sents):
        sentence_points[i] = (s.start, s.end)
        storage[i] = []

    # Get mapper pronoun and antecedents
    mapper_pron_ant = get_mapper_pron_ant(doc)
        
    for idx, token in enumerate(doc):
        text = ''
        subjects = []
        abilities = []
    
        # If the token is verb
        if token.pos_ == 'VERB':
            
            for t in token.children:
                # Check if the token children contain subject.
                if t.dep_ == 'nsubj':
                    # Get current child index
                    current_idx = t.i
                    # If the current child is pronoun and its current_idx in mapper_pron_ant
                    if t.pos_ == 'PRON' and current_idx in mapper_pron_ant.keys():
                        idx_map = mapper_pron_ant[current_idx]
                        t = doc[idx_map]
                    # If the current child is pronoun (but not in mapper_pron_ant keys), or only contains special characters or numbers, or
                    #   length text less than 3
                    elif (t.pos_ == 'PRON') or (re.match(r'^[0-9\W]+$', t.text)) or (len(t.text) < 3):
                        continue
                    # text = text + t.text
                    subjects.append(t.lemma_)
                    # Looping through the children of subject.
                    subjects += extract_conj(t, lemma=True)

            if len(subjects) > 0:
                # Make sure the subject is unique
                subjects = list(set(subjects))
                
                # Add text '=' on each subject
                # subjects = cross_product_str(subjects, '=')
                # text = ', '.join(cross_product_str(subjects, token.lemma_))
                
                
                # Store the result
                # result = text.split(', ')
                result = cross_product_tuple(subjects, token.lemma_)
                sentence_location = get_sentence_location(sentence_points, idx)
                storage[sentence_location] += result
    
    
        # If the token is aux
        elif token.pos_ == 'AUX':
            
            # Check if the neglect exist and depend on token aux
            neg = get_neglect(token)
            # Looping through children
            for t in token.children:
                if t.dep_ == 'nsubj' and t.pos_ in ['']:
                    # Get current child index
                    current_idx = t.i
                    # If the current child is pronoun and its current_idx in mapper_pron_ant
                    if t.pos_ == 'PRON' and current_idx in mapper_pron_ant.keys():
                        idx_map = mapper_pron_ant[current_idx]
                        t = doc[idx_map]
                    # If the current child is pronoun (but not in mapper_pron_ant keys), or only contains special characters or numbers, or
                    #   length text less than 3
                    elif (t.pos_ == 'PRON') or (re.match(r'^[0-9\W]+$', t.text)) or (len(t.text) < 3):
                        continue
                        
                    subjects.append(t.lemma_)
                    # Looping through the children of subject.
                    subjects += extract_conj(t, lemma=True)
    
                if t.dep_ == 'acomp':
                    # If neglection does not exist after aux, then check if it exist at first adj/verb
                    if not neg:
                        neg = get_neglect(t)
                    abilities.append(t.text)
                    # Looping through the children of subject
                    # If neglection does not appear in after aux or before first subject.
                    #  Then check all neglection in first conjugation.
                    if not neg:
                        abilities += extract_conj(t, neglect=True, lemma=True)
                    else:
                        abilities += extract_conj(t, lemma=True)
                        abilities = cross_product_str(neg, abilities)
    
            if len(subjects) > 0 and len(abilities) > 0 :
                # Make sure the subject is unique
                subjects = list(set(subjects))
                # Add text '=' on each subject
                # subjects = cross_product_str(subjects, '=')
                # text = ', '.join(cross_product_str(subjects, abilities))

                # Store the result
                # result = text.split(', ')
                result = cross_product_tuple(subjects, abilities)
                sentence_location = get_sentence_location(sentence_points, idx)
                storage[sentence_location] += result
            
        # If the token is noun
        elif token.pos_ == 'NOUN':
            # If only contains special characters or numbers, or length text less than 3
            if re.match(r'^[0-9\W]+$', token.text) or len(token.text) < 3:
                continue
            for t in token.lefts:
                if t.pos_ == 'ADJ':
                    neg = get_neglect(t)
                    if neg:
                        abilities.append(neg + ' ' + t.lemma_)
                    else:
                        abilities.append(t.lemma_)
    
            # If the token contain abilities, then we check is there any conjugation
            if len(abilities) > 0:
                subjects.append(token.lemma_)
                subjects += extract_conj(token, lemma=True)
                # Make sure the subject is unique
                subjects = list(set(subjects))
                # Add text '=' on each subject
                # subjects = cross_product_str(subjects, '=')
                # text = ', '.join(cross_product_str(subjects, abilities))

                # Store the result
                # result = text.split(', ')
                result = cross_product_tuple(subjects, abilities)
                sentence_location = get_sentence_location(sentence_points, idx)
                storage[sentence_location] += result

    return storage

In [32]:
def process(x):
    # Prepare sentence
    texts = remove_extra_spaces(x)
    texts = expand_contractions(x)
    texts = remove_non_ascii(x)

    # Get aspect
    doc = nlp(texts)
    # mapper_pron_ant = get_mapper_pron_ant(doc)
    result = get_raw_abilities(doc)
    
    return result

text = "Just spent 10 minutes waiting at this McDonald's ." +\
        "According to Google they're open 24/7. Finally we pull up to the window to see if anyone was there," +\
        " sure enough one employee seated in the lobby and another at the window. She told me they were closed." +\
        " Whoever these two are they need to be replaced. Get it together McDonald's. You're a corporate power house and you have a reputation to keep." +\
        "One of the most unprofessional experiences I've ever had with fast food. 0/10 would not recomend this location."
# text = "We hit it off pretty good in the beginning everything was great but that didn" +\
#         "ÃÂ¯ÃÂ¿ÃÂ½ÃÂ¯ÃÂ¿ÃÂ½ÃÂ¯ÃÂ¿ÃÂ½ÃÂ¯ÃÂ¿ÃÂ½ÃÂ¯ÃÂ¿ÃÂ½ÃÂ¯ÃÂ¿ÃÂ½ÃÂ¯ÃÂ¿ÃÂ½ÃÂ¯ÃÂ¿ÃÂ½ÃÂ¯ÃÂ¿ÃÂ½ÃÂ¯ÃÂ¿ÃÂ½ÃÂ¯ÃÂ¿ÃÂ½ÃÂ¯ÃÂ¿ÃÂ½ÃÂ¯" +\
#         "ÃÂ¿ÃÂ½ÃÂ¯ÃÂ¿ÃÂ½ÃÂ¯ÃÂ¿ÃÂ½ÃÂ¯ÃÂ¿ÃÂ½ÃÂ¯ÃÂ¿ÃÂ½ÃÂ¯ÃÂ¿ÃÂ½ÃÂ¯ÃÂ¿ÃÂ½ÃÂ¯ÃÂ¿ÃÂ½ÃÂ¯ÃÂ¿ÃÂ½ÃÂ¯ÃÂ¿ÃÂ½ÃÂ¯ÃÂ¿ÃÂ½ÃÂ¯ÃÂ¿ÃÂ½ÃÂ¯" +\
#         "ÃÂ¿ÃÂ½ÃÂ¯ÃÂ¿ÃÂ½ÃÂ¯ÃÂ¿ÃÂ½t last long. I mean the service is fast but with attitude an"
process(text)

{0: [],
 1: [],
 2: [],
 3: [],
 4: [],
 5: [('house', 'corporate')],
 6: [('experiences', 'unprofessional'), ('food', 'fast')],
 7: []}

In [22]:
from spacy import displacy
def process(x):
    texts = remove_extra_spaces(x)
    texts = expand_contractions(x)
    texts = remove_non_ascii(x)

    return texts
displacy.render(nlp(process(text)), 'dep')

In [None]:
def process(x): 
    # Prepare sentence
    texts = remove_extra_spaces(x)
    texts = expand_contractions(x)
    texts = remove_non_ascii(x)

    # Get aspect
    doc = nlp(texts)
    result = get_raw_aspects(doc)
    result = prunning_aspect(result, doc)

    return result

df['aspect'] = df['review'].apply(process)

In [None]:
def process(x): 
    # Prepare sentence
    texts = remove_extra_spaces(x)
    texts = expand_contractions(x)
    texts = remove_non_ascii(x)

    # Get abilities
    doc = nlp(texts)
    result = get_raw_abilities(doc)
    
    return result

df['abilities'] = df['review'].apply(process)

In [None]:
df[['review', 'aspect', 'abilities']]

In [None]:
review, aspects = df[['review', 'aspect']].iloc[0]

print(review)
print(aspects)

In [None]:
review, aspects = df[['review', 'aspect']].iloc[50]

print(review)
print(aspects)

In [None]:
df.to_excel("delete.xlsx", index=False)

In [None]:
import re

def filter_text(inputs):
    filtered = []
    for text in inputs:
        if not re.match(r'^[0-9]+$', text) and not re.match(r'^[^\w\s]+$', text) and len(text) >= 3:
            filtered.append(text)
    return filtered

# Example usage
texts_to_filter = [
    "12",        # invalid (length < 3, only numbers)
    "!!!",       # invalid (length < 3, only special characters)
    "abc",       # valid
    "1234",      # invalid (only numbers)
    "!@#",       # invalid (only special characters)
    "ab1",       # valid
    "abc@",      # valid
]

result = filter_text(texts_to_filter)
print(result)  # Output: ['abc', 'ab1', 'abc@']
