In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import spacy

import re
import os

In [2]:
nlp = spacy.load('en_core_web_lg')

In [3]:
df = pd.read_csv('./sample-mcd.csv', encoding='latin1')
df = df[['reviewer_id', 'review_time', 'review', 'rating']]

df['rating'] = df['rating'].apply(lambda x: int(x.split(" ")[0]))

print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   reviewer_id  100 non-null    int64 
 1   review_time  100 non-null    object
 2   review       100 non-null    object
 3   rating       100 non-null    int64 
dtypes: int64(2), object(2)
memory usage: 3.3+ KB
None


Unnamed: 0,reviewer_id,review_time,review,rating
0,1,3 months ago,Why does it look like someone spit on my food?...,1
1,2,5 days ago,It'd McDonalds. It is what it is as far as the...,4
2,3,5 days ago,Made a mobile order got to the speaker and che...,1
3,4,a month ago,My mc. Crispy chicken sandwich was ÃÂ¯ÃÂ¿ÃÂ...,5
4,5,2 months ago,"I repeat my order 3 times in the drive thru, a...",1


In [4]:
# Load Bing Liu's opinion word dictionary
bing_liu_opinion_words = set()  # Add the actual list of opinion words here

# Function to load opinion words from Bing Liu lexicon
def load_opinion_words(filepath):
    global bing_liu_opinion_words
    temp = pd.read_table(filepath, comment=';', header=None)[0].to_list()
    bing_liu_opinion_words = bing_liu_opinion_words.union(set(temp))


# Load opinion words
current_dir = os.getcwd()
load_opinion_words(os.path.join(current_dir, 'util/opinion-lexicon-English/negative-words.txt'))
load_opinion_words(os.path.join(current_dir, 'util/opinion-lexicon-English/positive-words.txt'))

In [5]:
# Contraction

from contractions import CONTRACTION_MAP

def expand_contractions(sentence, contraction_mapping=CONTRACTION_MAP):
    """
    Expand the contractions in a sentence. For example don't => do not.
    
    Paramters:
    sentence (str): The input sentence to clean.
    contraction_mapping (dict): A dictionary for mapping contractions.
    
    
    Returns:
    str: The expanded contraction sentence.
    """
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                      flags=re.IGNORECASE|re.DOTALL)
    
    def expanded_match(contraction):
        """
        Filter for expanding the matched contraction.
        
        Parameters:
        contraction (str): The input of contraction
        
        Returns:
        str: The expanded contraction.
        """
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match) if contraction_mapping.get(match) else contraction_mapping.get(match.lower())
        
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction
    
    expanded_sentence = contractions_pattern.sub(expanded_match, sentence)
    return expanded_sentence
    

In [58]:
sample = df['review'].iloc[1]
sample = 'It would McDonalds. It is what it is as far as the food and atmosphere go. The staff here does make a difference. The staff and manager are all friendly, accommodating and always smiling. Makes for a more pleasant experience than many other not fast food places.'
# sample = 'The staff and manager are no friendly, accommodating and no smiling.'
sample = expand_contractions(sample)
print(sample)

It would McDonalds. It is what it is as far as the food and atmosphere go. The staff here does make a difference. The staff and manager are all friendly, accommodating and always smiling. Makes for a more pleasant experience than many other not fast food places.


In [59]:
doc = nlp(sample)

for idx, sent in enumerate(doc.sents):
    print(f'{idx + 1}. {sent}')

1. It would McDonalds.
2. It is what it is as far as the food and atmosphere go.
3. The staff here does make a difference.
4. The staff and manager are all friendly, accommodating and always smiling.
5. Makes for a more pleasant experience than many other not fast food places.


In [60]:
from spacy import displacy

displacy.render(doc, style='dep')

In [61]:
temp = {
    'text': [],
    'pos': [],
    'tag': [],
    'dep': []
}

for token in doc:
    temp['text'].append(token.text)
    temp['pos'].append(token.pos_)
    temp['tag'].append(token.tag_)
    temp['dep'].append(token.dep_)

pd.DataFrame.from_dict(temp)

Unnamed: 0,text,pos,tag,dep
0,It,PRON,PRP,nsubj
1,would,AUX,MD,ROOT
2,McDonalds,PROPN,NNP,attr
3,.,PUNCT,.,punct
4,It,PRON,PRP,nsubj
5,is,AUX,VBZ,ROOT
6,what,PRON,WP,attr
7,it,PRON,PRP,nsubj
8,is,AUX,VBZ,ccomp
9,as,ADV,RB,advmod


In [67]:
# EXTRACT ASPECT
storage = []

# adv = []
# adj_mod = []
adv_adj_mod = []
is_contain_subject_verb = False
# sent = list(doc.sents)[0]
for idx, token in enumerate(doc):
    # If the word is noun and preceded by an adjective
    if idx != 0 and (token.pos_ == 'NOUN' and doc[idx - 1].pos_ == 'ADJ'):
        # If the adjective is an opinion
        if doc[idx - 1].text not in bing_liu_opinion_words:
            # Concatenate adj + word then add to storage
            text = doc[idx - 1].text + ' ' + token.text
            storage.append((text, idx - 1, idx + 1))
        else:
            # Else, add noun only
            text = token.text
            storage.append((text, idx, idx + 1))
    
    # If the word is noun and preceded by another noun 
    if idx != 0 and (token.pos_ == 'NOUN' and doc[idx - 1].pos_ == 'NOUN'):
        text = doc[idx - 1].text + ' ' + token.text
        storage.append((text, idx - 1, idx + 1))
        
    # If the word is noun and direct object
    if token.pos_ == 'NOUN' and token.dep_ == 'dobj':
        text = token.text
        storage.append((text, idx, idx + 1))

    # If the word is noun and a subject of sentence
    if token.pos_ == 'NOUN' and token.dep_ == 'nsubj':
        text = token.text
        storage.append((text, idx, idx + 1))

    # If the word is noun and a conj of another noun
    if (token.pos_ == 'NOUN' and token.dep_ == 'conj') and (token.head.pos_ == 'NOUN'):
        text = token.text
        storage.append((text, idx, idx + 1))

    # If the sentence contains SUBJECT VERB, then makes it true
    if token.dep_ == 'nsubj' and token.head.pos_ == 'VERB':
        is_contain_subject_verb = True

    # If token is word that contain pre-modifier
    # if (token.dep_ == 'amod' and token.head.pos_ == 'NOUN'):
    #     if token.head.i != idx + 1:
    #         continue
    #     text = token.text + ' ' + token.head.text
    #     storage.append((text, idx, token.head.i + 1))

    # # If token is word that contain post-modifier
    # if (token.dep_ == 'pobj' and token.pos_ == 'NOUN'):
    #     if token.head.dep_ == 'prep' and token.head.head.pos_ == 'NOUN':
    #         text = token.head.head.text + ' ' + token.head.text + ' ' + token.text
    #         start = token.head.head.i
    #         storage.append((text, start, idx + 1))
        
    
    # If token is adverb modifier and its head is NOUN then store it.
    if (token.dep_ == 'advmod' and token.head.pos_ == 'NOUN'):
        text = token.head.text + ' ' + token.text
        storage.append((text, token.head.i, idx + 1))
        # adv_adj_mod.append((text, idx, idx + 1))
        
# If sentence contains SUBJECT VERB
# if is_contain_subject_verb:
#     # if adjective modifier and adverb is opinion word, append to storage
#     for e in adv_adj_mod:
#         storage.append(e)

# Fix the multiple data

# Sort storage
storage = list(set(storage))
storage = sorted(storage, key=lambda x: (x[1], x[0]))

storage

[('food', 13, 14),
 ('atmosphere', 15, 16),
 ('staff', 19, 20),
 ('staff here', 19, 21),
 ('difference', 24, 25),
 ('staff', 27, 28),
 ('manager', 29, 30),
 ('experience', 44, 45),
 ('food', 50, 51),
 ('food places', 50, 52)]

In [68]:
# PRUNNING
drop_idx = []
for idx, item in enumerate(storage):
    print(idx + 1, item)
    if idx != len(storage) - 1:
        next_item = storage[idx + 1]
        if item[-1] - 1 == next_item[1]:
            append_text = ' '.join(next_item[0].split()[1:])
            new_start = item[1]
            new_end = next_item[-1]
            new_text = item[0] + ' ' + append_text
            drop_idx.append(idx)
            storage[idx + 1] = (new_text, new_start, new_end)

        if item[-1] == next_item[1]:
            new_start = item[1]
            new_end = next_item[-1]
            new_text = item[0] + ' ' + next_item[0]
            drop_idx.append(idx)
            storage[idx + 1] = (new_text, new_start, new_end)

storage = [storage[i] for i in range(len(storage)) if i not in drop_idx]
storage

1 ('food', 13, 14)
2 ('atmosphere', 15, 16)
3 ('staff', 19, 20)
4 ('staff here', 19, 21)
5 ('difference', 24, 25)
6 ('staff', 27, 28)
7 ('manager', 29, 30)
8 ('experience', 44, 45)
9 ('food', 50, 51)
10 ('food places', 50, 52)


[('food', 13, 14),
 ('atmosphere', 15, 16),
 ('staff here', 19, 21),
 ('difference', 24, 25),
 ('staff', 27, 28),
 ('manager', 29, 30),
 ('experience', 44, 45),
 ('food places', 50, 52)]

In [85]:
# GET THE ABILITY OF EACH CONTEXT

# Cross product two lists
def cross_product(first, second):
    temp = []
    if type(first) == str:
        first = [first]
    if type(second) == str:
        second = [second]
    for i in first:
        for j in second:
            temp.append(i + ' ' + j)
    return temp

# Get neglection
def get_neglect(token):
    for t in token.children:
        if (t.dep_ == 'neg') or (t.dep_ == 'det' and t.text.lower() == 'no'):
            return t.text
    return ''
    
# Check children contain conjugation
def get_token_conj(token):
    for t in token.children:
        if t.dep_ == 'conj':
            return t

    return None

# Crawling all possibile conjugation
def extract_conj(token, neglect=False, lemma=False):
    result = []
    current = get_token_conj(token)
    while current:
        if neglect:
            neg = get_neglect(current)
            # If lemma
            if lemma:
                text = (neg + ' ' + current.lemma_).strip()
            else:
                text = (neg + ' ' + current.text).strip()
                
            result.append(text)
        else:
            result.append(current.text)
        current = get_token_conj(current)

    return result

for idx, token in enumerate(doc):
    text = ''
    subjects = []
    abilities = []

    # If the token is verb
    if token.pos_ == 'VERB':
        
        for t in token.children:
            # Check if the token children contain subject.
            if t.dep_ == 'nsubj':
                # text = text + t.text
                subjects.append(t.text)
                # Looping through the children of subject.
                subjects += extract_conj(t)

        # Check how many subject that refers current verb
        text = ', '.join(cross_product(subjects, token.lemma_))


    # If the token is aux
    if token.pos_ == 'AUX':
        
        # Check if the neglect exist and depend on token aux
        neg = get_neglect(token)
        # Looping through children
        for t in token.children:
            if t.dep_ == 'nsubj':
                # first_subject = (neg + ' ' + t.text).strip()
                subjects.append(t.text)     
                # Looping through the children of subject.
                subjects += extract_conj(t)

            if t.dep_ == 'acomp':
                # If neglection does not exist after aux, then check if it exist at first adj/verb
                if not neg:
                    neg = get_neglect(t)
                abilities.append(t.text)
                # Looping through the children of subject
                # If neglection does not appear in after aux or before first subject.
                #  Then check all neglection in first conjugation.
                if not neg:
                    abilities += extract_conj(t, neglect=True, lemma=True)
                else:
                    abilities += extract_conj(t, lemma=True)
                    abilities = cross_product(neg, abilities)

        if len(subjects) > 0 and len(abilities) > 0 :
            text = ', '.join(cross_product(subjects, abilities))

    # If the token is noun
    if token.pos_ == 'NOUN':
        for t in token.lefts:
            if t.pos_ == 'ADJ':
                neg = get_neglect(t)
                if neg:
                    abilities.append(neg + ' ' + t.lemma_)
                else:
                    abilities.append(t.lemma_)

        # If the token contain abilities, then we check is there any conjugation
        if len(abilities) > 0:
            subjects.append(token.text)
            subjects += extract_conj(token)
            text = ', '.join(cross_product(subjects, abilities))
        
            
    if len(text) > 0:
        print(text)

food go, atmosphere go
staff make
staff friendly, staff accommodate, staff smile, manager friendly, manager accommodate, manager smile
experience pleasant
places many, places other, places not fast


In [80]:
# Example
ex = 'The staff is not smiling'
ex = 'It would McDonalds. It is what it is as far as the food and atmosphere go.' +\
     'The staff here does make a difference.' +\
     'The staff and manager are not all friendly, accommodating and always smiling.' +\
     'Makes for a more pleasant experience than many other fast food places.'
ex = 'The staff and manager are no friendly, accommodating and no smiling'

doc2 = nlp(ex)

displacy.render(doc2, 'dep')

for token in doc2:
    temp = get_neglect(token)
    if temp:
        print(temp)

no
no
