In [3]:
from gensim.models import KeyedVectors
glove_input_file = './util/glove.twitter.27B.200d.txt'

# Load GloVe vectors directly into a KeyedVectors instance
model = KeyedVectors.load_word2vec_format(glove_input_file, binary=False, no_header=True)

In [4]:
# Import package
import pandas as pd
import numpy as np

from tqdm import tqdm

In [5]:
df = pd.read_csv("./McDonald_s_Reviews.csv", encoding='latin1')
df = df[['reviewer_id', 'review_time', 'review', 'rating']]
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33396 entries, 0 to 33395
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   reviewer_id  33396 non-null  int64 
 1   review_time  33396 non-null  object
 2   review       33396 non-null  object
 3   rating       33396 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.0+ MB
None


Unnamed: 0,reviewer_id,review_time,review,rating
0,1,3 months ago,Why does it look like someone spit on my food?...,1 star
1,2,5 days ago,It'd McDonalds. It is what it is as far as the...,4 stars
2,3,5 days ago,Made a mobile order got to the speaker and che...,1 star
3,4,a month ago,My mc. Crispy chicken sandwich was ï¿½ï¿½ï¿½ï¿...,5 stars
4,5,2 months ago,"I repeat my order 3 times in the drive thru, a...",1 star


In [6]:
df = df.iloc[:100]

# Normalization

In [7]:
import util.normalization as norm

In [8]:
# Tokenization into sentence

df_st = df[['reviewer_id', 'review']].copy()

# Sentence Tokenization
df_st['token_sentence'] = df_st['review'].apply(norm.sentence_tokenize)
df_st = df_st.explode('token_sentence')
df_st['token_sentence'] = df_st['token_sentence'].str.strip()

# Remove non-ASCII
df_st['token_sentence'] = df_st['token_sentence'].apply(norm.remove_non_ascii)

# Expand contractions
df_st['token_sentence'] = df_st['token_sentence'].apply(norm.expand_contractions)

# Remove characters
df_st['token_sentence'] = df_st['token_sentence'].apply(norm.remove_characters, args=(True,))

# Remove enter tab
df_st['token_clean'] = df_st['token_sentence'].apply(norm.remove_enter_tab)

# Lemmatization
df_st['token_clean'] = df_st['token_clean'].apply(norm.lemmatize_text)

In [9]:
print(df_st.info())
df_st.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 546 entries, 0 to 99
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   reviewer_id     546 non-null    int64 
 1   review          546 non-null    object
 2   token_sentence  546 non-null    object
 3   token_clean     546 non-null    object
dtypes: int64(1), object(3)
memory usage: 21.3+ KB
None


Unnamed: 0,reviewer_id,review,token_sentence,token_clean
0,1,Why does it look like someone spit on my food?...,Why does it look like someone spit on my food,why do it look like someone spit on my food
0,1,Why does it look like someone spit on my food?...,I had a normal transaction,i have a normal transaction
0,1,Why does it look like someone spit on my food?...,everyone was chill and polite,everyone be chill and polite
0,1,Why does it look like someone spit on my food?...,but now i dont want to eat this.,but now i dont want to eat this .
0,1,Why does it look like someone spit on my food?...,Im trying not to think about what this milky w...,im try not to think about what this milky whit...


# Get Aspect

In [10]:
import util.utility as util

In [11]:
tqdm.pandas()
df_st['sub_aspect'] = df_st['token_clean'].progress_apply(util.get_aspect_rules)

100%|████████████████████████████████████████████████████████████████████████████████| 546/546 [00:44<00:00, 12.41it/s]


In [12]:
aspects = ['food', 'service', 'order' ,'place']

def fun(x):
    temp = set()
    for e in x:
        aspect = util.get_aspect(e, aspects=aspects, model=model)
        if aspect:
            temp.add(aspect)
    temp = list(temp)
    return temp

tqdm.pandas()
# df_st['sub_aspect'].apply(fun)
# df_st['aspect'] = df_st['sub_aspect'].progress_apply(lambda x: list({util.get_aspect(e, aspects=aspects, model=model) 
#                                                                      for e in x}))
df_st['aspect'] = df_st['sub_aspect'].progress_apply(fun)


100%|██████████████████████████████████████████████████████████████████████████████| 546/546 [00:00<00:00, 3282.37it/s]


In [13]:
df_st

Unnamed: 0,reviewer_id,review,token_sentence,token_clean,sub_aspect,aspect
0,1,Why does it look like someone spit on my food?...,Why does it look like someone spit on my food,why do it look like someone spit on my food,{someone spit},[food]
0,1,Why does it look like someone spit on my food?...,I had a normal transaction,i have a normal transaction,{normal transaction},[service]
0,1,Why does it look like someone spit on my food?...,everyone was chill and polite,everyone be chill and polite,{everyone},[place]
0,1,Why does it look like someone spit on my food?...,but now i dont want to eat this.,but now i dont want to eat this .,{},[]
0,1,Why does it look like someone spit on my food?...,Im trying not to think about what this milky w...,im try not to think about what this milky whit...,"{clear substance, im}","[food, place]"
...,...,...,...,...,...,...
98,99,Allways has the best Fries and Ice cream in th...,Allways has the best Fries and Ice cream in th...,allways have the best fry and ice cream in the...,"{allways, ice cream, best fry}","[food, place]"
99,100,Mcdonalds is great but they really need to hir...,Mcdonalds is great,mcdonalds be great,{mcdonalds},[food]
99,100,Mcdonalds is great but they really need to hir...,they really need to hire people who understand...,they really need to hire people who understand...,"{order, people, english order, spanish speaker}","[order, place, food]"
99,100,Mcdonalds is great but they really need to hir...,The people are really nice,the people be really nice,{people},[place]


# Sentiment Analysis

In [14]:
import util.model as models

In [15]:
sample = df_st['token_clean'].iloc[4]
print(sample)
models.pattern_lexicon_model(sample)

im try not to think about what this milky white or clear substance be all over my food


'negative'

In [16]:
tqdm.pandas()
df_st['pattern_prediction'] = df_st['token_clean'].progress_apply(models.pattern_lexicon_model)

100%|██████████████████████████████████████████████████████████████████████████████| 546/546 [00:00<00:00, 6733.68it/s]


In [17]:
tqdm.pandas()
df_st['roberta_prediction'] = df_st['token_sentence'].progress_apply(models.roberta_model)

100%|████████████████████████████████████████████████████████████████████████████████| 546/546 [09:05<00:00,  1.00it/s]


# Get Result

In [18]:
def get_result(aspect, model):
    
    # Initialization
    get_aspects = df_st['aspect'].values
    get_predictions = df_st[model].values
    
    temp = []

    for i in range(len(get_aspects)):
        if (not get_aspects[i]) or (aspect not in get_aspects[i]):
            temp.append(0)
        else:
            if get_predictions[i] == 'negative':
                temp.append(-99)
            else:
                temp.append(1)
    return np.array(temp)

In [19]:
get_aspects = df_st['aspect'].values
get_pattern = df_st['pattern_prediction'].values
get_roberta = df_st['roberta_prediction'].values

new_fields = set()
for aspect in aspects:
    if aspect in ['service', 'order']:
        aspect = 'service'
        df_st[aspect + f"_pattern"] = get_result('service', 'pattern_prediction') + get_result('order', 'pattern_prediction')
        df_st[aspect + f"_roberta"] = get_result('service', 'roberta_prediction') + get_result('order', 'roberta_prediction')
    else:
        df_st[aspect + f"_pattern"] = get_result(aspect, 'pattern_prediction')
        df_st[aspect + f"_roberta"] = get_result(aspect, 'roberta_prediction')
        
    new_fields.add(aspect + f"_pattern")
    new_fields.add(aspect + f"_roberta")

new_fields = list(new_fields)
print(df_st.info())
df_st

<class 'pandas.core.frame.DataFrame'>
Int64Index: 546 entries, 0 to 99
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   reviewer_id         546 non-null    int64 
 1   review              546 non-null    object
 2   token_sentence      546 non-null    object
 3   token_clean         546 non-null    object
 4   sub_aspect          546 non-null    object
 5   aspect              546 non-null    object
 6   pattern_prediction  546 non-null    object
 7   roberta_prediction  546 non-null    object
 8   food_pattern        546 non-null    int32 
 9   food_roberta        546 non-null    int32 
 10  service_pattern     546 non-null    int32 
 11  service_roberta     546 non-null    int32 
 12  place_pattern       546 non-null    int32 
 13  place_roberta       546 non-null    int32 
dtypes: int32(6), int64(1), object(7)
memory usage: 51.2+ KB
None


Unnamed: 0,reviewer_id,review,token_sentence,token_clean,sub_aspect,aspect,pattern_prediction,roberta_prediction,food_pattern,food_roberta,service_pattern,service_roberta,place_pattern,place_roberta
0,1,Why does it look like someone spit on my food?...,Why does it look like someone spit on my food,why do it look like someone spit on my food,{someone spit},[food],negative,negative,-99,-99,0,0,0,0
0,1,Why does it look like someone spit on my food?...,I had a normal transaction,i have a normal transaction,{normal transaction},[service],positive,positive,0,0,1,1,0,0
0,1,Why does it look like someone spit on my food?...,everyone was chill and polite,everyone be chill and polite,{everyone},[place],negative,positive,0,0,0,0,-99,1
0,1,Why does it look like someone spit on my food?...,but now i dont want to eat this.,but now i dont want to eat this .,{},[],negative,negative,0,0,0,0,0,0
0,1,Why does it look like someone spit on my food?...,Im trying not to think about what this milky w...,im try not to think about what this milky whit...,"{clear substance, im}","[food, place]",negative,positive,-99,1,0,0,-99,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,99,Allways has the best Fries and Ice cream in th...,Allways has the best Fries and Ice cream in th...,allways have the best fry and ice cream in the...,"{allways, ice cream, best fry}","[food, place]",positive,positive,1,1,0,0,1,1
99,100,Mcdonalds is great but they really need to hir...,Mcdonalds is great,mcdonalds be great,{mcdonalds},[food],positive,positive,1,1,0,0,0,0
99,100,Mcdonalds is great but they really need to hir...,they really need to hire people who understand...,they really need to hire people who understand...,"{order, people, english order, spanish speaker}","[order, place, food]",negative,positive,-99,1,-99,1,-99,1
99,100,Mcdonalds is great but they really need to hir...,The people are really nice,the people be really nice,{people},[place],positive,positive,0,0,0,0,1,1


In [20]:
grouped_df = df_st[ ['reviewer_id'] + new_fields ].groupby('reviewer_id').sum()

grouped_df = grouped_df.applymap(lambda x: 0 if x < 0 else 1).reset_index()

grouped_df

Unnamed: 0,reviewer_id,service_roberta,place_roberta,food_roberta,food_pattern,service_pattern,place_pattern
0,1,1,1,0,0,1,0
1,2,1,1,1,1,0,0
2,3,0,1,0,0,0,0
3,4,1,1,1,0,0,1
4,5,1,1,0,0,0,0
...,...,...,...,...,...,...,...
95,96,0,0,1,0,0,0
96,97,0,0,0,0,0,0
97,98,0,0,1,1,0,0
98,99,1,1,1,1,1,1


In [21]:
result = df.merge(grouped_df, on='reviewer_id', how='left')

print(result.info())
result.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 99
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   reviewer_id      100 non-null    int64 
 1   review_time      100 non-null    object
 2   review           100 non-null    object
 3   rating           100 non-null    object
 4   service_roberta  100 non-null    int64 
 5   place_roberta    100 non-null    int64 
 6   food_roberta     100 non-null    int64 
 7   food_pattern     100 non-null    int64 
 8   service_pattern  100 non-null    int64 
 9   place_pattern    100 non-null    int64 
dtypes: int64(7), object(3)
memory usage: 8.6+ KB
None


Unnamed: 0,reviewer_id,review_time,review,rating,service_roberta,place_roberta,food_roberta,food_pattern,service_pattern,place_pattern
0,1,3 months ago,Why does it look like someone spit on my food?...,1 star,1,1,0,0,1,0
1,2,5 days ago,It'd McDonalds. It is what it is as far as the...,4 stars,1,1,1,1,0,0
2,3,5 days ago,Made a mobile order got to the speaker and che...,1 star,0,1,0,0,0,0
3,4,a month ago,My mc. Crispy chicken sandwich was ï¿½ï¿½ï¿½ï¿...,5 stars,1,1,1,0,0,1
4,5,2 months ago,"I repeat my order 3 times in the drive thru, a...",1 star,1,1,0,0,0,0


# Evaluation

In [22]:
df_val = pd.read_csv('validation.csv').dropna()

print(df_val.info())
df_val.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   reviewer_id  100 non-null    int64 
 1   review_time  100 non-null    object
 2   review       100 non-null    object
 3   rating       100 non-null    object
 4   food         100 non-null    object
 5   service      100 non-null    object
 6   order        100 non-null    object
 7   place        100 non-null    object
dtypes: int64(1), object(7)
memory usage: 7.0+ KB
None


Unnamed: 0,reviewer_id,review_time,review,rating,food,service,order,place
0,1,3 months ago,Why does it look like someone spit on my food?...,1 star,Negative,Neutral,Positive,Neutral
1,2,5 days ago,It'd McDonalds. It is what it is as far as the...,4 stars,Neutral,Positive,Neutral,Positive
2,3,5 days ago,Made a mobile order got to the speaker and che...,1 star,Neutral,Negative,Negative,Neutral
3,4,a month ago,My mc. Crispy chicken sandwich was ï¿½ï¿½ï¿½ï¿...,5 stars,Positive,Positive,Neutral,Neutral
4,5,2 months ago,"I repeat my order 3 times in the drive thru, a...",1 star,Negative,Negative,Negative,Neutral


In [23]:
def mapping_sentiment(x):
    if x == 'Negative':
        return 0
    else:
        return 1    

def join_aspect(x, y):
    if x==1 and y==1:
        return 1
    else:
        return 0
    
for aspect in ['food', 'service', 'order', 'place']:
    df_val[aspect] = df_val[aspect].apply(mapping_sentiment)

df_val['service'] = [join_aspect(x, y) for x, y in zip(df_val.service, df_val.order)]
df_val = df_val.drop('order', axis=1)
print(df_val.info())
df_val.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   reviewer_id  100 non-null    int64 
 1   review_time  100 non-null    object
 2   review       100 non-null    object
 3   rating       100 non-null    object
 4   food         100 non-null    int64 
 5   service      100 non-null    int64 
 6   place        100 non-null    int64 
dtypes: int64(4), object(3)
memory usage: 6.2+ KB
None


Unnamed: 0,reviewer_id,review_time,review,rating,food,service,place
0,1,3 months ago,Why does it look like someone spit on my food?...,1 star,0,1,1
1,2,5 days ago,It'd McDonalds. It is what it is as far as the...,4 stars,1,1,1
2,3,5 days ago,Made a mobile order got to the speaker and che...,1 star,1,0,1
3,4,a month ago,My mc. Crispy chicken sandwich was ï¿½ï¿½ï¿½ï¿...,5 stars,1,1,1
4,5,2 months ago,"I repeat my order 3 times in the drive thru, a...",1 star,0,0,1


In [24]:
from sklearn.metrics import classification_report, roc_auc_score

print("Prediction pattern")
print("==============================================================")
for aspect in ['food', 'service', 'place']:
    actual = df_val[aspect]
    pred = result[aspect + "_pattern"]
    print(f"ROC AUC Score: ", roc_auc_score(actual, pred))
    print(f"Result {aspect}")
    print(classification_report(actual, pred))

Prediction pattern
ROC AUC Score:  0.6729780271154746
Result food
              precision    recall  f1-score   support

           0       0.43      0.84      0.57        31
           1       0.88      0.51      0.64        69

    accuracy                           0.61       100
   macro avg       0.65      0.67      0.61       100
weighted avg       0.74      0.61      0.62       100

ROC AUC Score:  0.6743421052631579
Result service
              precision    recall  f1-score   support

           0       0.86      0.72      0.79        76
           1       0.42      0.62      0.50        24

    accuracy                           0.70       100
   macro avg       0.64      0.67      0.64       100
weighted avg       0.75      0.70      0.72       100

ROC AUC Score:  0.6103896103896104
Result place
              precision    recall  f1-score   support

           0       0.51      0.86      0.64        44
           1       0.77      0.36      0.49        56

    accuracy      

In [25]:
from sklearn.metrics import classification_report, roc_auc_score

print("Prediction roberta")
print("==============================================================")
for aspect in ['food', 'service', 'place']:
    actual = df_val[aspect]
    pred = result[aspect + "_roberta"]
    print(f"ROC AUC Score: ", roc_auc_score(actual, pred))
    print(f"Result {aspect}")
    print(classification_report(actual, pred))

Prediction roberta
ROC AUC Score:  0.7187938288920056
Result food
              precision    recall  f1-score   support

           0       0.52      0.74      0.61        31
           1       0.86      0.70      0.77        69

    accuracy                           0.71       100
   macro avg       0.69      0.72      0.69       100
weighted avg       0.75      0.71      0.72       100

ROC AUC Score:  0.7883771929824562
Result service
              precision    recall  f1-score   support

           0       0.98      0.62      0.76        76
           1       0.44      0.96      0.61        24

    accuracy                           0.70       100
   macro avg       0.71      0.79      0.68       100
weighted avg       0.85      0.70      0.72       100

ROC AUC Score:  0.7637987012987013
Result place
              precision    recall  f1-score   support

           0       0.70      0.80      0.74        44
           1       0.82      0.73      0.77        56

    accuracy      