In [648]:
import fasttext
model = fasttext.load_model('./util/cc.en.300.bin')

In [649]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import json
import re

from tqdm import tqdm

In [650]:
import util.normalization as norm
import util.model as models
import util.utility as util

# Import Data

In [651]:
# Opening JSON file
with open("./mcd_result/map_second_mcd.json") as json_file:
    map_second_keys = json.load(json_file)

with open("./mcd_result/map_category_mcd.json") as json_file:
    map_category = json.load(json_file)

with open("./mcd_result/list_first_cycle_mcd.json") as json_file:
    list_first_cycle = json.load(json_file)

In [652]:
# # Open residu

# df_residu = pd.read_csv("./mcd_result/residu_mcd.csv").fillna("")
# print(df_residu.info())
# df_residu['residu'] = df_residu['residu'].apply(lambda x: str(x).split(', '))

In [653]:
df_base = pd.read_csv("./mcd_result/base_mcd_v1.csv")

df_base['aspect'] = df_base['aspect'].apply(lambda x: x.split(", "))
df_base['token_sentence'] = df_base['token_sentence'].apply(lambda x: x.split(".\n"))
df_base['token_lemma'] = df_base['token_lemma'].apply(lambda x: x.split(".\n"))

df_base['residu'] = df_base['residu'].apply(lambda x: str(x).split(', '))

print(df_base.info())
df_base.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   reviewer_id     100 non-null    int64 
 1   review          100 non-null    object
 2   summarize       100 non-null    object
 3   aspect          100 non-null    object
 4   token_sentence  100 non-null    object
 5   token_lemma     100 non-null    object
 6   rating          100 non-null    int64 
 7   residu          100 non-null    object
dtypes: int64(2), object(6)
memory usage: 6.4+ KB
None


Unnamed: 0,reviewer_id,review,summarize,aspect,token_sentence,token_lemma,rating,residu
0,1,Why does it look like someone spit on my food?...,Why does it look like someone spit on my food...,"[clear substance, someone spit, everyone, norm...",[Why does it look like someone spit on my food...,"[why do it look like someone spit on my food, ...",1,"[normal transaction, clear substance, everyone..."
1,2,It'd McDonalds. It is what it is as far as the...,The staff at McDonalds are always friendly an...,"[mcdonalds, atmosphere, staff, food]",[The staff at McDonalds are always friendly an...,[the staff at mcdonalds be always friendly and...,4,[atmosphere]
2,3,Made a mobile order got to the speaker and che...,Never got the refund in the app. Made a mobil...,"[line, mobile order, money, refund]","[Never got the refund in the app., Made a mobi...","[never get the refund in the app ., make a mob...",1,"[refund, mobile order, money]"
3,4,My mc. Crispy chicken sandwich was ÃÂ¯ÃÂ¿ÃÂ...,My mc. Crispy chicken sandwich was customer ...,"[service, crispy chicken, customer service, ch...",[Crispy chicken sandwich was customer service...,[crispy chicken sandwich be customer service b...,5,[crispy chicken]
4,5,"I repeat my order 3 times in the drive thru, a...","I repeat my order 3 times in the drive thru, ...","[order, large fry, large meal, close attention...","[I repeat my order 3 times in the drive thru, ...","[i repeat my order 3 time in the drive thru, i...",1,"[double filet, english, close attention]"


In [654]:
df_nodes = pd.read_csv("./mcd_result/nodes_mcd.csv")
print(df_nodes.info())
df_nodes.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   node_id       500 non-null    int64 
 1   reviewer_id   500 non-null    int64 
 2   first_cycle   500 non-null    object
 3   second_cycle  500 non-null    object
 4   category      500 non-null    object
dtypes: int64(2), object(3)
memory usage: 19.7+ KB
None


Unnamed: 0,node_id,reviewer_id,first_cycle,second_cycle,category
0,0,1,normal,situation,place and service
1,1,1,other,drink,food and drink
2,2,2,mcdonalds,place,place and service
3,3,2,other,place,place and service
4,4,2,food,food,food and drink


# Sentiment Analysis

## A. Sentiment Sentence

In [655]:
df_sentiment = df_base[['reviewer_id', 'token_lemma']].copy().explode('token_lemma')

tqdm.pandas()
df_sentiment['pattern_prediction'] = df_sentiment['token_lemma']\
                                    .progress_apply(models.pattern_lexicon_model)\
                                    .map({'positive': 1,'negative': -1})

df_sentiment = df_sentiment.groupby('reviewer_id')['pattern_prediction'].apply(list).reset_index()
print(df_sentiment.info())
df_sentiment

100%|██████████████████████████████████████████████████████████████████████████████| 297/297 [00:00<00:00, 1648.95it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   reviewer_id         100 non-null    int64 
 1   pattern_prediction  100 non-null    object
dtypes: int64(1), object(1)
memory usage: 1.7+ KB
None





Unnamed: 0,reviewer_id,pattern_prediction
0,1,"[-1, 1, -1, -1]"
1,2,"[1, 1]"
2,3,"[-1, -1, -1, -1, -1]"
3,4,"[-1, 1]"
4,5,"[-1, 1, -1, -1]"
...,...,...
95,96,"[-1, -1, -1]"
96,97,[-1]
97,98,"[-1, -1, -1]"
98,99,"[1, 1]"


## B. Inverse Coding Process

In [656]:
# Inverse coding process

df_1 = df_base[['reviewer_id', 'aspect', 'token_lemma', 'residu']].copy()
df_2 = df_nodes[['reviewer_id', 'node_id', 'first_cycle', 'second_cycle', 'category']].copy()

df_temp = df_1.merge(df_2, on='reviewer_id', how='inner')
df_temp = df_temp.merge(df_sentiment, on='reviewer_id', how='left')
print(df_temp.info())
df_temp.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   reviewer_id         500 non-null    int64 
 1   aspect              500 non-null    object
 2   token_lemma         500 non-null    object
 3   residu              500 non-null    object
 4   node_id             500 non-null    int64 
 5   first_cycle         500 non-null    object
 6   second_cycle        500 non-null    object
 7   category            500 non-null    object
 8   pattern_prediction  500 non-null    object
dtypes: int64(2), object(7)
memory usage: 35.3+ KB
None


Unnamed: 0,reviewer_id,aspect,token_lemma,residu,node_id,first_cycle,second_cycle,category,pattern_prediction
0,1,"[clear substance, someone spit, everyone, norm...","[why do it look like someone spit on my food, ...","[normal transaction, clear substance, everyone...",0,normal,situation,place and service,"[-1, 1, -1, -1]"
1,1,"[clear substance, someone spit, everyone, norm...","[why do it look like someone spit on my food, ...","[normal transaction, clear substance, everyone...",1,other,drink,food and drink,"[-1, 1, -1, -1]"
2,2,"[mcdonalds, atmosphere, staff, food]",[the staff at mcdonalds be always friendly and...,[atmosphere],2,mcdonalds,place,place and service,"[1, 1]"
3,2,"[mcdonalds, atmosphere, staff, food]",[the staff at mcdonalds be always friendly and...,[atmosphere],3,other,place,place and service,"[1, 1]"
4,2,"[mcdonalds, atmosphere, staff, food]",[the staff at mcdonalds be always friendly and...,[atmosphere],4,food,food,food and drink,"[1, 1]"


In [657]:
def inverse_score(x):
    # Define storage

    keywords = [] # Keywords for pattern search
    keys = [] # Key value for storing
    scoring =  {}
    
    
    # Extract all values
    first = x['first_cycle']
    second = x['second_cycle']
    residu = x['residu']
    aspect = x['aspect']
    tokens = x['token_lemma']
    scores = x['pattern_prediction']
    category = x['category']
    
    
    
    ################### INVERSING PROCESS ##########################################
    # Get the aspect of each code
    if first == 'other':
        for res in residu:
            nearest = util.get_nearest_word(res, [second], model, threshold=.35)
            if nearest:
                keywords.append(res)
                keys.append(second)
                break
    
    else:
        pattern = re.compile(rf"{first}")
        for element in aspect:
            if pattern.search(element):
                keywords.append(element)
                keys.append(first)
                

    # If keys and keywords empty, return None
    if (len(keys) == 0) and (len(keywords) == 0):
        return
    
    ################################# SCORING ##################################################
    # Get the index sentence
    for keyword in keywords:
        pattern = re.compile(rf"{keyword}")
        
        for idx, element in enumerate(tokens):
            if pattern.search(element):
                if keyword not in scoring:
                    scoring[keyword] = scores[idx]
                else:
                    scoring[keyword] = scoring[keyword] + scores[idx]

        if keyword not in scoring:
            # If it's not detected we have to split it and choose the token that contains all element.
            temp_split_result = {}
            for e in keyword.split(' '):
                pattern = re.compile(rf"{e}")
                
                for idx, element in enumerate(tokens):
                    if pattern.search(element):
                        if keyword not in scoring:
                            scoring[keyword] = scores[idx]
                        else:
                            scoring[keyword] = scoring[keyword] + scores[idx]

    # Scoring    
    total_scores = sum(list(scoring.values()))
    tags = '/'.join(keywords)
    
    if total_scores < 0:
        return (second, tags, 'negative')
    else:
        return (second, tags, 'positive')
#     return (second, tags, total_scores)

df_temp['key_score_aspect'] = [inverse_score(x[-1]) for x in df_temp.iterrows()]
# If positive => sentiment 1, if negative => sentiment 0, if empty => sentiment 1 
df_temp['sentiment'] = df_temp['key_score_aspect'].apply(lambda x: 1 if not x else 1 if x[-1] == 'positive' else 0)

# print(inverse_score(df_temp.iloc[495]))
# print(inverse_score(df_temp.iloc[497]))

df_temp

Unnamed: 0,reviewer_id,aspect,token_lemma,residu,node_id,first_cycle,second_cycle,category,pattern_prediction,key_score_aspect,sentiment
0,1,"[clear substance, someone spit, everyone, norm...","[why do it look like someone spit on my food, ...","[normal transaction, clear substance, everyone...",0,normal,situation,place and service,"[-1, 1, -1, -1]","(situation, normal transaction, positive)",1
1,1,"[clear substance, someone spit, everyone, norm...","[why do it look like someone spit on my food, ...","[normal transaction, clear substance, everyone...",1,other,drink,food and drink,"[-1, 1, -1, -1]","(drink, someone spit, negative)",0
2,2,"[mcdonalds, atmosphere, staff, food]",[the staff at mcdonalds be always friendly and...,[atmosphere],2,mcdonalds,place,place and service,"[1, 1]","(place, mcdonalds, positive)",1
3,2,"[mcdonalds, atmosphere, staff, food]",[the staff at mcdonalds be always friendly and...,[atmosphere],3,other,place,place and service,"[1, 1]","(place, atmosphere, positive)",1
4,2,"[mcdonalds, atmosphere, staff, food]",[the staff at mcdonalds be always friendly and...,[atmosphere],4,food,food,food and drink,"[1, 1]","(food, food, positive)",1
...,...,...,...,...,...,...,...,...,...,...,...
495,99,"[ice cream, best fry, allways]",[allways have the best fry and ice cream in th...,[allways],495,cream,food,food and drink,"[1, 1]","(food, ice cream, positive)",1
496,100,"[mcdonalds, people, order, english]","[mcdonalds be great, they really need to hire ...",[english],496,mcdonalds,place,place and service,"[1, -1, 1, -1]","(place, mcdonalds, positive)",1
497,100,"[mcdonalds, people, order, english]","[mcdonalds be great, they really need to hire ...",[english],497,people,situation,place and service,"[1, -1, 1, -1]","(situation, people, positive)",1
498,100,"[mcdonalds, people, order, english]","[mcdonalds be great, they really need to hire ...",[english],498,other,other,other,"[1, -1, 1, -1]",,1


# Sentiment Scoring

In [658]:
result = df_temp[['node_id', 'reviewer_id', 
                  'first_cycle', 'second_cycle', 
                  'category', 'sentiment']].copy()

result

Unnamed: 0,node_id,reviewer_id,first_cycle,second_cycle,category,sentiment
0,0,1,normal,situation,place and service,1
1,1,1,other,drink,food and drink,0
2,2,2,mcdonalds,place,place and service,1
3,3,2,other,place,place and service,1
4,4,2,food,food,food and drink,1
...,...,...,...,...,...,...
495,495,99,cream,food,food and drink,1
496,496,100,mcdonalds,place,place and service,1
497,497,100,people,situation,place and service,1
498,498,100,other,other,other,1


In [659]:
first_sentiment = result.groupby('first_cycle')\
                    .agg({'sentiment': lambda x: round(np.sum(x) / len(x), 3)})\
                    .reset_index()\
                    .rename(columns={'sentiment': 'first_cycle_sentiment_positive_rate'})

second_sentiment = result.groupby('second_cycle')\
                    .agg({'sentiment': lambda x: round(np.sum(x) / len(x), 3)})\
                    .reset_index()\
                    .rename(columns={'sentiment': 'second_cycle_sentiment_positive_rate'})

category_sentiment = result.groupby('category')\
                    .agg({'sentiment': lambda x: round(np.sum(x) / len(x), 3)})\
                    .reset_index()\
                    .rename(columns={'sentiment': 'category_sentiment_positive_rate'})


result = result.merge(first_sentiment, on='first_cycle', how='left')
result = result.merge(second_sentiment, on='second_cycle', how='left')
result = result.merge(category_sentiment, on='category', how='left')
result['overall_sentiment_positive_rate'] = round(np.sum(result['sentiment']) / result.shape[0], 3) * np.ones(result.shape[0])


result = result.drop('sentiment', axis=1)

result

Unnamed: 0,node_id,reviewer_id,first_cycle,second_cycle,category,sentiment,first_cycle_sentiment_positive_rate,second_cycle_sentiment_positive_rate,category_sentiment_positive_rate,overall_sentiment_positive_rate
0,0,1,normal,situation,place and service,1,1.000,0.348,0.385,0.424
1,1,1,other,drink,food and drink,0,0.583,0.389,0.336,0.424
2,2,2,mcdonalds,place,place and service,1,0.500,0.408,0.385,0.424
3,3,2,other,place,place and service,1,0.583,0.408,0.385,0.424
4,4,2,food,food,food and drink,1,0.500,0.319,0.336,0.424
...,...,...,...,...,...,...,...,...,...,...
495,495,99,cream,food,food and drink,1,0.500,0.319,0.336,0.424
496,496,100,mcdonalds,place,place and service,1,0.500,0.408,0.385,0.424
497,497,100,people,situation,place and service,1,0.400,0.348,0.385,0.424
498,498,100,other,other,other,1,0.583,0.653,0.653,0.424


# Add Review Value

In [660]:
df_rating = df_base[['reviewer_id', 'rating']]

result = result.merge(df_rating, on='reviewer_id', how='left')

first_rating = result.groupby('first_cycle')\
                .agg({'rating': 'mean'})\
                .apply(lambda x: round(x, 3))\
                .reset_index()\
                .rename(columns={'rating': 'first_cycle_rating'})
second_rating = result.groupby('second_cycle')\
                .agg({'rating': 'mean'})\
                .apply(lambda x: round(x, 3))\
                .reset_index()\
                .rename(columns={'rating': 'second_cycle_rating'})
category_rating = result.groupby('category')\
                .agg({'rating': 'mean'})\
                .apply(lambda x: round(x, 3))\
                .reset_index()\
                .rename(columns={'rating': 'category_rating'})

result = result.merge(first_rating, on='first_cycle', how='left')
result = result.merge(second_rating, on='second_cycle', how='left')
result = result.merge(overall_rating, on='category', how='left')
result['overall_rating'] = round(np.sum(result['rating']) / result.shape[0], 3) * np.ones(result.shape[0])

result = result.drop('rating', axis=1)

result

Unnamed: 0,node_id,reviewer_id,first_cycle,second_cycle,category,sentiment,first_cycle_sentiment_positive_rate,second_cycle_sentiment_positive_rate,category_sentiment_positive_rate,overall_sentiment_positive_rate,first_cycle_rating,secpnd_cycle_rating,overall_rating
0,0,1,normal,situation,place and service,1,1.000,0.348,0.385,0.424,1.667,1.565,1.940
1,1,1,other,drink,food and drink,0,0.583,0.389,0.336,0.424,2.139,1.833,2.007
2,2,2,mcdonalds,place,place and service,1,0.500,0.408,0.385,0.424,1.750,2.014,1.940
3,3,2,other,place,place and service,1,0.583,0.408,0.385,0.424,2.139,2.014,1.940
4,4,2,food,food,food and drink,1,0.500,0.319,0.336,0.424,2.700,2.062,2.007
...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,495,99,cream,food,food and drink,1,0.500,0.319,0.336,0.424,2.750,2.062,2.007
496,496,100,mcdonalds,place,place and service,1,0.500,0.408,0.385,0.424,1.750,2.014,1.940
497,497,100,people,situation,place and service,1,0.400,0.348,0.385,0.424,3.000,1.565,1.940
498,498,100,other,other,other,1,0.583,0.653,0.653,0.424,2.139,1.944,1.944


In [661]:
# Update base

df_temp['key_score_aspect'] = df_temp['key_score_aspect'].apply(lambda x: ", ".join(list(x)) if x else '')
df_int = df_temp[['reviewer_id', 'key_score_aspect']]\
            .groupby('reviewer_id')\
            .agg({'key_score_aspect': lambda x: "; ".join(x)})\
            .reset_index()
df_base = df_base.merge(df_int, on='reviewer_id', how='left')

df_base['aspect'] = df_base['aspect'].apply(lambda x: ", ".join(x))
df_base['token_sentence'] = df_base['token_sentence'].apply(lambda x: ", ".join(x))
df_base['token_lemma'] = df_base['token_lemma'].apply(lambda x: ", ".join(x))
df_base['residu'] = df_base['residu'].apply(lambda x: ", ".join(x))

df_base.head()

Unnamed: 0,reviewer_id,review,summarize,aspect,token_sentence,token_lemma,rating,residu,key_score_aspect
0,1,Why does it look like someone spit on my food?...,Why does it look like someone spit on my food...,"clear substance, someone spit, everyone, norma...","Why does it look like someone spit on my food,...","why do it look like someone spit on my food, i...",1,"normal transaction, clear substance, everyone,...","situation, normal transaction, positive; drink..."
1,2,It'd McDonalds. It is what it is as far as the...,The staff at McDonalds are always friendly an...,"mcdonalds, atmosphere, staff, food",The staff at McDonalds are always friendly and...,the staff at mcdonalds be always friendly and ...,4,atmosphere,"place, mcdonalds, positive; place, atmosphere,..."
2,3,Made a mobile order got to the speaker and che...,Never got the refund in the app. Made a mobil...,"line, mobile order, money, refund","Never got the refund in the app., Made a mobil...","never get the refund in the app ., make a mobi...",1,"refund, mobile order, money","situation, line, negative; service, mobile ord..."
3,4,My mc. Crispy chicken sandwich was ÃÂ¯ÃÂ¿ÃÂ...,My mc. Crispy chicken sandwich was customer ...,"service, crispy chicken, customer service, chi...",Crispy chicken sandwich was customer service ...,crispy chicken sandwich be customer service be...,5,crispy chicken,"service, service/customer service, positive; ;..."
4,5,"I repeat my order 3 times in the drive thru, a...","I repeat my order 3 times in the drive thru, ...","order, large fry, large meal, close attention,...","I repeat my order 3 times in the drive thru, i...","i repeat my order 3 time in the drive thru, it...",1,"double filet, english, close attention","place, close attention, negative; food, order,..."


In [662]:
df_int

Unnamed: 0,reviewer_id,key_score_aspect
0,1,"situation, normal transaction, positive; drink..."
1,2,"place, mcdonalds, positive; place, atmosphere,..."
2,3,"situation, line, negative; service, mobile ord..."
3,4,"service, service/customer service, positive; ;..."
4,5,"place, close attention, negative; food, order,..."
...,...,...
95,96,"service, time management, negative; service, s..."
96,97,"situation, mess, negative; ; food, order food,..."
97,98,"service, rude service, negative; food, order w..."
98,99,"drink, ice cream, positive; ; food, best fry, ..."


In [663]:
result.to_csv("./mcd_result/final_nodes.csv", index=False)
df_base.to_csv("./mcd_result/base_mcd_v2.csv", index=False)