In [21]:
import fasttext
model = fasttext.load_model('./util/cc.en.300.bin')

In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import json
import re

from tqdm import tqdm

In [23]:
import util.normalization as norm
import util.model as models
import util.utility as util

# Import Data

In [24]:
# Opening JSON file
with open("./mcd_result/map_second_mcd.json") as json_file:
    map_second_keys = json.load(json_file)

with open("./mcd_result/map_category_mcd.json") as json_file:
    map_category = json.load(json_file)

with open("./mcd_result/list_first_cycle_mcd.json") as json_file:
    list_first_cycle = json.load(json_file)

In [25]:
# # Open residu

# df_residu = pd.read_csv("./mcd_result/residu_mcd.csv").fillna("")
# print(df_residu.info())
# df_residu['residu'] = df_residu['residu'].apply(lambda x: str(x).split(', '))

In [26]:
df_base = pd.read_csv("./mcd_result/base_mcd.csv")

df_base['aspect'] = df_base['aspect'].apply(lambda x: x.split(", "))
df_base['token_sentence'] = df_base['token_sentence'].apply(lambda x: x.split(".\n"))
df_base['token_lemma'] = df_base['token_lemma'].apply(lambda x: x.split(".\n"))

df_base['residu'] = df_base['residu'].apply(lambda x: str(x).split(', '))

print(df_base.info())
df_base.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   reviewer_id     100 non-null    int64 
 1   review          100 non-null    object
 2   summarize       100 non-null    object
 3   aspect          100 non-null    object
 4   token_sentence  100 non-null    object
 5   token_lemma     100 non-null    object
 6   rating          100 non-null    int64 
 7   residu          100 non-null    object
dtypes: int64(2), object(6)
memory usage: 6.4+ KB
None


Unnamed: 0,reviewer_id,review,summarize,aspect,token_sentence,token_lemma,rating,residu
0,1,Why does it look like someone spit on my food?...,Why does it look like someone spit on my food...,"[clear substance, someone spit, everyone, norm...",[Why does it look like someone spit on my food...,"[why do it look like someone spit on my food, ...",1,"[normal transaction, clear substance, everyone..."
1,2,It'd McDonalds. It is what it is as far as the...,The staff at McDonalds are always friendly an...,"[mcdonalds, atmosphere, staff, food]",[The staff at McDonalds are always friendly an...,[the staff at mcdonalds be always friendly and...,4,[atmosphere]
2,3,Made a mobile order got to the speaker and che...,Never got the refund in the app. Made a mobil...,"[line, mobile order, money, refund]","[Never got the refund in the app., Made a mobi...","[never get the refund in the app ., make a mob...",1,"[refund, mobile order, money]"
3,4,My mc. Crispy chicken sandwich was ÃÂ¯ÃÂ¿ÃÂ...,My mc. Crispy chicken sandwich was customer ...,"[service, crispy chicken, customer service, ch...",[Crispy chicken sandwich was customer service...,[crispy chicken sandwich be customer service b...,5,[crispy chicken]
4,5,"I repeat my order 3 times in the drive thru, a...","I repeat my order 3 times in the drive thru, ...","[order, large fry, large meal, close attention...","[I repeat my order 3 times in the drive thru, ...","[i repeat my order 3 time in the drive thru, i...",1,"[double filet, english, close attention]"


In [27]:
df_nodes = pd.read_csv("./mcd_result/nodes_mcd.csv")
print(df_nodes.info())
df_nodes.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   node_id       500 non-null    int64 
 1   reviewer_id   500 non-null    int64 
 2   first_cycle   500 non-null    object
 3   second_cycle  500 non-null    object
 4   category      500 non-null    object
dtypes: int64(2), object(3)
memory usage: 19.7+ KB
None


Unnamed: 0,node_id,reviewer_id,first_cycle,second_cycle,category
0,0,1,normal,situation,place and service
1,1,1,other,drink,food and drink
2,2,2,mcdonalds,place,place and service
3,3,2,other,place,place and service
4,4,2,food,food,food and drink


# Sentiment Analysis

## A. Sentiment Sentence

In [28]:
df_sentiment = df_base[['reviewer_id', 'token_lemma']].copy().explode('token_lemma')

tqdm.pandas()
df_sentiment['pattern_prediction'] = df_sentiment['token_lemma']\
                                    .progress_apply(models.pattern_lexicon_model)\
                                    .map({'positive': 1,'negative': -1})

df_sentiment = df_sentiment.groupby('reviewer_id')['pattern_prediction'].apply(list).reset_index()
print(df_sentiment.info())
df_sentiment

100%|██████████████████████████████████████████████████████████████████████████████| 297/297 [00:00<00:00, 1857.77it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   reviewer_id         100 non-null    int64 
 1   pattern_prediction  100 non-null    object
dtypes: int64(1), object(1)
memory usage: 1.7+ KB
None





Unnamed: 0,reviewer_id,pattern_prediction
0,1,"[-1, 1, -1, -1]"
1,2,"[1, 1]"
2,3,"[-1, -1, -1, -1, -1]"
3,4,"[-1, 1]"
4,5,"[-1, 1, -1, -1]"
...,...,...
95,96,"[-1, -1, -1]"
96,97,[-1]
97,98,"[-1, -1, -1]"
98,99,"[1, 1]"


## B. Inverse Coding Process

In [30]:
# Inverse coding process

df_1 = df_base[['reviewer_id', 'aspect', 'token_lemma', 'residu']].copy()
df_2 = df_nodes[['reviewer_id', 'node_id', 'first_cycle', 'second_cycle']].copy()

df_temp = df_1.merge(df_2, on='reviewer_id', how='inner')
print(df_temp.info())
df_temp.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   reviewer_id   500 non-null    int64 
 1   aspect        500 non-null    object
 2   token_lemma   500 non-null    object
 3   residu        500 non-null    object
 4   node_id       500 non-null    int64 
 5   first_cycle   500 non-null    object
 6   second_cycle  500 non-null    object
dtypes: int64(2), object(5)
memory usage: 27.5+ KB
None


Unnamed: 0,reviewer_id,aspect,token_lemma,residu,node_id,first_cycle,second_cycle
0,1,"[clear substance, someone spit, everyone, norm...","[why do it look like someone spit on my food, ...","[normal transaction, clear substance, everyone...",0,normal,situation
1,1,"[clear substance, someone spit, everyone, norm...","[why do it look like someone spit on my food, ...","[normal transaction, clear substance, everyone...",1,other,drink
2,2,"[mcdonalds, atmosphere, staff, food]",[the staff at mcdonalds be always friendly and...,[atmosphere],2,mcdonalds,place
3,2,"[mcdonalds, atmosphere, staff, food]",[the staff at mcdonalds be always friendly and...,[atmosphere],3,other,place
4,2,"[mcdonalds, atmosphere, staff, food]",[the staff at mcdonalds be always friendly and...,[atmosphere],4,food,food


In [None]:
df_temp['first_cycle'].value_counts()

In [42]:
import re

def inverse(x):
    # Define storage
    temp = []
    
    # Extract all values
    first = x['first_cycle']
    second = x['second_cycle']
    residu = x['residu']
    aspect = x['aspect']
    tokens = x['token_lemma']
    
    # Get the aspect of each code
    if first == 'other':
        for res in residu:
            result = util.get_nearest_word(res, [second], model, threshold=.35)
            if result:
                temp.append(res)
    
    else:
        pattern = re.compile(rf"{first}")
        result = [element for element in aspect if pattern.search(element)]
        temp = temp + result
    return temp

inverse(df_temp.iloc[0])

['normal transaction']

In [None]:
def get_int_index(x):
    # Get index aspect from first_cycle
    array = x[0]
    text = x[1]
    pattern = re.compile(rf"{text}")
    
    if text == 'other':
        temp = [-99]
    else:
        temp = [idx for idx, element in enumerate(array) if pattern.search(element)]
        
#     temp = [idx for idx, element in enumerate(array) if pattern.search(element)]
    
    return temp

def get_final_index(x):
    # Get index token_sentence from aspect

    pass
    

temp['temp'] = list(zip(temp['token_lemma'].values, temp['first_cycle'].values))
temp['temp_idx'] = temp['temp'].apply(get_int_index)

# temp = temp.groupby('reviewer_id')['temp_idx'].apply('sum').reset_index()
print(temp.info())
temp.head()

In [None]:
df_sentiment = temp.merge(df_sentiment, on='reviewer_id', how='left')
print(df_sentiment.info())
df_sentiment.head()

In [None]:
def scoring(x):
    score_array = x[0]
    indices = x[1]
    temp = []
    for i in indices:
        # If others we assume it is netral
        if i == -99:
            temp.append(0)
        else:
            temp.append(score_array[i])
    return sum(temp)

df_sentiment['temp'] = list(zip(df_sentiment['pattern_prediction'], df_sentiment['temp_idx']))

df_sentiment['sentiment_score'] = df_sentiment['temp'].apply(scoring).apply(lambda x: 0 if x < 0 else 1 )
df_sentiment = df_sentiment[['node_id', 'sentiment_score']]

df_sentiment

In [None]:
df_sentiment['sentiment_score'].value_counts()

In [None]:
result = df_nodes.merge(df_sentiment, on='node_id', how='inner')

result

In [None]:
first_sentiment = result.groupby('first_cycle')\
                    .agg({'sentiment_score': lambda x: np.sum(x) / len(x)})\
                    .reset_index()\
                    .rename(columns={'sentiment_score': 'first_sentiment_score'})

second_sentiment = result.groupby('second_cycle')\
                    .agg({'sentiment_score': lambda x: np.sum(x) / len(x)})\
                    .reset_index()\
                    .rename(columns={'sentiment_score': 'second_sentiment_score'})

overall_sentiment = result.groupby('category')\
                    .agg({'sentiment_score': lambda x: np.sum(x) / len(x)})\
                    .reset_index()\
                    .rename(columns={'sentiment_score': 'overall_sentiment_score'})


result = result.merge(first_sentiment, on='first_cycle', how='left')
result = result.merge(second_sentiment, on='second_cycle', how='left')
result = result.merge(overall_sentiment, on='category', how='left')

result = result.drop('sentiment_score', axis=1)

# Add Review Value

In [None]:
df_rating = df_base[['reviewer_id', 'rating']]

result = result.merge(df_rating, on='reviewer_id', how='left')

first_rating = result.groupby('first_cycle')\
                .agg({'rating': 'mean'})\
                .reset_index()\
                .rename(columns={'rating': 'first_rating'})
second_rating = result.groupby('second_cycle')\
                .agg({'rating': 'mean'})\
                .reset_index()\
                .rename(columns={'rating': 'second_rating'})
overall_rating = result.groupby('category')\
                .agg({'rating': 'mean'})\
                .reset_index()\
                .rename(columns={'rating': 'overall_rating'})

result = result.merge(first_rating, on='first_cycle', how='left')
result = result.merge(second_rating, on='second_cycle', how='left')
result = result.merge(overall_rating, on='category', how='left')

result = result.drop('rating', axis=1)

result

In [None]:
result.to_csv("./mcd_result/final_nodes.csv", index=False)