In [666]:
import fasttext
model = fasttext.load_model('./util/cc.en.300.bin')

In [667]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import json
import re

from tqdm import tqdm

In [668]:
import util.normalization as norm
import util.model as models
import util.utility as util

# Import Data

In [669]:
# Opening JSON file
with open("./mcd_result/map_second_mcd.json") as json_file:
    map_second_keys = json.load(json_file)

with open("./mcd_result/map_category_mcd.json") as json_file:
    map_category = json.load(json_file)

with open("./mcd_result/list_first_cycle_mcd.json") as json_file:
    list_first_cycle = json.load(json_file)

In [670]:
# # Open residu

# df_residu = pd.read_csv("./mcd_result/residu_mcd.csv").fillna("")
# print(df_residu.info())
# df_residu['residu'] = df_residu['residu'].apply(lambda x: str(x).split(', '))

In [671]:
df_base = pd.read_csv("./mcd_result/base_mcd_v1.csv")

df_base['aspect'] = df_base['aspect'].apply(lambda x: x.split(", "))
df_base['token_sentence'] = df_base['token_sentence'].apply(lambda x: x.split(".\n"))
df_base['token_lemma'] = df_base['token_lemma'].apply(lambda x: x.split(".\n"))

df_base['residu'] = df_base['residu'].apply(lambda x: str(x).split(', '))

print(df_base.info())
df_base.head()

FileNotFoundError: [Errno 2] No such file or directory: './mcd_result/base_mcd_v1.csv'

In [None]:
df_nodes = pd.read_csv("./mcd_result/nodes_mcd.csv")
print(df_nodes.info())
df_nodes.head()

# Sentiment Analysis

## A. Sentiment Sentence

In [None]:
df_sentiment = df_base[['reviewer_id', 'token_lemma']].copy().explode('token_lemma')

tqdm.pandas()
df_sentiment['pattern_prediction'] = df_sentiment['token_lemma']\
                                    .progress_apply(models.pattern_lexicon_model)\
                                    .map({'positive': 1,'negative': -1})

df_sentiment = df_sentiment.groupby('reviewer_id')['pattern_prediction'].apply(list).reset_index()
print(df_sentiment.info())
df_sentiment

## B. Inverse Coding Process

In [None]:
# Inverse coding process

df_1 = df_base[['reviewer_id', 'aspect', 'token_lemma', 'residu']].copy()
df_2 = df_nodes[['reviewer_id', 'node_id', 'first_cycle', 'second_cycle', 'category']].copy()

df_temp = df_1.merge(df_2, on='reviewer_id', how='inner')
df_temp = df_temp.merge(df_sentiment, on='reviewer_id', how='left')
print(df_temp.info())
df_temp.head()

In [None]:
def inverse_score(x):
    # Define storage

    keywords = [] # Keywords for pattern search
    keys = [] # Key value for storing
    scoring =  {}
    
    
    # Extract all values
    first = x['first_cycle']
    second = x['second_cycle']
    residu = x['residu']
    aspect = x['aspect']
    tokens = x['token_lemma']
    scores = x['pattern_prediction']
    category = x['category']
    
    
    
    ################### INVERSING PROCESS ##########################################
    # Get the aspect of each code
    if first == 'other':
        for res in residu:
            nearest = util.get_nearest_word(res, [second], model, threshold=.35)
            if nearest:
                keywords.append(res)
                keys.append(second)
                break
    
    else:
        pattern = re.compile(rf"{first}")
        for element in aspect:
            if pattern.search(element):
                keywords.append(element)
                keys.append(first)
                

    # If keys and keywords empty, return None
    if (len(keys) == 0) and (len(keywords) == 0):
        return
    
    ################################# SCORING ##################################################
    # Get the index sentence
    for keyword in keywords:
        pattern = re.compile(rf"{keyword}")
        
        for idx, element in enumerate(tokens):
            if pattern.search(element):
                if keyword not in scoring:
                    scoring[keyword] = scores[idx]
                else:
                    scoring[keyword] = scoring[keyword] + scores[idx]

        if keyword not in scoring:
            # If it's not detected we have to split it and choose the token that contains all element.
            temp_split_result = {}
            for e in keyword.split(' '):
                pattern = re.compile(rf"{e}")
                
                for idx, element in enumerate(tokens):
                    if pattern.search(element):
                        if keyword not in scoring:
                            scoring[keyword] = scores[idx]
                        else:
                            scoring[keyword] = scoring[keyword] + scores[idx]

    # Scoring    
    total_scores = sum(list(scoring.values()))
    tags = '/'.join(keywords)
    
    if total_scores < 0:
        return (second, tags, 'negative')
    else:
        return (second, tags, 'positive')
#     return (second, tags, total_scores)

df_temp['key_score_aspect'] = [inverse_score(x[-1]) for x in df_temp.iterrows()]
# If positive => sentiment 1, if negative => sentiment 0, if empty => sentiment 1 
df_temp['sentiment'] = df_temp['key_score_aspect'].apply(lambda x: 1 if not x else 1 if x[-1] == 'positive' else 0)

# print(inverse_score(df_temp.iloc[495]))
# print(inverse_score(df_temp.iloc[497]))

df_temp

# Sentiment Scoring

In [None]:
result = df_temp[['node_id', 'reviewer_id', 
                  'first_cycle', 'second_cycle', 
                  'category', 'sentiment']].copy()

result

In [None]:
first_sentiment = result.groupby('first_cycle')\
                    .agg({'sentiment': lambda x: round(np.sum(x) / len(x), 3)})\
                    .reset_index()\
                    .rename(columns={'sentiment': 'first_cycle_sentiment_positive_rate'})

second_sentiment = result.groupby('second_cycle')\
                    .agg({'sentiment': lambda x: round(np.sum(x) / len(x), 3)})\
                    .reset_index()\
                    .rename(columns={'sentiment': 'second_cycle_sentiment_positive_rate'})

category_sentiment = result.groupby('category')\
                    .agg({'sentiment': lambda x: round(np.sum(x) / len(x), 3)})\
                    .reset_index()\
                    .rename(columns={'sentiment': 'category_sentiment_positive_rate'})


result = result.merge(first_sentiment, on='first_cycle', how='left')
result = result.merge(second_sentiment, on='second_cycle', how='left')
result = result.merge(category_sentiment, on='category', how='left')
result['overall_sentiment_positive_rate'] = round(np.sum(result['sentiment']) / result.shape[0], 3) * np.ones(result.shape[0])


result = result.drop('sentiment', axis=1)

result

# Add Review Value

In [None]:
df_rating = df_base[['reviewer_id', 'rating']]

result = result.merge(df_rating, on='reviewer_id', how='left')

first_rating = result.groupby('first_cycle')\
                .agg({'rating': 'mean'})\
                .apply(lambda x: round(x, 3))\
                .reset_index()\
                .rename(columns={'rating': 'first_cycle_rating'})
second_rating = result.groupby('second_cycle')\
                .agg({'rating': 'mean'})\
                .apply(lambda x: round(x, 3))\
                .reset_index()\
                .rename(columns={'rating': 'second_cycle_rating'})
category_rating = result.groupby('category')\
                .agg({'rating': 'mean'})\
                .apply(lambda x: round(x, 3))\
                .reset_index()\
                .rename(columns={'rating': 'category_rating'})

result = result.merge(first_rating, on='first_cycle', how='left')
result = result.merge(second_rating, on='second_cycle', how='left')
result = result.merge(overall_rating, on='category', how='left')
result['overall_rating'] = round(np.sum(result['rating']) / result.shape[0], 3) * np.ones(result.shape[0])

result = result.drop('rating', axis=1)

result

In [None]:
# Update base

df_temp['key_score_aspect'] = df_temp['key_score_aspect'].apply(lambda x: ", ".join(list(x)) if x else '')
df_int = df_temp[['reviewer_id', 'key_score_aspect']]\
            .groupby('reviewer_id')\
            .agg({'key_score_aspect': lambda x: "; ".join(x)})\
            .reset_index()
df_base = df_base.merge(df_int, on='reviewer_id', how='left')

df_base['aspect'] = df_base['aspect'].apply(lambda x: ", ".join(x))
df_base['token_sentence'] = df_base['token_sentence'].apply(lambda x: ", ".join(x))
df_base['token_lemma'] = df_base['token_lemma'].apply(lambda x: ", ".join(x))
df_base['residu'] = df_base['residu'].apply(lambda x: ", ".join(x))

df_base.head()

In [None]:
df_int

In [None]:
result.to_csv("./mcd_result/final_nodes.csv", index=False)
df_base.to_csv("./mcd_result/base_mcd_v2.csv", index=False)