In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm import tqdm

In [6]:
import util.normalization as norm
import util.model as models
import util.utility as util

FileNotFoundError: [WinError 2] The system cannot find the file specified

In [None]:
df = pd.read_csv('./sample-mcd.csv', encoding='latin1')
df = df[['reviewer_id', 'review_time', 'review', 'rating']]

df['rating'] = df['rating'].apply(lambda x: int(x.split(" ")[0]))

print(df.info())
df.head()

# Normalization

In [None]:
df_sm = df[['reviewer_id', 'review']].copy()

# Remove non-ASCII
df_sm['review_clean'] = df_sm['review'].apply(norm.remove_non_ascii)

# Expand contractions
df_sm['review_clean'] = df_sm['review_clean'].apply(norm.expand_contractions)

# Remove characters
df_sm['review_clean'] = df_sm['review_clean'].apply(norm.remove_characters, args=(True, ))

In [None]:
df_sm.head()

In [None]:
import torch
from transformers import pipeline

# classifier = pipeline("summarization")
classifier = pipeline("summarization", device=0)

In [None]:
def get_summarize(text, min_length=25, max_length=68):
    result = classifier(text, min_length=min_length, max_length=max_length)
    return result[0]['summary_text']

In [None]:
tqdm.pandas()
df_sm['summarize'] = df_sm['review_clean'].progress_apply(get_summarize)
print(df_sm.info())
df_sm.head()

In [None]:
# Sentence Tokenization
df_st = df_sm.copy()
df_st['token_sentence'] = df_st['summarize'].apply(norm.sentence_tokenize)
df_st = df_st.explode('token_sentence')
df_st['token_sentence'] = df_st['token_sentence'].str.strip()

# Remove characters
df_st['token_sentence'] = df_st['token_sentence'].apply(norm.remove_characters, args=(True, ))

# Lower text
# df_st['token_sentence'] = df_st['token_sentence'].apply(lambda x: x.lower())

# Lemmatization
df_st['token_lemma'] = df_st['token_sentence'].apply(norm.lemmatize_text)

# Get Aspect and Applied Sentiment Analysis (Rules Based)

In [None]:
# Get aspect from rules
tqdm.pandas()
df_st['aspect'] = df_st['token_lemma']\
                        .progress_apply(util.get_aspect_rules)

print(df_st.info())
df_st.head()

In [None]:
df_st

In [None]:
mask = df_st['aspect'].apply(lambda x: False if len(x) == 0 else True).values

temp = df_st[mask].groupby(['reviewer_id'])\
                .agg({'token_sentence': lambda x: '.\n'.join(x),
                      'token_lemma': lambda x: '.\n'.join(x)})\
                .reset_index()

df_prc = df_st.groupby(['reviewer_id'])\
                .agg({'aspect': lambda x: set().union(*x),})\
                .reset_index()

df_prc = df_prc.merge(temp, on='reviewer_id', how='left').fillna("")

df_prc = df_sm.merge(df_prc, on='reviewer_id', how='left')
print(df_prc.info())
df_prc.head()

In [None]:
df_prc['token_sentence'].iloc[0]

In [None]:
df_prc['summarize'].iloc[0]

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

text = ""
sub_aspect = df_prc['aspect'].values
for i in range(df_prc.shape[0]):
    text = text + " " + " ".join(list(sub_aspect[i]))
    

text = text.strip()

wordcloud = WordCloud(background_color='white').generate(text)
plt.style.use('classic')
plt.figure(figsize=(12,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# Get Topic

In [None]:
# def get_optimum_lda(dictionary, corpus, texts, limit,
#                     start=2, step=1, get_result=False,
#                     iterations=20, passes=1):
#     coherence_values = []
    
#     for n in range(start, limit, step):
#         lda = gensim.models.LdaMulticore(corpus=corpus,
#                                          num_topics=n,
#                                          id2word=dictionary,
#                                          iterations=iterations,
#                                          passes=passes)
        
#         # Create coherence
#         coherence_model = CoherenceModel(model=lda, 
#                                          texts=texts,
#                                          dictionary=dictionary, 
#                                          coherence='c_v')
#         coherence_values.append(coherence_model.get_coherence())
    
    
#     opt_num_topics = start + coherence_values.index(max(coherence_values))
    
#     lda_opt = gensim.models.LdaMulticore(corpus=corpus,
#                                          num_topics=opt_num_topics,
#                                          id2word=dictionary)
    
#     if get_result:
#         print(coherence_values)
    
#     return lda_opt



In [None]:
# import gensim
# from gensim.models import CoherenceModel


# texts = df_prc['summarize'].apply(util.preprocess_lda)
# # texts = df_prc['sub_aspect'].apply(lambda x: ', '.join(list(x))).apply(util.preprocess_lda)

# dictionary = gensim.corpora.Dictionary(texts)

# dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100_000)
# bow_corpus = [dictionary.doc2bow(doc) for doc in texts]

# lda_opt = get_optimum_lda(dictionary, bow_corpus,
#                           texts, 10, get_result=True,
#                           passes=75, iterations=150)

In [None]:
# # Print topic
# for idx, topic in lda_opt.print_topics(-1):
#     print('Topic: {} \nWords: {}'.format(idx, topic))

In [None]:
# def predict_topic(text):
#     text = util.preprocess_lda(text)
#     bow_vector = dictionary.doc2bow(text)
    
#     result = sorted(lda_opt[bow_vector], 
#                     key=lambda x: -1*x[1])[0][0]
    
#     return result
    
# # Extract keywords into a dictionary or list
# threshold = 0.05
# topics_dict = {}
# for topic_num, topic in lda_opt.show_topics(num_topics=10,
#                                                 num_words=10,
#                                                 formatted=False):
#     keywords = [word for word, w in topic if w > threshold]
#     topics_dict[topic_num] = keywords

In [None]:
# topics_dict

In [None]:
# tqdm.pandas()
# df_prc['topic'] = df_prc['summarize'].progress_apply(predict_topic)

In [None]:
# tqdm.pandas()
# df_prc['topic_keys'] = df_prc['topic'].progress_apply(lambda x: topics_dict[x])

In [None]:
# df_prc

# Save Result

In [None]:
# result = df_prc[['reviewer_id', 'review', 'summarize', 'aspect', 'topic_sentence', 'rating']].copy()
result = df_prc.drop('review_clean', axis=1).copy()
result = result.merge(df[['reviewer_id','rating']],
                      on='reviewer_id',
                      how='left')

result['aspect'] = result['aspect'].apply(lambda x: ', '.join(list(x)))
# result['sub_aspect'] = result['sub_aspect'].apply(lambda x: list(x))
# result['topic_keys'] = result['topic_keys'].apply(lambda x: ', '.join(x))

# result.to_csv("./mcd_result/base_mcd.csv", index=False)