In [1]:
# from gensim.models import KeyedVectors
# glove_input_file = './util/glove.twitter.27B.200d.txt'

# # Load GloVe vectors directly into a KeyedVectors instance
# model = KeyedVectors.load_word2vec_format(glove_input_file, binary=False, no_header=True)

In [2]:
import fasttext
model = fasttext.load_model('./util/cc.en.300.bin')

In [3]:
# Import package
import pandas as pd
import numpy as np

from tqdm import tqdm

In [4]:
df = pd.read_csv("./McDonald_s_Reviews.csv", encoding='latin1')
df = df[['reviewer_id', 'review_time', 'review', 'rating']]
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33396 entries, 0 to 33395
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   reviewer_id  33396 non-null  int64 
 1   review_time  33396 non-null  object
 2   review       33396 non-null  object
 3   rating       33396 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.0+ MB
None


Unnamed: 0,reviewer_id,review_time,review,rating
0,1,3 months ago,Why does it look like someone spit on my food?...,1 star
1,2,5 days ago,It'd McDonalds. It is what it is as far as the...,4 stars
2,3,5 days ago,Made a mobile order got to the speaker and che...,1 star
3,4,a month ago,My mc. Crispy chicken sandwich was ï¿½ï¿½ï¿½ï¿...,5 stars
4,5,2 months ago,"I repeat my order 3 times in the drive thru, a...",1 star


In [5]:
df = df.iloc[:100]

# Normalization

In [6]:
import util.normalization as norm

In [7]:
# Tokenization into sentence

df_st = df[['reviewer_id', 'review']].copy()

# Sentence Tokenization
df_st['token_sentence'] = df_st['review'].apply(norm.sentence_tokenize)
df_st = df_st.explode('token_sentence')
df_st['token_sentence'] = df_st['token_sentence'].str.strip()

# Remove non-ASCII
df_st['token_sentence'] = df_st['token_sentence'].apply(norm.remove_non_ascii)

# Expand contractions
df_st['token_sentence'] = df_st['token_sentence'].apply(norm.expand_contractions)

# Remove characters
df_st['token_sentence'] = df_st['token_sentence'].apply(norm.remove_characters, args=(True,))

# Remove enter tab
df_st['token_clean'] = df_st['token_sentence'].apply(norm.remove_enter_tab)

# Lemmatization
df_st['token_clean'] = df_st['token_clean'].apply(norm.lemmatize_text)

In [8]:
print(df_st.info())
df_st.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 546 entries, 0 to 99
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   reviewer_id     546 non-null    int64 
 1   review          546 non-null    object
 2   token_sentence  546 non-null    object
 3   token_clean     546 non-null    object
dtypes: int64(1), object(3)
memory usage: 21.3+ KB
None


Unnamed: 0,reviewer_id,review,token_sentence,token_clean
0,1,Why does it look like someone spit on my food?...,Why does it look like someone spit on my food,why do it look like someone spit on my food
0,1,Why does it look like someone spit on my food?...,I had a normal transaction,i have a normal transaction
0,1,Why does it look like someone spit on my food?...,everyone was chill and polite,everyone be chill and polite
0,1,Why does it look like someone spit on my food?...,but now i dont want to eat this.,but now i dont want to eat this .
0,1,Why does it look like someone spit on my food?...,Im trying not to think about what this milky w...,im try not to think about what this milky whit...


# Determine Aspect with LDA

In [None]:
import util.model as models
import util.utility as util
from util.utility import get_topics, predict_topic
from util.normalization import preprocess_lda

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

texts = df_st['token_clean'].values

# Vectorize the texts using CountVectorizer
vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)

optimal_model, vectorizer = models.get_opt_lda_model(texts, vectorizer) 

In [None]:
# Print the topics
def print_topics(model, vectorizer, num_words=10):
    topics = model.components_
    feature_names = vectorizer.get_feature_names_out()
    for idx, topic in enumerate(topics):
        print(f"Topic {idx + 1}:")
        print([feature_names[i] for i in topic.argsort()[:-num_words - 1:-1]])
        print([topic.argsort()[:-num_words - 1: -1]])

print_topics(optimal_model, vectorizer)

In [None]:
dict_topics = get_topics(optimal_model, vectorizer, 0.25)
print(dict_topics)

In [None]:
predict_topic("Why does it look like someone spit on my food", 
              optimal_model, vectorizer)

In [None]:
util.get_nearest_word("Why does it look like someone spit on my food", 
                      list(dict_topics[1]), model,
                      threshold=0.4, get_all=True)

In [None]:
def fun(x, get_all=False, k=None):
    cat = predict_topic(x, optimal_model, vectorizer)
    if not cat:
        return set()
    topic = dict_topics[cat]
    result = util.get_nearest_word(x, topic, model,
                                   threshold=0.0, get_all=True)
    if get_all:
        return set(result.keys())
    else:
        if not k:
            return set([list(result.keys())[0]])
        return set(list(result.keys())[:k])

df_st['topic_lda'] = df_st['token_clean'].apply(fun, args=(False, 3))
df_st['cat_topic_lda'] = df_st['token_clean'].apply(predict_topic, args=(optimal_model, vectorizer,))

In [None]:
df_st

In [None]:
df_st['cat_topic_lda'].value_counts()

# Get Aspect

In [None]:
tqdm.pandas()
df_st['sub_aspect'] = df_st['token_clean'].progress_apply(util.get_aspect_rules)

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

text = ""
topic_lda = df_st['topic_lda'].values
sub_aspect = df_st['sub_aspect'].values
for i in range(df_st.shape[0]):
    text = text + " " + " ".join(list(topic_lda[i])) + " " + " ".join(list(sub_aspect[i]))
    

text = text.strip()

wordcloud = WordCloud(background_color='white').generate(text)
plt.style.use('classic')
plt.figure(figsize=(12,10))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
aspects = ['food', 'service', 'order' ,]

def fun1(x):
    temp = set()
    for e in x:
        aspect = util.get_nearest_word(e, aspects, model)
        if aspect:
            temp.add(aspect)
    temp = list(temp)
    return temp

def fun2(x):
    result = util.get_nearest_word(" ".join(x), aspects, model)
#     print(util.get_nearest_word(" ".join(x), aspects, model, 0.45, get_all=True))
    if not result:
        return []
    return [result]

tqdm.pandas()
# df_st['sub_aspect'].apply(fun)
# df_st['aspect'] = df_st['sub_aspect'].progress_apply(lambda x: list({util.get_aspect(e, aspects=aspects, model=model) 
#                                                                      for e in x}))
# df_st['aspect'] = df_st['sub_aspect'].progress_apply(fun1)
# df_st['aspect'] = df_st['topic_lda'].progress_apply(fun2)
df_st['aspect'] = [list(set(fun1(i) + fun2(j))) for i,j in zip(df_st.sub_aspect, df_st.topic_lda)]
df_st

# Sentiment Analysis

In [None]:
import util.model as models

In [None]:
sample = df_st['token_clean'].iloc[4]
print(sample)
models.pattern_lexicon_model(sample)

In [None]:
tqdm.pandas()
df_st['pattern_prediction'] = df_st['token_clean'].progress_apply(models.pattern_lexicon_model)

In [None]:
tqdm.pandas()
df_st['roberta_prediction'] = df_st['token_sentence'].progress_apply(models.roberta_model)

# Get Result

In [None]:
def get_result(aspect, model):
    
    # Initialization
    get_aspects = df_st['aspect'].values
    get_predictions = df_st[model].values
    
    temp = []

    for i in range(len(get_aspects)):
        if (not get_aspects[i]) or (aspect not in get_aspects[i]):
            temp.append(0)
        else:
            if get_predictions[i] == 'negative':
                temp.append(-99)
            else:
                temp.append(1)
    return np.array(temp)

In [None]:
get_aspects = df_st['aspect'].values
get_pattern = df_st['pattern_prediction'].values
get_roberta = df_st['roberta_prediction'].values

new_fields = set()
for aspect in aspects:
    
    df_st[aspect + f"_pattern"] = get_result(aspect, 'pattern_prediction')
    df_st[aspect + f"_roberta"] = get_result(aspect, 'roberta_prediction')
    new_fields.add(aspect + f"_pattern")
    new_fields.add(aspect + f"_roberta")
new_fields = list(new_fields)
print(df_st.info())
df_st

In [None]:
grouped_df = df_st[ ['reviewer_id'] + new_fields ].groupby('reviewer_id').sum()

grouped_df = grouped_df.applymap(lambda x: 0 if x < 0 else 1).reset_index()

grouped_df

In [None]:
result = df.merge(grouped_df, on='reviewer_id', how='left')

print(result.info())
result.head()

# Evaluation

In [None]:
df_val = pd.read_csv('validation.csv').dropna()

print(df_val.info())
df_val.head()

In [None]:
def mapping_sentiment(x):
    if x == 'Negative':
        return 0
    else:
        return 1    

def join_aspect(x, y):
    if x==1 and y==1:
        return 1
    else:
        return 0
    
for aspect in ['food', 'service', 'order', 'place']:
    df_val[aspect] = df_val[aspect].apply(mapping_sentiment)

# df_val['service'] = [join_aspect(x, y) for x, y in zip(df_val.service, df_val.order)]
# df_val = df_val.drop('order', axis=1)
print(df_val.info())
df_val.head()

In [None]:
from sklearn.metrics import classification_report, roc_auc_score

print("Prediction pattern")
print("==============================================================")
for aspect in aspects:
    try:
        actual = df_val[aspect]
        pred = result[aspect + "_pattern"]
        print(f"ROC AUC Score: ", roc_auc_score(actual, pred))
        print(f"Result {aspect}")
        print(classification_report(actual, pred))
    except:
        continue

In [None]:
print("Prediction roberta")
print("==============================================================")
for aspect in aspects:
    try:
        actual = df_val[aspect]
        pred = result[aspect + "_roberta"]
        print(f"ROC AUC Score: ", roc_auc_score(actual, pred))
        print(f"Result {aspect}")
        print(classification_report(actual, pred))
    except:
        continue