In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
import json

from tqdm import tqdm

In [2]:
import nltk
from nltk.corpus import stopwords

import spacy
import gensim

# Load Model

In [3]:
# Load nlp model
nlp = spacy.load('en_core_web_sm')

# Load Dataset

In [4]:
with open("meta-data.json", "r") as file:
    data = json.load(file)

# data

In [5]:
df = pd.DataFrame.from_dict(data, 'index')

print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   reviewer_id       100 non-null    int64 
 1   review_time       100 non-null    object
 2   rating            100 non-null    int64 
 3   review_processed  100 non-null    object
 4   aspect_sentiment  100 non-null    object
 5   sentiment         100 non-null    object
dtypes: int64(2), object(4)
memory usage: 5.5+ KB
None


Unnamed: 0,reviewer_id,review_time,rating,review_processed,aspect_sentiment,sentiment
0,1,2024-09-04,1,"I had a normal transaction, everyone was calm ...","[{'term': 'food', 'class': 'negative', 'probab...",negative
1,2,2024-11-29,4,"The staff at McDonald's are friendly, accommod...","[{'term': 'fast food', 'class': 'positive', 'p...",positive
2,3,2024-11-29,1,I made a mobile order got to the speaker and c...,"[{'term': 'speaker', 'class': 'neutral', 'prob...",negative
3,4,2024-11-04,5,"Crispy chicken sandwich was delicious, and cus...","[{'term': 'sandwich', 'class': 'positive', 'pr...",positive
4,5,2024-10-04,1,I repeat my order three times in the drive thr...,"[{'term': 'fries', 'class': 'negative', 'proba...",negative


# Modeling

In [6]:
import os

# Load Bing Liu's opinion word dictionary
bing_liu_opinion_words = set()  # Add the actual list of opinion words here

# Function to load opinion words from Bing Liu lexicon
def load_opinion_words(filepath):
    global bing_liu_opinion_words
    temp = pd.read_table(filepath, comment=';', header=None)[0].to_list()
    bing_liu_opinion_words = bing_liu_opinion_words.union(set(temp))


# Load opinion words
current_dir = os.getcwd()
load_opinion_words(os.path.join(current_dir, 'util/opinion-lexicon-English/negative-words.txt'))
load_opinion_words(os.path.join(current_dir, 'util/opinion-lexicon-English/positive-words.txt'))

In [7]:
corpus = df['review_processed'].values

In [8]:
# Define the list of stopwords
stop_words = set(stopwords.words('english'))

In [9]:
# Preprocessing text
def preprocessing(text):

    # Get token of words
    doc = nlp(text)
    result = []
    for token in doc:
        t = token.lemma_.lower()

        if re.match(r'^[0-9\W]+$', t) or len(t) < 3 or t in stop_words:
            continue
        # If the token is adjective, noun, propn, or verb
        if token.pos_ in ['NOUN', 'PROPN', 'VERB']:
            result.append(t)
        # elif token.pos_ in ['ADJ', 'VERB']:
        #     result.append(t)
        # If the token is ADJ but not sentiment opinion
        elif token.pos_ in ['ADJ'] and t not in bing_liu_opinion_words:
            result.append(t)
        else:
            continue
        # result.append(t)
    return result

# Create texts
texts = [preprocessing(document) for document in corpus]

# Create dictionary
dictionary = gensim.corpora.Dictionary(texts)


# Convert documents into Bag-of-words format
corpus_bow = [dictionary.doc2bow(text) for text in texts]

# Train the TF-IDF model
tfidf_model = gensim.models.TfidfModel(corpus_bow)

# Get corpus tfidf 
corpus_tfidf = tfidf_model[corpus_bow]

In [10]:
def topic_model_coherence_generator(corpus, texts, dictionary,
                                    start_topic_count=2, end_topic_count=10,
                                    step=1, cpus=1):
    models = []
    coherence_scores = []
    for topic_nums in tqdm(range(start_topic_count, end_topic_count+1, step)):
        lda_model = gensim.models.LdaModel(corpus=corpus, id2word=dictionary,
                                           chunksize=1740, alpha='auto',
                                           eta='auto', random_state=42,
                                           iterations=500, num_topics=topic_nums,
                                           passes=20, eval_every=None)

        cv_coherence_model_lda = gensim.models.CoherenceModel(model=lda_model,
                                                                     corpus=corpus,
                                                                     texts=texts,
                                                                     dictionary=dictionary,
                                                                     coherence='c_v')
        coherence_score = cv_coherence_model_lda.get_coherence()
        coherence_scores.append(coherence_score)
        models.append(lda_model)


    return models, coherence_scores

models, coherence_scores = topic_model_coherence_generator(corpus=corpus_tfidf,
                                                           texts=texts,
                                                           dictionary=dictionary)
opt_model = models[np.argmax(coherence_scores)]

100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [01:01<00:00,  6.83s/it]


In [11]:
opt_model = models[2]

In [12]:
# Calculate overall mean coherence score
topics_coherences = opt_model.top_topics(corpus_tfidf, topn=20)

In [13]:
coherence_scores

[0.3436506829892641,
 0.36712392261298205,
 0.45411010897828186,
 0.48334003071613446,
 0.5066616716213542,
 0.4914349167897502,
 0.514811684545422,
 0.4861680718070238,
 0.47716146344109917]

# Result

In [14]:
# Visualize result: Topic with weights

topics_with_wts = [item[0] for item in topics_coherences]
print("LDA Topics with Weights")
print('='*50)
for idx, topic in enumerate(topics_with_wts):
  print(f'Topic {idx + 1}:')
  print([(term, round(wt, 3)) for wt, term in topic])
  print()

LDA Topics with Weights
Topic 1:
[('order', 0.012), ('take', 0.009), ('wait', 0.007), ('come', 0.006), ('minute', 0.006), ('service', 0.006), ('mcdonald', 0.006), ('food', 0.006), ('customer', 0.006), ('leave', 0.005), ('sauce', 0.005), ('employee', 0.005), ('get', 0.005), ('late', 0.005), ('menu', 0.005), ('night', 0.005), ('people', 0.005), ('exist', 0.005), ('sit', 0.004), ('hour', 0.004)]

Topic 2:
[('get', 0.007), ('location', 0.007), ('customer', 0.006), ('drink', 0.006), ('chicken', 0.006), ('service', 0.006), ('time', 0.006), ('food', 0.006), ('give', 0.005), ('price', 0.005), ('attitude', 0.005), ('order', 0.005), ('answer', 0.005), ('experience', 0.005), ('review', 0.005), ('miss', 0.005), ('large', 0.005), ('letter', 0.004), ('welcome', 0.004), ('staff', 0.004)]

Topic 3:
[('world', 0.007), ('cream', 0.006), ('manager', 0.006), ('ice', 0.006), ('thru', 0.006), ('drive', 0.005), ('taste', 0.005), ('say', 0.005), ('need', 0.005), ('student', 0.005), ('mcdonald', 0.004), ('fry'

In [15]:
topics = [[(term, round(wt, 3))
              for term, wt in opt_model.show_topic(n, topn=20)]
          for n in range(0, opt_model.num_topics)
          ]

topic_df = pd.DataFrame([', '.join([term for term, wt in topic]) for topic in topics],
                       columns=['Term per Topic'],
                       index=[str(t) for t in range(1, opt_model.num_topics+1)])
topic_df

Unnamed: 0,Term per Topic
1,"service, food, meal, line, call, eat, long, wa..."
2,"get, location, customer, drink, chicken, servi..."
3,"world, cream, manager, ice, thru, drive, taste..."
4,"order, take, wait, come, minute, service, mcdo..."


In [16]:
topic_json = topic_df.reset_index().rename({'index': 'topic', 'Term per Topic': 'term'}, axis=1).to_dict('records')

topic_json

[{'topic': '1',
  'term': 'service, food, meal, line, call, eat, long, wait, drive, fry, want, get, sure, smoothie, company, sign, boy, period, staff, hour'},
 {'topic': '2',
  'term': 'get, location, customer, drink, chicken, service, time, food, give, price, attitude, order, answer, experience, review, miss, large, letter, welcome, staff'},
 {'topic': '3',
  'term': 'world, cream, manager, ice, thru, drive, taste, say, need, student, mcdonald, fry, job, staff, chain, time, try, meal, think, work'},
 {'topic': '4',
  'term': 'order, take, wait, come, minute, service, mcdonald, food, customer, leave, sauce, employee, get, late, menu, night, people, exist, sit, hour'}]

In [17]:
# Interpreting result
tm_results = opt_model[corpus_tfidf]

# Corpus Topics
corpus_topics = [sorted(topics, key=lambda record: -record[1])[0]
                 for topics in tm_results]

corpus_topics[:5]

[(0, 0.9230946),
 (1, 0.9264464),
 (1, 0.94004244),
 (3, 0.9069516),
 (2, 0.9376731)]

In [18]:
count = 0
for topics in tm_results:
    count += 1
    temp = [t[-1] for t in topics]
    if count == 2:
        break
print(temp)
sum(temp)

[0.022532022, 0.9264454, 0.021526597, 0.029495927]


0.9999999701976776

In [19]:
corpus_topic_df = pd.DataFrame(index=df['reviewer_id'].values)
corpus_topic_df['topic'] = [item[0]+1 for item in corpus_topics]
corpus_topic_df['probability'] = [round(item[1], 4) for item in corpus_topics]
corpus_topic_df['terms'] = [topic_df.iloc[t[0]]['Term per Topic'] for t in corpus_topics]
# corpus_topic_df['reviewer_id'] = df['reviewer_id'].values

corpus_topic_df.head()

Unnamed: 0,topic,probability,terms
1,1,0.9231,"service, food, meal, line, call, eat, long, wa..."
2,2,0.9264,"get, location, customer, drink, chicken, servi..."
3,2,0.94,"get, location, customer, drink, chicken, servi..."
4,4,0.907,"order, take, wait, come, minute, service, mcdo..."
5,3,0.9377,"world, cream, manager, ice, thru, drive, taste..."


In [20]:
corpus_topic_json = corpus_topic_df.to_dict('index')

# corpus_topic_json

In [21]:
df['topic'] = df['reviewer_id'].apply(lambda x: corpus_topic_json[x])

df.head()

Unnamed: 0,reviewer_id,review_time,rating,review_processed,aspect_sentiment,sentiment,topic
0,1,2024-09-04,1,"I had a normal transaction, everyone was calm ...","[{'term': 'food', 'class': 'negative', 'probab...",negative,"{'topic': 1, 'probability': 0.9230999946594238..."
1,2,2024-11-29,4,"The staff at McDonald's are friendly, accommod...","[{'term': 'fast food', 'class': 'positive', 'p...",positive,"{'topic': 2, 'probability': 0.9264000058174133..."
2,3,2024-11-29,1,I made a mobile order got to the speaker and c...,"[{'term': 'speaker', 'class': 'neutral', 'prob...",negative,"{'topic': 2, 'probability': 0.9399999976158142..."
3,4,2024-11-04,5,"Crispy chicken sandwich was delicious, and cus...","[{'term': 'sandwich', 'class': 'positive', 'pr...",positive,"{'topic': 4, 'probability': 0.9070000052452087..."
4,5,2024-10-04,1,I repeat my order three times in the drive thr...,"[{'term': 'fries', 'class': 'negative', 'proba...",negative,"{'topic': 3, 'probability': 0.9376999735832214..."


In [22]:
result = df.to_dict('index')
# result

# Save Result

In [23]:
# with open("meta-topic.json", "w") as file:
#     json.dump(corpus_topic_json, file, indent=4)

In [24]:
# with open("map-topic.json",  "w") as file:
#     json.dump(topic_json, file, indent=4)