In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
import json

from tqdm import tqdm

In [25]:
import nltk
from nltk.corpus import stopwords

import spacy
import gensim

# Load Model

In [26]:
# Load nlp model
nlp = spacy.load('en_core_web_sm')

# Load Dataset

In [27]:
with open("temp-2.json", "r") as file:
    data = json.load(file)

data

{'0': {'reviewer_id': 1,
  'review_time': '2024-09-04',
  'rating': 1,
  'review_processed': "I had a normal transaction, everyone was calm and polite, but now I don't want to eat this. I'm trying not to think about what this milky white/clear substance is all over my food, and I'm sure I'm not coming back.",
  'aspect_sentiment': [{'term': 'food',
    'class': 'negative',
    'probability': [0.9815933108329773,
     0.015454968437552452,
     0.002951699076220393],
    'context': ["I'm trying not to think about what this milky white/clear substance is all over my food, and I'm sure I'm not coming back."]},
   {'term': 'substance',
    'class': 'negative',
    'probability': [0.5997273921966553,
     0.004296493716537952,
     0.39597612619400024],
    'context': ["I'm trying not to think about what this milky white/clear substance is all over my food, and I'm sure I'm not coming back."]}],
  'sentiment': 'negative'},
 '1': {'reviewer_id': 2,
  'review_time': '2024-11-29',
  'rating': 

In [28]:
df = pd.DataFrame.from_dict(data, 'index')

print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   reviewer_id       100 non-null    int64 
 1   review_time       100 non-null    object
 2   rating            100 non-null    int64 
 3   review_processed  100 non-null    object
 4   aspect_sentiment  100 non-null    object
 5   sentiment         100 non-null    object
dtypes: int64(2), object(4)
memory usage: 5.5+ KB
None


Unnamed: 0,reviewer_id,review_time,rating,review_processed,aspect_sentiment,sentiment
0,1,2024-09-04,1,"I had a normal transaction, everyone was calm ...","[{'term': 'food', 'class': 'negative', 'probab...",negative
1,2,2024-11-29,4,"The staff at McDonald's are friendly, accommod...","[{'term': 'fast food', 'class': 'postive', 'pr...",positive
2,3,2024-11-29,1,I made a mobile order got to the speaker and c...,"[{'term': 'speaker', 'class': 'neutral', 'prob...",negative
3,4,2024-11-04,5,"Crispy chicken sandwich was delicious, and cus...","[{'term': 'sandwich', 'class': 'postive', 'pro...",positive
4,5,2024-10-04,1,I repeat my order three times in the drive thr...,"[{'term': 'fries', 'class': 'negative', 'proba...",negative


# Modeling

In [29]:
corpus = df['review_processed'].values

In [30]:
# Define the list of stopwords
stop_words = set(stopwords.words('english'))

In [31]:
# Preprocessing text
def preprocessing(text):

    # Get token of words
    doc = nlp(text)
    result = []
    for token in doc:
        t = token.lemma_.lower()

        if re.match(r'^[0-9\W]+$', t) or len(t) < 3 or t in stop_words:
            continue
        # If the token is adjective, noun, propn, or verb
        if token.pos_ in ['NOUN', 'PROPN']:
            result.append(t)
        elif token.pos_ in ['ADJ', 'VERB']:
            result.append(t)
        else:
            continue
    return result

# Create texts
texts = [preprocessing(document) for document in corpus]

# Create dictionary
dictionary = gensim.corpora.Dictionary(texts)


# Convert documents into Bag-of-words format
corpus_bow = [dictionary.doc2bow(text) for text in texts]

# Train the TF-IDF model
tfidf_model = gensim.models.TfidfModel(corpus_bow)

# Get corpus tfidf 
corpus_tfidf = tfidf_model[corpus_bow]

In [32]:
def topic_model_coherence_generator(corpus, texts, dictionary,
                                    start_topic_count=2, end_topic_count=10,
                                    step=1, cpus=1):
    models = []
    coherence_scores = []
    for topic_nums in tqdm(range(start_topic_count, end_topic_count+1, step)):
        lda_model = gensim.models.LdaModel(corpus=corpus, id2word=dictionary,
                                           chunksize=1740, alpha='auto',
                                           eta='auto', random_state=42,
                                           iterations=500, num_topics=topic_nums,
                                           passes=20, eval_every=None)

        cv_coherence_model_lda = gensim.models.CoherenceModel(model=lda_model,
                                                                     corpus=corpus,
                                                                     texts=texts,
                                                                     dictionary=dictionary,
                                                                     coherence='c_v')
        coherence_score = cv_coherence_model_lda.get_coherence()
        coherence_scores.append(coherence_score)
        models.append(lda_model)


    return models, coherence_scores

models, coherence_scores = topic_model_coherence_generator(corpus=corpus_tfidf,
                                                           texts=texts,
                                                           dictionary=dictionary)
opt_model = models[np.argmax(coherence_scores)]

100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [01:00<00:00,  6.68s/it]


In [33]:
# opt_model = models[3]

In [34]:
# Calculate overall mean coherence score
topics_coherences = opt_model.top_topics(corpus_tfidf, topn=20)

In [35]:
coherence_scores

[0.3733855034629775,
 0.3974876417762688,
 0.4367221281119681,
 0.4745998024115434,
 0.5003586436412433,
 0.4970216453772654,
 0.4942239209087066,
 0.4672160213893543,
 0.4960869715331455]

# Result

In [36]:
# Visualize result: Topic with weights

topics_with_wts = [item[0] for item in topics_coherences]
print("LDA Topics with Weights")
print('='*50)
for idx, topic in enumerate(topics_with_wts):
  print(f'Topic {idx + 1}:')
  print([(term, round(wt, 3)) for wt, term in topic])
  print()

LDA Topics with Weights
Topic 1:
[('good', 0.01), ('amazing', 0.008), ('food', 0.008), ('ice', 0.008), ('order', 0.007), ('cream', 0.007), ('service', 0.007), ('wrong', 0.006), ('come', 0.005), ('quick', 0.005), ('correct', 0.005), ('get', 0.005), ('taste', 0.005), ('late', 0.005), ('fast', 0.005), ('mcdonald', 0.005), ('great', 0.004), ('door', 0.004), ('right', 0.004), ('line', 0.004)]

Topic 2:
[('food', 0.006), ('time', 0.006), ('letter', 0.006), ('order', 0.006), ('love', 0.006), ('well', 0.005), ('fry', 0.005), ('customer', 0.005), ('difference', 0.004), ('raw', 0.004), ('give', 0.004), ('get', 0.004), ('meal', 0.004), ('experience', 0.004), ('mess', 0.004), ('take', 0.004), ('extra', 0.004), ('call', 0.004), ('cold', 0.004), ('minute', 0.004)]

Topic 3:
[('rude', 0.006), ('nice', 0.005), ('hour', 0.005), ('people', 0.005), ('manager', 0.005), ('exist', 0.005), ('issue', 0.005), ('order', 0.005), ('enjoy', 0.005), ('try', 0.005), ('say', 0.005), ('coffee', 0.004), ('point', 0.004

In [37]:
topics = [[(term, round(wt, 3))
              for term, wt in opt_model.show_topic(n, topn=20)]
          for n in range(0, opt_model.num_topics)
          ]

topic_df = pd.DataFrame([', '.join([term for term, wt in topic]) for topic in topics],
                       columns=['Term per Topic'],
                       index=[str(t) for t in range(1, opt_model.num_topics+1)])
topic_df

Unnamed: 0,Term per Topic
1,"rude, nice, hour, people, manager, exist, issu..."
2,"drink, large, thru, menu, take, order, job, st..."
3,"staff, wait, make, manager, great, smoothie, m..."
4,"good, amazing, food, ice, order, cream, servic..."
5,"food, time, letter, order, love, well, fry, cu..."


In [38]:
topic_json = topic_df.reset_index().rename({'index': 'topic', 'Term per Topic': 'term'}, axis=1).to_dict('records')

topic_json

[{'topic': '1',
  'term': 'rude, nice, hour, people, manager, exist, issue, order, enjoy, try, say, coffee, point, mcdonald, location, place, work, spanish, day, know'},
 {'topic': '2',
  'term': 'drink, large, thru, menu, take, order, job, staff, drive, last, employee, student, line, make, sauce, sit, get, store, change, sandwich'},
 {'topic': '3',
  'term': 'staff, wait, make, manager, great, smoothie, miss, customer, correct, boy, meal, confuse, unhappy, clean, fresh, leave, upgrade, period, fresher, attitude'},
 {'topic': '4',
  'term': 'good, amazing, food, ice, order, cream, service, wrong, come, quick, correct, get, taste, late, fast, mcdonald, great, door, right, line'},
 {'topic': '5',
  'term': 'food, time, letter, order, love, well, fry, customer, difference, raw, give, get, meal, experience, mess, take, extra, call, cold, minute'}]

In [39]:
# Interpreting result
tm_results = opt_model[corpus_tfidf]

# Corpus Topics
corpus_topics = [sorted(topics, key=lambda record: -record[1])[0]
                 for topics in tm_results]

corpus_topics[:5]

[(3, 0.9312262),
 (4, 0.91646194),
 (2, 0.92595273),
 (3, 0.9244299),
 (1, 0.9345906)]

In [40]:
count = 0
for topics in tm_results:
    count += 1
    temp = [t[-1] for t in topics]
    if count == 2:
        break
print(temp)
sum(temp)

[0.020123467, 0.020095162, 0.016854065, 0.026465379, 0.91646194]


1.0000000167638063

In [41]:
corpus_topic_df = pd.DataFrame(index=df['reviewer_id'].values)
corpus_topic_df['topic'] = [item[0]+1 for item in corpus_topics]
corpus_topic_df['probability'] = [round(item[1], 4) for item in corpus_topics]
corpus_topic_df['terms'] = [topic_df.iloc[t[0]]['Term per Topic'] for t in corpus_topics]
# corpus_topic_df['reviewer_id'] = df['reviewer_id'].values

corpus_topic_df.head()

Unnamed: 0,topic,probability,terms
1,4,0.9312,"good, amazing, food, ice, order, cream, servic..."
2,5,0.9165,"food, time, letter, order, love, well, fry, cu..."
3,3,0.926,"staff, wait, make, manager, great, smoothie, m..."
4,4,0.9244,"good, amazing, food, ice, order, cream, servic..."
5,2,0.9346,"drink, large, thru, menu, take, order, job, st..."


In [42]:
corpus_topic_json = corpus_topic_df.to_dict('index')

corpus_topic_json

{1: {'topic': 4,
  'probability': 0.9312000274658203,
  'terms': 'good, amazing, food, ice, order, cream, service, wrong, come, quick, correct, get, taste, late, fast, mcdonald, great, door, right, line'},
 2: {'topic': 5,
  'probability': 0.9164999723434448,
  'terms': 'food, time, letter, order, love, well, fry, customer, difference, raw, give, get, meal, experience, mess, take, extra, call, cold, minute'},
 3: {'topic': 3,
  'probability': 0.9259999990463257,
  'terms': 'staff, wait, make, manager, great, smoothie, miss, customer, correct, boy, meal, confuse, unhappy, clean, fresh, leave, upgrade, period, fresher, attitude'},
 4: {'topic': 4,
  'probability': 0.9243999719619751,
  'terms': 'good, amazing, food, ice, order, cream, service, wrong, come, quick, correct, get, taste, late, fast, mcdonald, great, door, right, line'},
 5: {'topic': 2,
  'probability': 0.9345999956130981,
  'terms': 'drink, large, thru, menu, take, order, job, staff, drive, last, employee, student, line, ma

In [43]:
df['topic'] = df['reviewer_id'].apply(lambda x: corpus_topic_json[x])

df.head()

Unnamed: 0,reviewer_id,review_time,rating,review_processed,aspect_sentiment,sentiment,topic
0,1,2024-09-04,1,"I had a normal transaction, everyone was calm ...","[{'term': 'food', 'class': 'negative', 'probab...",negative,"{'topic': 4, 'probability': 0.9312000274658203..."
1,2,2024-11-29,4,"The staff at McDonald's are friendly, accommod...","[{'term': 'fast food', 'class': 'postive', 'pr...",positive,"{'topic': 5, 'probability': 0.9164999723434448..."
2,3,2024-11-29,1,I made a mobile order got to the speaker and c...,"[{'term': 'speaker', 'class': 'neutral', 'prob...",negative,"{'topic': 3, 'probability': 0.9259999990463257..."
3,4,2024-11-04,5,"Crispy chicken sandwich was delicious, and cus...","[{'term': 'sandwich', 'class': 'postive', 'pro...",positive,"{'topic': 4, 'probability': 0.9243999719619751..."
4,5,2024-10-04,1,I repeat my order three times in the drive thr...,"[{'term': 'fries', 'class': 'negative', 'proba...",negative,"{'topic': 2, 'probability': 0.9345999956130981..."


In [44]:
result = df.to_dict('index')
result

{'0': {'reviewer_id': 1,
  'review_time': '2024-09-04',
  'rating': 1,
  'review_processed': "I had a normal transaction, everyone was calm and polite, but now I don't want to eat this. I'm trying not to think about what this milky white/clear substance is all over my food, and I'm sure I'm not coming back.",
  'aspect_sentiment': [{'term': 'food',
    'class': 'negative',
    'probability': [0.9815933108329773,
     0.015454968437552452,
     0.002951699076220393],
    'context': ["I'm trying not to think about what this milky white/clear substance is all over my food, and I'm sure I'm not coming back."]},
   {'term': 'substance',
    'class': 'negative',
    'probability': [0.5997273921966553,
     0.004296493716537952,
     0.39597612619400024],
    'context': ["I'm trying not to think about what this milky white/clear substance is all over my food, and I'm sure I'm not coming back."]}],
  'sentiment': 'negative',
  'topic': {'topic': 4,
   'probability': 0.9312000274658203,
   'ter

# Save Result

In [45]:
with open("meta-data.json", "w") as file:
    json.dump(result, file, indent=4)

In [46]:
with open("map-topic.json",  "w") as file:
    json.dump(topic_json, file, indent=4)