In [47]:
# Utility: Preparation text

from contractions import CONTRACTION_MAP

##========== PREPARATION TEXT ===========##

# Contraction
def expand_contractions(sentence, contraction_mapping=CONTRACTION_MAP):
    """
    Expand the contractions in a sentence. For example don't => do not.
    
    Paramters:
    sentence (str): The input sentence to clean.
    contraction_mapping (dict): A dictionary for mapping contractions.
    
    
    Returns:
    str: The expanded contraction sentence.
    """
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                      flags=re.IGNORECASE|re.DOTALL)
    
    def expanded_match(contraction):
        """
        Filter for expanding the matched contraction.
        
        Parameters:
        contraction (str): The input of contraction
        
        Returns:
        str: The expanded contraction.
        """
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match) if contraction_mapping.get(match) else contraction_mapping.get(match.lower())
        
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction
    
    expanded_sentence = contractions_pattern.sub(expanded_match, sentence)
    return expanded_sentence


def remove_extra_spaces(sentence):
    # Use regex to replace multiple spaces with a single space
    return re.sub(r'\s+', ' ', sentence).strip()


def remove_non_ascii(text):
    """
    Remove all non-ASCII characters from the text.

    Parameters:
    text (str): The input text to clean.

    Returns:
    str: The cleaned text with only ASCII characters.
    
    """
    
    return ''.join([char for char in text if ord(char) < 128])

In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import json
import copy
import re

from tqdm import tqdm

from nltk.corpus import stopwords

import spacy
import gensim
from pattern.en import sentiment

In [49]:
pd.set_option('display.max_colwidth', 2000)

In [50]:
# Load nlp model
nlp = spacy.load('en_core_web_sm')

In [51]:
# Load JSON data from a file
with open('data-2.json', 'r') as json_file:
    data = json.load(json_file)

data

{'0': {'id': 1,
  'review': 'The staff were incredibly helpful and patient, helping me find the perfect phone!',
  'date': '2024-11-01',
  'aspect': {'staff': {'ADJ': [[['helpful'], 'staff were helpful.', 'Neutral'],
     [['patient'], 'staff were patient.', 'Neutral']],
    'VERB': [],
    'OTHER': []}},
  'sentiment': 'Neutral'},
 '1': {'id': 2,
  'review': 'I had a great experience purchasing my phone here, the process was smooth and quick.',
  'date': '2024-11-01',
  'aspect': {'process': {'ADJ': [[['quick'], 'process was quick.', 'Positive'],
     [['smooth'], 'process was smooth.', 'Positive']],
    'VERB': [],
    'OTHER': []}},
  'sentiment': 'Positive'},
 '2': {'id': 3,
  'review': 'Their selection of phones is amazing, and the prices are very competitive!',
  'date': '2024-11-01',
  'aspect': {'selection': {'ADJ': [[['amazing'],
      'selection is amazing.',
      'Positive']],
    'VERB': [],
    'OTHER': []},
   'price': {'ADJ': [[['competitive'], 'price are competitive.',

In [52]:
def get_corpus(input_):
    corpus = []
    # Get dictionary level 1
    for i, dict_1 in input_.items():
        document = ''
        # Get dictionary level 2
        for aspect, dict_2 in dict_1['aspect'].items():
            # Get text from labels
            for label, data_ in dict_2.items():
                for d in data_:
                    document += d[1] + ' '
        corpus.append(document)

    return corpus

get_corpus(data)

['staff were helpful. staff were patient. ',
 'process was quick. process was smooth. ',
 'selection is amazing. price are competitive. ',
 'I appreciate walked me. I appreciate walked through setting new device. ',
 'question answered. ',
 'I could not resist offer amazing deals on phones. I could not resist upgrading. ',
 'technician fixed faster. technician fixed phones issue. technician fixed phones issue. ',
 'experience really know stuff. ',
 'variety was impressive. ',
 '',
 'staff was knowledgeable. ',
 'price were reasonable. ',
 'staff really went mile. ',
 'service be excellent. ',
 'deal was friendly. staff was friendly. ',
 'buying be experience. fixing be experience. ',
 'I got good deal on old phone. ',
 'service are quick. service are reliable. ',
 'staff was helpful in setting phone. staff was helpful in setting phone. ',
 'I m satisfied. ',
 'staff was patient with questions. variety was patient with questions. ',
 'process was simple. ',
 'they helped choose within b

In [53]:
corpus = get_corpus(data)

# Define the list of stopwords
stop_words = set(stopwords.words('english'))
    
# Preprocessing text
def preprocessing(text):
    text = remove_extra_spaces(text)
    text = expand_contractions(text)
    text = remove_non_ascii(text)

    # Get token of words
    doc = nlp(text)
    result = []
    for token in doc:
        t = token.lemma_.lower()

        if re.match(r'^[0-9\W]+$', t) or len(t) < 3 or t in stop_words:
            continue
        # If the token is adjective, noun, propn, or verb
        if token.pos_ in ['NOUN', 'PROPN']:
            result.append(t)
        elif token.pos_ in ['ADJ', 'VERB']:
            result.append(t)
        else:
            continue
    return result

# Create texts
texts = [preprocessing(document) for document in corpus]

# Create dictionary
dictionary = gensim.corpora.Dictionary(texts)


# Convert documents into Bag-of-words format
corpus_bow = [dictionary.doc2bow(text) for text in texts]

# Train the TF-IDF model
tfidf_model = gensim.models.TfidfModel(corpus_bow)

# Get corpus tfidf 
corpus_tfidf = tfidf_model[corpus_bow]

In [54]:
def topic_model_coherence_generator(corpus, texts, dictionary,
                                    start_topic_count=2, end_topic_count=10,
                                    step=1, cpus=1):
    models = []
    coherence_scores = []
    for topic_nums in tqdm(range(start_topic_count, end_topic_count+1, step)):
        lda_model = gensim.models.LdaModel(corpus=corpus, id2word=dictionary,
                                           chunksize=1740, alpha='auto',
                                           eta='auto', random_state=42,
                                           iterations=500, num_topics=topic_nums,
                                           passes=20, eval_every=None)

        cv_coherence_model_lda = gensim.models.CoherenceModel(model=lda_model,
                                                                     corpus=corpus,
                                                                     texts=texts,
                                                                     dictionary=dictionary,
                                                                     coherence='c_v')
        coherence_score = cv_coherence_model_lda.get_coherence()
        coherence_scores.append(coherence_score)
        models.append(lda_model)


    return models, coherence_scores

models, coherence_scores = topic_model_coherence_generator(corpus=corpus_tfidf,
                                                           texts=texts,
                                                           dictionary=dictionary)
opt_model = models[np.argmax(coherence_scores)]

100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:55<00:00,  6.18s/it]


In [55]:
# Calculate overall mean coherence score
topics_coherences = opt_model.top_topics(corpus_tfidf, topn=20)

In [56]:
coherence_scores

[0.6850983927558234,
 0.7225553258293611,
 0.7219723474786128,
 0.6984040893433792,
 0.6874730577326793,
 0.6994897914640068,
 0.6819452616365094,
 0.6789185895745116,
 0.6635181678605453]

# Interpretation Result

In [57]:
# Visualize result: Topic with weights

topics_with_wts = [item[0] for item in topics_coherences]
print("LDA Topics with Weights")
print('='*50)
for idx, topic in enumerate(topics_with_wts):
  print(f'Topic {idx + 1}:')
  print([(term, round(wt, 3)) for wt, term in topic])
  print()

LDA Topics with Weights
Topic 1:
[('phone', 0.043), ('fix', 0.022), ('work', 0.021), ('repair', 0.02), ('experience', 0.019), ('get', 0.017), ('great', 0.017), ('selection', 0.017), ('process', 0.016), ('break', 0.015), ('technician', 0.015), ('amazing', 0.015), ('issue', 0.015), ('helpful', 0.014), ('price', 0.013), ('staff', 0.013), ('time', 0.012), ('discount', 0.012), ('defective', 0.012), ('promise', 0.012)]

Topic 2:
[('staff', 0.037), ('service', 0.036), ('excellent', 0.019), ('quick', 0.019), ('take', 0.016), ('store', 0.016), ('good', 0.015), ('set', 0.015), ('make', 0.015), ('look', 0.014), ('help', 0.014), ('variety', 0.014), ('warranty', 0.013), ('give', 0.012), ('deal', 0.012), ('wait', 0.012), ('offer', 0.011), ('new', 0.011), ('appreciate', 0.011), ('friendly', 0.011)]

Topic 3:
[('store', 0.026), ('reliable', 0.023), ('feel', 0.021), ('question', 0.02), ('satisfied', 0.016), ('resolve', 0.015), ('plan', 0.015), ('upsold', 0.015), ('unprofessional', 0.015), ('apologetic'

Explanation:
- Topic 1: Service Quality and Conflict Resolution.
> This topic likely centers around the overall quality of service and how customer issues are handled, particularly in resolving conflicts, complaints, or misunderstandings.
- Topic 2: Product Experience and Sales Interactions.
> This topic emphasizes product experiences and interactions with sales staff, with a particular focus on phones.
- Topic 3: Service Effectiveness and Customer Satisfaction.
> This topic likely reflects customer evaluations of service effectiveness.
- Topic 4: In-Store Experience and Helpfulness.
> This topic highlights in-store experiences, focusing on staff helpfulness, product selection, pricing, and store organization.

In [58]:
topics = [[(term, round(wt, 3))
              for term, wt in opt_model.show_topic(n, topn=20)]
          for n in range(0, opt_model.num_topics)
          ]

topic_df = pd.DataFrame([', '.join([term for term, wt in topic]) for topic in topics],
                       columns=['Term per Topic'],
                       index=[str(t) for t in range(1, opt_model.num_topics+1)])
topic_df

Unnamed: 0,Term per Topic
1,"store, reliable, feel, question, satisfied, resolve, plan, upsold, unprofessional, apologetic, slow, outstanding, hand, pressured, overcharge, worth, reasonable, answer, inform, choose"
2,"staff, service, excellent, quick, take, store, good, set, make, look, help, variety, warranty, give, deal, wait, offer, new, appreciate, friendly"
3,"phone, fix, work, repair, experience, get, great, selection, process, break, technician, amazing, issue, helpful, price, staff, time, discount, defective, promise"


In [59]:
topic_json = topic_df.reset_index().rename({'index': 'topic', 'Term per Topic': 'term'}, axis=1).to_dict('records')

topic_json

[{'topic': '1',
  'term': 'store, reliable, feel, question, satisfied, resolve, plan, upsold, unprofessional, apologetic, slow, outstanding, hand, pressured, overcharge, worth, reasonable, answer, inform, choose'},
 {'topic': '2',
  'term': 'staff, service, excellent, quick, take, store, good, set, make, look, help, variety, warranty, give, deal, wait, offer, new, appreciate, friendly'},
 {'topic': '3',
  'term': 'phone, fix, work, repair, experience, get, great, selection, process, break, technician, amazing, issue, helpful, price, staff, time, discount, defective, promise'}]

In [60]:
# Interpreting result
tm_results = opt_model[corpus_tfidf]

# Corpus Topics
corpus_topics = [sorted(topics, key=lambda record: -record[1])[0]
                 for topics in tm_results]

corpus_topics[:5]

[(2, 0.6150381),
 (2, 0.6102596),
 (2, 0.74554163),
 (1, 0.7650772),
 (0, 0.6253251)]

In [61]:
corpus_topic_df = pd.DataFrame()
corpus_topic_df['topic_dominant'] = [item[0]+1 for item in corpus_topics]
corpus_topic_df['topic_contribution'] = [round(item[1], 4) for item in corpus_topics]
corpus_topic_df['term'] = [topic_df.iloc[t[0]]['Term per Topic'] for t in corpus_topics]
corpus_topic_df['review'] = corpus

corpus_topic_df.head()

Unnamed: 0,topic_dominant,topic_contribution,term,review
0,3,0.615,"phone, fix, work, repair, experience, get, great, selection, process, break, technician, amazing, issue, helpful, price, staff, time, discount, defective, promise",staff were helpful. staff were patient.
1,3,0.6103,"phone, fix, work, repair, experience, get, great, selection, process, break, technician, amazing, issue, helpful, price, staff, time, discount, defective, promise",process was quick. process was smooth.
2,3,0.7455,"phone, fix, work, repair, experience, get, great, selection, process, break, technician, amazing, issue, helpful, price, staff, time, discount, defective, promise",selection is amazing. price are competitive.
3,2,0.7651,"staff, service, excellent, quick, take, store, good, set, make, look, help, variety, warranty, give, deal, wait, offer, new, appreciate, friendly",I appreciate walked me. I appreciate walked through setting new device.
4,1,0.6253,"store, reliable, feel, question, satisfied, resolve, plan, upsold, unprofessional, apologetic, slow, outstanding, hand, pressured, overcharge, worth, reasonable, answer, inform, choose",question answered.


# Save result

In [62]:
# Update meta data
new_data = corpus_topic_df.drop('review', axis=1).to_dict('index')

def update_data(data_, new_data_):
    updated_data = copy.deepcopy(data_)

    for i, new in new_data_.items():
        updated_data[str(i)].update({'topic': new})

    return updated_data

save_data = update_data(data, new_data)

In [63]:
# Save dictionary to a JSON file
with open('data-3.json', 'w') as json_file:
    json.dump(save_data, json_file, indent=4)  # 'indent=4' makes the JSON pretty-printed


with open('data-topic.json', 'w') as json_file:
    json.dump(topic_json, json_file, indent=4)

In [64]:
# Save model