In [2]:
# Utility: Preparation text

from contractions import CONTRACTION_MAP

##========== PREPARATION TEXT ===========##

# Contraction
def expand_contractions(sentence, contraction_mapping=CONTRACTION_MAP):
    """
    Expand the contractions in a sentence. For example don't => do not.
    
    Paramters:
    sentence (str): The input sentence to clean.
    contraction_mapping (dict): A dictionary for mapping contractions.
    
    
    Returns:
    str: The expanded contraction sentence.
    """
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                      flags=re.IGNORECASE|re.DOTALL)
    
    def expanded_match(contraction):
        """
        Filter for expanding the matched contraction.
        
        Parameters:
        contraction (str): The input of contraction
        
        Returns:
        str: The expanded contraction.
        """
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match) if contraction_mapping.get(match) else contraction_mapping.get(match.lower())
        
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction
    
    expanded_sentence = contractions_pattern.sub(expanded_match, sentence)
    return expanded_sentence


def remove_extra_spaces(sentence):
    # Use regex to replace multiple spaces with a single space
    return re.sub(r'\s+', ' ', sentence).strip()


def remove_non_ascii(text):
    """
    Remove all non-ASCII characters from the text.

    Parameters:
    text (str): The input text to clean.

    Returns:
    str: The cleaned text with only ASCII characters.
    
    """
    
    return ''.join([char for char in text if ord(char) < 128])

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import spacy

import re
import os

import ast

from tqdm import tqdm

from nltk.corpus import stopwords
import gensim

In [4]:
# Load nlp model
nlp = spacy.load('en_core_web_lg')

In [5]:
df = pd.read_csv('example.csv')

# Convert the string columns to dictionaries
df['ability'] = df['ability'].apply(ast.literal_eval)
df['ability_filtered'] = df['ability_filtered'].apply(ast.literal_eval)


print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   review            99 non-null     object
 1   ability           99 non-null     object
 2   ability_filtered  99 non-null     object
dtypes: object(3)
memory usage: 2.4+ KB
None


Unnamed: 0,review,ability,ability_filtered
0,"The staff were incredibly helpful and patient,...","{0: [('staff', 'were patient'), ('staff', 'wer...","{0: [('staff', 'were patient'), ('staff', 'wer..."
1,I had a great experience purchasing my phone h...,"{0: [('process', 'was quick'), ('process', 'wa...","{0: [('process', 'was quick')]}"
2,"Their selection of phones is amazing, and the ...","{0: [('selection', 'is amazing'), ('price', 'a...","{0: [('selection', 'is amazing')]}"
3,I appreciate how the staff walked me through s...,"{0: [('I', 'appreciate walked me'), ('I', 'app...","{0: [('I', 'appreciate walked me')]}"
4,"Great customer service, I left with the phone ...","{0: [('question', 'answered'), ('I', 'left wit...","{0: [('question', 'answered'), ('I', 'left wit..."


In [6]:
def contraction(x):
    flatten = [item for sublist in x.values() for item in sublist]

    temp = []
    for t in flatten:
        temp.append(' '.join(t))

    if len(temp) > 0:
        return '. '.join(temp) + '.'
    return ''
    
corpus = df['ability'].apply(contraction).values

corpus

array(['staff were patient. staff were helpful.',
       'process was quick. process was smooth.',
       'selection is amazing. price are competitive.',
       'I appreciate walked me. I appreciate walked through setting new device.',
       'question answered. I left with phone.',
       'I could not resist offer amazing deals on phones. I could not resist upgrading.',
       'I expected. technician fixed faster. technician fixed issue.',
       'experience really know stuff.',
       'variety was impressive. I found perfect case.', '',
       'staff was knowledgeable.', 'price were reasonable.',
       'staff really went mile.',
       'service service be Excellent. they helped find.',
       'staff was friendly. deal was friendly.',
       'I love store. buying buying be experience. fixing buying be experience.',
       'I got good deal on old phone.',
       'service are reliable. service are quick.',
       'staff was helpful in setting phone.', 'I m satisfied.',
       'staff wa

In [7]:
# Define the list of stopwords
stop_words = set(stopwords.words('english'))
    
# Preprocessing text
def preprocessing(text):
    text = remove_extra_spaces(text)
    text = expand_contractions(text)
    text = remove_non_ascii(text)

    # Get token of words
    doc = nlp(text)
    result = []
    for token in doc:
        t = token.lemma_.lower()

        if re.match(r'^[0-9\W]+$', t) or len(t) < 3 or t in stop_words:
            continue
        # If the token is adjective, noun, propn, or verb
        if token.pos_ in ['NOUN', 'PROPN']:
            result.append(t)
        elif token.pos_ in ['ADJ', 'VERB']:
            result.append(t)
        else:
            continue
    return result

# Create texts
texts = [preprocessing(document) for document in corpus]

# Create dictionary
dictionary = gensim.corpora.Dictionary(texts)


# Convert documents into Bag-of-words format
corpus_bow = [dictionary.doc2bow(text) for text in texts]

# Train the TF-IDF model
tfidf_model = gensim.models.TfidfModel(corpus_bow)

# Get corpus tfidf 
corpus_tfidf = tfidf_model[corpus_bow]

In [8]:
def topic_model_coherence_generator(corpus, texts, dictionary,
                                    start_topic_count=2, end_topic_count=10,
                                    step=1, cpus=1):
    models = []
    coherence_scores = []
    for topic_nums in tqdm(range(start_topic_count, end_topic_count+1, step)):
        lda_model = gensim.models.LdaModel(corpus=corpus, id2word=dictionary,
                                           chunksize=1740, alpha='auto',
                                           eta='auto', random_state=42,
                                           iterations=500, num_topics=topic_nums,
                                           passes=20, eval_every=None)

        cv_coherence_model_lda = gensim.models.CoherenceModel(model=lda_model,
                                                                     corpus=corpus,
                                                                     texts=texts,
                                                                     dictionary=dictionary,
                                                                     coherence='c_v')
        coherence_score = cv_coherence_model_lda.get_coherence()
        coherence_scores.append(coherence_score)
        models.append(lda_model)


    return models, coherence_scores

models, coherence_scores = topic_model_coherence_generator(corpus=corpus_tfidf,
                                                           texts=texts,
                                                           dictionary=dictionary)
opt_model = models[np.argmax(coherence_scores)]

100%|████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:30<00:00,  3.36s/it]


In [11]:
# Calculate overall mean coherence score
topics_coherences = opt_model.top_topics(corpus_tfidf, topn=20)

In [12]:
coherence_scores

[0.6780125508564886,
 0.6936101867402961,
 0.6954448369156887,
 0.6981522176775626,
 0.68509081595846,
 0.6897949182640505,
 0.6482217841172262,
 0.6335215408719335,
 0.6245686835264562]

In [10]:
# Visualize result: Topic with weights

topics_with_wts = [item[0] for item in topics_coherences]
print("LDA Topics with Weights")
print('='*50)
for idx, topic in enumerate(topics_with_wts):
  print(f'Topic {idx + 1}:')
  print([(term, round(wt, 3)) for wt, term in topic])
  print()

LDA Topics with Weights
Topic 1:
[('help', 0.024), ('deal', 0.023), ('great', 0.022), ('time', 0.022), ('experience', 0.02), ('staff', 0.019), ('friendly', 0.018), ('selection', 0.017), ('find', 0.017), ('get', 0.017), ('service', 0.015), ('resist', 0.015), ('order', 0.014), ('lose', 0.014), ('explain', 0.014), ('rush', 0.014), ('want', 0.014), ('unprofessional', 0.014), ('program', 0.014), ('phone', 0.013)]

Topic 2:
[('staff', 0.025), ('store', 0.025), ('set', 0.024), ('phone', 0.023), ('resolve', 0.023), ('helpful', 0.02), ('patient', 0.02), ('question', 0.019), ('work', 0.018), ('rude', 0.018), ('service', 0.017), ('good', 0.016), ('leave', 0.016), ('upsold', 0.015), ('outstanding', 0.015), ('hand', 0.015), ('price', 0.015), ('selection', 0.015), ('buy', 0.015), ('unhelpful', 0.014)]

Topic 3:
[('fix', 0.028), ('reliable', 0.026), ('service', 0.024), ('refuse', 0.023), ('discount', 0.022), ('promotion', 0.016), ('quick', 0.016), ('knowledgeable', 0.016), ('refund', 0.016), ('issue'

Explanation:
- Topic 1: This topic revolves around positive interactions with staff, with a focus on helpfulness and friendliness.
- Topic 2: This topic highlights mixed customer service experiences, ranging from helpful and patient staff to rude or unhelpful interactions.
- Topic 3: The focus here is on the reliability of services provided and the handling of customer issues.
- Topic 4: This topic focuses on the in-store experience, including wait times and the store’s ability to meet customer needs.
- Topic 5: This topic touches on customer satisfaction with product quality and service outcomes.