In [1]:
#Imports

import pandas as pd
import string
import re

import nltk
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

import warnings
warnings.filterwarnings("ignore")

In [2]:
#Let's open our excel file.

utterances = pd.read_excel('output_artifacts.xlsx', sheet_name = 'user_queries')
utterances

Unnamed: 0,text,topic L1,subtopic L2 (if necessary)
0,12.50.00,General,Out_of_scope
1,10 past 3 p.m. I have an appointment on the 22...,Appointment,Appointment
2,15:50 afternoon is good,General,Out_of_scope
3,1st Dec,General,Out_of_scope
4,20 mins after 10 PM,General,Out_of_scope
...,...,...,...
995,I was trying to make an international transfer...,General,Troubleshooting
996,I was trying to make an international transfer...,Application,Application
997,I was trying to top up,Card,Card_refill
998,I was trying to top up my card but it was reve...,General,Troubleshooting


In [3]:
#Now let's tokenize our text data.

tokenized_text = [word_tokenize(sent) for sent in utterances.text.values]
tokens = []

for sent in tokenized_text:
    for word in sent:
         tokens.append(word.lower()) 

print(tokens[:40])

['12.50.00', '10', 'past', '3', 'p.m', '.', 'i', 'have', 'an', 'appointment', 'on', 'the', '22nd', 'may', '15:50', 'afternoon', 'is', 'good', '1st', 'dec', '20', 'mins', 'after', '10', 'pm', '25', 'to', '11', 'morning', 'then', ',', 'bye', '3', 'maybe', '?', '40', 'a', '$', '5600', 'transfer']


In [4]:
#It is time to get rid of the punctuation and stop words.

stop_words = stopwords.words('english')
stop_words.extend(["n't", 'more', 'much', 'really'])

without_punct = [word for word in tokens if word not in list(string.punctuation)]
without_stopwords = [word for word in without_punct if word not in stop_words]

In [5]:
#Now let's apply lemmatization.
lemmatizer = WordNetLemmatizer()
lemma_text = [lemmatizer.lemmatize(w) for w in without_stopwords]

In [6]:
#Let's use TfidfVectorizer.

tf_vectorizer = CountVectorizer(strip_accents = 'unicode',
                                stop_words = 'english',
                                lowercase = True,
                                token_pattern = r'\b[a-zA-Z]{3,}\b',
                                max_df = 0.5, 
                                min_df = 10)
dtm_tf = tf_vectorizer.fit_transform(lemma_text)
tfidf_vectorizer = TfidfVectorizer(**tf_vectorizer.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(lemma_text)

In [7]:
#Now let's create the topic model of the current dataset.

lda_tf = LatentDirichletAllocation(n_components=20, random_state=0)
lda_tf.fit(dtm_tf)
lda_tfidf = LatentDirichletAllocation(n_components=20, random_state=0)
lda_tfidf.fit(dtm_tfidf)
print(lda_tfidf)

LatentDirichletAllocation(n_components=20, random_state=0)


In [9]:
#It is time to create the visualizations of the topic clusters.

from imp import reload
pyLDAvis.sklearn.prepare(lda_tf, dtm_tf, tf_vectorizer)
