# Topic Modeling

In [1]:
pip install gensim

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
from gensim import matutils, models

In [4]:
# Import the necessary modules for LDA with gensim
# Terminal / Anaconda Navigator: conda install -c conda-forge gensim
import scipy.sparse
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Topic Modeling - Attempt #1 (all text)

In [5]:
# Read in the data, before the CountVectorizer step
data=pd.read_csv('AirIndia_final1.csv')
data['Tweet_Text']

0       @hemendu @JM_Scindia @PMOIndia @airindiain @AA...
1       Heard a lot about @airindiain, reason to have ...
2       RT @Dastanagoi: Pathetic service of @makemytri...
3       @virsanghvi @airindiain @IndiGo6E @TataCompani...
4       RT @madhukishwar: Surprised that @AirIndia beh...
                              ...                        
2285    AirIndia Express has completed 17 years of ser...
2286    @GnaniGnaneshan @rangaba @flysrilankan @charit...
2287    @sayantan05cts Dear Mr. Chakraborty, we reques...
2288    @DENISH918 @airindiain I am on the same flight...
2289    @Nehaneh32313806 Dear Ma'am, we recommend you ...
Name: Tweet_Text, Length: 2290, dtype: object

In [6]:
lem_transcript = []
from nltk.stem.wordnet import WordNetLemmatizer 
import re
for i in range(0, data.shape[0]):
    #Remove punctuations
    text = re.sub('[^a-zA-Z]', ' ', str(data['Tweet_Text'][i]))
    
    #Convert to lowercase
    text = text.lower()
    ##Convert to list from string
    text = text.split()
    ##Lemmatizing
    lm = WordNetLemmatizer() 
       
    
    text = [lm.lemmatize(word) for word in text if (len(word) > 3 and len(word.strip('xx/')) > 2)] 
    text = " ".join(text)
    lem_transcript.append(text)

In [18]:
lem_transcript[10]

'sminujindal airindiain rntata scindia millionth time what update refund booking http iqzqmsigx'

In [19]:
# Create a document-term matrix
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

# additional stop words since we are recreating the document-term matrix
add_stop_words = ["rntata","tatacompanies","tata","http","scindia","airindiain","airindia","air","india",'like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said', 'x','xx', 'xxx','xxxx','xxxxx']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate a document-term matrix
cv = CountVectorizer(stop_words=stop_words,ngram_range=(1,1))
data_cv = cv.fit_transform(lem_transcript)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data.index
data_dtm

Unnamed: 0,aaib,aaiblgairport,aaibpiairport,aaidelhi,aaiixsairport,aaijamairport,aaipunairport,aairednr,aairhqsr,aaistvairport,...,zueksakrv,zueznpctr,zump,zuqxsxb,zurich,zvcoxgw,zvxmclfyvw,zyadaahorahahai,zykzgvnbo,zywizs
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2285,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2286,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2287,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2288,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# Create the gensim corpus
corpus = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtm.transpose()))

# Create the vocabulary dictionary
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [11]:
#corpus[10]

In [12]:
#cv.vocabulary_.items()

In [21]:
# Let's start with 3 topics
lda = models.LdaModel(corpus=corpus, num_topics=3, id2word=id2word, passes=20)
lda.print_topics()

2022-05-10 23:12:51,653 : INFO : using symmetric alpha at 0.3333333333333333
2022-05-10 23:12:51,658 : INFO : using symmetric eta at 0.3333333333333333
2022-05-10 23:12:51,677 : INFO : using serial LDA version on this node
2022-05-10 23:12:51,702 : INFO : running online (multi-pass) LDA training, 3 topics, 20 passes over the supplied corpus of 2290 documents, updating model once every 2000 documents, evaluating perplexity every 2290 documents, iterating 50x with a convergence threshold of 0.001000
2022-05-10 23:12:51,794 : INFO : PROGRESS: pass 0, at document #2000/2290
2022-05-10 23:12:54,828 : INFO : merging changes from 2000 documents into a model of 2290 documents
2022-05-10 23:12:54,828 : INFO : topic #0 (0.333): 0.017*"flight" + 0.009*"delhi" + 0.007*"ticket" + 0.006*"service" + 0.006*"airport" + 0.005*"customer" + 0.004*"hour" + 0.004*"mumbai" + 0.004*"airline" + 0.004*"daikin"
2022-05-10 23:12:54,844 : INFO : topic #1 (0.333): 0.016*"flight" + 0.008*"travel" + 0.007*"internatio

2022-05-10 23:13:02,923 : INFO : topic diff=0.324705, rho=0.440867
2022-05-10 23:13:02,988 : INFO : PROGRESS: pass 4, at document #2000/2290
2022-05-10 23:13:04,354 : INFO : merging changes from 2000 documents into a model of 2290 documents
2022-05-10 23:13:04,354 : INFO : topic #0 (0.333): 0.026*"flight" + 0.014*"service" + 0.012*"delhi" + 0.012*"airport" + 0.008*"ticket" + 0.007*"staff" + 0.006*"hour" + 0.006*"mumbai" + 0.006*"worst" + 0.006*"customer"
2022-05-10 23:13:04,354 : INFO : topic #1 (0.333): 0.012*"flight" + 0.009*"travel" + 0.007*"refund" + 0.007*"international" + 0.006*"airline" + 0.006*"dear" + 0.005*"money" + 0.005*"check" + 0.005*"booked" + 0.004*"visit"
2022-05-10 23:13:04,369 : INFO : topic #2 (0.333): 0.013*"refund" + 0.013*"customer" + 0.013*"flight" + 0.011*"ticket" + 0.010*"help" + 0.009*"day" + 0.009*"care" + 0.008*"booking" + 0.007*"travel" + 0.006*"aiesl"
2022-05-10 23:13:04,372 : INFO : topic diff=0.292147, rho=0.403403
2022-05-10 23:13:04,665 : INFO : -7.38

2022-05-10 23:13:13,369 : INFO : topic #2 (0.333): 0.016*"refund" + 0.014*"flight" + 0.013*"customer" + 0.013*"ticket" + 0.010*"help" + 0.009*"day" + 0.009*"care" + 0.008*"booking" + 0.007*"travel" + 0.007*"moca"
2022-05-10 23:13:13,371 : INFO : topic diff=0.204452, rho=0.313960
2022-05-10 23:13:13,619 : INFO : -7.318 per-word bound, 159.5 perplexity estimate based on a held-out corpus of 290 documents with 4270 words
2022-05-10 23:13:13,619 : INFO : PROGRESS: pass 8, at document #2290/2290
2022-05-10 23:13:13,776 : INFO : merging changes from 290 documents into a model of 2290 documents
2022-05-10 23:13:13,785 : INFO : topic #0 (0.333): 0.027*"flight" + 0.016*"service" + 0.014*"airport" + 0.012*"delhi" + 0.008*"staff" + 0.008*"ticket" + 0.007*"hour" + 0.007*"customer" + 0.006*"worst" + 0.006*"baggage"
2022-05-10 23:13:13,788 : INFO : topic #1 (0.333): 0.010*"flight" + 0.009*"travel" + 0.007*"money" + 0.006*"airline" + 0.006*"dear" + 0.006*"international" + 0.005*"check" + 0.005*"canad

2022-05-10 23:13:20,759 : INFO : merging changes from 290 documents into a model of 2290 documents
2022-05-10 23:13:20,769 : INFO : topic #0 (0.333): 0.027*"flight" + 0.016*"service" + 0.014*"airport" + 0.012*"delhi" + 0.009*"staff" + 0.008*"ticket" + 0.007*"customer" + 0.007*"hour" + 0.006*"passenger" + 0.006*"worst"
2022-05-10 23:13:20,775 : INFO : topic #1 (0.333): 0.010*"flight" + 0.009*"travel" + 0.007*"money" + 0.006*"dear" + 0.006*"airline" + 0.006*"international" + 0.005*"check" + 0.005*"canada" + 0.005*"year" + 0.004*"policy"
2022-05-10 23:13:20,782 : INFO : topic #2 (0.333): 0.018*"refund" + 0.015*"flight" + 0.015*"ticket" + 0.015*"customer" + 0.011*"help" + 0.010*"care" + 0.010*"day" + 0.009*"booking" + 0.008*"travel" + 0.007*"cancelled"
2022-05-10 23:13:20,787 : INFO : topic diff=0.162755, rho=0.265888
2022-05-10 23:13:20,869 : INFO : PROGRESS: pass 13, at document #2000/2290
2022-05-10 23:13:22,076 : INFO : merging changes from 2000 documents into a model of 2290 documents

2022-05-10 23:13:28,247 : INFO : topic #2 (0.333): 0.019*"refund" + 0.015*"ticket" + 0.015*"flight" + 0.014*"customer" + 0.011*"help" + 0.010*"care" + 0.010*"day" + 0.009*"booking" + 0.008*"travel" + 0.007*"cancelled"
2022-05-10 23:13:28,250 : INFO : topic diff=0.139789, rho=0.234759
2022-05-10 23:13:28,292 : INFO : PROGRESS: pass 17, at document #2000/2290
2022-05-10 23:13:29,323 : INFO : merging changes from 2000 documents into a model of 2290 documents
2022-05-10 23:13:29,339 : INFO : topic #0 (0.333): 0.027*"flight" + 0.015*"service" + 0.013*"airport" + 0.012*"delhi" + 0.008*"staff" + 0.007*"hour" + 0.007*"ticket" + 0.006*"passenger" + 0.006*"customer" + 0.006*"airline"
2022-05-10 23:13:29,342 : INFO : topic #1 (0.333): 0.009*"flight" + 0.009*"travel" + 0.006*"international" + 0.006*"dear" + 0.006*"money" + 0.006*"airline" + 0.005*"check" + 0.005*"visit" + 0.004*"year" + 0.004*"canada"
2022-05-10 23:13:29,346 : INFO : topic #2 (0.333): 0.018*"refund" + 0.015*"ticket" + 0.014*"fligh

[(0,
  '0.027*"flight" + 0.016*"service" + 0.014*"airport" + 0.012*"delhi" + 0.009*"staff" + 0.007*"ticket" + 0.007*"hour" + 0.007*"customer" + 0.007*"passenger" + 0.006*"airline"'),
 (1,
  '0.009*"flight" + 0.009*"travel" + 0.007*"money" + 0.006*"dear" + 0.006*"international" + 0.006*"airline" + 0.005*"check" + 0.005*"canada" + 0.005*"year" + 0.004*"visit"'),
 (2,
  '0.019*"refund" + 0.015*"ticket" + 0.015*"flight" + 0.014*"customer" + 0.011*"help" + 0.010*"care" + 0.010*"day" + 0.009*"booking" + 0.008*"travel" + 0.007*"cancelled"')]

In [22]:
# With 4 topics
lda = models.LdaModel(corpus=corpus, num_topics=4, id2word=id2word, passes=20)
lda.print_topics()

2022-05-10 23:20:24,787 : INFO : using symmetric alpha at 0.25
2022-05-10 23:20:24,793 : INFO : using symmetric eta at 0.25
2022-05-10 23:20:24,815 : INFO : using serial LDA version on this node
2022-05-10 23:20:24,849 : INFO : running online (multi-pass) LDA training, 4 topics, 20 passes over the supplied corpus of 2290 documents, updating model once every 2000 documents, evaluating perplexity every 2290 documents, iterating 50x with a convergence threshold of 0.001000
2022-05-10 23:20:24,922 : INFO : PROGRESS: pass 0, at document #2000/2290
2022-05-10 23:20:28,851 : INFO : merging changes from 2000 documents into a model of 2290 documents
2022-05-10 23:20:28,863 : INFO : topic #0 (0.250): 0.015*"flight" + 0.011*"refund" + 0.006*"year" + 0.005*"airline" + 0.005*"moca" + 0.005*"service" + 0.004*"customer" + 0.004*"money" + 0.004*"airport" + 0.003*"waiting"
2022-05-10 23:20:28,869 : INFO : topic #1 (0.250): 0.025*"flight" + 0.007*"delhi" + 0.006*"airport" + 0.006*"passenger" + 0.006*"ti

2022-05-10 23:20:35,351 : INFO : topic #2 (0.250): 0.018*"flight" + 0.014*"travel" + 0.012*"delhi" + 0.008*"booking" + 0.008*"ticket" + 0.006*"change" + 0.006*"baggage" + 0.005*"date" + 0.005*"customer" + 0.005*"experience"
2022-05-10 23:20:35,355 : INFO : topic #3 (0.250): 0.012*"service" + 0.010*"ticket" + 0.009*"refund" + 0.008*"dear" + 0.008*"travel" + 0.007*"airline" + 0.006*"request" + 0.006*"international" + 0.005*"make" + 0.005*"visit"
2022-05-10 23:20:35,358 : INFO : topic diff=0.304577, rho=0.440867
2022-05-10 23:20:35,584 : INFO : -7.465 per-word bound, 176.7 perplexity estimate based on a held-out corpus of 290 documents with 4270 words
2022-05-10 23:20:35,584 : INFO : PROGRESS: pass 3, at document #2290/2290
2022-05-10 23:20:35,729 : INFO : merging changes from 290 documents into a model of 2290 documents
2022-05-10 23:20:35,746 : INFO : topic #0 (0.250): 0.020*"refund" + 0.016*"money" + 0.015*"aiesl" + 0.013*"flight" + 0.010*"year" + 0.009*"moca" + 0.006*"company" + 0.006

2022-05-10 23:20:40,063 : INFO : topic #1 (0.250): 0.033*"flight" + 0.017*"customer" + 0.012*"service" + 0.012*"care" + 0.010*"airport" + 0.010*"day" + 0.010*"luggage" + 0.009*"help" + 0.009*"team" + 0.008*"ticket"
2022-05-10 23:20:40,063 : INFO : topic #2 (0.250): 0.019*"flight" + 0.016*"travel" + 0.014*"delhi" + 0.010*"booking" + 0.010*"ticket" + 0.006*"change" + 0.006*"date" + 0.006*"baggage" + 0.006*"flipkart" + 0.005*"website"
2022-05-10 23:20:40,063 : INFO : topic #3 (0.250): 0.011*"refund" + 0.011*"ticket" + 0.011*"service" + 0.008*"dear" + 0.008*"travel" + 0.007*"airline" + 0.006*"request" + 0.005*"make" + 0.005*"form" + 0.005*"feedback"
2022-05-10 23:20:40,063 : INFO : topic diff=0.206374, rho=0.350392
2022-05-10 23:20:40,099 : INFO : PROGRESS: pass 7, at document #2000/2290
2022-05-10 23:20:41,490 : INFO : merging changes from 2000 documents into a model of 2290 documents
2022-05-10 23:20:41,505 : INFO : topic #0 (0.250): 0.018*"refund" + 0.013*"money" + 0.012*"aiesl" + 0.011

2022-05-10 23:20:50,543 : INFO : topic #1 (0.250): 0.033*"flight" + 0.016*"customer" + 0.012*"service" + 0.010*"airport" + 0.010*"care" + 0.008*"day" + 0.008*"help" + 0.008*"ticket" + 0.008*"luggage" + 0.008*"team"
2022-05-10 23:20:50,548 : INFO : topic #2 (0.250): 0.019*"flight" + 0.015*"travel" + 0.013*"delhi" + 0.009*"booking" + 0.009*"ticket" + 0.006*"change" + 0.005*"date" + 0.005*"flipkart" + 0.005*"website" + 0.004*"baggage"
2022-05-10 23:20:50,552 : INFO : topic #3 (0.250): 0.009*"service" + 0.009*"ticket" + 0.009*"refund" + 0.009*"dear" + 0.008*"travel" + 0.007*"airline" + 0.006*"request" + 0.006*"international" + 0.005*"visit" + 0.005*"make"
2022-05-10 23:20:50,557 : INFO : topic diff=0.159318, rho=0.286947
2022-05-10 23:20:51,052 : INFO : -7.312 per-word bound, 158.9 perplexity estimate based on a held-out corpus of 290 documents with 4270 words
2022-05-10 23:20:51,055 : INFO : PROGRESS: pass 10, at document #2290/2290
2022-05-10 23:20:51,742 : INFO : merging changes from 29

2022-05-10 23:21:01,577 : INFO : topic #1 (0.250): 0.033*"flight" + 0.018*"customer" + 0.014*"service" + 0.011*"care" + 0.010*"airport" + 0.010*"day" + 0.009*"help" + 0.009*"luggage" + 0.009*"ticket" + 0.008*"team"
2022-05-10 23:21:01,577 : INFO : topic #2 (0.250): 0.019*"flight" + 0.016*"travel" + 0.014*"delhi" + 0.010*"booking" + 0.010*"ticket" + 0.006*"date" + 0.006*"change" + 0.006*"website" + 0.005*"flipkart" + 0.004*"thing"
2022-05-10 23:21:01,577 : INFO : topic #3 (0.250): 0.010*"refund" + 0.010*"ticket" + 0.009*"service" + 0.009*"dear" + 0.008*"travel" + 0.007*"airline" + 0.006*"request" + 0.005*"form" + 0.005*"make" + 0.005*"international"
2022-05-10 23:21:01,577 : INFO : topic diff=0.136740, rho=0.256960
2022-05-10 23:21:01,620 : INFO : PROGRESS: pass 14, at document #2000/2290
2022-05-10 23:21:02,938 : INFO : merging changes from 2000 documents into a model of 2290 documents
2022-05-10 23:21:02,938 : INFO : topic #0 (0.250): 0.019*"refund" + 0.014*"money" + 0.012*"aiesl" + 0

2022-05-10 23:21:11,348 : INFO : topic #1 (0.250): 0.033*"flight" + 0.016*"customer" + 0.013*"service" + 0.010*"airport" + 0.010*"care" + 0.009*"day" + 0.008*"help" + 0.008*"ticket" + 0.008*"luggage" + 0.008*"team"
2022-05-10 23:21:11,350 : INFO : topic #2 (0.250): 0.019*"flight" + 0.015*"travel" + 0.014*"delhi" + 0.009*"booking" + 0.009*"ticket" + 0.006*"change" + 0.005*"date" + 0.005*"website" + 0.005*"flipkart" + 0.004*"hour"
2022-05-10 23:21:11,353 : INFO : topic #3 (0.250): 0.009*"dear" + 0.009*"ticket" + 0.009*"service" + 0.009*"refund" + 0.008*"travel" + 0.007*"request" + 0.006*"airline" + 0.006*"international" + 0.005*"visit" + 0.005*"make"
2022-05-10 23:21:11,357 : INFO : topic diff=0.119654, rho=0.228545
2022-05-10 23:21:11,661 : INFO : -7.277 per-word bound, 155.1 perplexity estimate based on a held-out corpus of 290 documents with 4270 words
2022-05-10 23:21:11,661 : INFO : PROGRESS: pass 17, at document #2290/2290
2022-05-10 23:21:11,828 : INFO : merging changes from 290 d

[(0,
  '0.021*"refund" + 0.016*"money" + 0.014*"aiesl" + 0.011*"flight" + 0.010*"year" + 0.009*"moca" + 0.006*"waiting" + 0.006*"ticket" + 0.006*"company" + 0.006*"month"'),
 (1,
  '0.033*"flight" + 0.018*"customer" + 0.014*"service" + 0.011*"care" + 0.011*"airport" + 0.010*"day" + 0.009*"help" + 0.009*"luggage" + 0.008*"team" + 0.008*"ticket"'),
 (2,
  '0.019*"flight" + 0.016*"travel" + 0.014*"delhi" + 0.010*"booking" + 0.010*"ticket" + 0.006*"change" + 0.006*"date" + 0.006*"website" + 0.005*"flipkart" + 0.004*"thing"'),
 (3,
  '0.010*"refund" + 0.010*"ticket" + 0.009*"dear" + 0.009*"service" + 0.008*"travel" + 0.007*"airline" + 0.006*"request" + 0.005*"form" + 0.005*"international" + 0.005*"make"')]

In [15]:
# With 8 topics
lda = models.LdaModel(corpus=corpus, num_topics=8, id2word=id2word, passes=20)
lda.print_topics()

2021-11-24 13:23:39,700 : INFO : using symmetric alpha at 0.125
2021-11-24 13:23:39,702 : INFO : using symmetric eta at 0.125
2021-11-24 13:23:39,705 : INFO : using serial LDA version on this node
2021-11-24 13:23:39,717 : INFO : running online (multi-pass) LDA training, 8 topics, 20 passes over the supplied corpus of 10002 documents, updating model once every 2000 documents, evaluating perplexity every 10002 documents, iterating 50x with a convergence threshold of 0.001000
2021-11-24 13:23:39,735 : INFO : PROGRESS: pass 0, at document #2000/10002
2021-11-24 13:23:40,848 : INFO : merging changes from 2000 documents into a model of 10002 documents
2021-11-24 13:23:40,856 : INFO : topic #5 (0.125): 0.028*"account" + 0.020*"loan" + 0.019*"payment" + 0.012*"told" + 0.011*"bank" + 0.010*"credit" + 0.008*"called" + 0.008*"information" + 0.008*"received" + 0.007*"card"
2021-11-24 13:23:40,857 : INFO : topic #2 (0.125): 0.038*"card" + 0.026*"credit" + 0.018*"account" + 0.014*"payment" + 0.011*

2021-11-24 13:23:44,620 : INFO : topic #1 (0.125): 0.029*"account" + 0.022*"credit" + 0.011*"day" + 0.010*"sure" + 0.009*"document" + 0.009*"information" + 0.009*"application" + 0.008*"item" + 0.008*"balance" + 0.008*"request"
2021-11-24 13:23:44,621 : INFO : topic #6 (0.125): 0.050*"debt" + 0.039*"credit" + 0.030*"report" + 0.029*"account" + 0.024*"lawyer" + 0.021*"bureau" + 0.019*"proof" + 0.017*"file" + 0.017*"collection" + 0.017*"received"
2021-11-24 13:23:44,622 : INFO : topic diff=0.344331, rho=0.408248
2021-11-24 13:23:44,644 : INFO : PROGRESS: pass 1, at document #2000/10002
2021-11-24 13:23:45,506 : INFO : merging changes from 2000 documents into a model of 10002 documents
2021-11-24 13:23:45,514 : INFO : topic #6 (0.125): 0.045*"debt" + 0.038*"credit" + 0.030*"report" + 0.027*"account" + 0.020*"lawyer" + 0.019*"bureau" + 0.017*"proof" + 0.016*"received" + 0.016*"collection" + 0.015*"file"
2021-11-24 13:23:45,515 : INFO : topic #7 (0.125): 0.056*"bank" + 0.056*"account" + 0.03

2021-11-24 13:23:48,699 : INFO : topic #2 (0.125): 0.078*"card" + 0.051*"credit" + 0.023*"charge" + 0.012*"account" + 0.009*"dispute" + 0.009*"purchase" + 0.008*"citi" + 0.008*"bank" + 0.008*"capital" + 0.008*"transaction"
2021-11-24 13:23:48,700 : INFO : topic diff=0.354761, rho=0.377937
2021-11-24 13:23:48,718 : INFO : PROGRESS: pass 2, at document #2000/10002
2021-11-24 13:23:49,518 : INFO : merging changes from 2000 documents into a model of 10002 documents
2021-11-24 13:23:49,525 : INFO : topic #4 (0.125): 0.058*"account" + 0.036*"payment" + 0.021*"credit" + 0.015*"late" + 0.013*"chase" + 0.011*"balance" + 0.010*"received" + 0.010*"closed" + 0.009*"customer" + 0.009*"paid"
2021-11-24 13:23:49,526 : INFO : topic #3 (0.125): 0.020*"letter" + 0.017*"received" + 0.012*"claim" + 0.012*"document" + 0.012*"sent" + 0.010*"email" + 0.009*"information" + 0.008*"provided" + 0.008*"number" + 0.007*"documentation"
2021-11-24 13:23:49,527 : INFO : topic #6 (0.125): 0.050*"debt" + 0.040*"account

2021-11-24 13:23:52,566 : INFO : topic diff=0.380069, rho=0.353531
2021-11-24 13:23:52,583 : INFO : PROGRESS: pass 3, at document #2000/10002
2021-11-24 13:23:53,283 : INFO : merging changes from 2000 documents into a model of 10002 documents
2021-11-24 13:23:53,290 : INFO : topic #4 (0.125): 0.049*"account" + 0.035*"payment" + 0.022*"credit" + 0.014*"late" + 0.013*"balance" + 0.011*"chase" + 0.010*"called" + 0.010*"paid" + 0.009*"received" + 0.009*"phone"
2021-11-24 13:23:53,291 : INFO : topic #5 (0.125): 0.055*"payment" + 0.043*"loan" + 0.016*"told" + 0.016*"mortgage" + 0.013*"home" + 0.011*"month" + 0.011*"year" + 0.009*"frequent" + 0.009*"company" + 0.007*"paid"
2021-11-24 13:23:53,291 : INFO : topic #1 (0.125): 0.033*"usaa" + 0.031*"credit" + 0.025*"account" + 0.020*"day" + 0.013*"federal" + 0.012*"fair" + 0.011*"document" + 0.011*"accordance" + 0.011*"sure" + 0.011*"item"
2021-11-24 13:23:53,292 : INFO : topic #2 (0.125): 0.074*"card" + 0.043*"credit" + 0.026*"charge" + 0.011*"ci

2021-11-24 13:23:56,145 : INFO : PROGRESS: pass 4, at document #2000/10002
2021-11-24 13:23:56,817 : INFO : merging changes from 2000 documents into a model of 10002 documents
2021-11-24 13:23:56,824 : INFO : topic #3 (0.125): 0.021*"letter" + 0.017*"received" + 0.013*"sent" + 0.012*"information" + 0.012*"number" + 0.011*"document" + 0.011*"email" + 0.010*"claim" + 0.009*"phone" + 0.009*"address"
2021-11-24 13:23:56,825 : INFO : topic #7 (0.125): 0.070*"account" + 0.065*"bank" + 0.031*"check" + 0.021*"money" + 0.021*"fund" + 0.014*"deposit" + 0.014*"america" + 0.013*"checking" + 0.011*"branch" + 0.011*"told"
2021-11-24 13:23:56,826 : INFO : topic #2 (0.125): 0.075*"card" + 0.042*"credit" + 0.027*"charge" + 0.011*"citi" + 0.010*"dispute" + 0.010*"purchase" + 0.009*"transaction" + 0.008*"fraud" + 0.008*"told" + 0.008*"merchant"
2021-11-24 13:23:56,826 : INFO : topic #6 (0.125): 0.054*"debt" + 0.046*"account" + 0.044*"lawyer" + 0.039*"credit" + 0.034*"report" + 0.021*"bureau" + 0.020*"rec

2021-11-24 13:24:00,269 : INFO : merging changes from 2000 documents into a model of 10002 documents
2021-11-24 13:24:00,277 : INFO : topic #0 (0.125): 0.073*"didn" + 0.057*"bank" + 0.054*"account" + 0.053*"credit" + 0.038*"called" + 0.038*"told" + 0.037*"item" + 0.036*"card" + 0.035*"refunded" + 0.019*"charged"
2021-11-24 13:24:00,278 : INFO : topic #6 (0.125): 0.055*"debt" + 0.047*"account" + 0.044*"lawyer" + 0.040*"credit" + 0.035*"report" + 0.021*"bureau" + 0.020*"received" + 0.019*"proof" + 0.019*"file" + 0.019*"collection"
2021-11-24 13:24:00,279 : INFO : topic #2 (0.125): 0.076*"card" + 0.042*"credit" + 0.028*"charge" + 0.012*"citi" + 0.011*"dispute" + 0.011*"purchase" + 0.009*"transaction" + 0.009*"fraud" + 0.008*"capital" + 0.008*"merchant"
2021-11-24 13:24:00,279 : INFO : topic #4 (0.125): 0.040*"account" + 0.034*"payment" + 0.022*"credit" + 0.015*"called" + 0.014*"balance" + 0.013*"late" + 0.011*"paid" + 0.011*"phone" + 0.010*"month" + 0.010*"told"
2021-11-24 13:24:00,280 : 

2021-11-24 13:24:03,696 : INFO : topic #1 (0.125): 0.036*"usaa" + 0.034*"credit" + 0.026*"account" + 0.021*"day" + 0.015*"federal" + 0.013*"fair" + 0.012*"item" + 0.011*"accordance" + 0.011*"sure" + 0.011*"document"
2021-11-24 13:24:03,697 : INFO : topic #4 (0.125): 0.038*"account" + 0.034*"payment" + 0.022*"credit" + 0.016*"called" + 0.014*"balance" + 0.013*"late" + 0.012*"paid" + 0.012*"phone" + 0.011*"told" + 0.011*"month"
2021-11-24 13:24:03,697 : INFO : topic #2 (0.125): 0.077*"card" + 0.043*"credit" + 0.028*"charge" + 0.012*"citi" + 0.011*"dispute" + 0.011*"purchase" + 0.009*"transaction" + 0.009*"fraud" + 0.009*"capital" + 0.008*"merchant"
2021-11-24 13:24:03,698 : INFO : topic #0 (0.125): 0.074*"didn" + 0.057*"bank" + 0.054*"account" + 0.054*"credit" + 0.037*"told" + 0.037*"called" + 0.037*"item" + 0.036*"card" + 0.035*"refunded" + 0.019*"charged"
2021-11-24 13:24:03,699 : INFO : topic #7 (0.125): 0.077*"account" + 0.068*"bank" + 0.031*"check" + 0.022*"money" + 0.021*"fund" + 0

2021-11-24 13:24:07,174 : INFO : topic #2 (0.125): 0.078*"card" + 0.043*"credit" + 0.028*"charge" + 0.012*"citi" + 0.011*"dispute" + 0.011*"purchase" + 0.009*"transaction" + 0.009*"fraud" + 0.009*"capital" + 0.009*"merchant"
2021-11-24 13:24:07,175 : INFO : topic #1 (0.125): 0.036*"usaa" + 0.035*"credit" + 0.026*"account" + 0.021*"day" + 0.015*"federal" + 0.013*"fair" + 0.012*"item" + 0.012*"accordance" + 0.011*"sure" + 0.011*"document"
2021-11-24 13:24:07,176 : INFO : topic #5 (0.125): 0.049*"loan" + 0.037*"payment" + 0.019*"mortgage" + 0.012*"year" + 0.011*"month" + 0.009*"company" + 0.009*"told" + 0.008*"home" + 0.008*"insurance" + 0.007*"paid"
2021-11-24 13:24:07,177 : INFO : topic #0 (0.125): 0.074*"didn" + 0.057*"bank" + 0.054*"account" + 0.054*"credit" + 0.037*"item" + 0.037*"told" + 0.037*"called" + 0.036*"card" + 0.035*"refunded" + 0.019*"charged"
2021-11-24 13:24:07,178 : INFO : topic diff=0.243408, rho=0.277339
2021-11-24 13:24:07,201 : INFO : PROGRESS: pass 7, at document #

2021-11-24 13:24:10,400 : INFO : topic #7 (0.125): 0.081*"account" + 0.069*"bank" + 0.031*"check" + 0.022*"money" + 0.021*"fund" + 0.014*"america" + 0.014*"deposit" + 0.013*"checking" + 0.011*"transaction" + 0.011*"told"
2021-11-24 13:24:10,400 : INFO : topic #3 (0.125): 0.023*"letter" + 0.018*"received" + 0.016*"information" + 0.015*"sent" + 0.013*"number" + 0.010*"address" + 0.010*"email" + 0.010*"document" + 0.009*"claim" + 0.009*"phone"
2021-11-24 13:24:10,401 : INFO : topic #5 (0.125): 0.050*"loan" + 0.036*"payment" + 0.020*"mortgage" + 0.012*"year" + 0.011*"month" + 0.009*"company" + 0.008*"home" + 0.008*"told" + 0.008*"insurance" + 0.007*"paid"
2021-11-24 13:24:10,402 : INFO : topic diff=0.230141, rho=0.267252
2021-11-24 13:24:10,425 : INFO : PROGRESS: pass 8, at document #4000/10002
2021-11-24 13:24:11,032 : INFO : merging changes from 2000 documents into a model of 10002 documents
2021-11-24 13:24:11,039 : INFO : topic #5 (0.125): 0.051*"loan" + 0.034*"payment" + 0.021*"mortga

2021-11-24 13:24:13,592 : INFO : topic #7 (0.125): 0.083*"account" + 0.070*"bank" + 0.031*"check" + 0.021*"money" + 0.021*"fund" + 0.013*"america" + 0.013*"deposit" + 0.013*"checking" + 0.011*"transaction" + 0.011*"branch"
2021-11-24 13:24:13,593 : INFO : topic #1 (0.125): 0.035*"credit" + 0.035*"usaa" + 0.026*"account" + 0.021*"day" + 0.015*"federal" + 0.013*"fair" + 0.013*"item" + 0.012*"accordance" + 0.012*"sure" + 0.011*"document"
2021-11-24 13:24:13,594 : INFO : topic diff=0.218456, rho=0.258190
2021-11-24 13:24:13,618 : INFO : PROGRESS: pass 9, at document #4000/10002
2021-11-24 13:24:14,230 : INFO : merging changes from 2000 documents into a model of 10002 documents
2021-11-24 13:24:14,238 : INFO : topic #7 (0.125): 0.083*"account" + 0.070*"bank" + 0.030*"check" + 0.022*"money" + 0.021*"fund" + 0.013*"deposit" + 0.013*"america" + 0.013*"checking" + 0.012*"transaction" + 0.011*"branch"
2021-11-24 13:24:14,239 : INFO : topic #4 (0.125): 0.037*"payment" + 0.032*"account" + 0.020*"c

2021-11-24 13:24:16,783 : INFO : topic #7 (0.125): 0.084*"account" + 0.070*"bank" + 0.031*"check" + 0.021*"money" + 0.021*"fund" + 0.013*"america" + 0.013*"deposit" + 0.013*"checking" + 0.011*"transaction" + 0.011*"branch"
2021-11-24 13:24:16,784 : INFO : topic diff=0.208306, rho=0.249992
2021-11-24 13:24:16,808 : INFO : PROGRESS: pass 10, at document #4000/10002
2021-11-24 13:24:17,402 : INFO : merging changes from 2000 documents into a model of 10002 documents
2021-11-24 13:24:17,410 : INFO : topic #3 (0.125): 0.023*"letter" + 0.018*"information" + 0.017*"received" + 0.016*"sent" + 0.013*"number" + 0.010*"email" + 0.010*"address" + 0.010*"document" + 0.009*"claim" + 0.009*"complaint"
2021-11-24 13:24:17,410 : INFO : topic #0 (0.125): 0.074*"didn" + 0.058*"bank" + 0.053*"account" + 0.053*"credit" + 0.037*"told" + 0.037*"item" + 0.037*"called" + 0.035*"card" + 0.034*"refunded" + 0.019*"charged"
2021-11-24 13:24:17,410 : INFO : topic #6 (0.125): 0.067*"debt" + 0.046*"account" + 0.042*"c

2021-11-24 13:24:19,886 : INFO : topic #6 (0.125): 0.061*"debt" + 0.049*"account" + 0.043*"lawyer" + 0.042*"credit" + 0.037*"report" + 0.033*"payment" + 0.022*"bureau" + 0.021*"collection" + 0.020*"received" + 0.020*"proof"
2021-11-24 13:24:19,887 : INFO : topic diff=0.199208, rho=0.242528
2021-11-24 13:24:19,911 : INFO : PROGRESS: pass 11, at document #4000/10002
2021-11-24 13:24:20,507 : INFO : merging changes from 2000 documents into a model of 10002 documents
2021-11-24 13:24:20,515 : INFO : topic #4 (0.125): 0.038*"payment" + 0.030*"account" + 0.019*"called" + 0.019*"credit" + 0.017*"told" + 0.013*"phone" + 0.012*"late" + 0.012*"balance" + 0.012*"paid" + 0.012*"month"
2021-11-24 13:24:20,516 : INFO : topic #2 (0.125): 0.081*"card" + 0.045*"credit" + 0.030*"charge" + 0.013*"purchase" + 0.011*"dispute" + 0.011*"citi" + 0.010*"fraud" + 0.010*"capital" + 0.009*"transaction" + 0.009*"account"
2021-11-24 13:24:20,517 : INFO : topic #3 (0.125): 0.023*"letter" + 0.019*"information" + 0.01

2021-11-24 13:24:23,102 : INFO : topic #5 (0.125): 0.051*"loan" + 0.033*"payment" + 0.021*"mortgage" + 0.012*"year" + 0.011*"month" + 0.009*"home" + 0.008*"company" + 0.008*"insurance" + 0.007*"rate" + 0.007*"paid"
2021-11-24 13:24:23,104 : INFO : topic diff=0.191131, rho=0.235696
2021-11-24 13:24:23,127 : INFO : PROGRESS: pass 12, at document #4000/10002
2021-11-24 13:24:23,723 : INFO : merging changes from 2000 documents into a model of 10002 documents
2021-11-24 13:24:23,730 : INFO : topic #1 (0.125): 0.036*"credit" + 0.032*"usaa" + 0.026*"account" + 0.022*"day" + 0.014*"federal" + 0.014*"fair" + 0.013*"item" + 0.012*"accordance" + 0.011*"sure" + 0.011*"document"
2021-11-24 13:24:23,731 : INFO : topic #4 (0.125): 0.038*"payment" + 0.029*"account" + 0.020*"called" + 0.018*"credit" + 0.017*"told" + 0.013*"phone" + 0.012*"late" + 0.012*"balance" + 0.012*"paid" + 0.012*"month"
2021-11-24 13:24:23,732 : INFO : topic #0 (0.125): 0.074*"didn" + 0.058*"bank" + 0.053*"account" + 0.053*"credi

2021-11-24 13:24:26,203 : INFO : topic #6 (0.125): 0.063*"debt" + 0.050*"account" + 0.043*"lawyer" + 0.043*"credit" + 0.038*"report" + 0.033*"payment" + 0.022*"bureau" + 0.022*"collection" + 0.020*"received" + 0.020*"proof"
2021-11-24 13:24:26,204 : INFO : topic diff=0.183924, rho=0.229410
2021-11-24 13:24:26,227 : INFO : PROGRESS: pass 13, at document #4000/10002
2021-11-24 13:24:26,811 : INFO : merging changes from 2000 documents into a model of 10002 documents
2021-11-24 13:24:26,818 : INFO : topic #4 (0.125): 0.038*"payment" + 0.028*"account" + 0.020*"called" + 0.018*"credit" + 0.018*"told" + 0.013*"phone" + 0.012*"late" + 0.012*"paid" + 0.012*"balance" + 0.012*"month"
2021-11-24 13:24:26,819 : INFO : topic #0 (0.125): 0.074*"didn" + 0.058*"bank" + 0.053*"account" + 0.053*"credit" + 0.037*"item" + 0.037*"told" + 0.037*"called" + 0.035*"card" + 0.034*"refunded" + 0.019*"charged"
2021-11-24 13:24:26,820 : INFO : topic #6 (0.125): 0.069*"debt" + 0.048*"account" + 0.044*"credit" + 0.03

2021-11-24 13:24:29,330 : INFO : topic diff=0.177557, rho=0.223601
2021-11-24 13:24:29,356 : INFO : PROGRESS: pass 14, at document #4000/10002
2021-11-24 13:24:29,932 : INFO : merging changes from 2000 documents into a model of 10002 documents
2021-11-24 13:24:29,939 : INFO : topic #2 (0.125): 0.081*"card" + 0.046*"credit" + 0.030*"charge" + 0.013*"purchase" + 0.011*"dispute" + 0.011*"citi" + 0.010*"capital" + 0.010*"fraud" + 0.009*"account" + 0.009*"transaction"
2021-11-24 13:24:29,940 : INFO : topic #3 (0.125): 0.024*"letter" + 0.020*"information" + 0.017*"received" + 0.016*"sent" + 0.012*"number" + 0.011*"address" + 0.010*"email" + 0.010*"document" + 0.009*"company" + 0.009*"claim"
2021-11-24 13:24:29,941 : INFO : topic #6 (0.125): 0.070*"debt" + 0.048*"account" + 0.044*"credit" + 0.039*"report" + 0.036*"lawyer" + 0.029*"payment" + 0.025*"collection" + 0.021*"bureau" + 0.019*"received" + 0.018*"proof"
2021-11-24 13:24:29,942 : INFO : topic #7 (0.125): 0.087*"account" + 0.071*"bank" 

2021-11-24 13:24:32,733 : INFO : PROGRESS: pass 15, at document #4000/10002
2021-11-24 13:24:33,447 : INFO : merging changes from 2000 documents into a model of 10002 documents
2021-11-24 13:24:33,455 : INFO : topic #1 (0.125): 0.036*"credit" + 0.032*"usaa" + 0.026*"account" + 0.022*"day" + 0.014*"fair" + 0.014*"federal" + 0.013*"item" + 0.012*"accordance" + 0.012*"sure" + 0.011*"document"
2021-11-24 13:24:33,456 : INFO : topic #2 (0.125): 0.082*"card" + 0.046*"credit" + 0.030*"charge" + 0.013*"purchase" + 0.011*"citi" + 0.011*"dispute" + 0.010*"capital" + 0.010*"fraud" + 0.010*"account" + 0.009*"transaction"
2021-11-24 13:24:33,457 : INFO : topic #7 (0.125): 0.087*"account" + 0.071*"bank" + 0.030*"check" + 0.021*"fund" + 0.021*"money" + 0.013*"america" + 0.013*"deposit" + 0.013*"checking" + 0.012*"transaction" + 0.011*"branch"
2021-11-24 13:24:33,457 : INFO : topic #4 (0.125): 0.037*"payment" + 0.027*"account" + 0.020*"called" + 0.018*"told" + 0.017*"credit" + 0.014*"phone" + 0.012*"l

2021-11-24 13:24:35,922 : INFO : PROGRESS: pass 16, at document #4000/10002
2021-11-24 13:24:36,489 : INFO : merging changes from 2000 documents into a model of 10002 documents
2021-11-24 13:24:36,496 : INFO : topic #0 (0.125): 0.074*"didn" + 0.058*"bank" + 0.053*"account" + 0.053*"credit" + 0.037*"item" + 0.037*"told" + 0.037*"called" + 0.035*"card" + 0.034*"refunded" + 0.019*"charged"
2021-11-24 13:24:36,497 : INFO : topic #3 (0.125): 0.024*"letter" + 0.020*"information" + 0.017*"received" + 0.016*"sent" + 0.012*"number" + 0.011*"address" + 0.010*"document" + 0.009*"company" + 0.009*"email" + 0.009*"claim"
2021-11-24 13:24:36,497 : INFO : topic #4 (0.125): 0.037*"payment" + 0.026*"account" + 0.021*"called" + 0.019*"told" + 0.017*"credit" + 0.014*"phone" + 0.012*"late" + 0.012*"paid" + 0.012*"month" + 0.011*"balance"
2021-11-24 13:24:36,498 : INFO : topic #1 (0.125): 0.036*"credit" + 0.032*"usaa" + 0.026*"account" + 0.022*"day" + 0.014*"fair" + 0.014*"federal" + 0.013*"item" + 0.012*"

2021-11-24 13:24:39,519 : INFO : merging changes from 2000 documents into a model of 10002 documents
2021-11-24 13:24:39,526 : INFO : topic #1 (0.125): 0.037*"credit" + 0.032*"usaa" + 0.026*"account" + 0.022*"day" + 0.014*"fair" + 0.014*"federal" + 0.013*"item" + 0.012*"accordance" + 0.012*"sure" + 0.011*"document"
2021-11-24 13:24:39,527 : INFO : topic #5 (0.125): 0.053*"loan" + 0.032*"payment" + 0.022*"mortgage" + 0.012*"year" + 0.011*"month" + 0.010*"home" + 0.008*"insurance" + 0.008*"company" + 0.007*"rate" + 0.007*"paid"
2021-11-24 13:24:39,527 : INFO : topic #0 (0.125): 0.074*"didn" + 0.058*"bank" + 0.053*"account" + 0.053*"credit" + 0.037*"item" + 0.037*"told" + 0.037*"called" + 0.035*"card" + 0.034*"refunded" + 0.019*"charged"
2021-11-24 13:24:39,528 : INFO : topic #3 (0.125): 0.024*"letter" + 0.021*"information" + 0.017*"received" + 0.016*"sent" + 0.012*"number" + 0.011*"address" + 0.010*"company" + 0.010*"document" + 0.009*"email" + 0.009*"provide"
2021-11-24 13:24:39,529 : I

2021-11-24 13:24:42,595 : INFO : topic #4 (0.125): 0.037*"payment" + 0.025*"account" + 0.021*"called" + 0.019*"told" + 0.016*"credit" + 0.014*"phone" + 0.012*"late" + 0.012*"paid" + 0.012*"month" + 0.011*"balance"
2021-11-24 13:24:42,595 : INFO : topic #6 (0.125): 0.072*"debt" + 0.050*"account" + 0.046*"credit" + 0.040*"report" + 0.037*"lawyer" + 0.030*"payment" + 0.026*"collection" + 0.021*"bureau" + 0.019*"received" + 0.018*"proof"
2021-11-24 13:24:42,596 : INFO : topic #2 (0.125): 0.082*"card" + 0.047*"credit" + 0.030*"charge" + 0.012*"purchase" + 0.011*"citi" + 0.011*"dispute" + 0.011*"capital" + 0.010*"account" + 0.010*"fraud" + 0.009*"transaction"
2021-11-24 13:24:42,597 : INFO : topic #7 (0.125): 0.088*"account" + 0.072*"bank" + 0.030*"check" + 0.021*"fund" + 0.021*"money" + 0.013*"checking" + 0.013*"america" + 0.013*"deposit" + 0.012*"transaction" + 0.011*"branch"
2021-11-24 13:24:42,597 : INFO : topic #5 (0.125): 0.053*"loan" + 0.032*"payment" + 0.022*"mortgage" + 0.012*"year"

2021-11-24 13:24:45,635 : INFO : topic #7 (0.125): 0.089*"account" + 0.072*"bank" + 0.030*"check" + 0.021*"fund" + 0.021*"money" + 0.013*"checking" + 0.013*"america" + 0.013*"deposit" + 0.012*"transaction" + 0.011*"branch"
2021-11-24 13:24:45,636 : INFO : topic #3 (0.125): 0.024*"letter" + 0.021*"information" + 0.016*"received" + 0.016*"sent" + 0.011*"number" + 0.011*"address" + 0.010*"company" + 0.009*"document" + 0.009*"provide" + 0.009*"claim"
2021-11-24 13:24:45,636 : INFO : topic #4 (0.125): 0.037*"payment" + 0.024*"account" + 0.021*"called" + 0.019*"told" + 0.016*"credit" + 0.014*"phone" + 0.012*"paid" + 0.012*"late" + 0.012*"month" + 0.011*"balance"
2021-11-24 13:24:45,637 : INFO : topic #1 (0.125): 0.037*"credit" + 0.032*"usaa" + 0.026*"account" + 0.022*"day" + 0.014*"fair" + 0.013*"federal" + 0.013*"item" + 0.012*"accordance" + 0.012*"sure" + 0.011*"document"
2021-11-24 13:24:45,638 : INFO : topic diff=0.103590, rho=0.199996
2021-11-24 13:24:45,665 : INFO : PROGRESS: pass 19, 

2021-11-24 13:24:47,563 : INFO : topic #7 (0.125): 0.088*"account" + 0.072*"bank" + 0.031*"check" + 0.021*"fund" + 0.020*"money" + 0.014*"checking" + 0.014*"america" + 0.013*"deposit" + 0.011*"chase" + 0.011*"transaction"


[(0,
  '0.075*"didn" + 0.057*"bank" + 0.055*"account" + 0.055*"credit" + 0.037*"item" + 0.037*"called" + 0.037*"told" + 0.036*"card" + 0.036*"refunded" + 0.019*"charged"'),
 (1,
  '0.036*"credit" + 0.028*"account" + 0.025*"usaa" + 0.019*"day" + 0.014*"federal" + 0.014*"sure" + 0.014*"fair" + 0.013*"demand" + 0.013*"item" + 0.012*"accordance"'),
 (2,
  '0.081*"card" + 0.047*"credit" + 0.030*"charge" + 0.011*"purchase" + 0.011*"capital" + 0.011*"account" + 0.010*"citi" + 0.010*"dispute" + 0.009*"fraud" + 0.009*"transaction"'),
 (3,
  '0.024*"letter" + 0.021*"information" + 0.016*"sent" + 0.016*"received" + 0.011*"number" + 0.011*"address" + 0.011*"company" + 0.010*"provide" + 0.009*"document" + 0.009*"claim"'),
 (4,
  '0.036*"payment" + 0.023*"account" + 0.021*"called" + 0.019*"told" + 0.016*"credit" + 0.014*"phone" + 0.013*"paid" + 0.012*"late" + 0.011*"month" + 0.011*"balance"'),
 (5,
  '0.052*"loan" + 0.033*"payment" + 0.025*"mortgage" + 0.012*"year" + 0.010*"month" + 0.010*"home" + 0

# Attempt 2 - Pull only the Nouns

In [7]:
# Let's create a function to pull out nouns from a string of text
from nltk import word_tokenize, pos_tag

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

In [8]:
# Apply the nouns function to the transcripts to filter only on nouns
data_nouns = pd.DataFrame(data['Tweet_Text'].apply(nouns))
data_nouns

Unnamed: 0,Tweet_Text
0,hemendu @ JM_Scindia @ PMOIndia @ AAI_Official...
1,Heard lot @ airindiain reason ticket IN trip s...
2,RT @ Dastanagoi service makemytrip AirIndia tr...
3,virsanghvi @ airindiain @ IndiGo6E @ TataCompa...
4,RT @ madhukishwar AirIndia behaving Gharwapsi ...
...,...
2285,AirIndia Express years service LCC SouthEast A...
2286,@ GnaniGnaneshan @ rangaba @ flysrilankan @ ch...
2287,sayantan05cts Mr. Chakraborty https //t.co/Zaj...
2288,DENISH918 @ airindiain flight confusion timing...


In [9]:
lem_transcript_n = []
from nltk.stem.wordnet import WordNetLemmatizer 
import re
for i in range(0, data.shape[0]):
    #Remove punctuations
    text = re.sub('[^a-zA-Z]', ' ', str(data_nouns['Tweet_Text'][i]))
    
    #Convert to lowercase
    text = text.lower()
    ##Convert to list from string
    text = text.split()
    ##Lemmatizing
    lm = WordNetLemmatizer() 
    
    text = [lm.lemmatize(word) for word in text if (len(word) > 3 and len(word.strip('xx/')) > 2)] 
    text = " ".join(text)
    lem_transcript_n.append(text)

In [19]:
#lem_transcript_n[1:10]

In [18]:
# Create a new document-term matrix using only nouns
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer

# Re-add the additional stop words since we are recreating the document-term matrix
add_stop_words = ["aircraftmechan","aiesl","rntata","tatacompanies","tata","http","scindia","airindiain","airindia","air","india",'like', 'im', 'know', 'just', 'dont', 'thats', 'right', 'people',
                  'youre', 'got', 'gonna', 'time', 'think', 'yeah', 'said', 'x','xx', 'xxx','xxxx','xxxxx']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate a document-term matrix with only nouns
cvn = CountVectorizer(stop_words=stop_words)
data_cvn = cvn.fit_transform(lem_transcript_n)
data_dtmn = pd.DataFrame(data_cvn.toarray(), columns=cvn.get_feature_names())
data_dtmn.index = data_nouns.index
data_dtmn

Unnamed: 0,aaib,aaiblgairport,aaibpiairport,aaiixsairport,aaijamairport,aaipunairport,aairhqsr,aaistvairport,aaivjaairport,aajtak,...,zsneakerheadz,zueksakrv,zueznpctr,zump,zuqxsxb,zurich,zvcoxgw,zvxmclfyvw,zyadaahorahahai,zywizs
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2285,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2286,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2287,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2288,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# Create the gensim corpus
corpusn = matutils.Sparse2Corpus(scipy.sparse.csr_matrix(data_dtmn.transpose()))

# Create the vocabulary dictionary
id2wordn = dict((v, k) for k, v in cvn.vocabulary_.items())

In [24]:
# Let's start with 3 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=3, id2word=id2wordn, passes=30)
ldan.print_topics()

2022-05-11 00:15:21,825 : INFO : using symmetric alpha at 0.3333333333333333
2022-05-11 00:15:21,831 : INFO : using symmetric eta at 0.3333333333333333
2022-05-11 00:15:21,849 : INFO : using serial LDA version on this node
2022-05-11 00:15:21,870 : INFO : running online (multi-pass) LDA training, 3 topics, 30 passes over the supplied corpus of 2290 documents, updating model once every 2000 documents, evaluating perplexity every 2290 documents, iterating 50x with a convergence threshold of 0.001000
2022-05-11 00:15:21,957 : INFO : PROGRESS: pass 0, at document #2000/2290
2022-05-11 00:15:24,806 : INFO : merging changes from 2000 documents into a model of 2290 documents
2022-05-11 00:15:24,806 : INFO : topic #0 (0.333): 0.017*"flight" + 0.011*"service" + 0.008*"delhi" + 0.007*"day" + 0.006*"month" + 0.006*"hour" + 0.006*"airline" + 0.005*"mumbai" + 0.005*"refund" + 0.005*"customer"
2022-05-11 00:15:24,817 : INFO : topic #1 (0.333): 0.025*"flight" + 0.012*"delhi" + 0.010*"dear" + 0.008*"t

2022-05-11 00:15:31,213 : INFO : topic diff=0.302180, rho=0.440867
2022-05-11 00:15:31,252 : INFO : PROGRESS: pass 4, at document #2000/2290
2022-05-11 00:15:32,222 : INFO : merging changes from 2000 documents into a model of 2290 documents
2022-05-11 00:15:32,237 : INFO : topic #0 (0.333): 0.016*"customer" + 0.012*"care" + 0.010*"team" + 0.010*"service" + 0.009*"info" + 0.007*"mylauggage" + 0.007*"aircraft" + 0.007*"moca" + 0.007*"tracker" + 0.006*"dear"
2022-05-11 00:15:32,241 : INFO : topic #1 (0.333): 0.029*"flight" + 0.016*"delhi" + 0.014*"baggage" + 0.013*"service" + 0.012*"airport" + 0.012*"staff" + 0.011*"dear" + 0.009*"travel" + 0.008*"dgcaindia" + 0.006*"parent"
2022-05-11 00:15:32,244 : INFO : topic #2 (0.333): 0.041*"flight" + 0.028*"ticket" + 0.024*"refund" + 0.013*"customer" + 0.010*"airline" + 0.009*"travel" + 0.009*"service" + 0.008*"issue" + 0.008*"month" + 0.007*"website"
2022-05-11 00:15:32,248 : INFO : topic diff=0.276986, rho=0.403403
2022-05-11 00:15:32,485 : INFO

2022-05-11 00:15:37,649 : INFO : topic #1 (0.333): 0.031*"flight" + 0.017*"delhi" + 0.015*"baggage" + 0.014*"service" + 0.014*"staff" + 0.013*"airport" + 0.010*"dear" + 0.009*"dgcaindia" + 0.009*"travel" + 0.007*"parent"
2022-05-11 00:15:37,666 : INFO : topic #2 (0.333): 0.041*"flight" + 0.031*"ticket" + 0.024*"refund" + 0.014*"customer" + 0.010*"travel" + 0.009*"airline" + 0.009*"issue" + 0.009*"service" + 0.008*"month" + 0.008*"money"
2022-05-11 00:15:37,669 : INFO : topic diff=0.192349, rho=0.313960
2022-05-11 00:15:37,872 : INFO : -6.937 per-word bound, 122.5 perplexity estimate based on a held-out corpus of 290 documents with 2565 words
2022-05-11 00:15:37,872 : INFO : PROGRESS: pass 8, at document #2290/2290
2022-05-11 00:15:38,007 : INFO : merging changes from 290 documents into a model of 2290 documents
2022-05-11 00:15:38,007 : INFO : topic #0 (0.333): 0.021*"customer" + 0.017*"care" + 0.013*"info" + 0.013*"team" + 0.010*"service" + 0.010*"mylauggage" + 0.009*"tracker" + 0.008

2022-05-11 00:15:42,042 : INFO : topic diff=0.154772, rho=0.265888
2022-05-11 00:15:42,205 : INFO : -6.913 per-word bound, 120.5 perplexity estimate based on a held-out corpus of 290 documents with 2565 words
2022-05-11 00:15:42,205 : INFO : PROGRESS: pass 12, at document #2290/2290
2022-05-11 00:15:42,325 : INFO : merging changes from 290 documents into a model of 2290 documents
2022-05-11 00:15:42,325 : INFO : topic #0 (0.333): 0.021*"customer" + 0.017*"care" + 0.013*"info" + 0.013*"team" + 0.010*"service" + 0.010*"mylauggage" + 0.009*"tracker" + 0.008*"dear" + 0.008*"moca" + 0.007*"aircraft"
2022-05-11 00:15:42,331 : INFO : topic #1 (0.333): 0.032*"flight" + 0.018*"baggage" + 0.017*"delhi" + 0.016*"airport" + 0.016*"staff" + 0.015*"service" + 0.010*"dgcaindia" + 0.010*"dear" + 0.008*"parent" + 0.008*"travel"
2022-05-11 00:15:42,334 : INFO : topic #2 (0.333): 0.045*"flight" + 0.035*"ticket" + 0.027*"refund" + 0.015*"customer" + 0.011*"travel" + 0.009*"issue" + 0.009*"money" + 0.009*"

2022-05-11 00:15:46,593 : INFO : topic #1 (0.333): 0.032*"flight" + 0.018*"baggage" + 0.017*"delhi" + 0.017*"airport" + 0.016*"staff" + 0.015*"service" + 0.010*"dgcaindia" + 0.010*"dear" + 0.008*"parent" + 0.008*"travel"
2022-05-11 00:15:46,598 : INFO : topic #2 (0.333): 0.045*"flight" + 0.034*"ticket" + 0.027*"refund" + 0.015*"customer" + 0.011*"travel" + 0.010*"issue" + 0.010*"money" + 0.009*"service" + 0.009*"hour" + 0.009*"website"
2022-05-11 00:15:46,599 : INFO : topic diff=0.129744, rho=0.234759
2022-05-11 00:15:46,622 : INFO : PROGRESS: pass 17, at document #2000/2290
2022-05-11 00:15:47,241 : INFO : merging changes from 2000 documents into a model of 2290 documents
2022-05-11 00:15:47,241 : INFO : topic #0 (0.333): 0.018*"customer" + 0.014*"care" + 0.011*"team" + 0.011*"info" + 0.009*"service" + 0.008*"mylauggage" + 0.008*"dear" + 0.008*"moca" + 0.007*"tracker" + 0.007*"aircraft"
2022-05-11 00:15:47,249 : INFO : topic #1 (0.333): 0.032*"flight" + 0.017*"delhi" + 0.016*"airport"

2022-05-11 00:15:50,548 : INFO : topic diff=0.115441, rho=0.212502
2022-05-11 00:15:50,616 : INFO : PROGRESS: pass 21, at document #2000/2290
2022-05-11 00:15:51,243 : INFO : merging changes from 2000 documents into a model of 2290 documents
2022-05-11 00:15:51,243 : INFO : topic #0 (0.333): 0.018*"customer" + 0.014*"care" + 0.012*"team" + 0.011*"info" + 0.009*"service" + 0.008*"mylauggage" + 0.008*"dear" + 0.008*"moca" + 0.008*"tracker" + 0.007*"aircraft"
2022-05-11 00:15:51,261 : INFO : topic #1 (0.333): 0.032*"flight" + 0.018*"delhi" + 0.016*"airport" + 0.016*"baggage" + 0.015*"staff" + 0.014*"service" + 0.010*"dear" + 0.009*"dgcaindia" + 0.008*"travel" + 0.007*"parent"
2022-05-11 00:15:51,263 : INFO : topic #2 (0.333): 0.042*"flight" + 0.032*"ticket" + 0.025*"refund" + 0.014*"customer" + 0.010*"travel" + 0.010*"issue" + 0.009*"service" + 0.009*"hour" + 0.009*"airline" + 0.009*"money"
2022-05-11 00:15:51,265 : INFO : topic diff=0.113906, rho=0.207860
2022-05-11 00:15:51,423 : INFO :

2022-05-11 00:15:55,083 : INFO : topic #1 (0.333): 0.032*"flight" + 0.018*"delhi" + 0.017*"airport" + 0.016*"baggage" + 0.015*"staff" + 0.014*"service" + 0.010*"dear" + 0.009*"dgcaindia" + 0.008*"travel" + 0.007*"parent"
2022-05-11 00:15:55,093 : INFO : topic #2 (0.333): 0.042*"flight" + 0.032*"ticket" + 0.025*"refund" + 0.014*"customer" + 0.010*"travel" + 0.010*"issue" + 0.009*"service" + 0.009*"hour" + 0.009*"money" + 0.009*"airline"
2022-05-11 00:15:55,095 : INFO : topic diff=0.103754, rho=0.191935
2022-05-11 00:15:55,272 : INFO : -6.881 per-word bound, 117.8 perplexity estimate based on a held-out corpus of 290 documents with 2565 words
2022-05-11 00:15:55,272 : INFO : PROGRESS: pass 25, at document #2290/2290
2022-05-11 00:15:55,376 : INFO : merging changes from 290 documents into a model of 2290 documents
2022-05-11 00:15:55,376 : INFO : topic #0 (0.333): 0.021*"customer" + 0.016*"care" + 0.013*"team" + 0.013*"info" + 0.009*"mylauggage" + 0.009*"tracker" + 0.009*"service" + 0.008

2022-05-11 00:15:58,850 : INFO : topic diff=0.095901, rho=0.179187
2022-05-11 00:15:59,022 : INFO : -6.873 per-word bound, 117.3 perplexity estimate based on a held-out corpus of 290 documents with 2565 words
2022-05-11 00:15:59,022 : INFO : PROGRESS: pass 29, at document #2290/2290
2022-05-11 00:15:59,127 : INFO : merging changes from 290 documents into a model of 2290 documents
2022-05-11 00:15:59,127 : INFO : topic #0 (0.333): 0.021*"customer" + 0.015*"care" + 0.013*"team" + 0.013*"info" + 0.009*"mylauggage" + 0.009*"tracker" + 0.009*"service" + 0.008*"dear" + 0.008*"moca" + 0.008*"aircraft"
2022-05-11 00:15:59,145 : INFO : topic #1 (0.333): 0.032*"flight" + 0.018*"airport" + 0.018*"delhi" + 0.017*"baggage" + 0.016*"staff" + 0.015*"service" + 0.010*"dear" + 0.009*"dgcaindia" + 0.008*"parent" + 0.008*"travel"
2022-05-11 00:15:59,146 : INFO : topic #2 (0.333): 0.044*"flight" + 0.034*"ticket" + 0.026*"refund" + 0.015*"customer" + 0.011*"travel" + 0.010*"issue" + 0.010*"service" + 0.010

[(0,
  '0.021*"customer" + 0.015*"care" + 0.013*"team" + 0.013*"info" + 0.009*"mylauggage" + 0.009*"tracker" + 0.009*"service" + 0.008*"dear" + 0.008*"moca" + 0.008*"aircraft"'),
 (1,
  '0.032*"flight" + 0.018*"airport" + 0.018*"delhi" + 0.017*"baggage" + 0.016*"staff" + 0.015*"service" + 0.010*"dear" + 0.009*"dgcaindia" + 0.008*"parent" + 0.008*"travel"'),
 (2,
  '0.044*"flight" + 0.034*"ticket" + 0.026*"refund" + 0.015*"customer" + 0.011*"travel" + 0.010*"issue" + 0.010*"service" + 0.010*"money" + 0.009*"hour" + 0.009*"website"')]

In [44]:
# Let's start with 6 topics
ldan = models.LdaModel(corpus=corpusn, num_topics=6, id2word=id2wordn, passes=20)
ldan.print_topics()

2022-05-10 23:38:04,384 : INFO : using symmetric alpha at 0.16666666666666666
2022-05-10 23:38:04,391 : INFO : using symmetric eta at 0.16666666666666666
2022-05-10 23:38:04,410 : INFO : using serial LDA version on this node
2022-05-10 23:38:04,447 : INFO : running online (multi-pass) LDA training, 6 topics, 20 passes over the supplied corpus of 2290 documents, updating model once every 2000 documents, evaluating perplexity every 2290 documents, iterating 50x with a convergence threshold of 0.001000
2022-05-10 23:38:04,531 : INFO : PROGRESS: pass 0, at document #2000/2290
2022-05-10 23:38:07,275 : INFO : merging changes from 2000 documents into a model of 2290 documents
2022-05-10 23:38:07,327 : INFO : topic #4 (0.167): 0.030*"flight" + 0.018*"airport" + 0.014*"ticket" + 0.010*"moca" + 0.008*"delhi" + 0.006*"staff" + 0.006*"dear" + 0.005*"dgcaindia" + 0.005*"today" + 0.004*"help"
2022-05-10 23:38:07,333 : INFO : topic #2 (0.167): 0.026*"flight" + 0.020*"service" + 0.017*"refund" + 0.01

2022-05-10 23:38:12,447 : INFO : topic #4 (0.167): 0.034*"flight" + 0.027*"airport" + 0.025*"ticket" + 0.012*"parent" + 0.011*"staff" + 0.011*"moca" + 0.010*"delhi" + 0.010*"today" + 0.009*"dgcaindia" + 0.008*"date"
2022-05-10 23:38:12,456 : INFO : topic #1 (0.167): 0.029*"flight" + 0.025*"ticket" + 0.023*"money" + 0.016*"travel" + 0.016*"year" + 0.015*"baggage" + 0.014*"flipkart" + 0.013*"refund" + 0.011*"cleartrip" + 0.010*"charge"
2022-05-10 23:38:12,471 : INFO : topic #5 (0.167): 0.011*"passenger" + 0.010*"airline" + 0.010*"email" + 0.009*"force" + 0.009*"form" + 0.008*"fraud" + 0.008*"company" + 0.007*"ukraine" + 0.006*"expedia" + 0.006*"crore"
2022-05-10 23:38:12,477 : INFO : topic diff=0.279638, rho=0.491177
2022-05-10 23:38:12,519 : INFO : PROGRESS: pass 3, at document #2000/2290
2022-05-10 23:38:13,580 : INFO : merging changes from 2000 documents into a model of 2290 documents
2022-05-10 23:38:13,589 : INFO : topic #1 (0.167): 0.026*"flight" + 0.021*"ticket" + 0.017*"money" + 

2022-05-10 23:38:16,285 : INFO : topic #3 (0.167): 0.031*"flight" + 0.023*"delhi" + 0.020*"dear" + 0.019*"travel" + 0.008*"website" + 0.008*"staff" + 0.008*"morning" + 0.008*"center" + 0.007*"guideline" + 0.007*"parent"
2022-05-10 23:38:16,285 : INFO : topic #2 (0.167): 0.052*"flight" + 0.039*"service" + 0.030*"refund" + 0.026*"customer" + 0.019*"ticket" + 0.016*"airline" + 0.015*"issue" + 0.014*"hour" + 0.012*"airport" + 0.010*"passenger"
2022-05-10 23:38:16,285 : INFO : topic #0 (0.167): 0.035*"customer" + 0.028*"care" + 0.027*"aiesl" + 0.023*"team" + 0.022*"info" + 0.018*"mylauggage" + 0.018*"tracker" + 0.013*"dgcaindia" + 0.010*"cancellation" + 0.010*"employee"
2022-05-10 23:38:16,293 : INFO : topic #1 (0.167): 0.027*"flight" + 0.023*"money" + 0.022*"ticket" + 0.018*"year" + 0.015*"travel" + 0.015*"baggage" + 0.015*"flipkart" + 0.014*"refund" + 0.011*"cleartrip" + 0.010*"dear"
2022-05-10 23:38:16,294 : INFO : topic diff=0.172060, rho=0.374110
2022-05-10 23:38:16,306 : INFO : PROGRE

2022-05-10 23:38:19,291 : INFO : topic #4 (0.167): 0.036*"flight" + 0.026*"airport" + 0.024*"ticket" + 0.012*"parent" + 0.011*"staff" + 0.011*"today" + 0.011*"delhi" + 0.011*"moca" + 0.008*"dgcaindia" + 0.008*"date"
2022-05-10 23:38:19,294 : INFO : topic #0 (0.167): 0.033*"customer" + 0.027*"care" + 0.027*"aiesl" + 0.023*"team" + 0.022*"info" + 0.018*"mylauggage" + 0.017*"tracker" + 0.013*"dgcaindia" + 0.010*"cancellation" + 0.010*"employee"
2022-05-10 23:38:19,294 : INFO : topic #5 (0.167): 0.012*"passenger" + 0.010*"force" + 0.009*"email" + 0.008*"company" + 0.008*"airline" + 0.008*"form" + 0.007*"fraud" + 0.007*"ukraine" + 0.006*"expedia" + 0.006*"crore"
2022-05-10 23:38:19,299 : INFO : topic #2 (0.167): 0.052*"flight" + 0.039*"service" + 0.029*"refund" + 0.027*"customer" + 0.020*"ticket" + 0.016*"airline" + 0.015*"issue" + 0.014*"hour" + 0.012*"airport" + 0.010*"baggage"
2022-05-10 23:38:19,300 : INFO : topic #3 (0.167): 0.031*"flight" + 0.023*"delhi" + 0.021*"dear" + 0.019*"travel

2022-05-10 23:38:22,366 : INFO : -7.007 per-word bound, 128.6 perplexity estimate based on a held-out corpus of 290 documents with 2588 words
2022-05-10 23:38:22,369 : INFO : PROGRESS: pass 11, at document #2290/2290
2022-05-10 23:38:22,533 : INFO : merging changes from 290 documents into a model of 2290 documents
2022-05-10 23:38:22,543 : INFO : topic #0 (0.167): 0.033*"customer" + 0.027*"care" + 0.027*"aiesl" + 0.023*"team" + 0.022*"info" + 0.018*"mylauggage" + 0.017*"tracker" + 0.013*"dgcaindia" + 0.010*"cancellation" + 0.010*"employee"
2022-05-10 23:38:22,543 : INFO : topic #1 (0.167): 0.026*"flight" + 0.024*"money" + 0.021*"ticket" + 0.019*"year" + 0.015*"refund" + 0.015*"travel" + 0.015*"flipkart" + 0.014*"baggage" + 0.010*"cleartrip" + 0.010*"dear"
2022-05-10 23:38:22,543 : INFO : topic #4 (0.167): 0.037*"flight" + 0.025*"airport" + 0.023*"ticket" + 0.012*"parent" + 0.011*"staff" + 0.011*"today" + 0.011*"delhi" + 0.011*"moca" + 0.008*"help" + 0.007*"dgcaindia"
2022-05-10 23:38:2

2022-05-10 23:38:25,752 : INFO : topic #0 (0.167): 0.027*"customer" + 0.023*"aiesl" + 0.022*"care" + 0.019*"team" + 0.018*"info" + 0.015*"mylauggage" + 0.014*"tracker" + 0.011*"dgcaindia" + 0.010*"employee" + 0.010*"aircraftmechan"
2022-05-10 23:38:25,756 : INFO : topic diff=0.099677, rho=0.248875
2022-05-10 23:38:25,951 : INFO : -6.990 per-word bound, 127.2 perplexity estimate based on a held-out corpus of 290 documents with 2588 words
2022-05-10 23:38:25,951 : INFO : PROGRESS: pass 14, at document #2290/2290
2022-05-10 23:38:26,051 : INFO : merging changes from 290 documents into a model of 2290 documents
2022-05-10 23:38:26,065 : INFO : topic #4 (0.167): 0.037*"flight" + 0.025*"airport" + 0.023*"ticket" + 0.012*"parent" + 0.011*"staff" + 0.011*"today" + 0.011*"delhi" + 0.011*"moca" + 0.008*"help" + 0.007*"dgcaindia"
2022-05-10 23:38:26,065 : INFO : topic #2 (0.167): 0.052*"flight" + 0.039*"service" + 0.028*"refund" + 0.027*"customer" + 0.021*"ticket" + 0.017*"airline" + 0.015*"issue

2022-05-10 23:38:29,138 : INFO : topic #0 (0.167): 0.027*"customer" + 0.023*"aiesl" + 0.022*"care" + 0.019*"team" + 0.018*"info" + 0.015*"mylauggage" + 0.014*"tracker" + 0.011*"dgcaindia" + 0.010*"employee" + 0.010*"aircraftmechan"
2022-05-10 23:38:29,147 : INFO : topic #5 (0.167): 0.012*"passenger" + 0.009*"force" + 0.008*"airline" + 0.008*"company" + 0.008*"email" + 0.007*"form" + 0.007*"government" + 0.006*"fraud" + 0.006*"ukraine" + 0.005*"crore"
2022-05-10 23:38:29,148 : INFO : topic diff=0.089839, rho=0.228545
2022-05-10 23:38:29,335 : INFO : -6.979 per-word bound, 126.2 perplexity estimate based on a held-out corpus of 290 documents with 2588 words
2022-05-10 23:38:29,335 : INFO : PROGRESS: pass 17, at document #2290/2290
2022-05-10 23:38:29,429 : INFO : merging changes from 290 documents into a model of 2290 documents
2022-05-10 23:38:29,448 : INFO : topic #3 (0.167): 0.031*"flight" + 0.023*"delhi" + 0.021*"dear" + 0.019*"travel" + 0.008*"website" + 0.008*"staff" + 0.008*"morni

2022-05-10 23:38:31,991 : INFO : topic #1 (0.167): 0.025*"flight" + 0.023*"money" + 0.021*"ticket" + 0.019*"year" + 0.015*"refund" + 0.014*"flipkart" + 0.014*"travel" + 0.014*"baggage" + 0.010*"cleartrip" + 0.009*"dear"
2022-05-10 23:38:31,997 : INFO : topic #2 (0.167): 0.053*"flight" + 0.039*"service" + 0.028*"refund" + 0.027*"customer" + 0.021*"ticket" + 0.017*"airline" + 0.015*"issue" + 0.014*"hour" + 0.012*"airport" + 0.010*"baggage"
2022-05-10 23:38:32,001 : INFO : topic #3 (0.167): 0.030*"flight" + 0.023*"delhi" + 0.021*"dear" + 0.019*"travel" + 0.008*"website" + 0.008*"staff" + 0.008*"morning" + 0.008*"guideline" + 0.007*"april" + 0.007*"center"
2022-05-10 23:38:32,006 : INFO : topic #4 (0.167): 0.038*"flight" + 0.025*"airport" + 0.023*"ticket" + 0.012*"parent" + 0.011*"staff" + 0.011*"today" + 0.011*"delhi" + 0.010*"moca" + 0.008*"help" + 0.007*"date"
2022-05-10 23:38:32,008 : INFO : topic #5 (0.167): 0.013*"passenger" + 0.010*"force" + 0.009*"company" + 0.009*"email" + 0.008*"

[(0,
  '0.032*"customer" + 0.026*"aiesl" + 0.026*"care" + 0.022*"team" + 0.022*"info" + 0.017*"mylauggage" + 0.017*"tracker" + 0.012*"dgcaindia" + 0.010*"cancellation" + 0.010*"employee"'),
 (1,
  '0.025*"flight" + 0.023*"money" + 0.021*"ticket" + 0.019*"year" + 0.015*"refund" + 0.014*"flipkart" + 0.014*"travel" + 0.014*"baggage" + 0.010*"cleartrip" + 0.009*"dear"'),
 (2,
  '0.053*"flight" + 0.039*"service" + 0.028*"refund" + 0.027*"customer" + 0.021*"ticket" + 0.017*"airline" + 0.015*"issue" + 0.014*"hour" + 0.012*"airport" + 0.010*"baggage"'),
 (3,
  '0.030*"flight" + 0.023*"delhi" + 0.021*"dear" + 0.019*"travel" + 0.008*"website" + 0.008*"staff" + 0.008*"morning" + 0.008*"guideline" + 0.007*"april" + 0.007*"center"'),
 (4,
  '0.038*"flight" + 0.025*"airport" + 0.023*"ticket" + 0.012*"parent" + 0.011*"staff" + 0.011*"today" + 0.011*"delhi" + 0.010*"moca" + 0.008*"help" + 0.007*"date"'),
 (5,
  '0.013*"passenger" + 0.010*"force" + 0.009*"company" + 0.009*"email" + 0.008*"form" + 0.007

# Since 3 topics helps us bucket the problems, hence we will go with 3 topics with the noun filtering