Requirement libraries

In [None]:
!pip install gensim
!pip install PorterStemmer 
!pip install stop_words

In [16]:
from nltk.stem.porter import PorterStemmer
from stop_words import get_stop_words
import gensim
import string
from gensim import corpora, models
from gensim.models import LdaModel
from gensim.models import CoherenceModel
import time

**Read your file**

In [3]:
def readFile(fileName):
    file = open(fileName, encoding='utf-8')
    lines = file.readlines()
    file.close()

    file_ = []
    for line in lines:
        file_.append(str(line).rstrip("\n"))

    return file_

Example file is readed:

In [4]:
doc_set = readFile("uci-news3C.txt")

**Pre-processing and Create corpus&dictionary for LDA**

In [13]:
sentence_tokens = []
model_name = 'EnglishNews'
def englishSentence(doc):
    p_stemmer = PorterStemmer() #stemming library for English
    en_stop = get_stop_words('english') #english stopwords library
    totalWords = []
    number = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
    for i in doc:
        map = i.maketrans('', '', string.punctuation) #Applying punctuation process
        out = i.translate(map)
        tokens = []
        not_verb = []
        for word in str(out).split(" "):
            word = str(word).lower().strip()
            tokens.append(word)        
        stemmed = [ p_stemmer.stem(i) for i in tokens ] #Stemming process for words
        stemmed_tokens = [ i for i in stemmed if not i in en_stop]  #Removing stopwords

        delete = []    
        dataControl = True
        for root in stemmed_tokens: #Deleting numbers and single letter words from word roots
            if len(root) <= 1:
                delete.append(root)
            else:
                for s in number:
                    if root[0].find(s) != -1:
                        delete.append(root)
                        dataControl = False
                        break
                if dataControl:
                    totalWords.append(root)
        for d in delete:
            stemmed_tokens.remove(d)
                    
        sentence_tokens.append(stemmed_tokens)   #Create sentence tokens for gensim.LDA
    
    print("Sentence's tokens is obtained!")
    totalWords = list(set(totalWords))
    dictionary = corpora.Dictionary(sentence_tokens)
    dictionary.save( str(model_name) + '.dict')
    corpus = [dictionary.doc2bow(text) for text in sentence_tokens]
    gensim.corpora.MmCorpus.serialize(str(model_name) + '.mm', corpus)
    print("Corpus&Dictionary is obtained!")
    print("Total unique word count:" + str(len(totalWords)))

    return corpus, dictionary

Creating corpus and dictionary for readed file:

In [14]:
corpus, dictionary = englishSentence(doc_set)

Sentence's tokens is obtained!
Corpus&Dictionary is obtained!
Total unique word count:2878


Classic LDA model is obtained and shown: 

In [15]:
classic_LDA = LdaModel(corpus, num_topics = 10, id2word = dictionary, iterations = 100, passes = 10, alpha = 'asymmetric')
classic_LDA.save(str(model_name) + '.model')

In [26]:
classic_LDA.show_topics(num_topics= 10, num_words=15)

[(0,
  '0.029*"risk" + 0.020*"detect" + 0.016*"appl" + 0.016*"true" + 0.015*"io" + 0.014*"friend" + 0.013*"final" + 0.012*"new" + 0.012*"lindsay" + 0.010*"updat" + 0.010*"microsoft" + 0.010*"lohan" + 0.008*"releas" + 0.007*"sxsw" + 0.007*"similar"'),
 (1,
  '0.061*"predict" + 0.022*"possibl" + 0.021*"link" + 0.020*"mental" + 0.015*"research" + 0.013*"babi" + 0.012*"facebook" + 0.011*"studi" + 0.011*"rise" + 0.011*"date" + 0.010*"read" + 0.009*"cut" + 0.009*"lena" + 0.009*"gomez" + 0.008*"dunham"'),
 (2,
  '0.042*"bachelor" + 0.036*"juan" + 0.035*"pablo" + 0.024*"care" + 0.024*"googl" + 0.023*"wearabl" + 0.019*"sign" + 0.018*"senat" + 0.017*"final" + 0.017*"android" + 0.015*"recal" + 0.015*"awar" + 0.013*"sdk" + 0.012*"galavi" + 0.012*"two"'),
 (3,
  '0.040*"boy" + 0.032*"give" + 0.029*"approv" + 0.026*"new" + 0.023*"show" + 0.021*"studi" + 0.017*"noah" + 0.016*"rais" + 0.016*"malefic" + 0.014*"miss" + 0.014*"hear" + 0.013*"trailer" + 0.013*"promis" + 0.013*"sex" + 0.012*"joli"'),
 (4,


**n-stage LDA method**

n-stage LDA method is created:
*   model_name: your classic LDA model name
*   topic_no: your selected topic number
*   word_num: The word count to be selected from the topics for the lda.show_topics() operation
*   stage_no: Number of n-stage LDA stages to run, n value (n value starts from 2)
*   sentenceTokens: Sentence tokens list for your classic LDA (this code:sentence_tokens in englishSentence function)








In [24]:
def n_stage_LDA(model_name, topic_no, stage_no, word_num, sentencesTokens):
	#n-stage processes
	for stage in range(2, stage_no+1):
		#Load your LDA model
		lda = gensim.models.LdaModel.load(str(model_name) + '.model')
		info_LDA = lda.show_topics(num_topics= topic_no, num_words=word_num)
		topics = str(info_LDA).split('\'), (')
		topics[0] = str(topics[0])[2:]

		words = []
		weights = []
		totalWords = []
		for topic in topics:
			i = str(topic).find(',')
			topicNo = topic[:i]
			words_weights = str(str(topic[i+3:]).strip()).split('+')
			total_weight = 0
			for wordInfo in words_weights: #Determining words and values for each topic
				wordInfo = str(wordInfo).strip()
				i = str(wordInfo).find('*')
				weightValue = wordInfo[:i]
				k = str(wordInfo).rfind('"')
				wordValue = wordInfo[i+2:k]
				if weightValue != "0.0001": #Evaluation of words with a weight above 0.0001 or your value
					total_weight += float(weightValue) 
					weights.append(weightValue) #Calculation total weight for each topic
					words.append(wordValue)
				else:
					break

			tWords = words #storage words list
			average_weight = total_weight/(len(weights)) #Calculation threshold weight value for each topic seperately
			wordCount = 0
			#Create a new word list based on average weight by looking at the weight of each word
			for weight in weights:
				if float(weight) >= average_weight:
					totalWords.append(tWords[wordCount])
					wordCount += 1
				else:
					break
			#Removing contents in words and weights lists for other topic
			words.clear()
			weights.clear()

		totalWords = list(set(totalWords)) #Obtaining unique total words 
		print("New word list is created!")

		#Updating previous sentence's tokens according to total words list and creating new list for this
		sentences = sentencesTokens
		iterative_texts = []
		for text in sentences:
			tokens = [i for i in text if i in totalWords]
			iterative_texts.append(tokens)

		print("New word vocabulary is created! New count: " + str(len(totalWords)))

		#Creating a new LDA model stage
		model_name = model_name + str(stage) + '-LDA'
		dictionary = corpora.Dictionary(iterative_texts)
		dictionary.save( model_name + '.dict')
		corpus = [dictionary.doc2bow(iter) for iter in iterative_texts]
		gensim.corpora.MmCorpus.serialize(model_name + '.mm', corpus)
		print("New corpus&dictionary is created!")
		
		start = time.time()
		ldamodel = LdaModel(corpus, num_topics = topic_no, id2word = dictionary, iterations = 100, passes = 10, alpha = 'asymmetric')
		ldamodel.save(str(model_name) + '.model')
		end = time.time()
		print(str(stage) + '-LDA model ise created! RunTime:' + str(end-start))

Your n-stage LDA model is created:

In [25]:
n_stage_LDA(model_name, 10, 2, 400, sentence_tokens)

New word list is created!
New word vocabulary is created! New count: 637
New corpus&dictionary is created!
2-LDA model ise created! RunTime:7.225432872772217


Your n-stage LDA model is shown:

In [27]:
n_lda = gensim.models.LdaModel.load(str(model_name) + '2-LDA.model')
n_lda.show_topics(num_topics= 10, num_words=15)

[(0,
  '0.029*"googl" + 0.023*"friend" + 0.022*"studi" + 0.022*"wearabl" + 0.019*"lindsay" + 0.018*"android" + 0.017*"sxsw" + 0.016*"approv" + 0.016*"senat" + 0.015*"lohan" + 0.014*"facebook" + 0.014*"call" + 0.012*"two" + 0.012*"sdk" + 0.012*"zac"'),
 (1,
  '0.064*"detect" + 0.051*"true" + 0.042*"show" + 0.031*"final" + 0.030*"link" + 0.020*"new" + 0.019*"fight" + 0.018*"go" + 0.018*"amazon" + 0.018*"malefic" + 0.018*"promis" + 0.017*"servic" + 0.017*"hbo" + 0.017*"season" + 0.017*"propos"'),
 (2,
  '0.126*"risk" + 0.059*"care" + 0.031*"similar" + 0.029*"noah" + 0.029*"tv" + 0.023*"support" + 0.022*"peopl" + 0.019*"beard" + 0.016*"miley" + 0.016*"lena" + 0.016*"jame" + 0.016*"dunham" + 0.015*"swift" + 0.015*"cyru" + 0.015*"finalist"'),
 (3,
  '0.107*"test" + 0.049*"may" + 0.040*"diseas" + 0.039*"blood" + 0.036*"boy" + 0.034*"new" + 0.033*"alzheim" + 0.033*"give" + 0.027*"year" + 0.026*"life" + 0.025*"mer" + 0.024*"possibl" + 0.024*"health" + 0.023*"heart" + 0.023*"sign"'),
 (4,
  '0.0