The dataset has been obtained from Kaggle. 
This is the url: https://www.kaggle.com/pariza/bbc-news-summary

In [17]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import spacy
import os
import gensim

from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel
from gensim.corpora import Dictionary
import pyLDAvis.gensim

In [3]:
nlp = spacy.load('en_core_web_sm')

In [4]:
custom_stop_words = ['say', 'says', 'said', 'saying', '\'s', 'the', 'The', 'a', 'A', 'be', 'mr', 'Mr', 'but', 'in', 'people']
for stopword in custom_stop_words:
    lexeme = nlp.vocab[stopword]
    lexeme.is_stop = True

We are going to analyze the tech articles. Let us look at a sample news article

In [5]:
with open('BBC News Summary/News Articles/tech/001.txt', 'r') as news_file:
    newsArticle = news_file.read()
    print(newsArticle)

Ink helps drive democracy in Asia

The Kyrgyz Republic, a small, mountainous state of the former Soviet republic, is using invisible ink and ultraviolet readers in the country's elections as part of a drive to prevent multiple voting.

This new technology is causing both worries and guarded optimism among different sectors of the population. In an effort to live up to its reputation in the 1990s as "an island of democracy", the Kyrgyz President, Askar Akaev, pushed through the law requiring the use of ink during the upcoming Parliamentary and Presidential elections. The US government agreed to fund all expenses associated with this decision.

The Kyrgyz Republic is seen by many experts as backsliding from the high point it reached in the mid-1990s with a hastily pushed through referendum in 2003, reducing the legislative branch to one chamber with 75 deputies. The use of ink is only one part of a general effort to show commitment towards more open elections - the German Embassy, the So

In [6]:
base_directory = 'BBC News Summary/News Articles/tech/'
en_directory = os.fsencode(base_directory)
news_corpus = []

for file in os.listdir(en_directory):
    
    fileName = os.fsdecode(file)
    
    if fileName.endswith(".txt"):
        
        cur_article = []
        
        with open(base_directory + fileName, 'r') as news_file:
            
            newsArticle = news_file.read()
            newsArticle = newsArticle.replace('\n', ' ')
            doc = nlp(newsArticle)
            
            for token in doc:

                if token.lemma_ != '-PRON-' and not token.is_space and not token.is_stop and not token.is_punct and not token.like_num and not token.like_email and not token.like_url:
                    
                    # For topic modelling lemmatized version of the word is preferable for better results
                    cur_article.append(token.lemma_)
        
        news_corpus.append(cur_article)
       

In [7]:
print(len(news_corpus))
print(news_corpus[2])

401


In [8]:
bigram = gensim.models.Phrases(news_corpus)

In [9]:
news_corpus = [bigram[cur_article] for cur_article in news_corpus]

In [10]:
print(news_corpus[2])



In [11]:
dictionary = Dictionary(news_corpus)
corpus_for_modelling = [dictionary.doc2bow(cur_article) for cur_article in news_corpus]

In [12]:
print(corpus_for_modelling[2])

[(27, 1), (86, 1), (174, 1), (218, 2), (227, 2), (264, 2), (298, 1), (311, 1), (315, 6), (320, 1), (322, 4), (330, 1), (331, 1), (332, 3), (333, 3), (334, 1), (335, 1), (336, 2), (337, 1), (338, 1), (339, 1), (340, 1), (341, 1), (342, 2), (343, 1), (344, 1), (345, 1), (346, 1), (347, 1), (348, 1), (349, 1), (350, 1), (351, 1), (352, 1), (353, 1), (354, 1), (355, 1), (356, 2), (357, 1), (358, 1), (359, 1), (360, 1), (361, 1), (362, 9), (363, 1), (364, 1), (365, 1), (366, 1), (367, 1), (368, 1), (369, 1), (370, 1), (371, 1), (372, 5), (373, 1), (374, 2), (375, 2), (376, 1), (377, 1), (378, 1), (379, 1), (380, 1), (381, 1), (382, 1), (383, 1), (384, 1), (385, 1), (386, 1), (387, 1), (388, 1), (389, 1), (390, 1), (391, 1), (392, 3), (393, 1), (394, 1), (395, 1), (396, 1)]


In [13]:
hdpModel = HdpModel(corpus=corpus_for_modelling, id2word=dictionary)

  start_time = time.clock()


In [14]:
hdpModel.show_topics()

[(0,
  '0.005*technology + 0.004*service + 0.004*firm + 0.003*music + 0.003*file + 0.003*content + 0.003*user + 0.003*google + 0.003*consumer + 0.003*like + 0.003*way + 0.003*search + 0.002*want + 0.002*digital + 0.002*use + 0.002*p2p + 0.002*work + 0.002*new + 0.002*download + 0.002*yahoo'),
 (1,
  '0.006*technology + 0.005*mobile + 0.004*game + 0.004*tv + 0.004*new + 0.003*pc + 0.003*but + 0.003*device + 0.002*work + 0.002*digital + 0.002*music + 0.002*year + 0.002*computer + 0.002*power + 0.002*us + 0.002*mobile_phone + 0.002*gadget + 0.002*market + 0.002*time + 0.002*firm'),
 (2,
  '0.005*net + 0.003*user + 0.003*number + 0.003*new + 0.003*e_mail + 0.003*domain + 0.002*use + 0.002*phone + 0.002*technology + 0.002*firm + 0.002*computer + 0.002*message + 0.002*virus + 0.002*site + 0.002*network + 0.002*attack + 0.002*mobile + 0.002*broadband + 0.002*ibm + 0.002*year'),
 (3,
  '0.006*game + 0.004*attack + 0.003*software + 0.003*computer + 0.003*net + 0.002*us + 0.002*technology + 0.00

In [15]:
hdp_coherence_model = CoherenceModel(model=hdpModel, texts=news_corpus, dictionary=dictionary, coherence='c_v')
hdp_coherence = hdp_coherence_model.get_coherence()
print(hdp_coherence)

0.6228442032985549


In [18]:
list_num_of_topics = [5, 10, 15, 20, 25, 30]
for num_topics in list_num_of_topics:
    ldaModel = LdaModel(corpus=corpus_for_modelling, num_topics=num_topics, id2word=dictionary)
    coherenceModel = CoherenceModel(model=ldaModel, texts=news_corpus, dictionary=dictionary, coherence='c_v')
    print('Number of Topics', num_topics)
    print('Coherence Value', coherenceModel.get_coherence())

Number of Topics 5
Coherence Value 0.2603781802673089
Number of Topics 10
Coherence Value 0.26354557262746436
Number of Topics 15
Coherence Value 0.26429213181741545
Number of Topics 20
Coherence Value 0.2657787849155366
Number of Topics 25
Coherence Value 0.2809581096095441
Number of Topics 30
Coherence Value 0.27413275927589614


In [19]:
ldaModel_best = LdaModel(corpus=corpus_for_modelling, num_topics=25, id2word=dictionary)
ldaModel_best.show_topics(25)

[(0,
  '0.010*"mobile" + 0.007*"service" + 0.006*"tv" + 0.006*"microsoft" + 0.006*"software" + 0.005*"user" + 0.005*"but" + 0.005*"game" + 0.004*"music" + 0.004*"world"'),
 (1,
  '0.008*"game" + 0.007*"technology" + 0.005*"new" + 0.005*"us" + 0.005*"time" + 0.004*"work" + 0.004*"but" + 0.004*"player" + 0.004*"use" + 0.004*"user"'),
 (2,
  '0.007*"site" + 0.007*"net" + 0.006*"user" + 0.005*"service" + 0.005*"but" + 0.005*"broadband" + 0.004*"phone" + 0.004*"new" + 0.004*"world" + 0.004*"use"'),
 (3,
  '0.011*"game" + 0.009*"technology" + 0.005*"year" + 0.005*"digital" + 0.005*"launch" + 0.004*"mobile" + 0.004*"use" + 0.004*"new" + 0.004*"uk" + 0.004*"but"'),
 (4,
  '0.008*"service" + 0.007*"phone" + 0.006*"user" + 0.006*"game" + 0.005*"call" + 0.005*"mobile" + 0.005*"year" + 0.005*"technology" + 0.004*"software" + 0.004*"use"'),
 (5,
  '0.006*"computer" + 0.005*"software" + 0.004*"but" + 0.004*"game" + 0.004*"company" + 0.004*"law" + 0.004*"new" + 0.004*"technology" + 0.003*"online" + 0

In [20]:
pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(ldaModel_best, corpus_for_modelling, dictionary)

In [21]:
for num_topics in list_num_of_topics:
    lsiModel = LsiModel(corpus=corpus_for_modelling, num_topics=num_topics, id2word=dictionary)
    coherenceModel = CoherenceModel(model=lsiModel, texts=news_corpus, dictionary=dictionary, coherence='c_v')
    print('Number of Topics', num_topics)
    print('Coherence Value', coherenceModel.get_coherence())

Number of Topics 5
Coherence Value 0.3939867325430938
Number of Topics 10
Coherence Value 0.3402395918571028
Number of Topics 15
Coherence Value 0.32590282537275905
Number of Topics 20
Coherence Value 0.31232476434911793
Number of Topics 25
Coherence Value 0.3159888564143579
Number of Topics 30
Coherence Value 0.3115464440574105


In [22]:
lsiModel = LsiModel(corpus=corpus_for_modelling, num_topics=5, id2word=dictionary)
lsiModel.show_topics()

[(0,
  '0.291*"game" + 0.201*"technology" + 0.148*"mobile" + 0.147*"new" + 0.139*"service" + 0.138*"play" + 0.137*"time" + 0.133*"but" + 0.125*"year" + 0.123*"like"'),
 (1,
  '0.675*"game" + 0.271*"play" + -0.175*"service" + -0.137*"user" + -0.132*"technology" + 0.125*"gaming" + 0.104*"time" + -0.104*"mobile" + -0.102*"net" + 0.101*"hour"'),
 (2,
  '-0.409*"mobile" + 0.231*"site" + -0.214*"tv" + 0.203*"software" + 0.169*"attack" + 0.168*"user" + -0.164*"phone" + -0.161*"technology" + -0.146*"service" + -0.143*"gadget"'),
 (3,
  '-0.349*"gadget" + 0.287*"service" + 0.250*"phone" + 0.226*"mobile" + -0.200*"apple" + -0.194*"technology" + -0.154*"pc" + -0.137*"computer" + -0.129*"sony" + -0.123*"device"'),
 (4,
  '0.296*"mobile" + 0.295*"gadget" + -0.267*"technology" + 0.246*"phone" + 0.191*"mobile_phone" + -0.184*"dvd" + -0.168*"high_definition" + 0.155*"apple" + 0.152*"list" + -0.148*"content"')]