# Identifying topics

This example shows how to identify the topic of a document.

Topic identification is a process of discovering topics that are present in the input document set. These topics can be multiple words that occurs uniquely in a given text.

In [3]:
!pip3 install gensim
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from gensim import corpora, models
import nltk
import feedparser

Collecting gensim
  Downloading gensim-3.8.3-cp37-cp37m-macosx_10_9_x86_64.whl (24.2 MB)
[K     |████████████████████████████████| 24.2 MB 7.0 MB/s eta 0:00:01
Collecting smart-open>=1.8.1
  Downloading smart_open-2.1.0.tar.gz (116 kB)
[K     |████████████████████████████████| 116 kB 35.9 MB/s eta 0:00:01
Collecting boto
  Downloading boto-2.49.0-py2.py3-none-any.whl (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 36.5 MB/s eta 0:00:01
[?25hCollecting boto3
  Downloading boto3-1.14.33-py2.py3-none-any.whl (129 kB)
[K     |████████████████████████████████| 129 kB 12.6 MB/s eta 0:00:01
Collecting s3transfer<0.4.0,>=0.3.0
  Downloading s3transfer-0.3.3-py2.py3-none-any.whl (69 kB)
[K     |████████████████████████████████| 69 kB 10.6 MB/s eta 0:00:01
[?25hCollecting botocore<1.18.0,>=1.17.33
  Downloading botocore-1.17.33-py2.py3-none-any.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 49.6 MB/s eta 0:00:01
[?25hCollecting jmespath<1.0.0,>=0.7.1
  Downloadi

In [12]:
class IdentifyingTopic:
    def get_documents(self):
        url = 'https://www.straitstimes.com/news/business/rss.xml'
        feed = feedparser.parse(url)
        self.documents = []
        for entry in feed['entries'][:5]:
            text = entry['summary']
            self.documents.append(text)
            print('-- {}'.format(text))
        print('INFO: Fetching documents from {} completed'.format(url))
    
    def clean_documents(self):
        tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
        en_stop = set(stopwords.words('english'))
        self.cleaned = []
        for doc in self.documents:
            lowercase_doc = doc.lower()
            words = tokenizer.tokenize(lowercase_doc)
            non_stopped_words = [i for i in words if not i in en_stop]
            self.cleaned.append(non_stopped_words)
        print('INFO: Cleaning {} documents completed'.format(len(self.documents)))
    
    def do_lda(self):
        dictionary = corpora.Dictionary(self.cleaned)
        corpus = [dictionary.doc2bow(cleandoc) for cleandoc in self.cleaned]
        lda_model = models.ldamodel.LdaModel(corpus, num_topics=2, id2word=dictionary)
        return lda_model.print_topics(num_topics=2, num_words=4)

    def run(self):
        self.get_documents()
        self.clean_documents()
        return self.do_lda()

In [13]:
topic = IdentifyingTopic()
topic.run()

-- August 03, 2020 1:29 PM<br /><br />PARIS (BLOOMBERG) - Societe Generale swung to a surprise 1.26 billion-euro (S$2.04 billion) loss fir the second quarter because of charges at its trading unit, extending a losing streak that's set to increase pressure on chief executive officer Frederic Oudea.
-- August 03, 2020 12:26 PM<br /><br />LONDON (REUTERS) - HSBC Holdings posted a higher-than-expected 65 per cent tumble in first-half pre-tax profit as the coronavirus pandemic and its impact on businesses forced the Asia-focused bank to boost its loan-loss provisions.
-- August 03, 2020 12:06 PM<br /><br />HONG KONG )AFP) - Asian markets mostly retreated on Monday Aug 3) with sentiment depressed by a spike in coronavirus infections that has forced fresh lockdowns and sparked worries about the impact on the world economy.
-- August 03, 2020 11:45 AM<br /><br />SINGAPORE (THE BUSINESS TIMES) - Shares of Keppel Corp sank over 4 per cent on Monday morning (Aug 3), amid a broad market decline on

[(0, '0.043*"br" + 0.026*"august" + 0.022*"coronavirus" + 0.018*"pm"'),
 (1, '0.049*"br" + 0.024*"august" + 0.022*"singapore" + 0.018*"cent"')]