<a href="https://colab.research.google.com/github/arutraj/.githubcl/blob/main/3_Interpreting_Patterns_from_Text_Topic_Modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!python --version

Python 3.10.12


In [1]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

cvectorizer = CountVectorizer()

In [3]:
corpus = ["i love cooking", "I have prepared a cake today", "he is going to a new place", "he will learn cooking there"]

In [4]:
cvz = cvectorizer.fit_transform(corpus)

In [5]:
cvz

<4x15 sparse matrix of type '<class 'numpy.int64'>'
	with 17 stored elements in Compressed Sparse Row format>

In [6]:
vocab = cvectorizer.get_feature_names_out()
vocab

array(['cake', 'cooking', 'going', 'have', 'he', 'is', 'learn', 'love',
       'new', 'place', 'prepared', 'there', 'to', 'today', 'will'],
      dtype=object)

In [13]:
lda_model = LatentDirichletAllocation(n_components = 3, max_iter = 20, random_state = 20)
X_topics = lda_model.fit_transform(cvz)
topic_words = lda_model.components_

In [15]:
X_topics

array([[0.77342627, 0.11172509, 0.11484864],
       [0.0672812 , 0.86579901, 0.06691979],
       [0.04835244, 0.04817357, 0.903474  ],
       [0.0614754 , 0.05619798, 0.88232662]])

In [16]:
topic_words

array([[0.33409872, 1.3520179 , 0.33426983, 0.33409872, 0.3344864 ,
        0.33426983, 0.33484162, 1.33184251, 0.33426983, 0.33426983,
        0.33409872, 0.33484162, 0.33426983, 0.33409872, 0.33484162],
       [1.33225166, 0.33510505, 0.33404224, 1.33225166, 0.33407486,
        0.33404224, 0.33419528, 0.33426477, 0.33404224, 0.33404224,
        1.33225166, 0.33419528, 0.33404224, 1.33225166, 0.33419528],
       [0.33364962, 1.31287705, 1.33168793, 0.33364962, 2.33143874,
        1.33168793, 1.33096309, 0.33389272, 1.33168793, 1.33168793,
        0.33364962, 1.33096309, 1.33168793, 0.33364962, 1.33096309]])

In [8]:
n_top_words = 4

for i, topic_dist in enumerate(topic_words):
    sorted_topic_dist = np.argsort(topic_dist)
    topic_words = np.array(vocab)[sorted_topic_dist]
    topic_words = topic_words[:-n_top_words:-1]
    print ("Topic", str(i+1), topic_words)

Topic 1 ['cooking' 'love' 'will']
Topic 2 ['today' 'prepared' 'have']
Topic 3 ['he' 'to' 'place']


In [17]:
doc_topic = lda_model.transform(cvz)
for n in range(doc_topic.shape[0]):
    topic_doc = doc_topic[n].argmax()
    print ("Document", n+1, " -- Topic:" ,topic_doc)

Document 1  -- Topic: 0
Document 2  -- Topic: 1
Document 3  -- Topic: 2
Document 4  -- Topic: 2
