-- Topic Modelling
- Allows us to cluster documents together into topics.
- A large amount of text data is usually unlabelled.
- Cluster together types of documents.

-- Latent Dirichet Allocation
- Enables topic modelling.

-- Lda represents documents as mixtures of topics that spit out words with certain probabilities
- In order for it to work you need to have some sort of intuition on how many topics you want to find (k topics)

In [1]:
import pandas as pd

In [2]:
npr = pd.read_csv('npr.csv')

In [3]:
# Dataset of a couple thousand articles.
npr

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."
...,...
11987,The number of law enforcement officers shot an...
11988,"Trump is busy these days with victory tours,..."
11989,It’s always interesting for the Goats and Soda...
11990,The election of Donald Trump was a surprise to...


In [4]:
npr['Article']

0        In the Washington of 2016, even when the polic...
1          Donald Trump has used Twitter  —   his prefe...
2          Donald Trump is unabashedly praising Russian...
3        Updated at 2:50 p. m. ET, Russian President Vl...
4        From photography, illustration and video, to d...
                               ...                        
11987    The number of law enforcement officers shot an...
11988      Trump is busy these days with victory tours,...
11989    It’s always interesting for the Goats and Soda...
11990    The election of Donald Trump was a surprise to...
11991    Voters in the English city of Sunderland did s...
Name: Article, Length: 11992, dtype: object

In [5]:
len(npr)

11992

In [6]:
# Little bit of pre-processing
from sklearn.feature_extraction.text import CountVectorizer

In [47]:
# Create and get rid of really common words.
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words='english') # Will discard words that appear in 90% of documents
# If you use an integer like 2 it will discard if the word occurs in 2 documents

In [48]:
# This is unsupervised so no train test split.
dtm = cv.fit_transform(npr['Article']) # dtm = document term matrix

In [10]:
dtm

<11992x96277 sparse matrix of type '<class 'numpy.int64'>'
	with 3946179 stored elements in Compressed Sparse Row format>

In [12]:
# Perform the latend dirichlet
from sklearn.decomposition import LatentDirichletAllocation

In [50]:
LDA = LatentDirichletAllocation(
    n_components=7, # 7 general topics returned...
    random_state=42
)

In [51]:
# Fit to our dtm
LDA.fit(dtm) # Takes a while...

In [17]:
# Grab the vocabulary of words...
cv.get_feature_names_out() # List of all the words that were in all the documents...



array(['00', '000', '00000', ..., '脱贫', '반갑습니다', 'ﬁnd'], dtype=object)

In [25]:
cv.get_feature_names_out()[11111]

'boarder'

In [52]:
# Grab the topics.
LDA.components_


array([[8.64332806e+00, 2.38014333e+03, 1.42900522e-01, ...,
        1.43006821e-01, 1.42902042e-01, 1.42861626e-01],
       [2.76191749e+01, 5.36394437e+02, 1.42857148e-01, ...,
        1.42861973e-01, 1.42857147e-01, 1.42906875e-01],
       [7.22783888e+00, 8.24033986e+02, 1.42857148e-01, ...,
        6.14236247e+00, 2.14061364e+00, 1.42923753e-01],
       ...,
       [3.11488651e+00, 3.50409655e+02, 1.42857147e-01, ...,
        1.42859912e-01, 1.42857146e-01, 1.42866614e-01],
       [4.61486388e+01, 5.14408600e+01, 3.14281373e+00, ...,
        1.43107628e-01, 1.43902481e-01, 2.14271779e+00],
       [4.93991422e-01, 4.18841042e+02, 1.42857151e-01, ...,
        1.42857146e-01, 1.43760101e-01, 1.42866201e-01]])

In [29]:
type(LDA.components_)


numpy.ndarray

In [31]:
LDA.components_.shape

(7, 96277)

In [32]:
single_topic = LDA.components_[0]

In [34]:
single_topic.argsort() # Returns the index positions that would sort this array. Returns from least to greatest!

array([ 3695, 75086, 80119, ..., 38664, 86104, 95523])

In [35]:
import numpy as np

In [39]:
arr = np.array([10, 200, 1])

In [40]:
arr.argsort()

array([2, 0, 1])

In [41]:
# Top 10 words for this topic... We want last 10 values of argsort()
top_ten_words = single_topic.argsort()[-10:] # starting from index -10 go all the way to the end

In [42]:
top_ten_words

array([ 4713,  5930, 86010, 33446, 39755, 93069, 92838, 38664, 86104,
       95523])

In [43]:
for index in top_ten_words:
    print(cv.get_feature_names_out()[index])

an
are
they
from
his
we
was
he
this
you


In [44]:
top_twenty_words = single_topic.argsort()[-20:]

In [45]:
for index in top_twenty_words:
    print(cv.get_feature_names_out()[index])

so
not
have
one
or
what
by
like
about
be
an
are
they
from
his
we
was
he
this
you


In [53]:
# Loop for all topics.
for i, topic in enumerate(LDA.components_):
    print(f'THE TOP 15 words for topic number {i}')
    # Not a fan of list comprehension lol.
    print([cv.get_feature_names_out()[index] for index in topic.argsort()[-15:]])
    print('\n')
    print('\n')

THE TOP 15 words for topic number 0
['companies', 'money', 'year', 'federal', '000', 'new', 'percent', 'government', 'company', 'million', 'care', 'people', 'health', 'said', 'says']




THE TOP 15 words for topic number 1
['military', 'house', 'security', 'russia', 'government', 'npr', 'reports', 'says', 'news', 'people', 'told', 'police', 'president', 'trump', 'said']




THE TOP 15 words for topic number 2
['way', 'world', 'family', 'home', 'day', 'time', 'water', 'city', 'new', 'years', 'food', 'just', 'people', 'like', 'says']




THE TOP 15 words for topic number 3
['time', 'new', 'don', 'years', 'medical', 'disease', 'patients', 'just', 'children', 'study', 'like', 'women', 'health', 'people', 'says']




THE TOP 15 words for topic number 4
['voters', 'vote', 'election', 'party', 'new', 'obama', 'court', 'republican', 'campaign', 'people', 'state', 'president', 'clinton', 'said', 'trump']




THE TOP 15 words for topic number 5
['years', 'going', 've', 'life', 'don', 'new', 'way

In [54]:
# Tie to the original articles...
topic_results = LDA.transform(dtm)

In [56]:
topic_results.shape # Articles by topics...

(11992, 7)

In [61]:
topic_results[0].round(2) # Probability of a document belonging to a topic!!!

array([0.02, 0.68, 0.  , 0.  , 0.3 , 0.  , 0.  ])

In [62]:
topic_results[0].argmax() # Index that was highest...

1

In [63]:
npr['Topic'] = topic_results.argmax(axis=1)

In [64]:
npr

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",2
...,...,...
11987,The number of law enforcement officers shot an...,1
11988,"Trump is busy these days with victory tours,...",4
11989,It’s always interesting for the Goats and Soda...,3
11990,The election of Donald Trump was a surprise to...,4
