# <h1><b><CENTER>Latent Dirichlet Allocation(LDA)

Here, We will be giving several topics and our model will predict which documents corresponds to which topic.

In [0]:
import pandas as pd

In [5]:
npr = pd.read_csv('drive/My Drive/Pytorch_DataSet/TextFiles/npr.csv')
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [6]:
len(npr)

11992

# Data Preprocessing

## First, we will perform countVectorizeration.

https://towardsdatascience.com/hacking-scikit-learns-vectorizers-9ef26a7170af

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
"""
max_df signifies that the most frequent useless words like stop_words if they appear in 95% documents,then drop them.
min_df signifies that a word is to be taken into consideration if it appears in atleast 2 documents.
stop_words are to not taken into consideration.
"""

'\nmax_df signifies that the most frequent useless words like stop_words if they appear in 95% documents,then drop them.\nmin_df signifies that a word is to be taken into consideration if it appears in atleast 2 documents.\nstop_words are to not taken into consideration.\n'

In [8]:
dtm = cv.fit_transform(npr['Article'])
dtm

<11992x54777 sparse matrix of type '<class 'numpy.int64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

## Now, we will perform LDA.

https://towardsdatascience.com/light-on-math-machine-learning-intuitive-guide-to-latent-dirichlet-allocation-437c81220158

In [0]:
from sklearn.decomposition import LatentDirichletAllocation
LDA = LatentDirichletAllocation(n_components = 7, random_state = 42)  
#n_components refers to the number of topics u want.

In [10]:
LDA.fit(dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=7, n_jobs=None,
                          perp_tol=0.1, random_state=42, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

### Showing stored words

In [0]:
# Grab the vocabulary of words

In [13]:
len(cv.get_feature_names()), type(cv.get_feature_names())

(54777, list)

Length of the features_name is exactly the number of unique words the document has and is counted by the Count Vector. 

In [21]:
# Everytime, you ran this cell, you will get different Word. 

import random

random_word_id = random.randint(0,54777)

cv.get_feature_names()[random_word_id]

'tresses'

### Showing Top Words Per Topic

In [0]:
# Grab the topics

In [16]:
len(LDA.components_), type(LDA.components_), LDA.components_.shape

(7, numpy.ndarray, (7, 54777))

In [17]:
LDA.components_

array([[8.64332806e+00, 2.38014333e+03, 1.42900522e-01, ...,
        1.43006821e-01, 1.42902042e-01, 1.42861626e-01],
       [2.76191749e+01, 5.36394437e+02, 1.42857148e-01, ...,
        1.42861973e-01, 1.42857147e-01, 1.42906875e-01],
       [7.22783888e+00, 8.24033986e+02, 1.42857148e-01, ...,
        6.14236247e+00, 2.14061364e+00, 1.42923753e-01],
       ...,
       [3.11488651e+00, 3.50409655e+02, 1.42857147e-01, ...,
        1.42859912e-01, 1.42857146e-01, 1.42866614e-01],
       [4.61486388e+01, 5.14408600e+01, 3.14281373e+00, ...,
        1.43107628e-01, 1.43902481e-01, 2.14271779e+00],
       [4.93991422e-01, 4.18841042e+02, 1.42857151e-01, ...,
        1.42857146e-01, 1.43760101e-01, 1.42866201e-01]])

In [0]:
single_topic = LDA.components_[0]

In [24]:
single_topic.argsort()

array([ 2475, 18302, 35285, ..., 22673, 42561, 42993])

In [25]:
# ArgSort --> Sorting the number through indices from Least  ---> Greatest
# Top 10 values(Greatest) 
# Taking last 10 values of arg sort
single_topic.argsort()[-10:]

array([33390, 36310, 21228, 10425, 31464,  8149, 36283, 22673, 42561,
       42993])

In [0]:
top_word_indices = single_topic.argsort()[-10:]

In [27]:
for index in top_word_indices:
    print(cv.get_feature_names()[index])

new
percent
government
company
million
care
people
health
said
says


These look like business articles perhaps... Let's confirm by using .transform() on our vectorized articles to attach a label number. But first, let's view all the 10 topics found.

In [28]:
for index,topic in enumerate(LDA.components_):
    print(f'THE TOP 15 WORDS FOR TOPIC #{index}')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print('\n')

THE TOP 15 WORDS FOR TOPIC #0
['companies', 'money', 'year', 'federal', '000', 'new', 'percent', 'government', 'company', 'million', 'care', 'people', 'health', 'said', 'says']


THE TOP 15 WORDS FOR TOPIC #1
['military', 'house', 'security', 'russia', 'government', 'npr', 'reports', 'says', 'news', 'people', 'told', 'police', 'president', 'trump', 'said']


THE TOP 15 WORDS FOR TOPIC #2
['way', 'world', 'family', 'home', 'day', 'time', 'water', 'city', 'new', 'years', 'food', 'just', 'people', 'like', 'says']


THE TOP 15 WORDS FOR TOPIC #3
['time', 'new', 'don', 'years', 'medical', 'disease', 'patients', 'just', 'children', 'study', 'like', 'women', 'health', 'people', 'says']


THE TOP 15 WORDS FOR TOPIC #4
['voters', 'vote', 'election', 'party', 'new', 'obama', 'court', 'republican', 'campaign', 'people', 'state', 'president', 'clinton', 'said', 'trump']


THE TOP 15 WORDS FOR TOPIC #5
['years', 'going', 've', 'life', 'don', 'new', 'way', 'music', 'really', 'time', 'know', 'think',

### Attaching Discovered Topic Labels to Original Articles

In [29]:
dtm

<11992x54777 sparse matrix of type '<class 'numpy.int64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [30]:
npr

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."
...,...
11987,The number of law enforcement officers shot an...
11988,"Trump is busy these days with victory tours,..."
11989,It’s always interesting for the Goats and Soda...
11990,The election of Donald Trump was a surprise to...


In [0]:
topic_results = LDA.transform(dtm)

In [32]:
topic_results

array([[1.61040465e-02, 6.83341493e-01, 2.25376318e-04, ...,
        2.99652737e-01, 2.25479379e-04, 2.25497980e-04],
       [3.63424997e-02, 8.86130697e-01, 4.40751747e-04, ...,
        7.57636804e-02, 4.40866779e-04, 4.40835574e-04],
       [3.28569485e-04, 6.96344889e-01, 3.28302105e-04, ...,
        3.02012902e-01, 3.28724083e-04, 3.28352652e-04],
       ...,
       [1.44467964e-02, 1.60696622e-01, 1.73678310e-01, ...,
        2.24636569e-02, 3.98728349e-04, 3.98359730e-04],
       [4.33560738e-04, 3.53196803e-02, 4.33022554e-04, ...,
        9.62512640e-01, 4.33971991e-04, 4.33490254e-04],
       [3.98777533e-01, 2.54376049e-04, 3.59290659e-01, ...,
        2.40914375e-01, 2.54445555e-04, 2.54253739e-04]])

In [39]:
topic_results.shape

(11992, 7)

In [41]:
print(topic_results[0].round(2))
print(topic_results[0].argmax())  # we want max probability topic

[0.   0.   0.   0.   0.02 0.3  0.68]
6


In [0]:
npr['Topic'] = topic_results.argmax(axis=1)

In [43]:
npr

Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",6
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",1
4,"From photography, illustration and video, to d...",2
...,...,...
11987,The number of law enforcement officers shot an...,1
11988,"Trump is busy these days with victory tours,...",4
11989,It’s always interesting for the Goats and Soda...,3
11990,The election of Donald Trump was a surprise to...,4
