# Topic modeling

LDA - Latent Dirichlet Allocation is based on probability dristribution (Dirichlet distribution)

In [1]:
import pandas as pd

In [2]:
npr = pd.read_csv('../../pythongyak/UPDATED_NLP_COURSE/05-Topic-Modeling/npr.csv', sep='\t')
npr.head()

Unnamed: 0,Article
0,"In the Washington of 2016, even when the polic..."
1,Donald Trump has used Twitter — his prefe...
2,Donald Trump is unabashedly praising Russian...
3,"Updated at 2:50 p. m. ET, Russian President Vl..."
4,"From photography, illustration and video, to d..."


In [3]:
# nr of articles
len(npr)

11992

In [4]:
# nr of tokens is frist article
len(npr['Article'][0])

7646

In [5]:
# 1 # preprocessing: vectorisation
from sklearn.feature_extraction.text import CountVectorizer

# ignore terms with too high doc frequency or too low (ratio 0-1 or an integer)
# remove stopwords
cv = CountVectorizer(max_df=0.9, min_df=2, stop_words='english')

In [6]:
# document term matrix = vectorised article column
dtm = cv.fit_transform(npr['Article'])

In [7]:
dtm

<11992x54777 sparse matrix of type '<class 'numpy.int64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [10]:
# 2 # perform LDA
from sklearn.decomposition import LatentDirichletAllocation

# parameters: nr of components (~ topics), random state
LDA = LatentDirichletAllocation(n_components=7)

In [11]:
# fit lda to our dt-matrix
LDA.fit(dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=None,
                          topic_word_prior=None, total_samples=1000000.0,
                          verbose=0)

In [12]:
# 3 # grab vocab of words

# nr of terms
len(cv.get_feature_names())

54777

In [13]:
type(cv.get_feature_names())

list

In [14]:
cv.get_feature_names()[30012]

'manufactured'

In [15]:
# get a random word 
import random

random_word_id = random.randint(0,54777)
cv.get_feature_names()[random_word_id]

'criminals'

In [16]:
# 4 # grab topics

# nr of topics
len(LDA.components_)

5

In [17]:
# LDA.components_ is a numpy array containing probabilities for each word
type(LDA.components_)

numpy.ndarray

In [18]:
LDA.components_

array([[3.64269474e+01, 1.98820984e+02, 3.19983164e+00, ...,
        2.10803449e-01, 2.19338974e+00, 2.19960082e+00],
       [1.66772895e+01, 1.19512717e+03, 2.00000384e-01, ...,
        6.18064519e+00, 2.03041787e-01, 2.00000076e-01],
       [1.33338035e+01, 2.57588025e+03, 2.00167134e-01, ...,
        2.08278892e-01, 2.03528961e-01, 2.00000101e-01],
       [2.71438829e+01, 6.26019195e+02, 2.00000394e-01, ...,
        2.00028497e-01, 2.00000181e-01, 2.00294315e-01],
       [1.41807662e+00, 8.66152407e+02, 2.00000445e-01, ...,
        2.00243968e-01, 2.00039332e-01, 2.00104689e-01]])

In [19]:
# 5 # grab highest probability words per topic

single_topic = LDA.components_[0]

# returns sorted array with elements' original index position (see example below)
# shows location of higher values in LDA.comp -- higher probabilities -- better candidates for the topic
single_topic.argsort()

# use the index positions (that are same as in cv.get_feature_names)

array([11845, 49523, 49219, ..., 42993, 26752, 28659])

In [20]:
# example
import numpy as np

arr =  np.array([10, 200,1])
arr.argsort()
# returns array([2, 0, 1]) ---> [1, 10, 200]

array([2, 0, 1])

In [21]:
# argsort returns index positions from least to greatest
# we are looking for 10/20 greatest values --> grab last 10/20 values
# that returns index position of ten most probable words
top_ten_words = single_topic.argsort()[-20:]

In [22]:
# get words at index
for index in top_ten_words:
    print(cv.get_feature_names()[index])

black
things
story
going
ve
don
music
world
years
life
really
way
think
know
new
time
people
says
just
like


In [24]:
# top 20 words of each topic
for index,topic in enumerate(LDA.components_):
    print(f"Top 15 words for topic #{index}")
    print([cv.get_feature_names()[index] for index in topic.argsort()[-20:]])
    print("\n")


Top 15 words for topic #0
['black', 'things', 'story', 'going', 've', 'don', 'music', 'world', 'years', 'life', 'really', 'way', 'think', 'know', 'new', 'time', 'people', 'says', 'just', 'like']


Top 15 words for topic #1
['use', 'make', 'research', 'percent', 'care', 'years', 'don', 'time', 'university', 'students', 'children', 'food', 'new', 'study', 'school', 'just', 'like', 'health', 'people', 'says']


Top 15 words for topic #2
['world', 'companies', 'north', 'million', 'country', 'like', 'china', 'city', 'just', 'water', 'year', '000', 'company', 'government', 'state', 'years', 'said', 'new', 'people', 'says']


Top 15 words for topic #3
['law', 'white', 'new', 'russia', 'department', 'state', 'security', 'reports', 'house', 'government', 'court', 'news', 'npr', 'told', 'says', 'people', 'police', 'president', 'trump', 'said']


Top 15 words for topic #4
['house', 'like', 'health', 'voters', 'party', 'election', 'vote', 'states', 'just', 'obama', 'new', 'republican', 'percent', 

In [25]:
# attach topic number to original articles
dtm # original dt-matrix
npr # original dataframe

topic_results = LDA.transform(dtm)

In [26]:
# probabilities of articles belonging to each topic
# (one 7-dim row for each of the 119992 article)
topic_results.shape

(11992, 5)

In [27]:
# probabilities of first article with actual percentages (rounded to 3 decimal)
# probably belong to topic 2 ~ political topic
topic_results[0].round(3)

array([0.   , 0.   , 0.   , 0.651, 0.348])

In [28]:
# we can confirm by taking a look at the actual article
npr['Article'][0]

'In the Washington of 2016, even when the policy can be bipartisan, the politics cannot. And in that sense, this year shows little sign of ending on Dec. 31. When President Obama moved to sanction Russia over its alleged interference in the U. S. election just concluded, some Republicans who had long called for similar or more severe measures could scarcely bring themselves to approve. House Speaker Paul Ryan called the Obama measures ”appropriate” but also ”overdue” and ”a prime example of this administration’s ineffective foreign policy that has left America weaker in the eyes of the world.” Other GOP leaders sounded much the same theme. ”[We have] been urging President Obama for years to take strong action to deter Russia’s worldwide aggression, including its   operations,” wrote Rep. Devin Nunes,  . chairman of the House Intelligence Committee. ”Now with just a few weeks left in office, the president has suddenly decided that some stronger measures are indeed warranted.” Appearing 

In [29]:
# returns index position (topic nr) of highest probabilty
topic_results[0].argmax()

3

In [30]:
# add topic number column to original dataframe
# (axis 1 --> 7 from topic_results.shape => (11992, 7))
# ()
npr['Topic'] = topic_results.argmax(axis=1)
npr


Unnamed: 0,Article,Topic
0,"In the Washington of 2016, even when the polic...",3
1,Donald Trump has used Twitter — his prefe...,3
2,Donald Trump is unabashedly praising Russian...,3
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3
4,"From photography, illustration and video, to d...",2
5,I did not want to join yoga class. I hated tho...,1
6,With a who has publicly supported the debunk...,1
7,"I was standing by the airport exit, debating w...",0
8,"If movies were trying to be more realistic, pe...",1
9,"Eighteen years ago, on New Year’s Eve, David F...",0


Non-negative Matrix Factorisation:

simultaneous dimensionality reduction and clustering

the input is a non-negative data matrix, A (here: dtm).
nr of basis vectors, k (number of topics)
initialise W and H as random matrices


In [65]:
# import pandas as pd
npr = pd.read_csv('../../pythongyak/UPDATED_NLP_COURSE/05-Topic-Modeling/npr.csv')

In [66]:
# 1 # construct vector space model for docs --> td-matrix
# 2 #  apply tf-idf term weight normalisation on the td-matrix
# 3 # normalise tf-idf vectors to unit length
# 4 # initialise factors using non-negative double singular value decomposition (NNDSVD) on the td-matrix
from sklearn.feature_extraction.text import TfidfVectorizer # combines the steps of CountVectorizer and TfidfTransformer

tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = tfidf.fit_transform(npr['Article']) # call it dtm, although it is more than that as we used tfidf
dtm

<11992x54777 sparse matrix of type '<class 'numpy.float64'>'
	with 3033388 stored elements in Compressed Sparse Row format>

In [67]:
# 5 # apply projected gradient nonneg matrix factorisation (NMF) on the td-matrix
# --> basis vectors, W (topics/clusters) -- prob. of words belonging to a topic (each word is a row)
# --> coefficient matrix: membership weights for documents relative to each cluster -- prop of docs belongign to a topic (each doc is a row)
from sklearn.decomposition import NMF

nmf_model = NMF(n_components=7, random_state=42)
nmf_model.fit(dtm)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=7, random_state=42, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [68]:
# get a specific token
tfidf.get_feature_names()[2330]

'alcove'

In [69]:
# show top 15 words per topic
for index,topic in enumerate(nmf_model.components_):
    print(f"Top 15 words for topic #{index}")
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print("\n")

Top 15 words for topic #0
['new', 'research', 'like', 'patients', 'health', 'disease', 'percent', 'women', 'virus', 'study', 'water', 'food', 'people', 'zika', 'says']


Top 15 words for topic #1
['gop', 'pence', 'presidential', 'russia', 'administration', 'election', 'republican', 'obama', 'white', 'house', 'donald', 'campaign', 'said', 'president', 'trump']


Top 15 words for topic #2
['senate', 'house', 'people', 'act', 'law', 'tax', 'plan', 'republicans', 'affordable', 'obamacare', 'coverage', 'medicaid', 'insurance', 'care', 'health']


Top 15 words for topic #3
['officers', 'syria', 'security', 'department', 'law', 'isis', 'russia', 'government', 'state', 'attack', 'president', 'reports', 'court', 'said', 'police']


Top 15 words for topic #4
['primary', 'cruz', 'election', 'democrats', 'percent', 'party', 'delegates', 'vote', 'state', 'democratic', 'hillary', 'campaign', 'voters', 'sanders', 'clinton']


Top 15 words for topic #5
['love', 've', 'don', 'album', 'way', 'time', 'so

In [70]:
# attach topic number to dataframe
topic_results = nmf_model.transform(dtm)
npr['Topic number'] = topic_results.argmax(axis=1)
npr.head()

Unnamed: 0,Article,Topic number
0,"In the Washington of 2016, even when the polic...",1
1,Donald Trump has used Twitter — his prefe...,1
2,Donald Trump is unabashedly praising Russian...,1
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3
4,"From photography, illustration and video, to d...",6


In [71]:
# attach topic label to dataframe
my_topic_dict = {0:'Health',1:'Campaign',2:'Legislation', 3:'Foreign politics', 4:'Election', 5:'Music', 6:'Education'}
npr['Topic'] = npr['Topic number'].map(my_topic_dict)
npr.head()

Unnamed: 0,Article,Topic number,Topic
0,"In the Washington of 2016, even when the polic...",1,Campaign
1,Donald Trump has used Twitter — his prefe...,1,Campaign
2,Donald Trump is unabashedly praising Russian...,1,Campaign
3,"Updated at 2:50 p. m. ET, Russian President Vl...",3,Foreign politics
4,"From photography, illustration and video, to d...",6,Education
