# Topic modeling

LDA - Latent Dirichlet Allocation is based on probability dristribution (Dirichlet distribution)

In [None]:
import pandas as pd

In [None]:
npr = pd.read_csv('../../pythongyak/UPDATED_NLP_COURSE/05-Topic-Modeling/npr.csv', sep='\t')
npr.head()

In [None]:
# nr of articles
len(npr)

In [None]:
# nr of tokens is frist article
len(npr['Article'][0])

In [None]:
# 1 # preprocessing: vectorisation
from sklearn.feature_extraction.text import CountVectorizer

# ignore terms with too high doc frequency or too low (ratio 0-1 or an integer)
# remove stopwords
cv = CountVectorizer(max_df=0.9, min_df=2, stop_words='english')

In [None]:
# document term matrix = vectorised article column
dtm = cv.fit_transform(npr['Article'])

In [None]:
dtm

In [None]:
# 2 # perform LDA
from sklearn.decomposition import LatentDirichletAllocation

# parameters: nr of components (~ topics), random state
LDA = LatentDirichletAllocation(n_components=7)

In [None]:
# fit lda to our dt-matrix
LDA.fit(dtm)

In [None]:
# 3 # grab vocab of words

# nr of terms
len(cv.get_feature_names())

In [None]:
type(cv.get_feature_names())

In [None]:
cv.get_feature_names()[30012]

In [None]:
# get a random word 
import random

random_word_id = random.randint(0,54777)
cv.get_feature_names()[random_word_id]

In [None]:
# 4 # grab topics

# nr of topics
len(LDA.components_)

In [None]:
# LDA.components_ is a numpy array containing probabilities for each word
type(LDA.components_)

In [None]:
LDA.components_

In [None]:
# 5 # grab highest probability words per topic

single_topic = LDA.components_[0]

# returns sorted array with elements' original index position (see example below)
# shows location of higher values in LDA.comp -- higher probabilities -- better candidates for the topic
single_topic.argsort()

# use the index positions (that are same as in cv.get_feature_names)

In [None]:
# example
import numpy as np

arr =  np.array([10, 200,1])
arr.argsort()
# returns array([2, 0, 1]) ---> [1, 10, 200]

In [None]:
# argsort returns index positions from least to greatest
# we are looking for 10/20 greatest values --> grab last 10/20 values
# that returns index position of ten most probable words
top_ten_words = single_topic.argsort()[-20:]

In [None]:
# get words at index
for index in top_ten_words:
    print(cv.get_feature_names()[index])

In [None]:
# top 20 words of each topic
for index,topic in enumerate(LDA.components_):
    print(f"Top 15 words for topic #{index}")
    print([cv.get_feature_names()[index] for index in topic.argsort()[-20:]])
    print("\n")


In [None]:
# attach topic number to original articles
dtm # original dt-matrix
npr # original dataframe

topic_results = LDA.transform(dtm)

In [None]:
# probabilities of articles belonging to each topic
# (one 7-dim row for each of the 119992 article)
topic_results.shape

In [None]:
# probabilities of first article with actual percentages (rounded to 3 decimal)
# probably belong to topic 2 ~ political topic
topic_results[0].round(3)

In [None]:
# we can confirm by taking a look at the actual article
npr['Article'][0]

In [None]:
# returns index position (topic nr) of highest probabilty
topic_results[0].argmax()

In [None]:
# add topic number column to original dataframe
# (axis 1 --> 7 from topic_results.shape => (11992, 7))
# ()
npr['Topic'] = topic_results.argmax(axis=1)
npr


Non-negative Matrix Factorisation:

simultaneous dimensionality reduction and clustering

the input is a non-negative data matrix, A (here: dtm).
nr of basis vectors, k (number of topics)
initialise W and H as random matrices


In [None]:
# import pandas as pd
npr = pd.read_csv('../../pythongyak/UPDATED_NLP_COURSE/05-Topic-Modeling/npr.csv')

In [None]:
# 1 # construct vector space model for docs --> td-matrix
# 2 #  apply tf-idf term weight normalisation on the td-matrix
# 3 # normalise tf-idf vectors to unit length
# 4 # initialise factors using non-negative double singular value decomposition (NNDSVD) on the td-matrix
from sklearn.feature_extraction.text import TfidfVectorizer # combines the steps of CountVectorizer and TfidfTransformer

tfidf = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')
dtm = tfidf.fit_transform(npr['Article']) # call it dtm, although it is more than that as we used tfidf
dtm

In [None]:
# 5 # apply projected gradient nonneg matrix factorisation (NMF) on the td-matrix
# --> basis vectors, W (topics/clusters) -- prob. of words belonging to a topic (each word is a row)
# --> coefficient matrix: membership weights for documents relative to each cluster -- prop of docs belongign to a topic (each doc is a row)
from sklearn.decomposition import NMF

nmf_model = NMF(n_components=7, random_state=42)
nmf_model.fit(dtm)

In [None]:
# get a specific token
tfidf.get_feature_names()[2330]

In [None]:
# show top 15 words per topic
for index,topic in enumerate(nmf_model.components_):
    print(f"Top 15 words for topic #{index}")
    print([tfidf.get_feature_names()[i] for i in topic.argsort()[-15:]])
    print("\n")

In [None]:
# attach topic number to dataframe
topic_results = nmf_model.transform(dtm)
npr['Topic number'] = topic_results.argmax(axis=1)
npr.head()

In [None]:
# attach topic label to dataframe
my_topic_dict = {0:'Health',1:'Campaign',2:'Legislation', 3:'Foreign politics', 4:'Election', 5:'Music', 6:'Education'}
npr['Topic'] = npr['Topic number'].map(my_topic_dict)
npr.head()