In [1]:
# reference：　https://www.kaggle.com/rajmehra03/topic-modelling-using-lda-and-lsa-in-sklearn
# data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
# configure
# sets matplotlib to inline and displays graphs below the corressponding cell.
%matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

#import nltk
import nltk
nltk.download('punkt') # nltk needs various resources
nltk.download('wordnet') # nltk needs various resources
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize

# text data preprocessing
from nltk.corpus import stopwords  #stopwords
from nltk import word_tokenize,sent_tokenize # tokenizing
from nltk.stem import PorterStemmer,LancasterStemmer  # using the Porter Stemmer and Lancaster Stemmer and others
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer  # lammatizer from WordNet

# for named entity recognition (NER)
from nltk import ne_chunk

# vectorizers for creating the document-term-matrix (DTM)
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

# stop-words
stop_words=set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# load data
df=pd.read_csv('abcnews-date-text.csv')
# drop the publish date.
df.drop(['publish_date'],axis=1,inplace=True)
df.head() # take a look at the first few rows of data

Unnamed: 0,headline_text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers


In [3]:
def clean_text(headline): # you can add more preprocessing functions here
  le=WordNetLemmatizer()
  word_tokens=word_tokenize(headline)
  tokens=[le.lemmatize(w) for w in word_tokens if w not in stop_words and len(w)>3]
  cleaned_text=" ".join(tokens)
  return cleaned_text

# apply the clean_text function and create a new column to save results.
df['headline_cleaned_text']=df['headline_text'].apply(clean_text)
df.head()

Unnamed: 0,headline_text,headline_cleaned_text
0,aba decides against community broadcasting lic...,decides community broadcasting licence
1,act fire witnesses must be aware of defamation,fire witness must aware defamation
2,a g calls for infrastructure protection summit,call infrastructure protection summit
3,air nz staff in aust strike for pay rise,staff aust strike rise
4,air nz strike to affect australian travellers,strike affect australian traveller


In [4]:
# create TF-IDF vector, apply stop words, keep only 1000 features (1000 columns)
vect =TfidfVectorizer(stop_words=list(stop_words),max_features=1000) 
# 1000 selected by term frequency across the corpus.
# do note that high TF does not mean that word must be more informative and useful

In [5]:
# apply on our input df, only one column to create TF-IDF vector
vect_text=vect.fit_transform(df['headline_cleaned_text'])

In [6]:
# check the dimension and content of vect_text
print(vect_text.shape)
print(vect_text)

(1226258, 1000)
  (0, 507)	0.7830964759517771
  (0, 180)	0.6219002406752289
  (1, 575)	0.6350011874790689
  (1, 982)	0.634252507913514
  (1, 322)	0.44101842150367176
  (2, 850)	0.6547003749683041
  (2, 681)	0.6236747415657183
  (2, 124)	0.42707989387150563
  (3, 743)	0.4535400720072263
  (3, 842)	0.4901743730039168
  (3, 56)	0.5225164976545785
  (3, 826)	0.5301009307789318
  (4, 58)	0.6373967041659779
  (4, 842)	0.7705358145591605
  (5, 977)	1.0
  (6, 709)	1.0
  (7, 542)	0.5180441448099484
  (7, 345)	0.48078322560362474
  (7, 960)	0.535924910123952
  (7, 55)	0.46180325325287314
  (8, 452)	0.42931170443114947
  (8, 202)	0.34965626131197547
  (8, 775)	0.4483609681844147
  (8, 13)	0.5219542868074372
  (8, 56)	0.4690075948807512
  :	:
  (1226249, 831)	0.3778999040452349
  (1226250, 100)	0.5663193054263862
  (1226250, 941)	0.503939080598977
  (1226250, 548)	0.4622873009035714
  (1226250, 135)	0.4600198895370982
  (1226251, 344)	0.6080525617065659
  (1226251, 969)	0.7938967704948061
  (12262

In [7]:
# prepare to extract idf
idf=vect.idf_

In [9]:
dd=dict(zip(vect.get_feature_names_out(), idf))
l=sorted(dd, key=(dd).get)
print(l) # print out sorted output
print(l[0],l[-1]) # too many, so we check only the most common and least common words by idf
print(dd['police']) # we know police is the most common from the previous line's output, print idf
print(dd['walk'])  # police is the most common and walk is least common among the news headlines, print idf.

police walk
4.440524277043323
7.922757250789557


In [10]:
# Now we finally reach the step for LDA. This step takes long.
# In the original reference, it covers LSA too. Before LDA, LSA was the state-of-art for document clustering. 
# In recent years, few people use lSA and most people use LDA. So you can focus on LDA first.
from sklearn.decomposition import LatentDirichletAllocation
import time
start_time = time.time()

# the following is the key function for the basic LDA
lda_model=LatentDirichletAllocation(n_components=10,random_state=88,n_jobs=3) # n_jobs could be important now because LDA is slow
# parameters:
# n_components is the number of topics
# doc_topic_priorfloat, default=None => you can specify the prior probability of topics
# topic_word_priorfloat, default=None => you can specify the prior probability of words
# learning_method{‘batch’, ‘online’}, default=’batch’: if the data size is large, the online update will be much faster than the batch update.

# the training step that takes long time
lda_top=lda_model.fit_transform(vect_text)

print(lda_top.shape)  # (no_of_doc,no_of_topics)
print(lda_top)
print("--- %s seconds ---" % (time.time() - start_time))

# also remember that this is only for one value of no. of topics and you may need to try several times to tune the number of topics.

(1226258, 10)
[[0.0415801  0.0415801  0.0415801  ... 0.0415801  0.0415801  0.0415801 ]
 [0.03689667 0.49289935 0.21189715 ... 0.03689967 0.03689847 0.03689695]
 [0.0369732  0.03696543 0.03697667 ... 0.03696401 0.03696574 0.38155244]
 ...
 [0.54332106 0.0334604  0.03345428 ... 0.03345415 0.03345461 0.18906484]
 [0.35235181 0.03341149 0.03341149 ... 0.03341252 0.03341202 0.22139288]
 [0.04151237 0.0415052  0.62647723 ... 0.04150434 0.04150474 0.04149814]]
--- 735.5526669025421 seconds ---


In [11]:
# check whether the proportion adds up to 1
sum=0
for i in lda_top[0]:
  sum=sum+i
print(sum)  

1.0000000000000002


In [12]:
# composition of doc 0 for eg
print("Document 0: ")
for i,topic in enumerate(lda_top[0]):
  print("Topic ",i,": ",topic*100,"%")

Document 0: 
Topic  0 :  4.158009834670271 %
Topic  1 :  4.158009834687848 %
Topic  2 :  4.15800983466907 %
Topic  3 :  4.1580098346681185 %
Topic  4 :  4.158009834671056 %
Topic  5 :  4.1580099586966295 %
Topic  6 :  62.5779113639259 %
Topic  7 :  4.158009834672468 %
Topic  8 :  4.158009834669851 %
Topic  9 :  4.158009834668804 %


In [13]:
# print the probaility over words for each topic
print(lda_model.components_)
print(lda_model.components_.shape)  # (no_of_topics*no_of_words)

[[8.49925056e-01 8.98117340e-01 1.00006301e-01 ... 1.00005695e-01
  1.00003637e-01 1.00004235e-01]
 [1.00003536e-01 1.00001617e-01 1.00001913e-01 ... 1.00004652e-01
  1.00003116e-01 1.00004240e-01]
 [1.00002405e-01 1.00002166e-01 1.00003376e-01 ... 1.00005018e-01
  1.00003599e-01 1.00004349e-01]
 ...
 [1.00002182e-01 1.00002816e-01 1.00003197e-01 ... 1.00010387e-01
  1.00004223e-01 1.00002854e-01]
 [1.00002668e-01 1.00002094e-01 1.00003054e-01 ... 1.00004496e-01
  1.00002431e-01 1.00006169e-01]
 [1.00003285e-01 1.00002185e-01 1.00003830e-01 ... 1.00004737e-01
  8.51341721e+02 1.00004178e-01]]
(10, 1000)


In [17]:
# most important words (top 10) for each topic
vocab = vect.get_feature_names_out()

for i, comp in enumerate(lda_model.components_):
    vocab_comp = zip(vocab, comp)
    sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:10] # this is where you change the top 10 to topX
    print("Topic "+str(i)+": ")
    for t in sorted_words:
        print(t[0],end=" ")
    print("\n")
    
# The reference also have few more blocks of codes of Word Cloud.
# But that package wordcloud is not supported for python version 3.7 & above

Topic 0: 

Topic 1: 
crash dy killed police north road flood still iraq driver 

Topic 2: 
water charged final show south canberra trial green australia drought 

Topic 3: 
court face charge report accused life defends jail indigenous find 

Topic 4: 
china talk rural fire work budget mine country national trump 

Topic 5: 
council open school market urged rate first want centre aussie 

Topic 6: 
interview world home coast gold community guilty back second shooting 

Topic 7: 
sydney brisbane move queensland storm future assault perth protest denies 

Topic 8: 
missing take change search industry family appeal found sale park 

Topic 9: 
election win lead jailed say law year australian land action 



In [18]:
# now we output goodness-of-fit measures
from pprint import pprint

# Log Likelyhood: Higher is better
start_time = time.time()
print("Log Likelihood: ", lda_model.score(vect_text))
print("--- %s seconds ---" % (time.time() - start_time))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
start_time = time.time()
print("Perplexity: ", lda_model.perplexity(vect_text))
print("--- %s seconds ---" % (time.time() - start_time))

# See model parameters
pprint(lda_model.get_params())

Log Likelihood:  -14374029.07542995
--- 88.814706325531 seconds ---
Perplexity:  1912.1684872552237
--- 86.81644320487976 seconds ---
{'batch_size': 128,
 'doc_topic_prior': None,
 'evaluate_every': -1,
 'learning_decay': 0.7,
 'learning_method': 'batch',
 'learning_offset': 10.0,
 'max_doc_update_iter': 100,
 'max_iter': 10,
 'mean_change_tol': 0.001,
 'n_components': 10,
 'n_jobs': 3,
 'perp_tol': 0.1,
 'random_state': 88,
 'topic_word_prior': None,
 'total_samples': 1000000.0,
 'verbose': 0}
