In [1]:
# reference：　https://www.machinelearningplus.com/nlp/topic-modeling-python-sklearn-examples/#9buildldamodelwithsklearn
# The main difference is grid search and checking performance
# In the first code, there is no grid search and no parameter tuning
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
#configure
# sets matplotlib to inline and displays graphs below the corressponding cell.
%matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

#import nltk
import nltk
nltk.download('punkt') # nltk needs various resources
nltk.download('wordnet') # nltk needs various resources
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize

#preprocessing
from nltk.corpus import stopwords  #stopwords
from nltk import word_tokenize,sent_tokenize # tokenizing
from nltk.stem import PorterStemmer,LancasterStemmer  # using the Porter Stemmer and Lancaster Stemmer and others
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer  # lammatizer from WordNet

# for named entity recognition (NER)
from nltk import ne_chunk

# vectorizers for creating the document-term-matrix (DTM)
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

#stop-words
stop_words=set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# load data
df=pd.read_csv('abcnews-date-text2.csv')
# drop the publish date.
df.drop(['publish_date'],axis=1,inplace=True)
df.head()

Unnamed: 0,headline_text
0,aba decides against community broadcasting lic...
1,act fire witnesses must be aware of defamation
2,a g calls for infrastructure protection summit
3,air nz staff in aust strike for pay rise
4,air nz strike to affect australian travellers


In [3]:
def clean_text(headline):
  le=WordNetLemmatizer()
  word_tokens=word_tokenize(headline)
  tokens=[le.lemmatize(w) for w in word_tokens if w not in stop_words and len(w)>3]
  cleaned_text=" ".join(tokens)
  return cleaned_text

# apply the clean_text function and create a new column to save results.
df['headline_cleaned_text']=df['headline_text'].apply(clean_text)
df.head()

Unnamed: 0,headline_text,headline_cleaned_text
0,aba decides against community broadcasting lic...,decides community broadcasting licence
1,act fire witnesses must be aware of defamation,fire witness must aware defamation
2,a g calls for infrastructure protection summit,call infrastructure protection summit
3,air nz staff in aust strike for pay rise,staff aust strike rise
4,air nz strike to affect australian travellers,strike affect australian traveller


In [4]:
# create TF-IDF vector, apply stop words, keep only 1000 features
vect =TfidfVectorizer(stop_words=list(stop_words),max_features=1000) 
# 1000 selected by term frequency across the corpus.
# do note that high TF does not mean that word must be more informative and useful

In [5]:
# apply on our input df, only one column to create TF-IDF vector
vect_text=vect.fit_transform(df['headline_cleaned_text'])

In [6]:
print(vect_text.shape)
print(vect_text)

(10000, 1000)
  (0, 185)	1.0
  (1, 588)	0.6031800134912962
  (1, 986)	0.6588637628432997
  (1, 336)	0.44952465264611235
  (2, 861)	0.8106307910603304
  (2, 126)	0.5855576150857429
  (3, 752)	0.4889238857866728
  (3, 853)	0.4863379469158838
  (3, 56)	0.45864351880691284
  (3, 835)	0.5604239091556644
  (4, 58)	0.698363641497009
  (4, 853)	0.7157431272705573
  (5, 480)	0.7779036147633102
  (5, 980)	0.6283836138364648
  (6, 722)	1.0
  (7, 560)	0.48375607567921375
  (7, 357)	0.4778450080088669
  (7, 963)	0.5468122860381636
  (7, 55)	0.4885084762848749
  (8, 464)	0.3179975937713466
  (8, 207)	0.38292752999874563
  (8, 789)	0.47175282114458056
  (8, 10)	0.5750962103107868
  (8, 56)	0.44604670355579795
  (9, 57)	1.0
  :	:
  (9993, 384)	0.6973065832576476
  (9994, 122)	0.6388400214294557
  (9994, 454)	0.555951858264886
  (9994, 80)	0.5317903330372156
  (9995, 725)	0.5555639202730687
  (9995, 20)	0.5451654166859932
  (9995, 388)	0.3392883696219227
  (9995, 964)	0.38768844765112065
  (9995, 646)	

In [7]:
idf=vect.idf_

In [8]:
dd=dict(zip(vect.get_feature_names_out(), idf))
l=sorted(dd, key=(dd).get)
# print(l)
print(l[0],l[-1])
print(dd['police'])
print(dd['walk'])  # police is the most common and walk is least common among the news headlines.

iraq williams
4.451538593099235
7.571383037361257


In [9]:
# Check sparsity: how many percentage of your celss are nonzero?
# Materialize the sparse data
data_dense = vect_text.todense()

# Compute Sparsicity = Percentage of Non-Zero cells
print("Sparsicity: ", ((data_dense > 0).sum()/data_dense.size)*100, "%")

Sparsicity:  0.29568 %


In [10]:
######################################
# Now is the difference, Grid Search #
######################################
# This step takes long time
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
import time
start_time = time.time()

# Define Search Param
search_params = {'n_components': [15, 30], 'learning_decay': [.5, .9]}
# learning decay is not important fod LDA, but to illustrate how to code two-dimensional parameter tuning, we include that.
# To save time, I only check 15 and 30, in practie, you may need to check many more, like 10 to 30 with increment 5

# Initiallize the Model
lda = LatentDirichletAllocation(n_components=20,               # Number of topics
                                max_iter=10,               # Max learning iterations
                                learning_method='online',   
                                random_state=100,          # Random state
                                batch_size=128,            # n docs in each learning iter
                                evaluate_every = -1,       # compute perplexity every n iters, default: Don't
                                n_jobs = -1,               # Use all available CPUs
                                )

# Inititialize the Grid Search Class, this is the most important step for grid search specification
model = GridSearchCV(lda, param_grid=search_params)

# Do the Grid Search
model.fit(vect_text)

print("--- %s seconds ---" % (time.time() - start_time))

--- 393.62593364715576 seconds ---


In [11]:
# Best Model
best_lda_model = model.best_estimator_

# Model Parameters
print("Best Model's Params: ", model.best_params_)

# Log Likelihood Score
print("Best Log Likelihood Score: ", model.best_score_)

# Perplexity
print("Model Perplexity: ", best_lda_model.perplexity(vect_text))

Best Model's Params:  {'learning_decay': 0.5, 'n_components': 15}
Best Log Likelihood Score:  -32025.260201018093
Model Perplexity:  4136.335817524772


In [12]:
# Create Document-Topic Matrix that is much easier to browse in excel
lda_output = best_lda_model.transform(vect_text)

# column names
topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]

# index names
docnames = ["Doc" + str(i) for i in range(len(df))]

# Make the pandas dataframe
df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

# Get dominant topic for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

# Styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# Apply Style
df_document_topics = df_document_topic.head(15).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,Topic10,Topic11,Topic12,Topic13,Topic14,dominant_topic
Doc0,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.53,0.03,0.03,0.03,0.03,0.03,9
Doc1,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.49,0.02,0.02,0.02,0.02,0.02,0.19,0.02,7
Doc2,0.03,0.27,0.03,0.37,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,3
Doc3,0.02,0.02,0.02,0.02,0.02,0.02,0.18,0.19,0.02,0.02,0.02,0.02,0.36,0.02,0.02,12
Doc4,0.03,0.03,0.03,0.03,0.03,0.03,0.61,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,6
Doc5,0.03,0.03,0.03,0.03,0.03,0.61,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,5
Doc6,0.53,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0
Doc7,0.02,0.02,0.37,0.02,0.02,0.34,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,2
Doc8,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.02,0.71,0.02,0.02,12
Doc9,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.03,0.53,0.03,0.03,0.03,0.03,0.03,0.03,8


In [13]:
# check the distribution of dominant topic across documents
# this only provides an overview about distribution of topics for you to judge whether it makes sense
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution

Unnamed: 0,Topic Num,Num Documents
0,0,1060
1,2,874
2,5,798
3,13,759
4,3,739
5,4,710
6,1,640
7,8,589
8,9,584
9,7,568


In [15]:
# Topic-Keyword Matrix
df_topic_keywords = pd.DataFrame(best_lda_model.components_)

# Assign Column and Index
df_topic_keywords.columns = vect.get_feature_names_out()
df_topic_keywords.index = topicnames

# View
df_topic_keywords.head()

Unnamed: 0,abattoir,aboriginal,abuse,accc,access,accident,accused,accuses,across,action,...,wood,work,worker,world,would,wounded,year,youth,zimbabwe,zone
Topic0,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,...,0.066667,0.066667,0.066667,60.89506,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667
Topic1,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,17.226375,...,0.066667,0.066667,0.066667,0.066667,0.066667,9.960889,0.066667,0.066667,0.066667,0.066667
Topic2,0.066667,7.951966,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,...,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667
Topic3,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,...,8.037254,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667
Topic4,0.066667,0.066667,0.066667,0.066667,0.066667,23.809532,0.066667,0.066667,0.066667,0.066667,...,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667,0.066667


In [17]:
# Show top-n keywords for each topic
# relatively, this is an important step for human to check the quality of output and interpret topics.
# bear in mind that topic modeling does not tell you the meaning of each topic.
def show_topics(vectorizer=vect, lda_model=model, n_words=20):
    keywords = np.array(vect.get_feature_names_out())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

topic_keywords = show_topics(vectorizer=vect, lda_model=best_lda_model, n_words=15)        

# Topic - Keywords Dataframe
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,world,first,clash,british,hold,fear,tour,port,record,cross,talk,china,campaign,decision,meeting
Topic 1,call,force,death,killed,fighting,wont,fund,toll,site,funding,push,victim,poll,civilian,loss
Topic 2,troop,report,hope,begin,home,concern,farmer,drought,service,coast,seek,rain,melbourne,despite,thriller
Topic 3,plan,open,charged,boost,dead,chemical,raid,union,drug,industry,student,chief,cost,power,water
Topic 4,anti,take,crash,green,military,injured,head,airport,rally,accident,team,kill,turn,brisbane,possible
Topic 5,claim,three,case,centre,change,start,help,school,win,nats,tell,leave,king,reject,match
Topic 6,lead,australian,missing,blast,final,still,move,strike,opposition,family,south,continues,state,land,road
Topic 7,hospital,water,inquiry,study,bomb,title,rise,virus,threat,advance,expected,second,waterfall,give,iraq
Topic 8,sars,urged,face,australia,court,charge,korea,probe,worker,murder,find,mine,bushfires,injury,assault
Topic 9,saddam,protest,found,coalition,support,denies,election,dy,community,season,train,player,bomber,without,party
