In [19]:
import pandas as pd
import numpy as np


In [20]:
df = pd.read_excel(r'sample_articles.xlsx')

In [21]:
import gensim
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora, models
import pyLDAvis.gensim_models
from nltk.corpus import stopwords
import string
from nltk.stem.wordnet import WordNetLemmatizer

import warnings
warnings.simplefilter('ignore')
from itertools import chain

In [22]:
#import nltk
#nltk.download('all')

In [23]:
df.head()

Unnamed: 0,topic,title,article,date,link
0,BUSINESS,"MAHB faces double whammy of interstate ban, Tu...",KUALA LUMPUR: Affin Hwang Capital expects Mala...,"May 4, 2021 @ 10:40am",https://www.nst.com.my/business/2021/05/687692...
1,BUSINESS,RM opens at 4.09 against USD,KUALA LUMPUR: The ringgit opened easier agains...,"May 3, 2021 @ 10:09am",https://www.nst.com.my/business/2021/05/687398...
2,COLUMNISTS,Biden needs fresh thinking and more than one term,FORGET about the first 100 days. US President ...,"May 3, 2021 @ 12:10am",https://www.nst.com.my/opinion/columnists/2021...
3,SUNDAY VIBES,MONEY THOUGHTS: Ikigai — The uncommon key to c...,HAVE you noticed how life's getting tougher on...,"May 2, 2021 @ 8:30am",https://www.nst.com.my/lifestyle/sunday-vibes/...
4,BUSINESS,"Labuan FSA appoints new chairman, director gen...",KUALA LUMPUR: Labuan Financial Services Author...,"Apr 30, 2021 @ 4:15pm",https://www.nst.com.my/business/2021/04/686731...


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107 entries, 0 to 106
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   topic    107 non-null    object
 1   title    107 non-null    object
 2   article  107 non-null    object
 3   date     107 non-null    object
 4   link     107 non-null    object
dtypes: object(5)
memory usage: 4.3+ KB


In [25]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(text):
    stop_free = ' '.join([word for word in text.lower().split() if word not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = ' '.join([lemma.lemmatize(word) for word in punc_free.split()])
    return normalized.split()

In [26]:
df['article_clean']=df['article'].apply(clean)

In [27]:
dictionary = corpora.Dictionary(df['article_clean'])
print(dictionary.num_nnz)

20580


In [28]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in df['article_clean'] ]
print(len(doc_term_matrix))

107


In [29]:
lda = gensim.models.ldamodel.LdaModel

In [30]:
num_topics= 14
%time ldamodel = lda(doc_term_matrix,num_topics=num_topics,id2word=dictionary,passes=50,minimum_probability=0)

Wall time: 5.75 s


In [31]:
ldamodel.print_topics(num_topics=num_topics)

[(0,
  '0.012*"per" + 0.012*"cent" + 0.011*"said" + 0.009*"malaysia" + 0.007*"also" + 0.006*"year" + 0.006*"country" + 0.006*"billion" + 0.006*"government" + 0.006*"digital"'),
 (1,
  '0.007*"back" + 0.005*"education" + 0.005*"—" + 0.005*"million" + 0.004*"year" + 0.004*"last" + 0.004*"rajan" + 0.004*"i" + 0.004*"day" + 0.004*"u"'),
 (2,
  '0.017*"government" + 0.011*"country" + 0.010*"economic" + 0.008*"covid19" + 0.006*"people" + 0.006*"policy" + 0.006*"also" + 0.006*"bank" + 0.005*"investor" + 0.005*"system"'),
 (3,
  '0.017*"rating" + 0.012*"agency" + 0.008*"index" + 0.007*"–" + 0.007*"sovereign" + 0.006*"fund" + 0.006*"art" + 0.006*"asset" + 0.006*"financial" + 0.005*"market"'),
 (4,
  '0.016*"said" + 0.011*"new" + 0.010*"vaccine" + 0.007*"well" + 0.006*"country" + 0.006*"sabah" + 0.005*"investment" + 0.005*"year" + 0.005*"goh" + 0.004*"travel"'),
 (5,
  '0.009*"china" + 0.009*"american" + 0.008*"datuk" + 0.007*"said" + 0.007*"jakim" + 0.007*"halal" + 0.007*"biden" + 0.007*"americ

In [32]:
lda_display = pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

In [33]:
lda_corpus = ldamodel[doc_term_matrix]

In [34]:
[doc for doc in lda_corpus]

[[(0, 0.00031753315),
  (1, 0.0003175323),
  (2, 0.0003175326),
  (3, 0.00031753202),
  (4, 0.0003175323),
  (5, 0.000317532),
  (6, 0.9958721),
  (7, 0.00031753207),
  (8, 0.000317532),
  (9, 0.0003175325),
  (10, 0.00031753213),
  (11, 0.0003175322),
  (12, 0.00031753295),
  (13, 0.0003175328)],
 [(0, 0.0005068594),
  (1, 0.00050685735),
  (2, 0.000506859),
  (3, 0.0005068574),
  (4, 0.0005068574),
  (5, 0.0005068569),
  (6, 0.0005068581),
  (7, 0.039790634),
  (8, 0.0005068576),
  (9, 0.22443978),
  (10, 0.000506857),
  (11, 0.0005068569),
  (12, 0.73019415),
  (13, 0.0005068588)],
 [(0, 0.18283363),
  (1, 0.24654749),
  (2, 0.2070478),
  (3, 7.7106255e-05),
  (4, 7.710626e-05),
  (5, 0.36280003),
  (6, 7.710631e-05),
  (7, 7.7106364e-05),
  (8, 7.710615e-05),
  (9, 7.710625e-05),
  (10, 7.710626e-05),
  (11, 7.710621e-05),
  (12, 7.710648e-05),
  (13, 7.71065e-05)],
 [(0, 0.00012872883),
  (1, 0.0001287286),
  (2, 0.00012872867),
  (3, 0.00012872853),
  (4, 0.0001287285),
  (5, 0.0

In [35]:
scores = list(chain(*[[score for topic_id,score in topic] \
                      for topic in [doc for doc in lda_corpus]]))

threshold = sum(scores)/len(scores)
print(threshold)

0.07142857079236957


In [36]:
cluster1 = [j for i,j in zip(lda_corpus,df.index) if i[0][1] > threshold]
cluster2 = [j for i,j in zip(lda_corpus,df.index) if i[1][1] > threshold]
cluster3 = [j for i,j in zip(lda_corpus,df.index) if i[2][1] > threshold]
cluster4 = [j for i,j in zip(lda_corpus,df.index) if i[3][1] > threshold]
cluster5 = [j for i,j in zip(lda_corpus,df.index) if i[4][1] > threshold]

In [37]:
df.iloc[cluster1]

Unnamed: 0,topic,title,article,date,link,article_clean
2,COLUMNISTS,Biden needs fresh thinking and more than one term,FORGET about the first 100 days. US President ...,"May 3, 2021 @ 12:10am",https://www.nst.com.my/opinion/columnists/2021...,"[forget, first, 100, day, u, president, joe, b..."
7,BUSINESS,Strong sukuk interest shows Malaysia's ability...,KUALA LUMPUR: The strong interest in its inter...,"Apr 27, 2021 @ 12:17pm",https://www.nst.com.my/business/2021/04/685792...,"[kuala, lumpur, strong, interest, internationa..."
19,BUSINESS,Luno Malaysia manages over RM1bil DAUM,"KUALA LUMPUR: Global cryptocurrency company, L...","Apr 14, 2021 @ 11:51am",https://www.nst.com.my/business/2021/04/682203...,"[kuala, lumpur, global, cryptocurrency, compan..."
24,BUSINESS,Sapura Energy seals RM10.3bil debt refinancing,KUALA LUMPUR: Sapura Energy Bhd has secured a ...,"Mar 30, 2021 @ 3:38pm",https://www.nst.com.my/business/2021/03/678169...,"[kuala, lumpur, sapura, energy, bhd, secured, ..."
33,BUSINESS,"IHH, MAHB shares fall after Turkey's lira take...",KUALA LUMPUR: Khazanah Nasional Bhd's listed f...,"Mar 23, 2021 @ 9:57am",https://www.nst.com.my/business/2021/03/676147...,"[kuala, lumpur, khazanah, nasional, bhds, list..."
37,BUSINESS,BIMB Investment targets RM2.5bil AUM by end 2021,"KUALA LUMPUR: BIMB Investment Management Bhd, ...","Mar 22, 2021 @ 12:50pm",https://www.nst.com.my/business/2021/03/675964...,"[kuala, lumpur, bimb, investment, management, ..."
45,NATION,PM's 'Setahun Malaysia Prihatin' keynote address,KUALA LUMPUR: Prime Minister Tan Sri Muhyiddin...,"Mar 1, 2021 @ 5:54pm",https://www.nst.com.my/news/nation/2021/03/670...,"[kuala, lumpur, prime, minister, tan, sri, muh..."
48,BUSINESS,IHH's FY20 net profit falls 47.6pct to RM288.8...,KUALA LUMPUR: IHH Healthcare Bhd's net profit ...,"Feb 26, 2021 @ 8:22pm",https://www.nst.com.my/business/2021/02/669371...,"[kuala, lumpur, ihh, healthcare, bhds, net, pr..."
52,LETTERS,Encourage local food production to lower cost ...,LETTER: We propose that in the interest of red...,"Feb 25, 2021 @ 10:00am",https://www.nst.com.my/opinion/letters/2021/02...,"[letter, propose, interest, reducing, cost, li..."
54,BUSINESS,Ageson's Q2 net profit increases 39pct to RM12...,KUALA LUMPUR: Ageson Bhd posted a net profit o...,"Feb 24, 2021 @ 6:31pm",https://www.nst.com.my/business/2021/02/668668...,"[kuala, lumpur, ageson, bhd, posted, net, prof..."


In [38]:
df.iloc[cluster5]

Unnamed: 0,topic,title,article,date,link,article_clean
41,WORLD,How vaccines became ammunition in global diplo...,PARIS: Covid-19 vaccines are not just coveted ...,"Mar 5, 2021 @ 11:38am",https://www.nst.com.my/world/world/2021/03/671...,"[paris, covid19, vaccine, coveted, protection,..."
55,CRIME & COURTS,RM2.8 million lost to investment fraud syndica...,KUALA LUMPUR: Police arrested nine men on susp...,"Feb 6, 2021 @ 9:01am",https://www.nst.com.my/news/crime-courts/2021/...,"[kuala, lumpur, police, arrested, nine, men, s..."
79,NATION,"Sabah can be a vibrant economic powerhouse, sa...",KOTA KINABALU: Sabah needs to position itself ...,"Jan 2, 2021 @ 7:57pm",https://www.nst.com.my/news/nation/2021/01/653...,"[kota, kinabalu, sabah, need, position, early,..."
88,BUSINESS,"Malaysia's economy to grow 6.0pct in 2021, oil...",KUALA LUMPUR: Malaysia's economy is expected t...,"Nov 30, 2020 @ 5:01pm",https://www.nst.com.my/business/2020/11/645562...,"[kuala, lumpur, malaysia, economy, expected, r..."
