In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_excel(r'sample_articles.xlsx')

In [3]:
import gensim
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora, models
import pyLDAvis.gensim_models
from nltk.corpus import stopwords
import string
from nltk.stem.wordnet import WordNetLemmatizer

import warnings
warnings.simplefilter('ignore')
from itertools import chain

In [4]:
#import nltk
#nltk.download('all')

In [5]:
df.head()

Unnamed: 0,topic,title,article,date,link
0,BUSINESS,"MAHB faces double whammy of interstate ban, Tu...",KUALA LUMPUR: Affin Hwang Capital expects Mala...,"May 4, 2021 @ 10:40am",https://www.nst.com.my/business/2021/05/687692...
1,BUSINESS,RM opens at 4.09 against USD,KUALA LUMPUR: The ringgit opened easier agains...,"May 3, 2021 @ 10:09am",https://www.nst.com.my/business/2021/05/687398...
2,COLUMNISTS,Biden needs fresh thinking and more than one term,FORGET about the first 100 days. US President ...,"May 3, 2021 @ 12:10am",https://www.nst.com.my/opinion/columnists/2021...
3,SUNDAY VIBES,MONEY THOUGHTS: Ikigai — The uncommon key to c...,HAVE you noticed how life's getting tougher on...,"May 2, 2021 @ 8:30am",https://www.nst.com.my/lifestyle/sunday-vibes/...
4,BUSINESS,"Labuan FSA appoints new chairman, director gen...",KUALA LUMPUR: Labuan Financial Services Author...,"Apr 30, 2021 @ 4:15pm",https://www.nst.com.my/business/2021/04/686731...


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 107 entries, 0 to 106
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   topic    107 non-null    object
 1   title    107 non-null    object
 2   article  107 non-null    object
 3   date     107 non-null    object
 4   link     107 non-null    object
dtypes: object(5)
memory usage: 4.3+ KB


In [7]:
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(text):
    stop_free = ' '.join([word for word in text.lower().split() if word not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = ' '.join([lemma.lemmatize(word) for word in punc_free.split()])
    return normalized.split()

In [8]:
df['article_clean']=df['article'].apply(clean)

In [9]:
dictionary = corpora.Dictionary(df['article_clean'])
print(dictionary.num_nnz)

20580


In [10]:
doc_term_matrix = [dictionary.doc2bow(doc) for doc in df['article_clean'] ]
print(len(doc_term_matrix))

107


In [11]:
lda = gensim.models.ldamodel.LdaModel

In [12]:
num_topics= 14
%time ldamodel = lda(doc_term_matrix,num_topics=num_topics,id2word=dictionary,passes=50,minimum_probability=0)

Wall time: 6.04 s


In [13]:
ldamodel.print_topics(num_topics=num_topics)

[(0,
  '0.015*"food" + 0.010*"u" + 0.007*"said" + 0.007*"market" + 0.005*"wednesday" + 0.005*"currency" + 0.005*"energy" + 0.004*"dollar" + 0.004*"jakim" + 0.004*"halal"'),
 (1,
  '0.014*"per" + 0.014*"cent" + 0.008*"billion" + 0.008*"year" + 0.005*"million" + 0.005*"2020" + 0.005*"budget" + 0.005*"also" + 0.005*"still" + 0.005*"economy"'),
 (2,
  '0.011*"said" + 0.010*"investor" + 0.008*"investment" + 0.006*"the" + 0.006*"stock" + 0.006*"year" + 0.005*"bitcoin" + 0.005*"dollar" + 0.004*"price" + 0.004*"company"'),
 (3,
  '0.009*"business" + 0.008*"would" + 0.007*"sector" + 0.006*"labuan" + 0.005*"term" + 0.005*"wang" + 0.004*"service" + 0.004*"fsa" + 0.004*"short" + 0.004*"bigger"'),
 (4,
  '0.017*"said" + 0.009*"bhd" + 0.007*"vaccine" + 0.006*"would" + 0.006*"country" + 0.005*"also" + 0.005*"bank" + 0.005*"sapura" + 0.004*"phone" + 0.004*"energy"'),
 (5,
  '0.007*"case" + 0.007*"guterres" + 0.006*"u" + 0.005*"yemen" + 0.005*"worst" + 0.005*"would" + 0.005*"could" + 0.005*"said" + 0.0

In [14]:
lda_display = pyLDAvis.gensim_models.prepare(ldamodel, doc_term_matrix, dictionary, sort_topics=False, mds='mmds')
pyLDAvis.display(lda_display)

In [15]:
lda_corpus = ldamodel[doc_term_matrix]

In [16]:
[doc for doc in lda_corpus]

[[(0, 0.00031752972),
  (1, 0.00031753047),
  (2, 0.00031752946),
  (3, 0.0003175293),
  (4, 0.99587214),
  (5, 0.00031752937),
  (6, 0.00031753044),
  (7, 0.00031752887),
  (8, 0.00031752948),
  (9, 0.00031752934),
  (10, 0.00031752998),
  (11, 0.00031753074),
  (12, 0.00031752884),
  (13, 0.0003175294)],
 [(0, 0.29161647),
  (1, 0.00050676614),
  (2, 0.0005067655),
  (3, 0.0005067652),
  (4, 0.00050676597),
  (5, 0.0005067655),
  (6, 0.00050676573),
  (7, 0.0005067651),
  (8, 0.00050676573),
  (9, 0.0005067657),
  (10, 0.0005067654),
  (11, 0.7023023),
  (12, 0.0005067648),
  (13, 0.0005067654)],
 [(0, 7.707002e-05),
  (1, 7.707016e-05),
  (2, 7.7070035e-05),
  (3, 7.706991e-05),
  (4, 7.707004e-05),
  (5, 7.706993e-05),
  (6, 7.7070144e-05),
  (7, 7.706986e-05),
  (8, 7.7070144e-05),
  (9, 0.99899805),
  (10, 7.706995e-05),
  (11, 7.7070275e-05),
  (12, 7.70699e-05),
  (13, 7.707009e-05)],
 [(0, 0.29054785),
  (1, 0.00012876841),
  (2, 0.00012876833),
  (3, 0.00012876822),
  (4, 0.0

In [17]:
scores = list(chain(*[[score for topic_id,score in topic] \
                      for topic in [doc for doc in lda_corpus]]))

threshold = sum(scores)/len(scores)
print(threshold)

0.07142857105592146


In [18]:
cluster1 = [j for i,j in zip(lda_corpus,df.index) if i[0][1] > threshold]
cluster2 = [j for i,j in zip(lda_corpus,df.index) if i[1][1] > threshold]
cluster3 = [j for i,j in zip(lda_corpus,df.index) if i[2][1] > threshold]
cluster4 = [j for i,j in zip(lda_corpus,df.index) if i[3][1] > threshold]
cluster5 = [j for i,j in zip(lda_corpus,df.index) if i[4][1] > threshold]

In [19]:
df.iloc[cluster1]

Unnamed: 0,topic,title,article,date,link,article_clean
1,BUSINESS,RM opens at 4.09 against USD,KUALA LUMPUR: The ringgit opened easier agains...,"May 3, 2021 @ 10:09am",https://www.nst.com.my/business/2021/05/687398...,"[kuala, lumpur, ringgit, opened, easier, u, do..."
3,SUNDAY VIBES,MONEY THOUGHTS: Ikigai — The uncommon key to c...,HAVE you noticed how life's getting tougher on...,"May 2, 2021 @ 8:30am",https://www.nst.com.my/lifestyle/sunday-vibes/...,"[noticed, life, getting, tougher, every, front..."
5,BUSINESS,RM opens at 4.09 against USD,KUALA LUMPUR: The ringgit opened firmer agains...,"Apr 30, 2021 @ 10:43am",https://www.nst.com.my/business/2021/04/686620...,"[kuala, lumpur, ringgit, opened, firmer, u, do..."
14,NATION,Call to reduce food wastage during Ramadan,GEORGE TOWN: The Consumers Association of Pena...,"Apr 22, 2021 @ 5:28pm",https://www.nst.com.my/news/nation/2021/04/684...,"[george, town, consumer, association, penang, ..."
16,BUSINESS,RM opens at 4.12 versus USD,KUALA LUMPUR: The ringgit slipped but managed ...,"Apr 16, 2021 @ 10:32am",https://www.nst.com.my/business/2021/04/682768...,"[kuala, lumpur, ringgit, slipped, managed, hol..."
17,BUSINESS,Strong oil prices fuel RM higher by 50bps vers...,KUALA LUMPUR: The ringgit climbed 50 basis poi...,"Apr 15, 2021 @ 10:46am",https://www.nst.com.my/business/2021/04/682464...,"[kuala, lumpur, ringgit, climbed, 50, basis, p..."
35,BUSINESS,Former Petronas' top executive succeeds Sapura...,KUALA LUMPUR: Sapura Energy Bhd has today anno...,"Mar 22, 2021 @ 7:23pm",https://www.nst.com.my/business/2021/03/676078...,"[kuala, lumpur, sapura, energy, bhd, today, an..."
50,BUSINESS,Ringgit opens lower against US dollar,KUALA LUMPUR: The ringgit extended its downtre...,"Feb 26, 2021 @ 9:41am",https://www.nst.com.my/business/2021/02/669153...,"[kuala, lumpur, ringgit, extended, downtrend, ..."
51,BUSINESS,Ringgit maintains 4.04 level against US dollar...,KUALA LUMPUR: After closing marginally higher ...,"Feb 25, 2021 @ 10:09am",https://www.nst.com.my/business/2021/02/668784...,"[kuala, lumpur, closing, marginally, higher, w..."
58,BUSINESS,RM marginally higher against USD on improving ...,KUALA LUMPUR: The ringgit opened marginally hi...,"Feb 4, 2021 @ 10:27am",https://www.nst.com.my/business/2021/02/662940...,"[kuala, lumpur, ringgit, opened, marginally, h..."


In [20]:
df.iloc[cluster5]

Unnamed: 0,topic,title,article,date,link,article_clean
0,BUSINESS,"MAHB faces double whammy of interstate ban, Tu...",KUALA LUMPUR: Affin Hwang Capital expects Mala...,"May 4, 2021 @ 10:40am",https://www.nst.com.my/business/2021/05/687692...,"[kuala, lumpur, affin, hwang, capital, expects..."
38,BUSINESS,RM opens 120 bps lower amid falling oil prices...,KUALA LUMPUR: The ringgit reversed Thursday's ...,"Mar 19, 2021 @ 10:28am",https://www.nst.com.my/business/2021/03/675153...,"[kuala, lumpur, ringgit, reversed, thursday, g..."
48,BUSINESS,IHH's FY20 net profit falls 47.6pct to RM288.8...,KUALA LUMPUR: IHH Healthcare Bhd's net profit ...,"Feb 26, 2021 @ 8:22pm",https://www.nst.com.my/business/2021/02/669371...,"[kuala, lumpur, ihh, healthcare, bhds, net, pr..."
61,BUSINESS,Ringgit eases against US dollar at close,KUALA LUMPUR: The ringgit closed the first tra...,"Feb 2, 2021 @ 8:16pm",https://www.nst.com.my/business/2021/02/662543...,"[kuala, lumpur, ringgit, closed, first, tradin..."
68,BUSINESS,Ringgit strengthens against all major currencies,KUALA LUMPUR: The ringgit reversed recent loss...,"Jan 26, 2021 @ 10:10am",https://www.nst.com.my/business/2021/01/660574...,"[kuala, lumpur, ringgit, reversed, recent, los..."
71,BUSINESS,Ringgit opens lower against US dollar,KUALA LUMPUR: The ringgit retreated against th...,"Jan 22, 2021 @ 9:59am",https://www.nst.com.my/business/2021/01/659609...,"[kuala, lumpur, ringgit, retreated, u, dollar,..."
76,BUSINESS,"Bitcoin continues rally, breaches US$34,000","BITCOIN, the world's largest cryptocurrency, t...","Jan 4, 2021 @ 12:34pm",https://www.nst.com.my/business/2021/01/654365...,"[bitcoin, world, largest, cryptocurrency, topp..."
103,BUSINESS,RM opens higher against USD,KUALA LUMPUR: The ringgit continued to open hi...,"Nov 17, 2020 @ 10:25am",https://www.nst.com.my/business/2020/11/641722...,"[kuala, lumpur, ringgit, continued, open, high..."
