<h1>Topic Modelling (gensim LDA)</h1>
<p>Topic Modeling is an unsupervised learning approach to clustering documents, to discover topics based on their contents.</p>

In [1]:
import pandas as pd
import gensim
import gensim.corpora as corpora
import pyLDAvis
import pyLDAvis.gensim_models
import warnings
warnings.filterwarnings("ignore",category=FutureWarning)

from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim import corpora, models

In [2]:
df = pd.read_csv('../result_clean.csv')
df[['platform','date','body','clean_body','label']]

Unnamed: 0,platform,date,body,clean_body,label
0,Twitter,30-12-2021 18:38:04,adidas Yeezy Boost 350 V2 “Dazzling Blue” 👟\n\...,yeezy boost dazzling blue coming yankeekicks ig,adidas
1,Twitter,30-12-2021 17:43:37,Am over the worst of Omicron. Alhumdullilah. I...,worst omicron alhumdullilah cupcake breakfast ...,adidas
2,Twitter,30-12-2021 06:28:19,🎁 I'm blessing 1 follower with life changing N...,blessing follower life changing nfts hour orig...,adidas
3,Twitter,29-12-2021 22:31:57,South Park and adidas have a new “Professor Ch...,south park professor chaos nmd releasing,adidas
4,Twitter,29-12-2021 19:29:20,"Please patronise me, I sell Adidas shoes for 1...",patronise sell shoe fake walk fast notice,adidas
...,...,...,...,...,...
14083,HardwareZone,23-08-2018 13:56:42,All are legit.\nSpoiler,legit spoiler,adidas
14084,HardwareZone,23-08-2018 14:02:18,Tuakong said:\nThose who say END selling fakes...,tuakong selling fake sour grape sarpork,adidas
14085,HardwareZone,23-08-2018 14:03:42,Nakedtoes said:\nMight as well dun said..\n\ny...,nakedtoes dun share s,adidas
14086,HardwareZone,23-08-2018 14:34:13,Prinsloo said:\nAll are legit.\nSpoiler\n\n\n\...,prinsloo legit spoiler wah seh powerlah prinsl...,adidas


In [3]:
li = [str(body).split() for body in df['clean_body'].tolist()]
id2word = Dictionary(li)
id2word

<gensim.corpora.dictionary.Dictionary at 0x1da4da08a00>

In [4]:
corpus = [id2word.doc2bow(text) for text in li]
corpus[:1]

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)]]

In [5]:
[[(id2word[i], freq) for i, freq in doc] for doc in corpus[:1]]

[[('blue', 1),
  ('boost', 1),
  ('coming', 1),
  ('dazzling', 1),
  ('ig', 1),
  ('yankeekicks', 1),
  ('yeezy', 1)]]

In [6]:
lda_model = LdaModel(corpus=corpus,
                   id2word=id2word,
                   num_topics=10, 
                   random_state=0,
                   chunksize=100,
                   alpha='auto',
                   per_word_topics=True)

doc_lda = lda_model[corpus]
for i in lda_model.print_topics():
    print(f"{i[0]}\n{i[1].replace(' ','')}")

0
0.080*"brand"+0.077*"product"+0.036*"support"+0.034*"year"+0.020*"xinjiang"+0.016*"guy"+0.015*"share"+0.014*"god"+0.013*"click"+0.012*"social"
1
0.063*"balance"+0.037*"website"+0.026*"color"+0.024*"play"+0.023*"wait"+0.022*"matter"+0.021*"respect"+0.021*"hey"+0.020*"basketball"+0.016*"cut"
2
0.091*"buy"+0.050*"time"+0.027*"price"+0.024*"lol"+0.024*"love"+0.019*"market"+0.019*"shop"+0.015*"walking"+0.014*"lose"+0.012*"place"
3
0.060*"china"+0.050*"pair"+0.032*"boycott"+0.031*"sport"+0.026*"cheap"+0.019*"puma"+0.018*"lot"+0.017*"country"+0.017*"pay"+0.017*"told"
4
0.027*"good"+0.024*"people"+0.024*"la"+0.019*"store"+0.016*"order"+0.012*"size"+0.011*"para"+0.011*"buying"+0.009*"el"+0.009*"cotton"
5
0.039*"running"+0.034*"customer"+0.031*"fake"+0.019*"sell"+0.016*"hope"+0.014*"american"+0.013*"stand"+0.013*"family"+0.013*"gift"+0.012*"feel"
6
0.058*"bought"+0.038*"online"+0.027*"issue"+0.023*"nice"+0.023*"era"+0.021*"woman"+0.019*"shirt"+0.018*"check"+0.017*"una"+0.016*"long"
7
0.025*"bo

In [7]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis