<h1>Topic Modelling (gensim LDA)</h1>
<p>Topic Modeling is an unsupervised learning approach to clustering documents, to discover topics based on their contents.</p>

In [1]:
import pandas as pd
import pyLDAvis
import pyLDAvis.gensim_models
import gensim.corpora as corpora
import warnings
warnings.filterwarnings("ignore",category=FutureWarning)
warnings.filterwarnings("ignore",category=DeprecationWarning)

from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel

  from imp import reload
  from distutils.version import LooseVersion
  from scipy.linalg.special_matrices import triu


In [2]:
df = pd.read_csv('../result_clean.csv')
df[['platform','date','body','clean_body','label']]

Unnamed: 0,platform,date,body,clean_body,label
0,Twitter,30-12-2021 18:38:04,adidas Yeezy Boost 350 V2 “Dazzling Blue” 👟\n\...,yeezy boost dazzling blue coming yankeekicks ig,adidas
1,Twitter,30-12-2021 17:43:37,Am over the worst of Omicron. Alhumdullilah. I...,worst omicron alhumdullilah cupcake breakfast ...,adidas
2,Twitter,30-12-2021 06:28:19,🎁 I'm blessing 1 follower with life changing N...,blessing follower life changing nfts hour orig...,adidas
3,Twitter,29-12-2021 22:31:57,South Park and adidas have a new “Professor Ch...,south park professor chaos nmd releasing,adidas
4,Twitter,29-12-2021 19:29:20,"Please patronise me, I sell Adidas shoes for 1...",patronise sell shoe fake walk fast notice,adidas
...,...,...,...,...,...
18130,HardwareZone,20-08-2008 01:52:51,jus came back from hk.\n\n\nAdidas Factory out...,jus hk factory outlet location block hong kon...,adidas
18131,HardwareZone,24-08-2008 15:03:34,Me and hubby like Citygate leh. Polo Ralph Pol...,hubby citygate leh polo ralph polo tee pretty ...,adidas and nike
18132,HardwareZone,11-10-2008 04:19:53,"helpp!\n\nOkay, I am from Toronto & I'm going ...",helpp toronto hong kong december long wanna sh...,nike
18133,HardwareZone,30-10-2008 17:28:01,hihi... sorry all just wanna ask.. so the post...,hihi wanna post guide monkey post addidas shi...,nike


In [3]:
li = [str(body).split() for body in df['clean_body'].tolist()]
id2word = Dictionary(li)
id2word

<gensim.corpora.dictionary.Dictionary at 0x24d187aadd0>

In [4]:
corpus = [id2word.doc2bow(text) for text in li]
corpus[:1]

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)]]

In [5]:
[[(id2word[i], freq) for i, freq in doc] for doc in corpus[:1]]

[[('blue', 1),
  ('boost', 1),
  ('coming', 1),
  ('dazzling', 1),
  ('ig', 1),
  ('yankeekicks', 1),
  ('yeezy', 1)]]

In [6]:
lda_model = LdaModel(corpus=corpus,
                   id2word=id2word,
                   num_topics=10, 
                   random_state=0,
                   chunksize=100,
                   alpha='auto',
                   per_word_topics=True)

doc_lda = lda_model[corpus]
for i in lda_model.print_topics():
    print(f"{i[0]}\n{i[1].replace(' ','')}")

0
0.052*"pair"+0.043*"ub"+0.043*"year"+0.019*"model"+0.014*"football"+0.013*"soccer"+0.013*"care"+0.011*"god"+0.011*"share"+0.011*"hope"
1
0.033*"brand"+0.033*"good"+0.020*"la"+0.018*"air"+0.017*"order"+0.017*"sneaker"+0.016*"size"+0.012*"lot"+0.012*"sale"+0.012*"nmd"
2
0.038*"price"+0.038*"company"+0.028*"money"+0.022*"month"+0.021*"puma"+0.020*"sport"+0.019*"boycott"+0.015*"shop"+0.015*"chinese"+0.013*"big"
3
0.149*"shoe"+0.039*"day"+0.028*"customer"+0.016*"service"+0.015*"item"+0.014*"told"+0.014*"nb"+0.013*"sell"+0.012*"comfortable"+0.012*"gagt"
4
0.094*"buy"+0.065*"product"+0.064*"people"+0.024*"guy"+0.021*"life"+0.020*"buying"+0.018*"number"+0.018*"man"+0.015*"monday"+0.014*"market"
5
0.033*"boost"+0.027*"white"+0.020*"le"+0.015*"dun"+0.011*"uma"+0.011*"tnis"+0.010*"wait"+0.010*"yeezy"+0.009*"vous"+0.009*"est"
6
0.034*"lol"+0.034*"feel"+0.033*"click"+0.027*"boot"+0.022*"wearing"+0.015*"marten"+0.014*"hey"+0.014*"ralph"+0.013*"son"+0.013*"human"
7
0.037*"balance"+0.033*"love"+0.03

In [7]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis