<a href="https://colab.research.google.com/github/alvinsbkt/pendeteksian-topik-covid19-lda/blob/main/Pendeteksian_Topik_Publikasi_Ilmiah_COVID19_LDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Connect to Kaggle and Download Data

Dilakukan supaya tidak perlu mendownload data yang memiliki ukuran besar. Notebook (Google Colab) dihubungkan dengan Kaggle menggunakan Kaggle API

In [None]:
#dilakukan apabila belum mempunya package kaggle atau fusezip
!pip install kaggle #instalasi modul package kaggle
!apt-get install -y fuse-zip #instalasi fusezip untuk unzip dataset

In [None]:
#memasukkan API ke dalam environment
import os
os.environ['KAGGLE_USERNAME'] = "***insuba***"
os.environ['KAGGLE_KEY'] = "*****7c53e741cf09f64480daee*****" #gunakan API Sendiri

In [None]:
!kaggle datasets download -d allen-institute-for-ai/CORD-19-research-challenge #mengambil dataset dari kaggle

In [None]:
input_dir = "/tmp/kaggle-data"
!mkdir {input_dir}
!fuse-zip /content/CORD-19-research-challenge.zip {input_dir} #unzipping file dataset yang sudah diambil dari kaggle

In [None]:
!ls {input_dir} #memeriksa isi folder hasil unzip

#Data Preprocessing

Pada metode pendeteksian topik, secara garis besar yang akan dilakukan adalah melakukan pra pengolahan terhadap data, membuat dictionary untuk data, pembobotan dengan TF-IDF, dan pemodelan dengan LDA. LDA yang digunakan adalah dengan memanfaatkan module gensim (bukan sklearn)

In [None]:
#dilakukan apabila belum mempunyai package gensim
!pip install gensim #install gensim

In [None]:
#import package yang diperlukan untuk keberlansungan pendeteksian topik
import pandas as pd
import multiprocessing
import gensim
from gensim import corpora, models
from gensim.models import CoherenceModel
import numpy as np
import tqdm

# Plotting tools
!pip install pyLDAvis #diinstall apabila belum memiliki pyLDavis
import pyLDAvis.gensim
import pickle 
import pyLDAvis
%matplotlib inline

Di bawah ini adalah tahapan dalam melakukan prapengolahan data, detail lebih lengkap tersedia dalam sidenote pada setiap cell

In [None]:
df = pd.read_csv('/tmp/kaggle-data/metadata.csv') #ambil data
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,Microsoft Academic Paper ID,WHO #Covidence,arxiv_id,has_pdf_parse,has_pmc_xml_parse,full_text_file,url
0,zjufx4fo,b2897e1277f56641193a6db73825f707eed3e4c9,PMC,Sequence requirements for RNA strand transfer ...,10.1093/emboj/20.24.7220,PMC125340,11742998.0,unk,Nidovirus subgenomic mRNAs contain a leader se...,2001-12-17,"Pasternak, Alexander O.; van den Born, Erwin; ...",The EMBO Journal,,,,True,True,custom_license,http://europepmc.org/articles/pmc125340?pdf=re...
1,ymceytj3,e3d0d482ebd9a8ba81c254cc433f314142e72174,PMC,"Crystal structure of murine sCEACAM1a[1,4]: a ...",10.1093/emboj/21.9.2076,PMC125375,11980704.0,unk,CEACAM1 is a member of the carcinoembryonic an...,2002-05-01,"Tan, Kemin; Zelus, Bruce D.; Meijers, Rob; Liu...",The EMBO Journal,,,,True,True,custom_license,http://europepmc.org/articles/pmc125375?pdf=re...
2,wzj2glte,00b1d99e70f779eb4ede50059db469c65e8c1469,PMC,Synthesis of a novel hepatitis C virus protein...,10.1093/emboj/20.14.3840,PMC125543,11447125.0,no-cc,Hepatitis C virus (HCV) is an important human ...,2001-07-16,"Xu, Zhenming; Choi, Jinah; Yen, T.S.Benedict; ...",EMBO J,,,,True,True,custom_license,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1...
3,2sfqsfm1,cf584e00f637cbd8f1bb35f3f09f5ed07b71aeb0,PMC,Structure of coronavirus main proteinase revea...,10.1093/emboj/cdf327,PMC126080,12093723.0,unk,The key enzyme in coronavirus polyprotein proc...,2002-07-01,"Anand, Kanchan; Palm, Gottfried J.; Mesters, J...",The EMBO Journal,,,,True,True,custom_license,http://europepmc.org/articles/pmc126080?pdf=re...
4,i0zym7iq,dde02f11923815e6a16a31dd6298c46b109c5dfa,PMC,Discontinuous and non-discontinuous subgenomic...,10.1093/emboj/cdf635,PMC136939,12456663.0,unk,"Arteri-, corona-, toro- and roniviruses are ev...",2002-12-01,"van Vliet, A.L.W.; Smits, S.L.; Rottier, P.J.M...",The EMBO Journal,,,,True,True,custom_license,http://europepmc.org/articles/pmc136939?pdf=re...


In [None]:
#membuat dataset baru berisikan hanya data yang ingin diolah (abstrak)
data_text = df[['abstract']]
data_text['id'] = df['sha']
df = data_text

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
#prapengolahan dengan menghilangkan stop words dan melakukan tokenisasi
def preprocess(text):
    result=[]
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token)>3:
            result.append(token)
    return(result)
            
processed_docs=df['abstract'].astype(str).map(preprocess)
processed_docs[:10]

0    [nidovirus, subgenomic, mrnas, contain, leader...
1    [ceacam, member, antigen, family, isoforms, mu...
2    [hepatitis, virus, important, human, pathogen,...
3    [enzyme, coronavirus, polyprotein, processing,...
4    [arteri, corona, toro, roniviruses, evolutiona...
5    [background, rhinovirus, common, cause, upper,...
6    [recent, analyses, human, pathogens, revealed,...
7    [ribosomal, frameshifting, signals, mobile, ge...
8    [army, death, john, bunyan, memorable, phrase,...
9    [intracellular, replication, bacterial, pathog...
Name: abstract, dtype: object

In [None]:
#membuat dictionary/korpus untuk fitur pada data teks
dictionary=gensim.corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=100,no_above=0.7,keep_n=50000)

In [None]:
#melakukan pembobotan TF-IDF pada korpus yang sudah dibuat
bow_corpus=[dictionary.doc2bow(doc) for doc in processed_docs]

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf=tfidf[bow_corpus]

In [None]:
#memeriksa banyak CPU sehingga LDA dapat ditrain secara parallel
multiprocessing.cpu_count()

2

#LDA Hyperparameter Tuning

Setelah data sudah siap untuk di train, berikutnya akan dilakukan training model yang sekaligus juga merupakan hyperparameter tuning, karena melakukan hyperparameter tuning untuk semua kombinasi akan memakan waktu yang sangat lama. Maka akan dipilih kandidat untuk setiap hyperparameter. Caranya adalah dengan melakukan tuning satu per satu dan hyperparameter yang lain dibuat konstan. Urutan tuning yang dilakukan adalah banyak topik, kemudian alpha, dan terakhir beta

In [None]:
#dilakukan hyperparameter tuning untuk jumlah topik, dimulai dari kelipatan besar
lda_model_10=gensim.models.LdaMulticore(corpus_tfidf,num_topics=10,id2word=dictionary,passes=2,workers=10)
lda_model_25=gensim.models.LdaMulticore(corpus_tfidf,num_topics=25,id2word=dictionary,passes=2,workers=10)
lda_model_50=gensim.models.LdaMulticore(corpus_tfidf,num_topics=50,id2word=dictionary,passes=2,workers=10)
lda_model_75=gensim.models.LdaMulticore(corpus_tfidf,num_topics=75,id2word=dictionary,passes=2,workers=10)
lda_model_100=gensim.models.LdaMulticore(corpus_tfidf,num_topics=100,id2word=dictionary,passes=2,workers=10)

  diff = np.log(self.expElogbeta)


In [None]:
#mengeluarkan hasil topik yang didapatkan untuk setiap model
lda_model_array=[lda_model_10,lda_model_25,lda_model_50,lda_model_75,lda_model_100]
num=[10,25,50,75,100]
for i in range(len(num)):
  print("-------------------------------------------------------------------------------")
  print("-------------------------------------------------------------------------------")
  print('Hasil LDA Model dengan {} topik'.format(num[i]))
  print("__________________________")
  for idx,topic in lda_model_array[i].print_topics(-1):
    print('Topic{} - Word: {}'.format(idx,topic))

-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Hasil LDA Model dengan 10 topik
__________________________
Topic0 - Word: 0.004*"health" + 0.003*"public" + 0.002*"virus" + 0.002*"cells" + 0.002*"sars" + 0.002*"covid" + 0.002*"infection" + 0.002*"disease" + 0.002*"viral" + 0.002*"protein"
Topic1 - Word: 0.007*"unknown" + 0.005*"protein" + 0.005*"cells" + 0.004*"viral" + 0.004*"host" + 0.004*"proteins" + 0.004*"virus" + 0.004*"cell" + 0.003*"viruses" + 0.003*"expression"
Topic2 - Word: 0.004*"sars" + 0.004*"cells" + 0.003*"patients" + 0.003*"virus" + 0.003*"infection" + 0.003*"calves" + 0.003*"respiratory" + 0.003*"viral" + 0.002*"protein" + 0.002*"disease"
Topic3 - Word: 0.004*"viruses" + 0.004*"virus" + 0.003*"viral" + 0.003*"sequence" + 0.003*"sequences" + 0.003*"genome" + 0.003*"gene" + 0.003*"species" + 0.003*"human" + 0.002*"sars"
Topic4 - Word: 0.005*"virus" + 0.004*"pro

In [None]:
#mengeluarkan nilai koherens dan perpleksitas untuk setiap lda model yang dibuat
for i in range(len(num)):
  print("-------------------------------------------------------------------------------")
  print("-------------------------------------------------------------------------------")
  print('Performa LDA Model dengan {} topik'.format(num[i]))
  print("__________________________")
  print('Perpelexity {0} topik: {1}'.format(num[i], lda_model_array[i].log_perplexity(bow_corpus)))
  coherence_model_lda=CoherenceModel(model=lda_model_array[i],texts=processed_docs,dictionary=dictionary,coherence='c_v')
  coherence_lda=coherence_model_lda.get_coherence()
  print('Coherence Score {0} topik {1}: '.format(num[i],coherence_lda))

-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Performa LDA Model dengan 10 topik
__________________________
Perpelexity 10 topik: -7.541608975275597
Coherence Score 10 topik 0.49343689514169375: 
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Performa LDA Model dengan 25 topik
__________________________
Perpelexity 25 topik: -7.5068978404139255
Coherence Score 25 topik 0.48412213590700076: 
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Performa LDA Model dengan 50 topik
__________________________
Perpelexity 50 topik: -7.618957566977694
Coherence Score 50 topik 0.48854182528563095: 
---------------------------------------------------------------------

Pengambilan keputusan akan dilakukan berdasarkan nilai koherens dan perpleksitas yang diperoleh, proses training model, kemudian print topik, kemudian print coherence dan perpleksitas akan terus direpitisi selama proses tuning

In [None]:
#Pendefinisian untuk melakukan iterasi pada parameter alpha dan beta nantinya
# Alpha parameter
alpha = list(np.arange(0.01, 1, 0.3))
alpha.append('symmetric')
alpha.append('asymmetric')
# Beta parameter
beta = list(np.arange(0.01, 1, 0.3))
beta.append('symmetric')

In [None]:
#training untuk tuning banyak topik, tapi dengan jumlah yang lebih kecil
lda_model_2=gensim.models.LdaMulticore(corpus_tfidf,num_topics=2,id2word=dictionary,passes=2,workers=10)
lda_model_4=gensim.models.LdaMulticore(corpus_tfidf,num_topics=4,id2word=dictionary,passes=2,workers=10)
lda_model_6=gensim.models.LdaMulticore(corpus_tfidf,num_topics=6,id2word=dictionary,passes=2,workers=10)
lda_model_8=gensim.models.LdaMulticore(corpus_tfidf,num_topics=8,id2word=dictionary,passes=2,workers=10)
lda_model_16=gensim.models.LdaMulticore(corpus_tfidf,num_topics=16,id2word=dictionary,passes=2,workers=10)

In [None]:
#print nilai perpleksitas dan koherens untuk banyak topik 2,4,6,8,16 yang sudah ditrain sebelumnya
num_1=[2,4,6,8,16]
lda_model_array_1=[lda_model_2,lda_model_4,lda_model_6,lda_model_8,lda_model_16]
from gensim.models import CoherenceModel
for i in range(len(num_1)):
  print("-------------------------------------------------------------------------------")
  print("-------------------------------------------------------------------------------")
  print('Performa LDA Model dengan {} topik'.format(num_1[i]))
  print("__________________________")
  print('Perpelexity {0} topik: {1}'.format(num_1[i], lda_model_array_1[i].log_perplexity(bow_corpus)))
  coherence_model_lda=CoherenceModel(model=lda_model_array_1[i],texts=processed_docs,dictionary=dictionary,coherence='c_v')
  coherence_lda=coherence_model_lda.get_coherence()
  print('Coherence Score {0} topik {1}: '.format(num_1[i],coherence_lda))

-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Performa LDA Model dengan 2 topik
__________________________
Perpelexity 2 topik: -7.651705376344558
Coherence Score 2 topik 0.4263114001951781: 
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Performa LDA Model dengan 4 topik
__________________________
Perpelexity 4 topik: -7.531503002620636
Coherence Score 4 topik 0.5162192880978432: 
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Performa LDA Model dengan 6 topik
__________________________
Perpelexity 6 topik: -7.529272915908492
Coherence Score 6 topik 0.5056502565369726: 
-------------------------------------------------------------------------------
--

In [None]:
#sekarang dilakukan hyperparameter tuning untuk nilai alpha, digunakan beberapa kandidat nilai alpha yang sudah didefinisikan sebelumnya
lda_alpha_1=gensim.models.LdaMulticore(corpus_tfidf,num_topics=8,id2word=dictionary,passes=2,workers=10,alpha=alpha[0])
lda_alpha_2=gensim.models.LdaMulticore(corpus_tfidf,num_topics=8,id2word=dictionary,passes=2,workers=10,alpha=alpha[1])
lda_alpha_3=gensim.models.LdaMulticore(corpus_tfidf,num_topics=8,id2word=dictionary,passes=2,workers=10,alpha=alpha[2])
lda_alpha_4=gensim.models.LdaMulticore(corpus_tfidf,num_topics=8,id2word=dictionary,passes=2,workers=10,alpha=alpha[3])
lda_alpha_5=gensim.models.LdaMulticore(corpus_tfidf,num_topics=8,id2word=dictionary,passes=2,workers=10,alpha=alpha[4])
lda_alpha_6=gensim.models.LdaMulticore(corpus_tfidf,num_topics=8,id2word=dictionary,passes=2,workers=10,alpha=alpha[5])

In [None]:
#print hasil ekstraksi topik dengan beragam nilai alpha
lda_model_array_2=[lda_alpha_1,lda_alpha_2,lda_alpha_3,lda_alpha_4,lda_alpha_5,lda_alpha_6]
for i in range(len(alpha)):
  print("-------------------------------------------------------------------------------")
  print("-------------------------------------------------------------------------------")
  print('Hasil LDA Model dengan nilai alpha {}'.format(str(alpha[i])))
  print("__________________________")
  for idx,topic in lda_model_array_2[i].print_topics(-1):
    print('Topic{} - Word: {}'.format(idx,topic))

-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Hasil LDA Model dengan nilai alpha 0.01
__________________________
Topic0 - Word: 0.005*"mers" + 0.004*"virus" + 0.003*"sars" + 0.003*"infection" + 0.003*"vaccine" + 0.003*"influenza" + 0.003*"respiratory" + 0.003*"human" + 0.003*"viral" + 0.003*"health"
Topic1 - Word: 0.007*"cells" + 0.005*"cell" + 0.005*"protein" + 0.004*"expression" + 0.004*"virus" + 0.004*"infection" + 0.004*"viral" + 0.004*"mice" + 0.003*"proteins" + 0.003*"immune"
Topic2 - Word: 0.007*"unknown" + 0.005*"samples" + 0.004*"detection" + 0.004*"virus" + 0.004*"assay" + 0.004*"viruses" + 0.004*"respiratory" + 0.004*"influenza" + 0.003*"sars" + 0.003*"viral"
Topic3 - Word: 0.005*"covid" + 0.005*"health" + 0.005*"patients" + 0.004*"sars" + 0.003*"disease" + 0.003*"cases" + 0.003*"care" + 0.003*"public" + 0.003*"risk" + 0.003*"china"
Topic4 - Word: 0.003*"virus" +

In [None]:
#print perpleksitas dan nilai koherens dari lda model dengan berbagai nilai alpha
for i in range(len(alpha)):
  print("-------------------------------------------------------------------------------")
  print("-------------------------------------------------------------------------------")
  print('Performa LDA Model dengan nilai alpha: {}'.format(str(alpha[i])))
  print("__________________________")
  print('Perpelexity denngan alpha {0}: {1}'.format(str(alpha[i]), lda_model_array_2[i].log_perplexity(bow_corpus)))
  coherence_model_lda=CoherenceModel(model=lda_model_array_2[i],texts=processed_docs,dictionary=dictionary,coherence='c_v')
  coherence_lda=coherence_model_lda.get_coherence()
  print('Coherence Score dengan alpha {0}: {1} '.format(str(alpha[i]),coherence_lda))

-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Performa LDA Model dengan nilai alpha: 0.01
__________________________
Perpelexity denngan alpha 0.01: -7.491861861042008
Coherence Score dengan alpha 0.01: 0.5087731109428992 
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Performa LDA Model dengan nilai alpha: 0.31
__________________________
Perpelexity denngan alpha 0.31: -7.527092076292739
Coherence Score dengan alpha 0.31: 0.5365736266430465 
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Performa LDA Model dengan nilai alpha: 0.61
__________________________
Perpelexity denngan alpha 0.61: -7.814474159645957
Coherence Score dengan alpha 0.61: 0.3769438

In [None]:
#terakhir dilakukan hyperparameter tuning dari model lda untuk beta yang sudah didefinisikan sebelumnya
lda_eta_1=gensim.models.LdaMulticore(corpus_tfidf,num_topics=8,id2word=dictionary,passes=2,workers=10,eta=beta[0])
lda_eta_2=gensim.models.LdaMulticore(corpus_tfidf,num_topics=8,id2word=dictionary,passes=2,workers=10,eta=beta[1])
lda_eta_3=gensim.models.LdaMulticore(corpus_tfidf,num_topics=8,id2word=dictionary,passes=2,workers=10,eta=beta[2])
lda_eta_4=gensim.models.LdaMulticore(corpus_tfidf,num_topics=8,id2word=dictionary,passes=2,workers=10,eta=beta[3])
lda_eta_5=gensim.models.LdaMulticore(corpus_tfidf,num_topics=8,id2word=dictionary,passes=2,workers=10,eta=beta[4])

In [None]:
#print topik yang dihasilkan
lda_model_array_3=[lda_eta_1,lda_eta_2,lda_eta_3,lda_eta_4,lda_eta_5]
for i in range(len(beta)):
  print("-------------------------------------------------------------------------------")
  print("-------------------------------------------------------------------------------")
  print('Hasil LDA Model dengan nilai beta {}'.format(str(beta[i])))
  print("__________________________")
  for idx,topic in lda_model_array_3[i].print_topics(-1):
    print('Topic{} - Word: {}'.format(idx,topic))

-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Hasil LDA Model dengan nilai beta 0.01
__________________________
Topic0 - Word: 0.007*"cells" + 0.005*"unknown" + 0.005*"cell" + 0.005*"virus" + 0.004*"protein" + 0.004*"viral" + 0.004*"infection" + 0.004*"host" + 0.004*"proteins" + 0.003*"expression"
Topic1 - Word: 0.005*"virus" + 0.004*"mers" + 0.004*"vaccine" + 0.004*"cells" + 0.004*"protein" + 0.004*"antibodies" + 0.003*"cell" + 0.003*"sars" + 0.003*"antibody" + 0.003*"viral"
Topic2 - Word: 0.007*"patients" + 0.006*"sars" + 0.005*"covid" + 0.004*"respiratory" + 0.003*"infection" + 0.003*"clinical" + 0.003*"cases" + 0.003*"disease" + 0.003*"severe" + 0.003*"coronavirus"
Topic3 - Word: 0.004*"influenza" + 0.004*"sars" + 0.003*"respiratory" + 0.003*"virus" + 0.003*"patients" + 0.003*"viruses" + 0.003*"viral" + 0.003*"compounds" + 0.002*"health" + 0.002*"activity"
Topic4 - Word

In [None]:
#print nilai koherens dan perpleksitas yang dimiliki lda model dengan beragam hyperparameter beta
for i in range(len(beta)):
  print("-------------------------------------------------------------------------------")
  print("-------------------------------------------------------------------------------")
  print('Performa LDA Model dengan nilai beta: {}'.format(str(beta[i])))
  print("__________________________")
  print('Perpelexity dengan beta {0}: {1}'.format(str(beta[i]), lda_model_array_3[i].log_perplexity(bow_corpus)))
  coherence_model_lda=CoherenceModel(model=lda_model_array_3[i],texts=processed_docs,dictionary=dictionary,coherence='c_v')
  coherence_lda=coherence_model_lda.get_coherence()
  print('Coherence Score dengan beta {0}: {1} '.format(str(beta[i]),coherence_lda))

-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Performa LDA Model dengan nilai beta: 0.01
__________________________
Perpelexity dengan beta 0.01: -7.519747862984498
Coherence Score dengan beta 0.01: 0.5342067453917485 
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Performa LDA Model dengan nilai beta: 0.31
__________________________
Perpelexity dengan beta 0.31: -7.503286625510146
Coherence Score dengan beta 0.31: 0.5213652951602838 
-------------------------------------------------------------------------------
-------------------------------------------------------------------------------
Performa LDA Model dengan nilai beta: 0.61
__________________________
Perpelexity dengan beta 0.61: -7.515945102923734
Coherence Score dengan beta 0.61: 0.49298416773063375 


Setelah tuning untuk ketiga hyperparameter telah selesai dilakukan, selanjutnya dilakukan iterasi dengan kandidat yang sudah diperkecil sesuai dengan hasil yang diperoleh pada bagian-bagian sebelumnya

In [None]:
#banyak topik 4-8
#nilai alpha 0.31,symmetric
#nilai beta 0.31,0.91

topics_range_new=[4,5,6,7,8]
alpha_new=[0.31,"symmetric"]
beta_new=[0.31,0.91]

#untuk menyimpan nilai
model_results = {'Topics': [],
                 'Alpha': [],
                 'Beta': [],
                 'Coherence': []
                }

# fungsi yang dapat melakukan training lda kemudian memberikan return berupa nilai koherensnya
def compute_coherence_values(corpus, dictionary, k, a, b):
    lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                           id2word=dictionary,
                                           num_topics=k, 
                                           random_state=100,
                                           chunksize=100,
                                           passes=10,
                                           alpha=a,
                                           eta=b)
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=processed_docs, dictionary=dictionary, coherence='c_v')
    
    return coherence_model_lda.get_coherence()

In [None]:
#pengiterasian kombinasi kandidat hyperparameter yang sudah ditentukan menggunakkan for looping
#Iterasi ini memakan waktu lama (peneliti running sekitar 3 jam)
if 1 == 1:
    pbar = tqdm.tqdm()
    # iterate through number of topics
    for k in topics_range_new:
        # iterate through alpha values
        for a in alpha_new:
            # iterare through beta values
             for b in beta_new:
                 # get the coherence score for the given parameters
                cv = compute_coherence_values(corpus_tfidf, dictionary=dictionary, 
                                                  k=k, a=a, b=b)
                # Save the model results
                model_results['Topics'].append(k)
                model_results['Alpha'].append(a)
                model_results['Beta'].append(b)
                model_results['Coherence'].append(cv)
                    
                pbar.update(1)
    pd.DataFrame(model_results).to_csv('lda_tuning_results.csv', index=False)
    pbar.close()




0it [00:00, ?it/s][A[A[A


1it [08:01, 481.44s/it][A[A[A


2it [16:21, 486.94s/it][A[A[A


3it [24:32, 488.37s/it][A[A[A


4it [33:02, 494.67s/it][A[A[A


5it [41:18, 495.12s/it][A[A[A


6it [49:54, 501.39s/it][A[A[A


7it [58:27, 504.93s/it][A[A[A


8it [1:07:08, 509.67s/it][A[A[A


9it [1:15:45, 512.05s/it][A[A[A


10it [1:24:59, 524.63s/it][A[A[A


11it [1:34:02, 530.04s/it][A[A[A


12it [1:43:09, 535.08s/it][A[A[A


13it [1:52:14, 538.19s/it][A[A[A


14it [2:01:59, 552.07s/it][A[A[A


15it [2:11:13, 552.69s/it][A[A[A


16it [2:20:33, 555.00s/it][A[A[A


17it [2:29:11, 543.88s/it][A[A[A


18it [2:38:11, 542.69s/it][A[A[A


19it [2:46:54, 536.83s/it][A[A[A


20it [2:55:49, 527.46s/it]


In [None]:
model_results #mengeluarkan hasil berupa kombinasi dan nilai koherens yang diperoleh

{'Alpha': [0.31,
  0.31,
  'symmetric',
  'symmetric',
  0.31,
  0.31,
  'symmetric',
  'symmetric',
  0.31,
  0.31,
  'symmetric',
  'symmetric',
  0.31,
  0.31,
  'symmetric',
  'symmetric',
  0.31,
  0.31,
  'symmetric',
  'symmetric'],
 'Beta': [0.31,
  0.91,
  0.31,
  0.91,
  0.31,
  0.91,
  0.31,
  0.91,
  0.31,
  0.91,
  0.31,
  0.91,
  0.31,
  0.91,
  0.31,
  0.91,
  0.31,
  0.91,
  0.31,
  0.91],
 'Coherence': [0.5710771182339991,
  0.5648370928584929,
  0.5634203760903125,
  0.5604986639609915,
  0.5928831274648146,
  0.5845853696009883,
  0.5773381440186631,
  0.5355393557455885,
  0.5874281790894099,
  0.5864279509384566,
  0.5723629957612412,
  0.5583094039184263,
  0.6119747959583924,
  0.6009969890751607,
  0.5633661289302572,
  0.5892150004026504,
  0.6143501056518786,
  0.627713719808238,
  0.5971921637071687,
  0.5801979321984009],
 'Topics': [4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8]}

In [None]:
max(model_results['Coherence']) #nilai koherens maksimal yang diperoleh

0.627713719808238

#Final Model+Visualization

Sudah diperoleh kombinasi yang menghasilkan nilai koherens maksimal, sekarang akan diambil kombinasi tersebut kemudian ditrain model lda dan dilakukan visualisasi

In [None]:
#training model teroptimisasi
lda_model_tuned=gensim.models.LdaMulticore(corpus=corpus_tfidf,id2word=dictionary,num_topics=8, random_state=100, chunksize=100, passes=10,alpha=0.31,eta=0.91,workers=10)

In [None]:
#print topik yang dihasilkan
print("model tuned")
for idx,topic in lda_model_tuned.print_topics(-1):
  print('Topic{} - Word: {}'.format(idx,topic))

model tuned
Topic0 - Word: 0.007*"samples" + 0.007*"assay" + 0.007*"calves" + 0.007*"strains" + 0.007*"cats" + 0.006*"pedv" + 0.006*"virus" + 0.005*"feline" + 0.005*"detection" + 0.005*"diarrhea"
Topic1 - Word: 0.012*"protein" + 0.008*"sars" + 0.008*"proteins" + 0.006*"binding" + 0.005*"sequence" + 0.005*"fusion" + 0.005*"virus" + 0.004*"domain" + 0.004*"membrane" + 0.004*"structure"
Topic2 - Word: 0.006*"water" + 0.005*"temperature" + 0.004*"concentration" + 0.004*"dans" + 0.004*"airborne" + 0.003*"sont" + 0.003*"method" + 0.003*"abstract" + 0.003*"contamination" + 0.003*"ventilation"
Topic3 - Word: 0.016*"patients" + 0.010*"respiratory" + 0.008*"children" + 0.006*"pneumonia" + 0.006*"clinical" + 0.006*"infections" + 0.005*"influenza" + 0.005*"acute" + 0.005*"cases" + 0.004*"symptoms"
Topic4 - Word: 0.013*"unknown" + 0.009*"antiviral" + 0.008*"activity" + 0.007*"cells" + 0.007*"replication" + 0.006*"cell" + 0.006*"viral" + 0.005*"host" + 0.005*"expression" + 0.005*"virus"
Topic5 - Wor

In [None]:
#print nilai koherens dan perpleksitas
print('Perpelexity: ', lda_model_tuned.log_perplexity(bow_corpus))

from gensim.models import CoherenceModel
coherence_model_lda=CoherenceModel(model=lda_model_tuned,texts=processed_docs,dictionary=dictionary,coherence='c_v')
coherence_lda=coherence_model_lda.get_coherence()
print('Coherence Score: ',coherence_lda)

Perpelexity:  -7.377439388666868
Coherence Score:  0.5964958612848226


In [None]:
# Visualisasi topik
pyLDAvis.enable_notebook()
LDAvis_prepared = pyLDAvis.gensim.prepare(lda_model_tuned, bow_corpus, dictionary)
LDAvis_prepared