In [1]:
import os
import re
import pandas as pd
from gensim.parsing.preprocessing import preprocess_string, strip_punctuation, strip_numeric
from gensim import corpora
from gensim.models.coherencemodel import CoherenceModel
from gensim.models import LdaModel
from nltk.tokenize.treebank import TreebankWordDetokenizer

In [2]:
def prepare_corpus(dataset):

  # Remove common words
  data = []
  for d in dataset:
    words = []
    for word in d.split():
      if word not in ['covid', 'corona', 'pandemi'] and len(word) > 3:
        words.append(word)
    data.append(words)
    
  # Creating the term dictionary of our courpus, where every unique term is assigned an index. dictionary = corpora.Dictionary(doc_clean)
  dictionary = corpora.Dictionary(data)
  # Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
  doc_term_matrix = [dictionary.doc2bow(doc) for doc in data]
  # generate LDA model
  return data, dictionary, doc_term_matrix

def compute_coherence_values(dictionary, doc_term_matrix, doc_clean, stop, start=2, step=3):
  # Compute coherence values to determine number of topics
  coherence_values = []
  model_list = []
  for num_topics in range(start, stop, step):
    # generate LSA model
    model = LdaModel(doc_term_matrix, num_topics=num_topics, id2word=dictionary)
    model_list.append(model)
    coherencemodel = CoherenceModel(model=model, texts=doc_clean, dictionary=dictionary, coherence='c_v')
    coherence_values.append(coherencemodel.get_coherence())
  return coherence_values

def best_topic(dataset,start, stop, step):
  # Prepare corpus
  data,dictionary,doc_term_matrix=prepare_corpus(dataset)
  
  # Compute coherence values
  coherence_values = compute_coherence_values(dictionary, doc_term_matrix, data, stop, start, step)
  
  max_value = max(coherence_values)
  number_of_topics = coherence_values.index(max_value) + 2
  
  return dictionary, doc_term_matrix, number_of_topics

def create_gensim_lda_model(dataset):
  start,stop,step=2,12,1
  dictionary, doc_term_matrix, number_of_topics = best_topic(dataset,start,stop,step)

  # Generate LSA model
  model = LdaModel(doc_term_matrix, num_topics = number_of_topics, id2word=dictionary, update_every=1, chunksize=100, random_state=42, passes=10, alpha='auto' )

  topics = []
  filters = [lambda x: x.lower(), strip_punctuation, strip_numeric]

  lda_topics = model.show_topics(num_words=5)

  for topic in lda_topics:
    topics.append(preprocess_string(topic[1], filters))

  return dictionary, doc_term_matrix, model, topics, number_of_topics

In [8]:
# Import data
dataset = pd.read_csv('https://raw.githubusercontent.com/ahmaulana/topic-modelling/main/dataset.csv', index_col=0)
def remove_punct(text):
    text = re.sub(r'[^a-zA-Z_]', ' ', str(text))
    return text

dataset.prepro = dataset.prepro.apply(remove_punct)

# Build LDA Model
dictionary, corpus, model, topics, number_of_topics = create_gensim_lda_model(dataset.prepro)

# Topic Lists
topic_lists = pd.DataFrame(list(zip([*range(0, number_of_topics, 1)], topics)), columns=['Topic', 'Words'])
topic_lists

Unnamed: 0,Topic,Words
0,0,"[cegah, prokes, disiplin, tular, vaksinasi]"
1,1,"[nyata, metode, swab, protes, kontribusi]"
2,2,"[indonesia, pulih, perintah, masyarakat, ppkm]"
3,3,"[stop, hoaks, tingkat, susu, turun]"
4,4,"[optimis, vaksinasi, aman, halal, tekan]"
5,5,"[sigit, listyo, prabowo, jenderal, vaksinasi]"
6,6,"[jaga, anak, hindar, muka, tatap]"
7,7,"[vaksin, ajar, lawan, hati, tindak]"
8,8,"[masker, dampak, salur, giat, tangan]"


In [4]:
# Save Model
path = 'model'
os.mkdir(path)
model.save(path + '/topic.model')