# **Import Library**

In [1]:
pip install --upgrade git+https://github.com/ariaghora/mpstemmer.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting git+https://github.com/ariaghora/mpstemmer.git
  Cloning https://github.com/ariaghora/mpstemmer.git to /tmp/pip-req-build-vutgjydu
  Running command git clone -q https://github.com/ariaghora/mpstemmer.git /tmp/pip-req-build-vutgjydu
Building wheels for collected packages: mpstemmer
  Building wheel for mpstemmer (setup.py) ... [?25l[?25hdone
  Created wheel for mpstemmer: filename=mpstemmer-0.1.0-py3-none-any.whl size=99820 sha256=ff0a40a48720082e4e6de065ea498ad61e29f5d598586e9c193bf524ddee52e5
  Stored in directory: /tmp/pip-ephem-wheel-cache-v3ihgmoi/wheels/5c/f4/b7/9a03c2b80553c1ef45ee7971522137e4cd51db0ac5752f8d8a
Successfully built mpstemmer
Installing collected packages: mpstemmer
Successfully installed mpstemmer-0.1.0


In [None]:
!pip install Levenshtein

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
import gensim

import warnings
warnings.filterwarnings('ignore')

# **Import Dataset**

In [None]:
!gdown --id 1Es0qZKNrhvT_O3xPJC6riqdz8fQazfSg

In [None]:
pd.set_option('display.max_rows', None)

df = pd.read_excel('/content/Kel5_Clickbait_Fix.xlsx')
df = df[['Judul', 'Label_Akhir', 'Judul_Casefold', 'Judul_Relevant', 'Judul_Clean_Unlisted']]
df.head()

In [None]:
## ==== PENTING ====
df = df[df['Label_Akhir'] != 999]
df = df.dropna()
# df = df.sample(n=5000)

df.head()

# **Text Preprocessing**

## Tokenization

In [None]:
import nltk
nltk.download('punkt')

In [None]:
from nltk.tokenize import word_tokenize

def word_tokenize_wrapper(text):
  return word_tokenize(text)

df['Judul_Tokenized'] = df['Judul_Relevant'].apply(word_tokenize_wrapper)

df.head()

## Stemming

In [None]:
from mpstemmer import MPStemmer

stemmer = MPStemmer()

def stemming(words):
  return [stemmer.stem(word) for word in words]

df['Judul_Stemmed'] = df['Judul_Tokenized'].apply(stemming)
df.head()

## Stopwords Removal

In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

In [None]:
list_stopwords = stopwords.words('indonesian')
list_stopwords = set(list_stopwords)
print(list_stopwords)

In [None]:
def remove_stopwords(words):
  return [word for word in words if word not in list_stopwords]

df['Judul_Clean'] = df['Judul_Stemmed'].apply(remove_stopwords)

df.head()

# **Topic Modelling**

## Create Corpus DIctionary

In [None]:
dictionary = gensim.corpora.Dictionary(df['Judul_Clean'])
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

## TF-IDF Vectorization

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in df['Judul_Clean']]

from gensim import corpora, models
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

## Coherence Values

In [None]:
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
from numpy import array
#function to compute coherence values
def compute_coherence_values(dictionary, corpus, texts, limit, start, step):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, iterations=100)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
        
    return model_list, coherence_values

start=1
limit=11
step=1
model_list, coherence_values = compute_coherence_values(dictionary, corpus=corpus_tfidf, 
                                                        texts=df['Judul_Clean'], start=start, limit=limit, step=step)
#show graphs
import matplotlib.pyplot as plt
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 6))

In [None]:
model = LdaModel(corpus=bow_corpus, id2word=dictionary, num_topics=7) #num topic menyesuaikan hasil dari coherence value paling tinggi
for idx, topic in model.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic)) 

## Apply LDA Model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer()
vect_text = vect.fit_transform(df['Judul_Clean_Unlisted'])

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
lda_model = LatentDirichletAllocation(n_components=7, 
                                    learning_method='online',
                                    random_state=42,max_iter=1) 
lda_top = lda_model.fit_transform(vect_text)

In [None]:
print("Document 1: ")
for i,topic in enumerate(lda_top[1]):
  print("Topic ",i,": ",topic*100,"%")

In [None]:
#grab the highest probability word per topics
single_topic = lda_model.components_[0]

In [None]:
single_topic.argsort()

In [None]:
vocab = vect.get_feature_names()
for i, comp in enumerate(lda_model.components_):
     vocab_comp = zip(vocab, comp)
     sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:20]
     print('\n')
     print("Topic "+str(i)+": ")
     for t in sorted_words:
            print(t[0],end=" ")

In [None]:
topic_results=lda_model.transform(vect_text)

In [None]:
df['Topic'] = topic_results.argmax(axis=1)

topic_map = {0:'Sports & Entertainment',
             1:'Politics & Government',
             2:'Politics & Economics',
             3:'Entertainment & Pop Culture',
             4:'Politics & Social Issues',
             5:'Entertainment & Celebrity News',
             6:'Environmental Issues & Natural Disasters',}

df['Topic'] = df['Topic'].map(topic_map)

df['Topic'].value_counts()

In [None]:
df.head()

In [None]:
df = df[['Label_Akhir', 'Topic']]
df.head()