In [42]:
import pandas as pd
import numpy as np
import plotly.express as px
pd.options.plotting.backend = "plotly"
from utils import *
from operator import itemgetter

import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora

import pyLDAvis
import pyLDAvis.gensim_models

In [2]:
# Read data

df = pd.read_excel("data/TemuanBPK.xlsx")

df = df[["LHP_Tahun", "NomorLHP", "LHP_Tanggal", "NamaEntitas", "Jenis_Pemeriksaan", "LHP_Opini", "JudulTemuan", "isSignifikan"]]

df = df.rename(columns={"LHP_Tahun": "tahun_lhp",
                   "NomorLHP": "nomor_lhp",
                   "LHP_Tanggal": "tanggal_lhp",
                   "NamaEntitas": "nama_entitas",
                   "Jenis_Pemeriksaan": "jenis_pemeriksaan",
                   "LHP_Opini": "opini_lhp",
                   "JudulTemuan": "judul_temuan",
                   "isSignifikan": "is_signifikan"})

df.head()

Unnamed: 0,tahun_lhp,nomor_lhp,tanggal_lhp,nama_entitas,jenis_pemeriksaan,opini_lhp,judul_temuan,is_signifikan
0,2021,04/LHP/XIX.PLU/05/2021,2021-05-19,Pemkab Poso,LK,WTP,Kelemahan Pengendalian dan Penatausahaan Aset ...,0
1,2021,04/LHP/XIX.PLU/05/2021,2021-05-19,Pemkab Poso,LK,WTP,Belanja Perjalanan Dinas Tidak Sesuai Ketentua...,0
2,2021,04/LHP/XIX.PLU/05/2021,2021-05-19,Pemkab Poso,LK,WTP,Kelemahan Pengendalian Belanja Modal pada Dina...,0
3,2021,04/LHP/XIX.PLU/05/2021,2021-05-19,Pemkab Poso,LK,WTP,Kekurangan Volume Pekerjaan dalam Tiga Paket B...,0
4,2021,04/LHP/XIX.PLU/05/2021,2021-05-19,Pemkab Poso,LK,WTP,Penggunaan Langsung atas Penerimaan Pendapatan...,1


In [3]:
# Drop null values

df = df.dropna()

df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 104545 entries, 0 to 104621
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   tahun_lhp          104545 non-null  int64         
 1   nomor_lhp          104545 non-null  object        
 2   tanggal_lhp        104545 non-null  datetime64[ns]
 3   nama_entitas       104545 non-null  object        
 4   jenis_pemeriksaan  104545 non-null  object        
 5   opini_lhp          104545 non-null  object        
 6   judul_temuan       104545 non-null  object        
 7   is_signifikan      104545 non-null  object        
dtypes: datetime64[ns](1), int64(1), object(6)
memory usage: 7.2+ MB


In [4]:
# Jenis Pemeriksaan per Tahun

df2 = df.groupby(["tahun_lhp", "jenis_pemeriksaan"]).size().reset_index(name="value")

fig = px.bar(df2, x="tahun_lhp", y="value", color="jenis_pemeriksaan", title="Jenis Pemeriksaan per Tahun")
fig.show()

In [5]:
# Opini LHP per Tahun

df3 = df.groupby(["tahun_lhp", "opini_lhp"]).size().reset_index(name="value")

fig = px.bar(df3, x="tahun_lhp", y="value", color="opini_lhp", title="Opini LHP per Tahun",
            barmode="group", height=600)
fig.show()

In [6]:
# Preprocess data judul

df["judul_temuan"] = df["judul_temuan"].astype(str)
df["clean"] = df["judul_temuan"].apply(lambda x: preprocess_text(x))

In [7]:
df.sample(5)

Unnamed: 0,tahun_lhp,nomor_lhp,tanggal_lhp,nama_entitas,jenis_pemeriksaan,opini_lhp,judul_temuan,is_signifikan,clean
44398,2020,25/LHP/XVIII.BDG/06/2020,2020-07-01,Pemkab Pangandaran,LK,WTP,Penatausahaan Aset Tetap Pemerintah Kabupaten ...,0,penatausahaan aset pemerintah kabupaten pangan...
87732,2021,25/LHP/XIX.PNK/05/2021,2021-05-05,Pemkab Kubu Raya,LK,WTP,Kelebihan Pembayaran atas Kekurangan Volume Du...,0,kelebihan pembayaran kekurangan volume paket p...
45781,2017,92/HP/XIV/11/2018,2018-11-26,UO Kementerian Pertahanan,PDTT,BLANK,Pengadaan Pesawat Terbang Tanpa Awak (PTTA) Pe...,0,pengadaan pesawat terbang awak ptta perbatasan...
102789,2019,88/LHP/XVIII.SBY/12/2019,2019-12-11,Pemkab Kediri,Kinerja,BLANK,Pemerintah Daerah dan Satuan Pendidikan di Kab...,0,pemerintah daerah satuan pendidikan kabupaten ...
68977,2021,16/LHP/XVIII.JMB/5/2021,2021-05-07,Pemkab Kerinci,LK,WTP,Pengenaan Pajak Pertambahan Nilai dan/atau Paj...,0,pengenaan pajak pertambahan nilai pajak pengha...


In [8]:
# Select data with non-empty tokens

df["num_words"] = df["clean"].apply(lambda x: len(x))
df = df[df["num_words"] > 0]
df = df.drop(columns=["num_words"])

#### Gensim

In [9]:
# Convert sentences to words

def sent_to_words(sentences):
    for sentence in sentences:
        yield(simple_preprocess(str(sentence), deacc=True))

data = df["clean"].values.tolist()
data_words = list(sent_to_words(data))

In [10]:
# Build bigram models
bigram = gensim.models.Phrases(data_words, min_count=2, threshold=10)
bigram_mod = gensim.models.phrases.Phraser(bigram)

# Build trigram models
trigram = gensim.models.Phrases(bigram[data_words], threshold=10)
trigram_mod = gensim.models.phrases.Phraser(trigram)

In [11]:
def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

# Form Bigrams
data_words_bigrams = make_bigrams(data_words)

In [12]:
# Form corpus and dictionary

# corpus
texts = data_words_bigrams

# dictionary
id2word = corpora.Dictionary(texts)

# TF-IDF
corpus = [id2word.doc2bow(text) for text in texts]

In [13]:
# Base LDA Model

lda_model = gensim.models.LdaMulticore(corpus=corpus, id2word=id2word, num_topics=10,
                                       random_state=1, chunksize=100, passes=10, per_word_topics=True)

In [14]:
# Base Model Coherence Score

coherence_model = gensim.models.CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence="c_v")
coherence_lda = coherence_model.get_coherence()

print(f"Coherence score: {coherence_lda}")

Coherence score: 0.35544646569597677


In [28]:
# Hyperparameter tuning

def compute_coherence(corpus, dictionary, k, a, b):
    ''' 
    Compute coherence values for
    k - number of topics
    a - document-topic density
    b - word-topic density
    '''
    
    lda_model = gensim.models.LdaMulticore(corpus=corpus, id2word=dictionary, num_topics=k,
                                           random_state=1, chunksize=100, passes=10,
                                           alpha=a, eta=b)
    coherence_model = gensim.models.CoherenceModel(model=lda_model, texts=texts, dictionary=id2word, coherence="c_v")
    
    return coherence_model.get_coherence()  


grid = {}
grid["validation"] = {}

# topics
min_k = 5
max_k = 11
k_list = range(min_k, max_k, 1)

# alpha
alpha_list = list(np.arange(0.1, 1, 0.3))
alpha_list.append("symmetric")
alpha_list.append("asymmetric")

# beta
beta_list = list()
beta_list.append("symmetric")

# validation sets
num_docs = len(corpus)
corpus_sets = [corpus]
corpus_title = ["100% Corpus"]

model_results = {"validation": [],
                 "topics": [],
                 "alpha": [],
                 "beta": [],
                 "coherence": []}

iteration = 1

for i in range(len(corpus_sets)):
    for k in k_list:
        for a in alpha_list:
            for b in beta_list:
                cv = compute_coherence(corpus=corpus_sets[i], dictionary=id2word, k=k, a=a, b=b)

                # save results
                model_results["validation"].append(corpus_title[i])
                model_results["topics"].append(k)
                model_results["alpha"].append(a)
                model_results["beta"].append(b)
                model_results["coherence"].append(cv)

                print(f"Iter {iteration}")
                iteration += 1

pd.DataFrame(model_results).to_csv("lda_results.csv", index=False)

Iter 1
Iter 2
Iter 3
Iter 4
Iter 5
Iter 6
Iter 7
Iter 8
Iter 9
Iter 10
Iter 11
Iter 12
Iter 13
Iter 14
Iter 15
Iter 16
Iter 17
Iter 18
Iter 19
Iter 20
Iter 21
Iter 22
Iter 23
Iter 24
Iter 25
Iter 26
Iter 27
Iter 28
Iter 29
Iter 30


#### Best model  

In [30]:
results = pd.read_csv("lda_results.csv")
results[results["coherence"] == results["coherence"].max()]

Unnamed: 0,validation,topics,alpha,beta,coherence
12,100% Corpus,7,0.7000000000000001,symmetric,0.423209


In [46]:
# Best model

k = 7
a = 0.7
b = "symmetric"

best_lda_model = gensim.models.LdaMulticore(corpus=corpus, id2word=id2word, num_topics=k,
                                           random_state=1, chunksize=100, passes=10,
                                           alpha=a, eta=b)

In [49]:
coherence_model = gensim.models.CoherenceModel(model=best_lda_model, texts=texts, dictionary=id2word, coherence="c_v")
coherence_model.get_coherence()

0.42901275540203787

In [47]:
# Visualise model

pyLDAvis.enable_notebook()

LDAvis = pyLDAvis.gensim_models.prepare(best_lda_model, corpus, id2word)

LDAvis


In a future version of pandas all arguments of DataFrame.drop except for the argument 'labels' will be keyword-only.

  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  other = LooseVersion(other)
  other = LooseVersion(other)
  other = LooseVersion(other)
  other = LooseVersion(other)
  other = LooseVersion(other)
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  if LooseVersion(np.__version__) < '1.13':
  other = LooseVersion(other)
  other = LooseVersion(other)
 

In [48]:
# Annotate each sample of data with a topic

def get_topic(text, dictionary, lda_model):
    bow = dictionary.doc2bow(text.split())
    topic = lda_model.get_document_topics(bow)

    return max(topic, key=itemgetter(1))[0]

df["topic"] = df["clean"].apply(lambda x: get_topic(x, id2word, best_lda_model))

In [60]:
# Banyak topik

df4 = df.groupby(["topic"]).size().reset_index(name="value")

fig = px.pie(df4, values="value", names="topic", title="Banyak Topik")
fig.show()

In [67]:
# Topik per Tahun

df5 = df.groupby(["tahun_lhp", "topic"]).size().reset_index(name="value")

fig = px.line(df5, x="tahun_lhp", y="value", color="topic", markers=True)
fig.update_xaxes(type="category")
fig.show()

In [69]:
# Jenis Pemeriksaan dan Topik

df6 = df.groupby(["jenis_pemeriksaan", "topic"]).size().reset_index(name="value")

fig = px.bar(df6, x="jenis_pemeriksaan", y="value", color="topic", title="Topik per Jenis Pemeriksaan", barmode="group", height=600)
fig.show()

In [61]:
# Opini LHP dan Topik

df7 = df.groupby(["opini_lhp", "topic"]).size().reset_index(name="value")

fig = px.bar(df7, x="opini_lhp", y="value", color="topic", title="Topik per Opini LHP", height=600)
fig.show()