# Implementasi Latent Directlet Allocation (LDA)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Read File Abstrak

In [None]:
import pandas as pd
dataset = pd.read_csv('/content/drive/MyDrive/ppw/tugas/dataset_pta_infor.csv')
dataset.shape

(800, 6)

## Proporsi Topik dalam Dokumen

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Data teks Anda (ganti dengan data Anda sendiri)
data = dataset['abstrak_cleaned']

# Membuat DataFrame dari data teks
dataset_lda = pd.DataFrame(data)

# Mengisi nilai-nilai NaN dengan string kosong
dataset_lda['abstrak_cleaned'] = dataset_lda['abstrak_cleaned'].fillna('')

# Menggunakan CountVectorizer untuk mengonversi teks menjadi matriks hitungan
vectorizer = CountVectorizer()
count_matrix = vectorizer.fit_transform(dataset_lda['abstrak_cleaned'])

# Menerapkan model LDA
k = 3  # Jumlah topik
alpha = 0.1  # Parameter alpha
beta = 0.2  # Parameter beta

lda = LatentDirichletAllocation(n_components=k, doc_topic_prior=alpha, topic_word_prior=beta, random_state=42)
lda.fit(count_matrix)

# Mendapatkan distribusi topik pada setiap dokumen
doc_topic_distribution = lda.transform(count_matrix)

# Membuat DataFrame untuk hasil pemodelan LDA dengan menggunakan data teks asli
topic_names = [f"Topik {i+1}" for i in range(k)]
df = pd.DataFrame(columns=['Abstrak'] + topic_names)

# Menambahkan kolom distribusi topik pada setiap dokumen
for i, topic_name in enumerate(topic_names):
    df[topic_name] = doc_topic_distribution[:, i]

# Menyimpan DataFrame sebagai file CSV
output_csv_file = "topik_in_document.csv"
df.to_csv(output_csv_file, index=False)

# Menampilkan DataFrame sebagai tabel
df['Abstrak'] = dataset_lda['abstrak_cleaned'].values
df

Unnamed: 0,Abstrak,Topik 1,Topik 2,Topik 3
0,sistem informasi akademik siakad sistem inform...,0.001215,0.997570,0.001215
1,berjalannya koneksi jaringan komput lancar gan...,0.998101,0.000950,0.000950
2,web server perangkat lunak server berfungsi me...,0.000915,0.998170,0.000915
3,penjadwalan kuliah perguruan komplek permasala...,0.001486,0.001486,0.997028
4,seir perkembangan teknolog didunia muncul tekn...,0.001201,0.001201,0.997599
...,...,...,...,...
795,sistem informasi akademik siakad sistem inform...,0.001215,0.997570,0.001215
796,berjalannya koneksi jaringan komput lancar gan...,0.998101,0.000950,0.000950
797,web server perangkat lunak server berfungsi me...,0.000915,0.998170,0.000915
798,penjadwalan kuliah perguruan komplek permasala...,0.001486,0.001486,0.997028


## Proporsi Kata dalam Topik

In [None]:
# Menampilkan distribusi kata pada setiap topik
topic_word_distribution = lda.components_ / lda.components_.sum(axis=1)[:, np.newaxis]

# Membuat DataFrame untuk distribusi kata pada setiap topik
word_topic_df = pd.DataFrame(topic_word_distribution.T, columns=[f"Topik {i+1}" for i in range(k)],
                             index=vectorizer.get_feature_names_out())

# Menyimpan DataFrame sebagai file CSV
output_csv_file = "kata_in_topik.csv"
word_topic_df.to_csv(output_csv_file)

# Menampilkan DataFrame sebagai tabel
print("\nProporsi Kata pada Setiap Topik:")
word_topic_df



Proporsi Kata pada Setiap Topik:


Unnamed: 0,Topik 1,Topik 2,Topik 3
administr,0.056991,0.000007,0.000008
akademik,0.000012,0.015689,0.000008
aks,0.019005,0.000007,0.000008
algoritma,0.000012,0.000007,0.019968
animasi,0.000012,0.000007,0.013315
...,...,...,...
upaya,0.000012,0.000007,0.006662
variabel,0.000012,0.000007,0.006662
view,0.000012,0.005234,0.000008
virtual,0.000012,0.000007,0.013315
