# Tugas 2 PPW : Pre Processing Data PTA Trunojoyo

In [None]:
import numpy as np
import pandas as pd

## Import Data

In [None]:
# df = pd.read_csv('https://raw.githubusercontent.com/Zey21/dataset/main/DataPTAInformatika.csv')
# df = pd.read_csv('https://raw.githubusercontent.com/Zey21/dataset/main/DataPTAInformatikaMini.csv')
df = pd.read_csv('https://raw.githubusercontent.com/Zey21/dataset/main/DataPTAInformatikaLabel.csv',delimiter=';')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df.head()

Unnamed: 0,Judul,Nama Penulis,Pembimbing I,Pembimbing II,Abstrak,Prodi,Label
0,Gerak Pekerja Pada Game Real Time Strategy Men...,Adi Chandra Laksono,"Kurniawan Eka P, S.Kom., Msc","Arik Kurniawati, S.Kom., M.T.",Gerak pekerja ada pada game yang memiliki genr...,Jurusan Teknik Informatika,RPL
1,RANCANG BANGUN GAME PERAWATAN SAPI KARAPAN MEN...,NURRACHMAT,"Arik Kurniawati, S.Kom., M.T.","Kurniawan Eka Permana, S.Kom., MSc.","Perkembangan game yang semakin pesat, memberik...",Jurusan Teknik Informatika,RPL
2,EKSTRAKSI FITUR BERBASIS TWO DIMENSIONAL LINEA...,Muhammad Choirur Rozi,"Dr. Arif Muntasa, S.Si.,M.T","Fitri Damayanti, S.Kom.,M.kom",Sistem pengenalan wajah adalah suatu sistem un...,Jurusan Teknik Informatika,Kecerdasan Komputasional
3,IMPLEMENTASI ALGORITMA PRIM DAN DEPTH FIRST ...,M Khoiril Anwar,"Cucun Very Angkoso, S.T., M.T.","Arik Kurniawati S. Kom., M.T.",Teknologi mobile game beroperating system open...,Jurusan Teknik Informatika,RPL
4,Perancangan Sistem Informasi Badan Kepegawaian...,MALIKUL HAMZAH,"Moch. Kautsar Sophan, S.Kom., M.MT.","Yeni Kustiyaningsih, S.Kom., M.Kom.",Kantor Badan Kepegawaian kota Bangkalan adalah...,Jurusan Teknik Informatika,RPL


df.columns.str.contains('^Unamed') : fitur untuk menampilkan isi kolom dari kolom yang tidak berlabel hingga berlabel

## Pre Processing Data

### Cek Data yang Kosong

In [None]:
df.isnull().sum()

Judul             6
Nama Penulis      0
Pembimbing I      0
Pembimbing II    12
Abstrak          29
Prodi             5
Label             7
dtype: int64

### Menghapus Data yang Kosong

In [None]:
df.dropna(inplace=True)

### Cleaning Data

#### Membuat Fungsi Cleaning Data
- Tag HTML
- LowerCase Data
- Spasi pada teks
- Tanda baca dan karakter spesial
- Nomor
- Komponen Lainnya

In [None]:
import re, string

# Text Cleaning
def cleaning(text):
    # Menghapus tag HTML
    text = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});').sub('', str(text))

    # Mengubah seluruh teks menjadi huruf kecil
    text = text.lower()

    # Menghapus spasi pada teks
    text = text.strip()

    # Menghapus Tanda Baca, karakter spesial, and spasi ganda
    text = re.compile('<.*?>').sub('', text)
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub("â", "", text)

    # Menghapus Nomor
    text = re.sub(r'\[[0-9]*\]', ' ', text)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d', ' ', text)
    text = re.sub(r'\s+', ' ', text)

    # Mengubah text yang berisi 'nan' dengan whitespace agar nantinya dapat dihapus
    text = re.sub('nan', '', text)

    return text

#### Implementasi Fungsi Pada Data Frame Abstrak

In [None]:
df['Abstrak'] = df['Abstrak'].apply(lambda x: cleaning(x))
df.head()

Unnamed: 0,Judul,Nama Penulis,Pembimbing I,Pembimbing II,Abstrak,Prodi,Label
0,Gerak Pekerja Pada Game Real Time Strategy Men...,Adi Chandra Laksono,"Kurniawan Eka P, S.Kom., Msc","Arik Kurniawati, S.Kom., M.T.",gerak pekerja ada pada game yang memiliki genr...,Jurusan Teknik Informatika,RPL
1,RANCANG BANGUN GAME PERAWATAN SAPI KARAPAN MEN...,NURRACHMAT,"Arik Kurniawati, S.Kom., M.T.","Kurniawan Eka Permana, S.Kom., MSc.",perkembangan game yang semakin pesat memberika...,Jurusan Teknik Informatika,RPL
2,EKSTRAKSI FITUR BERBASIS TWO DIMENSIONAL LINEA...,Muhammad Choirur Rozi,"Dr. Arif Muntasa, S.Si.,M.T","Fitri Damayanti, S.Kom.,M.kom",sistem pengenalan wajah adalah suatu sistem un...,Jurusan Teknik Informatika,Kecerdasan Komputasional
3,IMPLEMENTASI ALGORITMA PRIM DAN DEPTH FIRST ...,M Khoiril Anwar,"Cucun Very Angkoso, S.T., M.T.","Arik Kurniawati S. Kom., M.T.",teknologi mobile game beroperating system open...,Jurusan Teknik Informatika,RPL
4,Perancangan Sistem Informasi Badan Kepegawaian...,MALIKUL HAMZAH,"Moch. Kautsar Sophan, S.Kom., M.MT.","Yeni Kustiyaningsih, S.Kom., M.Kom.",kantor badan kepegawaian kota bangkalan adalah...,Jurusan Teknik Informatika,RPL


### Tokenisasi Data
Memisahkan sebuah Dokumen menjadi susunan per kata / term

#### Import Library NLTK

In [None]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('popular')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cmudict.zip.
[nltk_data]    | Downloading package gazetteers to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gazetteers.zip.
[nltk_data]    | Downloading package genesis to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/genesis.zip.
[nltk_data]    | Downloading package gutenberg to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/gutenberg.zip.
[nltk_data]    | Downloading package inaugural to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/inaugural.zip.
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/movie_reviews.zip.
[nltk_data]    | Downloading package names to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/names.zip.
[nltk_data]    | Downloading package shakespeare to /root/nltk_data...
[nlt

True

#### Implementasi Library pada Data

In [None]:
df['abstrak_tokens'] = df['Abstrak'].apply(lambda x: word_tokenize(x))
df[["Abstrak", "abstrak_tokens"]].head()

Unnamed: 0,Abstrak,abstrak_tokens
0,gerak pekerja ada pada game yang memiliki genr...,"[gerak, pekerja, ada, pada, game, yang, memili..."
1,perkembangan game yang semakin pesat memberika...,"[perkembangan, game, yang, semakin, pesat, mem..."
2,sistem pengenalan wajah adalah suatu sistem un...,"[sistem, pengenalan, wajah, adalah, suatu, sis..."
3,teknologi mobile game beroperating system open...,"[teknologi, mobile, game, beroperating, system..."
4,kantor badan kepegawaian kota bangkalan adalah...,"[kantor, badan, kepegawaian, kota, bangkalan, ..."


lambda x adalah fitur tanpa nama, dimana akan melakukan progress pada banaknya data sesuai dengan fitur yang tlah di set.

### Stopword Data
Mengubah isi dokumen sesuai dengan kamus data

#### Import Library NLTK

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### Implementasi Library pada Data

In [None]:
from nltk.corpus import stopwords
from itertools import chain

stop_words = set(chain(stopwords.words('indonesian')))
df['abstrak_tokens'] = df['abstrak_tokens'].apply(lambda x: [w for w in x if not w in stop_words])

In [None]:
df['Abstrak'] = df['abstrak_tokens'].apply(lambda tokens: ' '.join(tokens))

In [None]:
df[["Abstrak", "abstrak_tokens"]].head()

Unnamed: 0,Abstrak,abstrak_tokens
0,gerak pekerja game memiliki genre rts real tim...,"[gerak, pekerja, game, memiliki, genre, rts, r..."
1,perkembangan game pesat alternative peminatnya...,"[perkembangan, game, pesat, alternative, pemin..."
2,sistem pengenalan wajah sistem mengenali ident...,"[sistem, pengenalan, wajah, sistem, mengenali,..."
3,teknologi mobile game beroperating system open...,"[teknologi, mobile, game, beroperating, system..."
4,kantor badan kepegawaian kota bangkalan instan...,"[kantor, badan, kepegawaian, kota, bangkalan, ..."


### Steaming Data
Mengubah kata menjadi bentuk dasar

#### Import Library Sastrawi

In [None]:
!pip install sastrawi

Collecting sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sastrawi
Successfully installed sastrawi-1.0.1


#### Implementasi Library pada Data

In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from tqdm.auto import tqdm
tqdm.pandas()

factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [None]:
df['abstrak_tokens'] = df['abstrak_tokens'].progress_apply(lambda x: stemmer.stem(' '.join(x)).split(' '))

  0%|          | 0/818 [00:00<?, ?it/s]

In [None]:
df['Abstrak'] = df['abstrak_tokens'].apply(lambda tokens: ' '.join(tokens))

In [None]:
df

Unnamed: 0,Judul,Nama Penulis,Pembimbing I,Pembimbing II,Abstrak,Prodi,Label,abstrak_tokens
0,Gerak Pekerja Pada Game Real Time Strategy Men...,Adi Chandra Laksono,"Kurniawan Eka P, S.Kom., Msc","Arik Kurniawati, S.Kom., M.T.",gerak kerja game milik genre rts real time str...,Jurusan Teknik Informatika,RPL,"[gerak, kerja, game, milik, genre, rts, real, ..."
1,RANCANG BANGUN GAME PERAWATAN SAPI KARAPAN MEN...,NURRACHMAT,"Arik Kurniawati, S.Kom., M.T.","Kurniawan Eka Permana, S.Kom., MSc.",kembang game pesat alternative minat bentuk ga...,Jurusan Teknik Informatika,RPL,"[kembang, game, pesat, alternative, minat, ben..."
2,EKSTRAKSI FITUR BERBASIS TWO DIMENSIONAL LINEA...,Muhammad Choirur Rozi,"Dr. Arif Muntasa, S.Si.,M.T","Fitri Damayanti, S.Kom.,M.kom",sistem kenal wajah sistem nali identitas wajah...,Jurusan Teknik Informatika,Kecerdasan Komputasional,"[sistem, kenal, wajah, sistem, nali, identitas..."
3,IMPLEMENTASI ALGORITMA PRIM DAN DEPTH FIRST ...,M Khoiril Anwar,"Cucun Very Angkoso, S.T., M.T.","Arik Kurniawati S. Kom., M.T.",teknologi mobile game beroperating system open...,Jurusan Teknik Informatika,RPL,"[teknologi, mobile, game, beroperating, system..."
4,Perancangan Sistem Informasi Badan Kepegawaian...,MALIKUL HAMZAH,"Moch. Kautsar Sophan, S.Kom., M.MT.","Yeni Kustiyaningsih, S.Kom., M.Kom.",kantor badan pegawai kota bangkal instansi per...,Jurusan Teknik Informatika,RPL,"[kantor, badan, pegawai, kota, bangkal, instan..."
...,...,...,...,...,...,...,...,...
848,PENERAPAN ALGORITMA LONG-SHORT TERM MEMORY UNT...,Rachmad Agung Pambudi,"Eka Mala Sari Rochman, S.Kom., M.Kom","Sri Herawati, S.Kom., M.Kom",investasi saham milik resiko rugi dikarenakanp...,Jurusan Teknik Informatika,Kecerdasan Komputasional,"[investasi, saham, milik, resiko, rugi, dikare..."
849,SISTEM PENCARIAN TEKS AL-QURAN TERJEMAHAN BERB...,Nadila Hidayanti,"Achmad Jauhari, S.T., M.Kom","Ika Oktavia Suzanti, S.Kom., M.Cs",information retrieval ir ambil informasi simpa...,Jurusan Teknik Informatika,Kecerdasan Komputasional,"[information, retrieval, ir, ambil, informasi,..."
850,KLASIFIKASI KOMPLEKSITAS VISUAL CITRA SAMPAH M...,Afni Sakinah,"Dr. Indah Agustien Siradjuddin, S.Kom., M.Kom.","Moch. Kautsar Sophan, S.Kom., M.MT.",klasifikasi citra proses kelompok piksel citra...,Jurusan Teknik Informatika,Kecerdasan Komputasional,"[klasifikasi, citra, proses, kelompok, piksel,..."
851,IDENTIFIKASI BINER ATRIBUT PEJALAN KAKI MENGGU...,Friska Fatmawatiningrum,"Dr. Indah Agustien Siradjuddin, S.Kom., M.Kom.","Prof. Dr. Arief Muntasa, S.Si., M.MT.",identifikasi atribut pejal kaki salah teliti k...,Jurusan Teknik Informatika,Kecerdasan Komputasional,"[identifikasi, atribut, pejal, kaki, salah, te..."


In [None]:
df.to_csv('DataSteaming.csv', index=False)

In [None]:
# df = pd.read_csv('https://raw.githubusercontent.com/wahyuarilsaputra/dataset/main/DataSteaming.csv')
# df.head()

## Ekstraksi Fitur

In [None]:
df['Abstrak']

0      gerak kerja game milik genre rts real time str...
1      kembang game pesat alternative minat bentuk ga...
2      sistem kenal wajah sistem nali identitas wajah...
3      teknologi mobile game beroperating system open...
4      kantor badan pegawai kota bangkal instansi per...
                             ...                        
848    investasi saham milik resiko rugi dikarenakanp...
849    information retrieval ir ambil informasi simpa...
850    klasifikasi citra proses kelompok piksel citra...
851    identifikasi atribut pejal kaki salah teliti k...
852    topik deteksi objek tarik perhati kembang tekn...
Name: Abstrak, Length: 818, dtype: object

### Term Frekuensi

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()

# Gantilah nilai NaN dalam kolom 'Abstrak' dengan string kosong
df['Abstrak'].fillna('', inplace=True)

X_count = count_vectorizer.fit_transform(np.array(df['Abstrak']))

terms_count = count_vectorizer.get_feature_names_out()
df_countvect = pd.DataFrame(data = X_count.toarray(),columns = terms_count)
df_countvect

Unnamed: 0,aalysis,abad,abadi,abai,abdi,abdullah,ability,abjad,absah,absensi,...,zara,zat,zcz,zf,zona,zone,zoning,zoom,zucara,zungu
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
813,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
814,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
815,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
816,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
token_counts = df_countvect.sum(axis=0)

non_zero_token_counts = token_counts[token_counts != 0]

print("Token Counts yang Tidak Sama dengan 0:")
print(non_zero_token_counts)

Token Counts yang Tidak Sama dengan 0:
aalysis    1
abad       1
abadi      2
abai       1
abdi       3
          ..
zone       3
zoning     4
zoom       3
zucara     1
zungu      1
Length: 6366, dtype: int64


In [None]:
df_countvect.to_csv('Data_CountVectorize.csv', index=False)

### One Hot Encoding

In [None]:
df_binary = df_countvect.applymap(lambda x: 1 if x > 0 else 0)
df_binary

Unnamed: 0,aalysis,abad,abadi,abai,abdi,abdullah,ability,abjad,absah,absensi,...,zara,zat,zcz,zf,zona,zone,zoning,zoom,zucara,zungu
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
813,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
814,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
815,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
816,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df_binary.to_csv('Data_OneHotEncoder.csv', index=False)

### TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(df['Abstrak'].tolist())

terms = vectorizer.get_feature_names_out()
df_tfidfvect = pd.DataFrame(data = X_tfidf.toarray(),columns = terms)
df_tfidfvect

Unnamed: 0,aalysis,abad,abadi,abai,abdi,abdullah,ability,abjad,absah,absensi,...,zara,zat,zcz,zf,zona,zone,zoning,zoom,zucara,zungu
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
813,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
814,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df_tfidfvect.to_csv('Data_TF-IDF.csv', index=False)

### Log Frekuensi

In [None]:
df_log = df_countvect.applymap(lambda x: np.log1p(x) if x > 0 else 0)
df_log

Unnamed: 0,aalysis,abad,abadi,abai,abdi,abdullah,ability,abjad,absah,absensi,...,zara,zat,zcz,zf,zona,zone,zoning,zoom,zucara,zungu
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
813,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
814,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
815,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
816,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df_log.to_csv('Data_LogFrekuensi.csv', index=False)

## Skip Gram Data

In [None]:
import gensim
from gensim.models import Word2Vec
from gensim.test.utils import common_texts
import pandas as pd

sentences = df['abstrak_tokens'].tolist()

In [None]:
sentences

[['gerak',
  'kerja',
  'game',
  'milik',
  'genre',
  'rts',
  'real',
  'time',
  'strategy',
  'gerak',
  'kerja',
  'milik',
  'gerak',
  'butuh',
  'dekat',
  'konsep',
  'ai',
  'desain',
  'perilaku',
  'kerja',
  'perilaku',
  'karakter',
  'tambah',
  'ai',
  'artifical',
  'intelegent',
  'perilaku',
  'hidup',
  'realistis',
  'teliti',
  'ai',
  'finite',
  'state',
  'machine',
  'finite',
  'state',
  'machine',
  'tentu',
  'gerak',
  'kerja',
  'parameter',
  'parameter',
  'dasar',
  'gerak',
  'simulasi',
  'game',
  'rts',
  'game',
  'engine',
  'hasil',
  'oleh',
  'teliti',
  'terap',
  'metode',
  'finite',
  'state',
  'machine',
  'tentu',
  'gerak',
  'kerja',
  'dasar',
  'parameter',
  'harta',
  'prajurit',
  'kondisi',
  'bangu',
  'stockpile',
  'resources',
  'bawa',
  'kunci',
  'game',
  'real',
  'time',
  'strategy',
  'gerak',
  'kerja',
  'finite',
  'state',
  'machine'],
 ['kembang',
  'game',
  'pesat',
  'alternative',
  'minat',
  'bentuk',
 

In [None]:
model = Word2Vec(sentences, vector_size=100, window=1, sg=1, epochs=1)

In [None]:
word = 'hasil'
similar_words = model.wv.most_similar(word)

# print(f"Kata yang mirip dengan '{word}':")
# for w, sim in similar_words:
#     print(f"{w}: {sim:.4f}")

In [None]:
import gensim
from gensim.models import Word2Vec
import pandas as pd

sentences = df['abstrak_tokens'].tolist()

model = Word2Vec(sentences, vector_size=100, window=1, sg=1, epochs=1)

word = "gerak"
context_words = []

if word in model.wv:
    word_vector = model.wv[word]
    similar_words = model.wv.most_similar([word_vector], topn=3)
    context_words = [w for w, _ in similar_words]

print(f"Kata-kata dalam konteks window=1 untuk '{word}':")
for w in context_words:
    print(w)


Kata-kata dalam konteks window=1 untuk 'gerak':
gerak
bas
main


In [None]:
import gensim
from gensim.models import Word2Vec

model = Word2Vec(sentences, vector_size=100, window=1, sg=1, epochs=1)

word1 = "gerak"
word2 = "bs"

if word1 in model.wv and word2 in model.wv:
    vector1 = model.wv[word1]
    vector2 = model.wv[word2]
    similarity = model.wv.cosine_similarities(vector1, [vector2])[0]

    print(f"Kesamaan kosakata antara '{word1}' dan '{word2}': {similarity:.4f}")
else:
    print("Salah satu atau kedua kata tidak ada dalam model.")


Salah satu atau kedua kata tidak ada dalam model.


# Tugas 3 LDA Modeling

## LDA Modeling

In [None]:
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD

## Modeling Data
- k=3
- alpha=0.1
- betha=0.2

In [None]:
lda_model = LatentDirichletAllocation(n_components=3, doc_topic_prior=0.1, topic_word_prior=0.2, random_state=42)
lda_model.fit(df_countvect)

## proporsi topik pada dokumen

In [None]:
doc_topic_proportions = lda_model.transform(df_countvect)

for i, doc in enumerate(df['Abstrak']):
    print(f"Dokumen {i+1}:")
    for j, topic_prob in enumerate(doc_topic_proportions[i]):
        print(f"Topik {j+1}: {topic_prob:.4f}")
    print()


Dokumen 1:
Topik 1: 0.9975
Topik 2: 0.0013
Topik 3: 0.0013

Dokumen 2:
Topik 1: 0.9978
Topik 2: 0.0011
Topik 3: 0.0011

Dokumen 3:
Topik 1: 0.0009
Topik 2: 0.0009
Topik 3: 0.9983

Dokumen 4:
Topik 1: 0.5392
Topik 2: 0.3484
Topik 3: 0.1124

Dokumen 5:
Topik 1: 0.6607
Topik 2: 0.0009
Topik 3: 0.3384

Dokumen 6:
Topik 1: 0.7362
Topik 2: 0.0009
Topik 3: 0.2629

Dokumen 7:
Topik 1: 0.9982
Topik 2: 0.0009
Topik 3: 0.0009

Dokumen 8:
Topik 1: 0.3559
Topik 2: 0.1195
Topik 3: 0.5246

Dokumen 9:
Topik 1: 0.1202
Topik 2: 0.7713
Topik 3: 0.1085

Dokumen 10:
Topik 1: 0.3282
Topik 2: 0.5134
Topik 3: 0.1584

Dokumen 11:
Topik 1: 0.0923
Topik 2: 0.7871
Topik 3: 0.1207

Dokumen 12:
Topik 1: 0.0008
Topik 2: 0.0008
Topik 3: 0.9984

Dokumen 13:
Topik 1: 0.0009
Topik 2: 0.0009
Topik 3: 0.9982

Dokumen 14:
Topik 1: 0.0008
Topik 2: 0.0526
Topik 3: 0.9467

Dokumen 15:
Topik 1: 0.9984
Topik 2: 0.0008
Topik 3: 0.0008

Dokumen 16:
Topik 1: 0.0009
Topik 2: 0.9982
Topik 3: 0.0009

Dokumen 17:
Topik 1: 0.0012
Topik

In [None]:
topic_word_distributions = lda_model.components_

feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(topic_word_distributions):
    top_words_idx = topic.argsort()[::-1][:10]  # Ambil 10 kata teratas
    top_words = [feature_names[i] for i in top_words_idx]
    print(f"Topik {topic_idx+1}:")
    print(", ".join(top_words))
    print()


Topik 1:
sistem, ajar, hasil, aplikasi, nilai, metode, informasi, data, game, proses

Topik 2:
hasil, data, metode, nilai, teliti, proses, sistem, tingkat, akurasi, uji

Topik 3:
citra, metode, hasil, sistem, sakit, teliti, data, akurasi, proses, uji



In [None]:
lda_model

In [None]:
import gensim
from gensim import corpora
from gensim.models import LdaModel
import pandas as pd
import matplotlib.pyplot as plt

# Ubah teks ke dalam format yang cocok untuk Gensim
documents =df['abstrak_tokens']

# Membuat kamus (dictionary) dari kata-kata unik dalam dokumen
dictionary = corpora.Dictionary(documents)

# Membuat korpus (bag-of-words) dari dokumen
corpus = [dictionary.doc2bow(doc) for doc in documents]

# Melatih model LDA
lda_model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=30)

In [None]:
# Membuat DataFrame untuk menampilkan proporsi topik dalam dokumen
document_topic_df = pd.DataFrame()

for doc in corpus:
    topic_distribution = lda_model.get_document_topics(doc, minimum_probability=0)
    doc_topic_props = {} #mengubah tampilan agar topik di probalility hilang dan ada pada tabel diatasnya
    for topic_id, prob in topic_distribution:
        key = f"Topik {topic_id + 1}"
        doc_topic_props[key] = prob
    # doc_topic_props["Judul"] = datajudul
    document_topic_df = pd.concat([document_topic_df, pd.Series(doc_topic_props)], ignore_index=True, axis=1)

document_topic_df = document_topic_df.transpose()  # Transpose agar topik menjadi kolom

column_names = [f"Topik {i + 1}" for i in range(lda_model.num_topics)]
document_topic_df.columns = column_names

# Menampilkan tabel proporsi topik dalam dokumen
print("Tabel Proporsi Topik dalam Dokumen:")
document_topic_df

Tabel Proporsi Topik dalam Dokumen:


Unnamed: 0,Topik 1,Topik 2,Topik 3,Topik 4,Topik 5
0,0.989808,0.002584,0.002533,0.002529,0.002546
1,0.339681,0.002291,0.549408,0.002253,0.106366
2,0.001738,0.001727,0.001739,0.993056,0.001741
3,0.989627,0.002583,0.002589,0.002589,0.002612
4,0.001873,0.001865,0.001859,0.001860,0.992542
...,...,...,...,...,...
813,0.001741,0.091434,0.001727,0.001757,0.903341
814,0.918613,0.002366,0.002398,0.074244,0.002378
815,0.001241,0.001243,0.001259,0.995011,0.001245
816,0.001583,0.001587,0.001581,0.993665,0.001584


In [None]:
# Membuat DataFrame untuk menampilkan proporsi kata dalam topik
topic_word_df = pd.DataFrame()

for topic_id in range(lda_model.num_topics):
    topic_words = lda_model.show_topic(topic_id, topn=10)  # Ambil 10 kata kunci teratas
    # words_list = [word for word, _ in topic_words]
    words_list = []
    for word, bbt in topic_words:
        words_list.append(word)
    topic_word_df[f"Topik {topic_id + 1}"] = words_list

# Menampilkan tabel proporsi kata dalam topik
print("\nTabel Proporsi Kata dalam Topik:")
topic_word_df


Tabel Proporsi Kata dalam Topik:


Unnamed: 0,Topik 1,Topik 2,Topik 3,Topik 4,Topik 5
0,ajar,sistem,data,citra,sistem
1,game,hasil,sistem,metode,informasi
2,hasil,metode,nilai,hasil,aplikasi
3,main,putus,metode,teliti,data
4,sistem,nilai,hasil,fitur,hasil
5,nilai,kerja,siswa,proses,metode
6,metode,kriteria,uji,akurasi,proses
7,aplikasi,usaha,mahasiswa,klasifikasi,tingkat
8,teliti,teliti,proses,data,butuh
9,media,dukung,informasi,kenal,kembang


In [None]:
len_df = topic_word_df.shape[0]

In [None]:
df["Judul"][:10]

0    Gerak Pekerja Pada Game Real Time Strategy Men...
1    RANCANG BANGUN GAME PERAWATAN SAPI KARAPAN MEN...
2    EKSTRAKSI FITUR BERBASIS TWO DIMENSIONAL LINEA...
3    IMPLEMENTASI  ALGORITMA PRIM  DAN DEPTH FIRST ...
4    Perancangan Sistem Informasi Badan Kepegawaian...
5    PEMANFAATAN TOGAF ADM UNTUK PERANCANGAN SISTEM...
6    APLIKASI METODE FUZZY ANALYTIC NETWORK PROCESS...
7    SISTEM PENDUKUNG KEPUTUSAN REKOMENDASI MENU DI...
8    RANCANG BANGUN APLIKASI PEMILIHAN TEKNIK REKAY...
9    DETEKSI COREPOINT SIDIK JARI MENGGUNAKAN METOD...
Name: Judul, dtype: object

In [None]:
np_judul = np.array(df["Judul"][:10])

In [None]:
list_judul = np_judul.tolist()

In [None]:
pd_judul = pd.DataFrame(np_judul)

In [None]:
pd_judul.columns = ["Judul"]

In [None]:
np_topic = np.array(topic_word_df)

In [None]:
list_topic = np_topic.tolist()

In [None]:
list_topic

[['ajar', 'sistem', 'data', 'citra', 'sistem'],
 ['game', 'hasil', 'sistem', 'metode', 'informasi'],
 ['hasil', 'metode', 'nilai', 'hasil', 'aplikasi'],
 ['main', 'putus', 'metode', 'teliti', 'data'],
 ['sistem', 'nilai', 'hasil', 'fitur', 'hasil'],
 ['nilai', 'kerja', 'siswa', 'proses', 'metode'],
 ['metode', 'kriteria', 'uji', 'akurasi', 'proses'],
 ['aplikasi', 'usaha', 'mahasiswa', 'klasifikasi', 'tingkat'],
 ['teliti', 'teliti', 'proses', 'data', 'butuh'],
 ['media', 'dukung', 'informasi', 'kenal', 'kembang']]

In [None]:
list_judul

['Gerak Pekerja Pada Game Real Time Strategy Menggunakan Finite State Machine',
 'RANCANG BANGUN GAME PERAWATAN SAPI KARAPAN MENGGUNAKAN METODE FUZZY LOGIC',
 'EKSTRAKSI FITUR BERBASIS TWO DIMENSIONAL LINEAR DISCRIMINANT ANALYSIS  UNTUK PENGENALAN WAJAH',
 'IMPLEMENTASI  ALGORITMA PRIM  DAN DEPTH FIRST SEARCH PADA PEMBUATAN MAZE GAME BERBASIS ANDROID OS MOBILE',
 'Perancangan Sistem Informasi Badan Kepegawaian Daerah ( BKD ) Bangkalan Sebagai Sub sistem dari E-Government Bangkalan Menggunakan TOGAF ADM ',
 'PEMANFAATAN TOGAF ADM UNTUK PERANCANGAN SISTEM INFORMASI DINAS PERINDUSTRIAN & PERDAGANGAN SEBAGAI SUB SISTEM ARSITEKTUR E-GOVERNMENT KABUPATEN BANGKALAN',
 'APLIKASI METODE FUZZY ANALYTIC NETWORK PROCESS (FANP) UNTUK MENDUKUNG KEPUTUSAN PROSES PROMOSI JABATAN DI PT. SURYA MADISTRINDO PAMEKASAN',
 'SISTEM PENDUKUNG KEPUTUSAN REKOMENDASI MENU DIET BAGI PASIEN RAWAT INAP MENGGUNAKAN METODE HARRIS BENEDICT DAN EUCLIDEAN (Studi Kasus : RSUD Dr. H. Moh. Anwar Sumenep)',
 'RANCANG BANGUN 

In [None]:
print(len(list_judul))

10


In [None]:
for i in range(len(list_topic)):
  print(list_topic[i])
  list_topic[i].insert(0,list_judul[i])

['ajar', 'sistem', 'data', 'citra', 'sistem']
['game', 'hasil', 'sistem', 'metode', 'informasi']
['hasil', 'metode', 'nilai', 'hasil', 'aplikasi']
['main', 'putus', 'metode', 'teliti', 'data']
['sistem', 'nilai', 'hasil', 'fitur', 'hasil']
['nilai', 'kerja', 'siswa', 'proses', 'metode']
['metode', 'kriteria', 'uji', 'akurasi', 'proses']
['aplikasi', 'usaha', 'mahasiswa', 'klasifikasi', 'tingkat']
['teliti', 'teliti', 'proses', 'data', 'butuh']
['media', 'dukung', 'informasi', 'kenal', 'kembang']


In [None]:
list_topic

[['Gerak Pekerja Pada Game Real Time Strategy Menggunakan Finite State Machine',
  'ajar',
  'sistem',
  'data',
  'citra',
  'sistem'],
 ['RANCANG BANGUN GAME PERAWATAN SAPI KARAPAN MENGGUNAKAN METODE FUZZY LOGIC',
  'game',
  'hasil',
  'sistem',
  'metode',
  'informasi'],
 ['EKSTRAKSI FITUR BERBASIS TWO DIMENSIONAL LINEAR DISCRIMINANT ANALYSIS  UNTUK PENGENALAN WAJAH',
  'hasil',
  'metode',
  'nilai',
  'hasil',
  'aplikasi'],
 ['IMPLEMENTASI  ALGORITMA PRIM  DAN DEPTH FIRST SEARCH PADA PEMBUATAN MAZE GAME BERBASIS ANDROID OS MOBILE',
  'main',
  'putus',
  'metode',
  'teliti',
  'data'],
 ['Perancangan Sistem Informasi Badan Kepegawaian Daerah ( BKD ) Bangkalan Sebagai Sub sistem dari E-Government Bangkalan Menggunakan TOGAF ADM ',
  'sistem',
  'nilai',
  'hasil',
  'fitur',
  'hasil'],
 ['PEMANFAATAN TOGAF ADM UNTUK PERANCANGAN SISTEM INFORMASI DINAS PERINDUSTRIAN & PERDAGANGAN SEBAGAI SUB SISTEM ARSITEKTUR E-GOVERNMENT KABUPATEN BANGKALAN',
  'nilai',
  'kerja',
  'siswa',
  

In [None]:
np_new_topic_word = np.array(list_topic)

In [None]:
np_new_topic_word

array([['Gerak Pekerja Pada Game Real Time Strategy Menggunakan Finite State Machine',
        'ajar', 'sistem', 'data', 'citra', 'sistem'],
       ['RANCANG BANGUN GAME PERAWATAN SAPI KARAPAN MENGGUNAKAN METODE FUZZY LOGIC',
        'game', 'hasil', 'sistem', 'metode', 'informasi'],
       ['EKSTRAKSI FITUR BERBASIS TWO DIMENSIONAL LINEAR DISCRIMINANT ANALYSIS  UNTUK PENGENALAN WAJAH',
        'hasil', 'metode', 'nilai', 'hasil', 'aplikasi'],
       ['IMPLEMENTASI  ALGORITMA PRIM  DAN DEPTH FIRST SEARCH PADA PEMBUATAN MAZE GAME BERBASIS ANDROID OS MOBILE',
        'main', 'putus', 'metode', 'teliti', 'data'],
       ['Perancangan Sistem Informasi Badan Kepegawaian Daerah ( BKD ) Bangkalan Sebagai Sub sistem dari E-Government Bangkalan Menggunakan TOGAF ADM ',
        'sistem', 'nilai', 'hasil', 'fitur', 'hasil'],
       ['PEMANFAATAN TOGAF ADM UNTUK PERANCANGAN SISTEM INFORMASI DINAS PERINDUSTRIAN & PERDAGANGAN SEBAGAI SUB SISTEM ARSITEKTUR E-GOVERNMENT KABUPATEN BANGKALAN',
        '

In [None]:
new_topic_word = pd.DataFrame(np_new_topic_word)

In [None]:
new_topic_word

Unnamed: 0,0,1,2,3,4,5
0,Gerak Pekerja Pada Game Real Time Strategy Men...,ajar,sistem,data,citra,sistem
1,RANCANG BANGUN GAME PERAWATAN SAPI KARAPAN MEN...,game,hasil,sistem,metode,informasi
2,EKSTRAKSI FITUR BERBASIS TWO DIMENSIONAL LINEA...,hasil,metode,nilai,hasil,aplikasi
3,IMPLEMENTASI ALGORITMA PRIM DAN DEPTH FIRST ...,main,putus,metode,teliti,data
4,Perancangan Sistem Informasi Badan Kepegawaian...,sistem,nilai,hasil,fitur,hasil
5,PEMANFAATAN TOGAF ADM UNTUK PERANCANGAN SISTEM...,nilai,kerja,siswa,proses,metode
6,APLIKASI METODE FUZZY ANALYTIC NETWORK PROCESS...,metode,kriteria,uji,akurasi,proses
7,SISTEM PENDUKUNG KEPUTUSAN REKOMENDASI MENU DI...,aplikasi,usaha,mahasiswa,klasifikasi,tingkat
8,RANCANG BANGUN APLIKASI PEMILIHAN TEKNIK REKAY...,teliti,teliti,proses,data,butuh
9,DETEKSI COREPOINT SIDIK JARI MENGGUNAKAN METOD...,media,dukung,informasi,kenal,kembang


In [None]:
new_topic_word.columns = ["Judul", "Topik 1", "Topik 2", "Topik 3", "Topik 4", "Topik 5"]

In [None]:
new_topic_word

Unnamed: 0,Judul,Topik 1,Topik 2,Topik 3,Topik 4,Topik 5
0,Gerak Pekerja Pada Game Real Time Strategy Men...,ajar,sistem,data,citra,sistem
1,RANCANG BANGUN GAME PERAWATAN SAPI KARAPAN MEN...,game,hasil,sistem,metode,informasi
2,EKSTRAKSI FITUR BERBASIS TWO DIMENSIONAL LINEA...,hasil,metode,nilai,hasil,aplikasi
3,IMPLEMENTASI ALGORITMA PRIM DAN DEPTH FIRST ...,main,putus,metode,teliti,data
4,Perancangan Sistem Informasi Badan Kepegawaian...,sistem,nilai,hasil,fitur,hasil
5,PEMANFAATAN TOGAF ADM UNTUK PERANCANGAN SISTEM...,nilai,kerja,siswa,proses,metode
6,APLIKASI METODE FUZZY ANALYTIC NETWORK PROCESS...,metode,kriteria,uji,akurasi,proses
7,SISTEM PENDUKUNG KEPUTUSAN REKOMENDASI MENU DI...,aplikasi,usaha,mahasiswa,klasifikasi,tingkat
8,RANCANG BANGUN APLIKASI PEMILIHAN TEKNIK REKAY...,teliti,teliti,proses,data,butuh
9,DETEKSI COREPOINT SIDIK JARI MENGGUNAKAN METOD...,media,dukung,informasi,kenal,kembang


In [None]:
new_topic_word.set_index('Judul')

Unnamed: 0_level_0,Topik 1,Topik 2,Topik 3,Topik 4,Topik 5
Judul,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Gerak Pekerja Pada Game Real Time Strategy Menggunakan Finite State Machine,ajar,sistem,data,citra,sistem
RANCANG BANGUN GAME PERAWATAN SAPI KARAPAN MENGGUNAKAN METODE FUZZY LOGIC,game,hasil,sistem,metode,informasi
EKSTRAKSI FITUR BERBASIS TWO DIMENSIONAL LINEAR DISCRIMINANT ANALYSIS UNTUK PENGENALAN WAJAH,hasil,metode,nilai,hasil,aplikasi
IMPLEMENTASI ALGORITMA PRIM DAN DEPTH FIRST SEARCH PADA PEMBUATAN MAZE GAME BERBASIS ANDROID OS MOBILE,main,putus,metode,teliti,data
Perancangan Sistem Informasi Badan Kepegawaian Daerah ( BKD ) Bangkalan Sebagai Sub sistem dari E-Government Bangkalan Menggunakan TOGAF ADM,sistem,nilai,hasil,fitur,hasil
PEMANFAATAN TOGAF ADM UNTUK PERANCANGAN SISTEM INFORMASI DINAS PERINDUSTRIAN & PERDAGANGAN SEBAGAI SUB SISTEM ARSITEKTUR E-GOVERNMENT KABUPATEN BANGKALAN,nilai,kerja,siswa,proses,metode
APLIKASI METODE FUZZY ANALYTIC NETWORK PROCESS (FANP) UNTUK MENDUKUNG KEPUTUSAN PROSES PROMOSI JABATAN DI PT. SURYA MADISTRINDO PAMEKASAN,metode,kriteria,uji,akurasi,proses
SISTEM PENDUKUNG KEPUTUSAN REKOMENDASI MENU DIET BAGI PASIEN RAWAT INAP MENGGUNAKAN METODE HARRIS BENEDICT DAN EUCLIDEAN (Studi Kasus : RSUD Dr. H. Moh. Anwar Sumenep),aplikasi,usaha,mahasiswa,klasifikasi,tingkat
RANCANG BANGUN APLIKASI PEMILIHAN TEKNIK REKAYASA KEBUTUHAN MENGGUNAKAN METODE SELF ORGANIZING MAP BERBASIS EUCLIDEAN DISTANCE DAN CANBERRA DISTANCE MATRIX,teliti,teliti,proses,data,butuh
DETEKSI COREPOINT SIDIK JARI MENGGUNAKAN METODE GEOMETRY OF REGION TECHNIQUE (GR),media,dukung,informasi,kenal,kembang


0    Gerak Pekerja Pada Game Real Time Strategy Men...
1    RANCANG BANGUN GAME PERAWATAN SAPI KARAPAN MEN...
2    EKSTRAKSI FITUR BERBASIS TWO DIMENSIONAL LINEA...
3    IMPLEMENTASI  ALGORITMA PRIM  DAN DEPTH FIRST ...
4    Perancangan Sistem Informasi Badan Kepegawaian...
Name: Judul, dtype: object

In [None]:
pd_docs = document_topic_df

In [None]:
pd_judul = df["Judul"]

In [None]:
np_judul = np.array(pd_judul)

In [None]:
np_docs = np.array(pd_docs)

In [None]:
list_judul = np_judul.tolist()

In [None]:
list_docs = np_docs.tolist()

In [None]:
for i in range(len(list_docs)):
  list_docs[i].insert(0,list_judul[i])

In [None]:
np_new_docs = np.array(list_docs)

In [None]:
new_docs = pd.DataFrame(np_new_docs)

In [None]:
new_docs.columns = ["Judul","Topik 1","Topik 2","Topik 3","Topik 4","Topik 5"]

In [None]:
new_docs

Unnamed: 0,Judul,Topik 1,Topik 2,Topik 3,Topik 4,Topik 5
0,Gerak Pekerja Pada Game Real Time Strategy Men...,0.9898079633712769,0.002583565190434456,0.002533490303903818,0.0025292898062616587,0.002545666880905628
1,RANCANG BANGUN GAME PERAWATAN SAPI KARAPAN MEN...,0.3396812379360199,0.0022914381697773933,0.5494083166122437,0.002253410406410694,0.10636556148529053
2,EKSTRAKSI FITUR BERBASIS TWO DIMENSIONAL LINEA...,0.0017377132317051291,0.0017265917267650366,0.0017386884428560734,0.9930562973022461,0.0017406947445124388
3,IMPLEMENTASI ALGORITMA PRIM DAN DEPTH FIRST ...,0.9896274209022522,0.002583353314548731,0.0025887414813041687,0.0025886453222483397,0.0026117891538888216
4,Perancangan Sistem Informasi Badan Kepegawaian...,0.0018731908639892936,0.0018649994162842631,0.001859072595834732,0.0018604929791763425,0.9925422072410583
...,...,...,...,...,...,...
813,PENERAPAN ALGORITMA LONG-SHORT TERM MEMORY UNT...,0.001741106272675097,0.09143387526273727,0.0017272846307605505,0.0017566599417477846,0.9033410549163818
814,SISTEM PENCARIAN TEKS AL-QURAN TERJEMAHAN BERB...,0.918613076210022,0.002366314409300685,0.002397844335064292,0.07424444705247879,0.0023782572243362665
815,KLASIFIKASI KOMPLEKSITAS VISUAL CITRA SAMPAH M...,0.0012412165524438024,0.001243165577761829,0.0012590637197718024,0.9950113296508789,0.0012452802620828152
816,IDENTIFIKASI BINER ATRIBUT PEJALAN KAKI MENGGU...,0.0015832815552130342,0.0015867046313360333,0.0015810849145054817,0.9936653971672058,0.0015835328958928585


In [None]:
cp_topic = topic_word_df

In [None]:
cp_topic

Unnamed: 0,Topik 1,Topik 2,Topik 3,Topik 4,Topik 5
0,ajar,sistem,data,citra,sistem
1,game,hasil,sistem,metode,informasi
2,hasil,metode,nilai,hasil,aplikasi
3,main,putus,metode,teliti,data
4,sistem,nilai,hasil,fitur,hasil
5,nilai,kerja,siswa,proses,metode
6,metode,kriteria,uji,akurasi,proses
7,aplikasi,usaha,mahasiswa,klasifikasi,tingkat
8,teliti,teliti,proses,data,butuh
9,media,dukung,informasi,kenal,kembang


In [None]:
cp_proporsi = pd.DataFrame(np_docs)[:10]

In [None]:
cp_proporsi

Unnamed: 0,0,1,2,3,4
0,0.989808,0.002584,0.002533,0.002529,0.002546
1,0.339681,0.002291,0.549408,0.002253,0.106366
2,0.001738,0.001727,0.001739,0.993056,0.001741
3,0.989627,0.002583,0.002589,0.002589,0.002612
4,0.001873,0.001865,0.001859,0.00186,0.992542
5,0.001804,0.001826,0.001802,0.001825,0.992742
6,0.001762,0.992928,0.001775,0.001762,0.001773
7,0.002489,0.002501,0.549567,0.002522,0.442921
8,0.001609,0.001621,0.914165,0.080992,0.001613
9,0.397289,0.001736,0.001754,0.597442,0.001778


In [None]:
np_proporsi = np.array(cp_proporsi)

In [None]:
list_proporsi = np_proporsi.tolist()

In [None]:
np_topic = np.array(cp_topic)

In [None]:
list_topic = np_topic.tolist()

In [None]:
list_proporsi

[[0.9898079633712769,
  0.002583565190434456,
  0.002533490303903818,
  0.0025292898062616587,
  0.002545666880905628],
 [0.3396812379360199,
  0.0022914381697773933,
  0.5494083166122437,
  0.002253410406410694,
  0.10636556148529053],
 [0.0017377132317051291,
  0.0017265917267650366,
  0.0017386884428560734,
  0.9930562973022461,
  0.0017406947445124388],
 [0.9896274209022522,
  0.002583353314548731,
  0.0025887414813041687,
  0.0025886453222483397,
  0.0026117891538888216],
 [0.0018731908639892936,
  0.0018649994162842631,
  0.001859072595834732,
  0.0018604929791763425,
  0.9925422072410583],
 [0.0018039126880466938,
  0.0018264437094330788,
  0.001802054583095014,
  0.001825135201215744,
  0.9927424192428589],
 [0.0017621769802644849,
  0.9929278492927551,
  0.001775026903487742,
  0.001762057770974934,
  0.0017728576203808188],
 [0.0024891714565455914,
  0.0025009841192513704,
  0.5495671033859253,
  0.0025216119829565287,
  0.44292107224464417],
 [0.0016091654542833567,
  0.0016