## Crowling Data pta.trunojoyo

In [None]:
# import library yang di butuhkan
import requests
from bs4 import BeautifulSoup
import csv

In [None]:
# ambil data dari url
fakultas = 4
page = 1
url = 'https://pta.trunojoyo.ac.id/c_search/byfac/{}/{}'

# Membuka file CSV untuk menulis hasil scraping
with open('hasil_scraping.csv', 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['Judul', 'Penulis', 'Dosen Pembimbing I', 'Dosen Pembimbing II', 'Abstrak']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    # Menulis header ke dalam file CSV
    writer.writeheader()

    while page <= 40:
        url = url.format(fakultas, page)
        req = requests.get(url)
        soup = BeautifulSoup(req.text, 'html.parser')
        items = soup.find_all('li', attrs={'data-id': 'id-1'})

        if not items:
            break

        for it in items:
            data = {}
            title = it.find('a', class_='title').text
            data['Judul'] = title
            div_elements = it.find_all('div', style='padding:2px 2px 2px 2px;')
            for div in div_elements:
                span = div.find('span')
                if span:
                    span_text = span.get_text()
                    key, value = span_text.split(':', 1)
                    data[key.strip()] = value.strip()

            # Mengambil link abstrak dari elemen dengan kelas 'gray button'
            abstrak_button = it.find('a', class_='gray button')
            if abstrak_button:
                abstrak_link = abstrak_button['href']
                abstrak_req = requests.get(abstrak_link)
                abstrak_soup = BeautifulSoup(abstrak_req.text, 'html.parser')
                abstrak = abstrak_soup.find('p', align='justify')
                if abstrak:
                    abstrak_text = abstrak.get_text(strip=True)
                    data['Abstrak'] = abstrak_text
                else:
                    data['Abstrak'] = "Abstrak tidak ditemukan"

            # Menulis data ke dalam file CSV
            writer.writerow(data)
            print("Data berhasil ditambahkan:", data)

        page += 1

print("Scraping selesai")


Data berhasil ditambahkan: {'Judul': 'PERANCANGAN DAN IMPLEMENTASI SISTEM DATABASE \r\nTERDISTRIBUSI MENGGUNAKAN ORACLE STUDI KASUS \r\nSIAKAD UNIVERSITAS TRUNOJOYO', 'Penulis': 'A.Ubaidillah S.Kom', 'Dosen Pembimbing I': 'Budi Setyono M.T', 'Dosen Pembimbing II': 'Hermawan S.T', 'Abstrak': 'Sistem  informasi  akademik  (SIAKAD) merupakan  sistem  informasi  yang  berfungsi  menangani pengelolaan  dan  penyajian  data-data  akademik,  yang  oleh pihak  fakultas  SIAKAD  dianggap  sangat  penting  dalam memberikan  pelayanan  mahasiswa  yang  membutuhkan informasi akademik. Di Universitas Trunojoyo telah tersedia SIAKAD,  namun  masih  menggunakan  database  terpusat. Sistem seperti ini memberikan kelebihan yaitu perawatannya mudah  selain  itu  juga  membutuhkan  sedikit  biaya,  namun sistem  tersebut  juga  berpotensi  mengahadapi  kendala-kendala  yaitu  dalam  proses  transaksi  data  karena  padatnya jaringan yang menuju database SIAKAD, kelambatan dalam pemrosesan  respon  query 

## Normalisasi Text


In [None]:
import pandas as pd

dataset = pd.read_csv('hasil_scraping.csv')
dataset.shape

(200, 5)

In [None]:
dataset.head()

Unnamed: 0,Judul,Penulis,Dosen Pembimbing I,Dosen Pembimbing II,Abstrak
0,PERANCANGAN DAN IMPLEMENTASI SISTEM DATABASE \...,A.Ubaidillah S.Kom,Budi Setyono M.T,Hermawan S.T,Sistem informasi akademik (SIAKAD) merupaka...
1,APLIKASI KONTROL DAN MONITORING JARINGAN KOMPU...,"M. Basith Ardianto,","Drs. Budi Soesilo, MT","Koko Joni, ST",Berjalannya koneksi jaringan komputer dengan l...
2,RANCANG BANGUN APLIKASI PROXY SERVER UNTUK\r\n...,"Akhmad Suyandi, S.Kom","Drs. Budi Soesilo, M.T","Hermawan, ST, MT",Web server adalah sebuah perangkat lunak serve...
3,SISTEM PENDUKUNG KEPUTUSAN OPTIMASI PENJADWALA...,Heri Supriyanto,"Mulaab, S.Si., M.Kom","Firli Irhamni, ST., M.Kom",Penjadwalan kuliah di Perguruan Tinggi me...
4,SISTEM AUGMENTED REALITY ANIMASI BENDA BERGERA...,Septian Rahman Hakim,"Arik Kurniawati, S.Kom., M.T.","Haryanto, S.T., M.T.",Seiring perkembangan teknologi yang ada diduni...


In [None]:
import nltk
import string
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

nltk.download('punkt')
nltk.download('stopwords')

# Import stopwords dalam bahasa Indonesia
from nltk.corpus import stopwords
stop_words = set(stopwords.words('indonesian'))

# Function untuk menghapus tanda baca dan stopwords
def preprocess_text(teks):
    # Punctuation process (Hapus tanda baca)
    teks_clean = teks.translate(str.maketrans('', '', string.punctuation))

    # Tokenisasi
    tokens = nltk.word_tokenize(teks_clean)

    # stopwords
    teks_cleaned = [word for word in tokens if word.lower() not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in teks_cleaned]

    # Gabungkan kembali tokens menjadi teks
    teks_final = ' '.join(stemmed_tokens)

    return teks_final

# Hapus tanda baca dan stopwords dari kolom 'abstrak'
dataset['abstrak_cleaned'] = dataset['Abstrak'].apply(preprocess_text)
ab = dataset[:15]['abstrak_cleaned']

# Ekstraksi fitur dan membentuk VSM dalam term frequency
vectorizer = CountVectorizer()
count_matrix = vectorizer.fit_transform(ab)
# print(count_matrix)
count_array = count_matrix.toarray()
df_term = pd.DataFrame(data=count_array,columns = vectorizer.vocabulary_.keys())
df_term

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,sistem,informasi,akademik,siakad,berfungsi,menangani,pengelolaan,penyajian,datadata,fakulta,...,dimana,dikolaborasikan,menggerakkan,langkah,playernya,interaksi,obyek,gerak,bola,dipukul
0,0,3,0,0,0,0,0,0,0,0,...,1,1,0,0,1,0,0,0,0,0
1,6,0,2,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,4,1,0,1,0,...,0,0,0,1,0,0,0,1,0,11
3,0,0,0,3,0,1,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
4,0,0,0,0,2,1,0,3,0,7,...,0,0,0,0,0,1,0,0,2,0
5,0,3,0,0,0,0,0,0,0,0,...,1,1,0,0,1,0,0,0,0,0
6,6,0,2,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
7,0,0,0,0,0,4,1,0,1,0,...,0,0,0,1,0,0,0,1,0,11
8,0,0,0,3,0,1,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
9,0,0,0,0,2,1,0,3,0,7,...,0,0,0,0,0,1,0,0,2,0


In [None]:
import numpy as np

# menghitung jumlah kemunculan kata untuk setiap kata dalam VSM dalam term frequency
word_counts = np.sum(count_matrix.toarray(), axis=0)

# membuat pasangan kata dan jumlah kemunculannya dalam bentuk kamus (dictionary)
word_count_dict = dict(zip(vectorizer.get_feature_names_out(), word_counts))

# Tampilkan kamus kata dan jumlah kemunculannya
print("\nJumlah Kemunculan Kata:")
for word, count in word_count_dict.items():
    print(f"{word}: {count}")


Jumlah Kemunculan Kata:
administr: 18
akademik: 9
aks: 6
algoritma: 9
animasi: 6
aplikasi: 18
aplikasiaplikasi: 3
ar: 9
aslinya: 3
augment: 21
avail: 3
bahasa: 3
bandwidth: 3
banyaknya: 3
basisdata: 6
bata: 3
benda: 3
bentuk: 3
berbasi: 6
berbentuk: 6
berdasarkan: 3
berfungsi: 6
bergerak: 6
berisi: 3
berjalan: 3
berjalannya: 6
berkembang: 3
berpotensi: 6
bertanggung: 3
bertujuan: 3
biaya: 3
bidang: 6
bola: 3
browser: 12
cepat: 3
client: 12
contoh: 3
dasar: 3
data: 12
databas: 12
datadata: 3
dekripsi: 3
diaks: 3
dianggap: 3
dibentuk: 6
dibuktikan: 3
dibutuhkan: 3
dicoba: 3
didalam: 3
didapatkan: 3
didunia: 3
diharapkan: 6
dijadikan: 3
diken: 3
dikirim: 6
dikolaborasikan: 3
dimana: 3
dimiliki: 3
dipertimbangkan: 3
dipukul: 3
disebabkan: 3
display: 3
distribusi: 3
ditampilkan: 6
diterapkan: 3
dituntut: 3
dokumen: 33
dunia: 6
engin: 3
enkripsi: 6
explor: 3
fakulta: 6
firefox: 3
fit: 3
flartoolkit: 6
flash: 3
gangguan: 18
genetika: 9
gerak: 3
halamanhalaman: 3
harapan: 3
hasil: 6
hasilnya:

In [None]:
def nltk_frequency_vectorize(corpus):

    # The NLTK frequency vectorize method
    from collections import defaultdict

    def vectorize(doc):
        features = defaultdict(int)

        for token in tokenize(doc):
            features[token] += 1

        return features

    return map(vectorize, corpus)
vectnltk=nltk_frequency_vectorize(dataset['abstrak_cleaned'])
type(vectnltk)

map

In [None]:
# Frequency
def sklearn_frequency_vectorize(corpus):
    # The Scikit-Learn frequency vectorize method
    from sklearn.feature_extraction.text import CountVectorizer

    vectorizer = CountVectorizer()
    return vectorizer.fit_transform(corpus)
vectsklen=sklearn_frequency_vectorize(dataset['abstrak_cleaned'])
print(vectsklen)

  (0, 214)	7
  (0, 87)	3
  (0, 1)	3
  (0, 213)	4
  (0, 21)	1
  (0, 133)	1
  (0, 175)	1
  (0, 180)	1
  (0, 40)	1
  (0, 71)	2
  (0, 43)	1
  (0, 165)	1
  (0, 121)	1
  (0, 130)	2
  (0, 235)	1
  (0, 232)	1
  (0, 227)	1
  (0, 39)	4
  (0, 225)	2
  (0, 96)	1
  (0, 184)	1
  (0, 153)	1
  (0, 30)	1
  (0, 27)	1
  (0, 137)	1
  :	:
  (199, 190)	1
  (199, 140)	1
  (199, 118)	2
  (199, 170)	2
  (199, 74)	2
  (199, 204)	1
  (199, 205)	1
  (199, 193)	1
  (199, 75)	1
  (199, 20)	1
  (199, 4)	2
  (199, 16)	1
  (199, 22)	2
  (199, 187)	1
  (199, 191)	1
  (199, 56)	1
  (199, 55)	1
  (199, 142)	1
  (199, 114)	1
  (199, 194)	1
  (199, 88)	1
  (199, 158)	2
  (199, 78)	1
  (199, 32)	1
  (199, 59)	1


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

#coun_vect = CountVectorizer()
coun_vect = CountVectorizer()
count_matrix = coun_vect.fit_transform(dataset['abstrak_cleaned'])
count_array = count_matrix.toarray()
df = pd.DataFrame(data=count_array,columns = coun_vect.vocabulary_.keys())
df

Unnamed: 0,sistem,informasi,akademik,siakad,berfungsi,menangani,pengelolaan,penyajian,datadata,fakulta,...,dimana,dikolaborasikan,menggerakkan,langkah,playernya,interaksi,obyek,gerak,bola,dipukul
0,0,3,0,0,0,0,0,0,0,0,...,1,1,0,0,1,0,0,0,0,0
1,6,0,2,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,4,1,0,1,0,...,0,0,0,1,0,0,0,1,0,11
3,0,0,0,3,0,1,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
4,0,0,0,0,2,1,0,3,0,7,...,0,0,0,0,0,1,0,0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0,3,0,0,0,0,0,0,0,0,...,1,1,0,0,1,0,0,0,0,0
196,6,0,2,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
197,0,0,0,0,0,4,1,0,1,0,...,0,0,0,1,0,0,0,1,0,11
198,0,0,0,3,0,1,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0


In [None]:
# TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import pandas as pd

tfidfvectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english')
tfidf_wm = tfidfvectorizer.fit_transform(dataset['abstrak_cleaned'])

tfidf_tokens = tfidfvectorizer.get_feature_names_out()
df_tfidfvect = pd.DataFrame(data = tfidf_wm.toarray(),columns = tfidf_tokens)
print("TF-IDF\n")
df_tfidfvect

TF-IDF



Unnamed: 0,administr,akademik,aks,algoritma,animasi,aplikasi,aplikasiaplikasi,ar,aslinya,augment,...,transaksi,trunojoyo,tuga,tujuan,universita,upaya,variabel,view,virtual,web
0,0.000000,0.260462,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.086821,0.086821,0.000000,0.000000,0.086821,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.300444,0.000000,0.100148,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.036909,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.105137,0.045153,0.000000,0.045153,0.000000,...,0.000000,0.000000,0.000000,0.045153,0.000000,0.000000,0.000000,0.045153,0.000000,0.496684
3,0.000000,0.000000,0.000000,0.252088,0.000000,0.048914,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.061937,0.000000,0.000000,0.000000,0.084029,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.145724,0.042414,0.000000,0.218586,0.000000,0.510033,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.072862,0.000000,0.000000,0.145724,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.000000,0.260462,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.086821,0.086821,0.000000,0.000000,0.086821,0.000000,0.000000,0.000000,0.000000,0.000000
196,0.300444,0.000000,0.100148,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.036909,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
197,0.000000,0.000000,0.000000,0.000000,0.000000,0.105137,0.045153,0.000000,0.045153,0.000000,...,0.000000,0.000000,0.000000,0.045153,0.000000,0.000000,0.000000,0.045153,0.000000,0.496684
198,0.000000,0.000000,0.000000,0.252088,0.000000,0.048914,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.061937,0.000000,0.000000,0.000000,0.084029,0.000000,0.000000,0.000000


In [None]:
# Logaritmic Frequency
vectorizer = CountVectorizer()
count_matrix = vectorizer.fit_transform(dataset['abstrak_cleaned'])
log_count_matrix = np.log1p(count_matrix.toarray())
df_log_vsm = pd.DataFrame(data=log_count_matrix, columns=vectorizer.get_feature_names_out())
df_log_vsm

Unnamed: 0,administr,akademik,aks,algoritma,animasi,aplikasi,aplikasiaplikasi,ar,aslinya,augment,...,transaksi,trunojoyo,tuga,tujuan,universita,upaya,variabel,view,virtual,web
0,0.00000,1.386294,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.693147,0.693147,0.000000,0.000000,0.693147,0.000000,0.000000,0.000000,0.000000,0.000000
1,1.94591,0.000000,1.098612,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.693147,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,0.00000,0.000000,0.000000,0.000000,0.000000,1.609438,0.693147,0.000000,0.693147,0.000000,...,0.000000,0.000000,0.000000,0.693147,0.000000,0.000000,0.000000,0.693147,0.000000,2.484907
3,0.00000,0.000000,0.000000,1.386294,0.000000,0.693147,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.693147,0.000000,0.000000,0.000000,0.693147,0.000000,0.000000,0.000000
4,0.00000,0.000000,0.000000,0.000000,1.098612,0.693147,0.000000,1.386294,0.000000,2.079442,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.693147,0.000000,0.000000,1.098612,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.00000,1.386294,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.693147,0.693147,0.000000,0.000000,0.693147,0.000000,0.000000,0.000000,0.000000,0.000000
196,1.94591,0.000000,1.098612,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.693147,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
197,0.00000,0.000000,0.000000,0.000000,0.000000,1.609438,0.693147,0.000000,0.693147,0.000000,...,0.000000,0.000000,0.000000,0.693147,0.000000,0.000000,0.000000,0.693147,0.000000,2.484907
198,0.00000,0.000000,0.000000,1.386294,0.000000,0.693147,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.693147,0.000000,0.000000,0.000000,0.693147,0.000000,0.000000,0.000000


In [None]:
# Binary Frequency
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(dataset['abstrak_cleaned'])
feature_names = vectorizer.get_feature_names_out()
df_vsm_binary = pd.DataFrame(data=X.toarray(), columns=feature_names)
df_vsm_binary

Unnamed: 0,administr,akademik,aks,algoritma,animasi,aplikasi,aplikasiaplikasi,ar,aslinya,augment,...,transaksi,trunojoyo,tuga,tujuan,universita,upaya,variabel,view,virtual,web
0,0,1,0,0,0,0,0,0,0,0,...,1,1,0,0,1,0,0,0,0,0
1,1,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,1,1,0,1,0,...,0,0,0,1,0,0,0,1,0,1
3,0,0,0,1,0,1,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
4,0,0,0,0,1,1,0,1,0,1,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0,1,0,0,0,0,0,0,0,0,...,1,1,0,0,1,0,0,0,0,0
196,1,0,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
197,0,0,0,0,0,1,1,0,1,0,...,0,0,0,1,0,0,0,1,0,1
198,0,0,0,1,0,1,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0


In [None]:
!pip install transformers



## LDA

In [None]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.datasets import make_multilabel_classification
# This produces a feature matrix of token counts, similar to what
# CountVectorizer would produce on text.
X, _ = make_multilabel_classification(random_state=0)
lda = LatentDirichletAllocation(n_components=5, random_state=0)
lda.fit(X)
LatentDirichletAllocation(...)
# get topics for some given samples:
lda.transform(X[-2:])

array([[0.00360392, 0.25499205, 0.0036211 , 0.64236448, 0.09541846],
       [0.15297572, 0.00362644, 0.44412786, 0.39568399, 0.003586  ]])