In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
import re
import nltk
nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\frans\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Membaca data dari file CSV
# Try reading the file with different encodings
encodings_to_try = ['utf-8', 'latin1', 'ISO-8859-1']

for encoding in encodings_to_try:
    try:
        df = pd.read_csv('Daftar_Skripsi.csv', encoding=encoding)
        print(f"File successfully read using encoding: {encoding}")
        # Further processing or analysis on the dataframe
        break  # Stop the loop if the file is read successfully
    except UnicodeDecodeError:
        print(f"Could not read the file with encoding: {encoding}")

# If none of the encodings work, consider further inspection or conversion of the file

Could not read the file with encoding: utf-8
File successfully read using encoding: latin1


Preprocessing

In [3]:
# Fungsi untuk membersihkan simbol dan angka dari teks
def remove_symbols_and_numbers(text):
    # Menggunakan ekspresi reguler untuk menghilangkan simbol dan angka
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    return cleaned_text
    
# Terapkan remove_symbols_and_numbers pada kolom 'JudulSkripsi' dan membuat kolom baru judul_cleaned
df['judul_cleaned'] = df['JudulSkripsi'].apply(remove_symbols_and_numbers)

Tokenisasi dan Case Folding

In [4]:
# Tokenisasi dan case folding pada kolom judul
df['judul_tokenized'] = df['judul_cleaned'].apply(lambda x: word_tokenize(str(x)))
df['judul_lower'] = df['judul_tokenized'].apply(lambda x: [word.lower() for word in x])


Filtering

In [5]:
# stopword from file
list_stopwords = pd.read_csv("stopwords.txt", names= ["stopwords"], header = None)

In [6]:
# Function to remove stopwords
def stopwords_removal(words, stopwords):
    return [word for word in words if word.lower() not in stopwords]

# Read stopwords from the file stopwords.txt
with open('stopwords.txt', 'r') as file:
    stopwords = file.read().splitlines()

# Assuming 'data' contains the DataFrame and 'judul_lower' column is the list of words
# Apply stopwords_removal to each list in 'judul_lower' column
df['judul_no_stopwords'] = df['judul_lower'].apply(lambda x: stopwords_removal(x, stopwords))


In [7]:
pip install Sastrawi

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\frans\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [8]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Function to perform stemming
def apply_stemming(words):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)  # Join stemmed words back into a sentence

# Apply stemming to each list in 'judul_lower' column
df['judul_stemmed'] = df['judul_no_stopwords'].apply(apply_stemming)

Kumpulan kata dasar = df['judul_stemmed']

In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

def calculate_wtf(df):
    # Filter out empty documents
    df = df[df["judul_stemmed"].apply(lambda x: len(x) > 0)]

    if df.empty:
        print("No non-empty documents to calculate WTF.")
        return None

    # Step 1: Count Term Frequency (TF)
    tf_vectorizer = TfidfVectorizer()
    
    try:
        tf_matrix = tf_vectorizer.fit_transform(df["judul_stemmed"].apply(lambda x: ' '.join(x)))
    except ValueError:
        print("Error: All documents may contain only stop words.")
        return None

    # Step 2: Normalize TF
    normalized_tf_matrix = tf_matrix / tf_matrix.sum(axis=1)

    # Step 3: Additional Weights (if needed)

    # Convert to DataFrame for easier manipulation
    wtf_df = pd.DataFrame(normalized_tf_matrix.toarray(), columns=tf_vectorizer.get_feature_names_out())

    return wtf_df

# Call the function with your DataFrame
wtf_result = calculate_wtf(df)

# Display the result if it's not None
if wtf_result is not None:
    print(wtf_result)


Error: All documents may contain only stop words.


In [9]:
# Assuming 'data' contains the DataFrame and 'judul_stemmed' column has the stemmed text

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the stemmed text data using TF-IDF
tfidf_matrix = tfidf_vectorizer.fit_transform(df['judul_stemmed'])

# Convert TF-IDF matrix to DataFrame for further analysis
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names(), index=df.index)

In [14]:
    # Membuat DataFrame untuk menyimpan WTF
    wtf_df = pd.DataFrame(index=df.index, columns=tfidf_vectorizer.get_feature_names())

    # Mengisi DataFrame WTF dengan nilai WTF
    for i in range(len(df)):
        doc_tfidf = tfidf_matrix[i].toarray().flatten()
        doc_words = df["judul_stemmed"][i]

        for j, word in enumerate(doc_words):
            wtf_df.at[i, word] = doc_tfidf[j]

    # Menggantikan nilai NaN dengan 0
    wtf_df = wtf_df.fillna(0)

    # Menyimpan DataFrame WTF ke dalam file CSV
    wtf_df.to_csv("wtf.csv", index=True)

In [13]:
import json
# Menyimpan DataFrame ke dalam tabel MySQL
for _, row in tfidf_df.iterrows():
    words_json = json.dumps(row.to_dict())  # Convert the row to a dictionary before JSON serialization
    print(words_json)


{"absensi": 0.0, "active": 0.0, "additive": 0.0, "adventure": 0.0, "ahp": 0.0, "airlangga": 0.0, "ajar": 0.0, "aju": 0.0, "akses": 0.0, "al": 0.0, "alat": 0.2576131987963104, "alazhar": 0.0, "alfabet": 0.0, "algoritma": 0.0, "alna": 0.0, "aman": 0.0, "anak": 0.0, "analytical": 0.0, "and": 0.0, "andi": 0.0, "android": 0.0, "aoheng": 0.0, "aplikasi": 0.0, "aplkasi": 0.0, "arduino": 0.2770220435285546, "area": 0.0, "astinet": 0.0, "athfal": 0.0, "atmega": 0.0, "attribute": 0.0, "augmented": 0.0, "awas": 0.0, "ayam": 0.0, "badan": 0.0, "bahan": 0.0, "baik": 0.0, "baku": 0.0, "bakung": 0.0, "balai": 0.0, "balikpapan": 0.0, "banding": 0.0, "bandwidth": 0.0, "bangun": 0.0, "bank": 0.0, "barang": 0.0, "barcode": 0.0, "bas": 0.10937458837494803, "basic": 0.0, "bayes": 0.0, "beasiswa": 0.0, "beli": 0.0, "berat": 0.0, "berita": 0.0, "berlian": 0.0, "bhakti": 0.0, "biologi": 0.0, "biro": 0.0, "bni": 0.0, "bocor": 0.3043772737677731, "bontang": 0.0, "borgol": 0.0, "borneo": 0.0, "bri": 0.0, "budaya

In [12]:
words_json

<function json.dumps(obj, *, skipkeys=False, ensure_ascii=True, check_circular=True, allow_nan=True, cls=None, indent=None, separators=None, default=None, sort_keys=False, **kw)>

In [15]:
# Menggunakan TfidfVectorizer untuk menghitung TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['judul_stemmed'])

# Mengonversi hasil ke dalam DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names())
print("TF-IDF:")
print(tfidf_df)

# Menghitung WTF (Weighted Term Frequency)
wtf_df = df.copy()
wtf_df['wtf'] = wtf_df['judul_stemmed'].apply(lambda x: len(x.split()))  # Menghitung jumlah kata
print("\nWTF:")
print(wtf_df[['wtf']])

TF-IDF:
    absensi    active  additive  adventure  ahp  airlangga  ajar  aju  akses  \
0       0.0  0.000000       0.0        0.0  0.0        0.0   0.0  0.0    0.0   
1       0.0  0.000000       0.0        0.0  0.0        0.0   0.0  0.0    0.0   
2       0.0  0.000000       0.0        0.0  0.0        0.0   0.0  0.0    0.0   
3       0.0  0.000000       0.0        0.0  0.0        0.0   0.0  0.0    0.0   
4       0.0  0.360152       0.0        0.0  0.0        0.0   0.0  0.0    0.0   
..      ...       ...       ...        ...  ...        ...   ...  ...    ...   
61      0.0  0.000000       0.0        0.0  0.0        0.0   0.0  0.0    0.0   
62      0.0  0.000000       0.0        0.0  0.0        0.0   0.0  0.0    0.0   
63      0.0  0.000000       0.0        0.0  0.0        0.0   0.0  0.0    0.0   
64      0.0  0.000000       0.0        0.0  0.0        0.0   0.0  0.0    0.0   
65      0.0  0.000000       0.0        0.0  0.0        0.0   0.0  0.0    0.0   

     al  ...    visual  vpn  vu

In [21]:
# Term Frequency (TF) calculation
tf_df = df["judul_stemmed"].apply(lambda x: pd.Series(x).value_counts()).fillna(0)

# Weighted Term Frequency (WTF) calculation
wtf_df = tf_df * tfidf_df

# Merge TF-IDF, TF, and WTF DataFrames
result_df = pd.concat([tfidf_df, tf_df, wtf_df], axis=1)

In [22]:
result_df

Unnamed: 0,absensi,active,additive,adventure,ahp,airlangga,ajar,aju,akses,al,...,visual,vpn,vuforia,web,website,weighted,weighting,winbox,wisata,wp
0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
1,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
4,0.0,0.360152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
62,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
63,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
64,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,


In [13]:
from sklearn.cluster import KMeans

# Initialize KMeans clustering
num_clusters = 5  # Ubah jumlah kluster sesuai kebutuhan
kmeans = KMeans(n_clusters=num_clusters, random_state=42)

# Fit KMeans to the TF-IDF matrix
kmeans.fit(tfidf_matrix)

# Predict cluster labels for the data points
df['cluster_label'] = kmeans.labels_

# Optional: Print the cluster centers
print("Cluster Centers:")
print(kmeans.cluster_centers_)  # Centroids of clusters

# Optional: Analyze the cluster assignments
cluster_counts = df['cluster_label'].value_counts()
print("\nCluster Counts:")
print(cluster_counts)  # Number of data points in each cluster

Cluster Centers:
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.02473705 0.        ]
 [0.         0.         0.02122812 ... 0.         0.         0.02538005]
 [0.0164457  0.01333895 0.00978795 ... 0.01279429 0.         0.        ]]

Cluster Counts:
4    27
2    14
3    12
1     8
0     5
Name: cluster_label, dtype: int64


In [14]:
df

Unnamed: 0,id,JudulSkripsi,NamaPeneliti,Tahun,ProgramStudi,judul_cleaned,judul_tokenized,judul_lower,judul_no_stopwords,judul_stemmed,cluster_label
0,1,Alat Pendeteksi Kebocoran Gas Lpg Berbasis Ard...,Adesta Tito Suwandhita,2018,Teknik Informatika,Alat Pendeteksi Kebocoran Gas Lpg Berbasis Ard...,"[Alat, Pendeteksi, Kebocoran, Gas, Lpg, Berbas...","[alat, pendeteksi, kebocoran, gas, lpg, berbas...","[alat, pendeteksi, kebocoran, gas, lpg, berbas...",alat deteksi bocor gas lpg bas arduino uno sms...,0
1,2,Sistem Pendukung Keputusan Penilaian Kinerja P...,Muhammad Faizal Rusbian,2018,Sistem Informasi,Sistem Pendukung Keputusan Penilaian Kinerja P...,"[Sistem, Pendukung, Keputusan, Penilaian, Kine...","[sistem, pendukung, keputusan, penilaian, kine...","[sistem, pendukung, keputusan, penilaian, kine...",sistem dukung putus nilai kerja pegawai honore...,3
2,3,Membangun Pengenalan Hewan Augmented Reality B...,Zulfikar Fitri Istyanto,2018,Teknik Informatika,Membangun Pengenalan Hewan Augmented Reality B...,"[Membangun, Pengenalan, Hewan, Augmented, Real...","[membangun, pengenalan, hewan, augmented, real...","[membangun, pengenalan, hewan, augmented, real...",bangun kenal hewan augmented reality bas andro...,1
3,4,Alat Penyiram Tanaman Otomatis Menggunakan Ard...,Aris Prasetyo,2018,Teknik Informatika,Alat Penyiram Tanaman Otomatis Menggunakan Ard...,"[Alat, Penyiram, Tanaman, Otomatis, Menggunaka...","[alat, penyiram, tanaman, otomatis, menggunaka...","[alat, penyiram, tanaman, otomatis, arduino, u...",alat siram tanam otomatis arduino uno kendali sms,0
4,5,Desain Dan Implementasi E-Learning Pada Lkp Ac...,Riski apriliansyah,2020,Sistem Informasi,Desain Dan Implementasi ELearning Pada Lkp Act...,"[Desain, Dan, Implementasi, ELearning, Pada, L...","[desain, dan, implementasi, elearning, pada, l...","[desain, implementasi, elearning, lkp, active,...",desain implementasi elearning lkp active engli...,4
...,...,...,...,...,...,...,...,...,...,...,...
61,62,Aplikasi Pengolahan Data Sparepart Pada Pt Kal...,Muhammad Salman,2017,Manajemen Informatika,Aplikasi Pengolahan Data Sparepart Pada Pt Kal...,"[Aplikasi, Pengolahan, Data, Sparepart, Pada, ...","[aplikasi, pengolahan, data, sparepart, pada, ...","[aplikasi, pengolahan, data, sparepart, pt, ka...",aplikasi olah data sparepart pt kaltim prima u...,4
62,63,Aplikasi Monitoring Bukti Potong Pph Pasal 15 ...,Wiwik Jayanti,2017,Manajemen Informatika,Aplikasi Monitoring Bukti Potong Pph Pasal Pa...,"[Aplikasi, Monitoring, Bukti, Potong, Pph, Pas...","[aplikasi, monitoring, bukti, potong, pph, pas...","[aplikasi, monitoring, bukti, potong, pph, pas...",aplikasi monitoring bukti potong pph pasal pt ...,4
63,64,Aplikasi Kebugaran Tubuh Berbasis Multimedia,Muhammad Yasin,2017,Manajemen Informatika,Aplikasi Kebugaran Tubuh Berbasis Multimedia,"[Aplikasi, Kebugaran, Tubuh, Berbasis, Multime...","[aplikasi, kebugaran, tubuh, berbasis, multime...","[aplikasi, kebugaran, tubuh, berbasis, multime...",aplikasi bugar tubuh bas multimedia,4
64,65,Sistem Informasi Data Kepegawaian Pada Kantor ...,Arya Sanjaya Lantang,2017,Manajemen Informatika,Sistem Informasi Data Kepegawaian Pada Kantor ...,"[Sistem, Informasi, Data, Kepegawaian, Pada, K...","[sistem, informasi, data, kepegawaian, pada, k...","[sistem, informasi, data, kepegawaian, kantor,...",sistem informasi data pegawai kantor desa loa ...,4


In [15]:
# Assuming df is your DataFrame
df['judul_tokenized'] = df['judul_tokenized'].apply(lambda x: ', '.join(map(str, x)))
df['judul_lower'] = df['judul_lower'].apply(lambda x: ', '.join(map(str, x)))
df['judul_no_stopwords'] = df['judul_no_stopwords'].apply(lambda x: ', '.join(map(str, x)))

In [16]:
df.to_csv("df3.csv")

In [17]:
# Misalkan 'df' adalah DataFrame yang Anda miliki
df['cluster_label'] = df['cluster_label'] + 1

# Print DataFrame setelah menambahkan 1 pada setiap nilai di kolom 'cluster_label'
print(df[['JudulSkripsi', 'cluster_label']])


                                         JudulSkripsi  cluster_label
0   Alat Pendeteksi Kebocoran Gas Lpg Berbasis Ard...              1
1   Sistem Pendukung Keputusan Penilaian Kinerja P...              4
2   Membangun Pengenalan Hewan Augmented Reality B...              2
3   Alat Penyiram Tanaman Otomatis Menggunakan Ard...              1
4   Desain Dan Implementasi E-Learning Pada Lkp Ac...              5
..                                                ...            ...
61  Aplikasi Pengolahan Data Sparepart Pada Pt Kal...              5
62  Aplikasi Monitoring Bukti Potong Pph Pasal 15 ...              5
63       Aplikasi Kebugaran Tubuh Berbasis Multimedia              5
64  Sistem Informasi Data Kepegawaian Pada Kantor ...              5
65  Aplikasi Pengolahan Data Service Handphone Pad...              5

[66 rows x 2 columns]


In [18]:
# Misalkan 'df' adalah DataFrame yang Anda miliki
sorted_df = df[['JudulSkripsi', 'cluster_label']].sort_values('cluster_label')

# Print DataFrame yang sudah diurutkan berdasarkan kolom 'cluster_label'
print(sorted_df)

                                         JudulSkripsi  cluster_label
0   Alat Pendeteksi Kebocoran Gas Lpg Berbasis Ard...              1
43  Penerapan Spk Decission Tree Menggunakan Algor...              1
37  Sistem Pendeteksi Peringatan Dini Keamanan Rum...              1
23  Game Edukasi Drag And Drop Budaya Nusantara Un...              1
3   Alat Penyiram Tanaman Otomatis Menggunakan Ard...              1
..                                                ...            ...
34  Sistem Informasi Pengelolaan Media Pembelajara...              5
40  Aplikasi Pemilihan Handphone Pada Parindo Cell...              5
41  Implementasi Absensi Siswa Berbasis Sms Gatewa...              5
45  Pengembangan Sistem Pengolahan Data Gaji Karya...              5
65  Aplikasi Pengolahan Data Service Handphone Pad...              5

[66 rows x 2 columns]


In [19]:
print(df.dtypes)


id                     int64
JudulSkripsi          object
NamaPeneliti          object
Tahun                  int64
ProgramStudi          object
judul_cleaned         object
judul_tokenized       object
judul_lower           object
judul_no_stopwords    object
judul_stemmed         object
cluster_label          int32
dtype: object
