In [2]:
import pandas as pd
from nltk.tokenize import word_tokenize
import re
import nltk
nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\frans\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Membaca data dari file CSV
# Try reading the file with different encodings
encodings_to_try = ['utf-8', 'latin1', 'ISO-8859-1']

for encoding in encodings_to_try:
    try:
        df = pd.read_csv('Daftar_Skripsi.csv', encoding=encoding)
        print(f"File successfully read using encoding: {encoding}")
        # Further processing or analysis on the dataframe
        break  # Stop the loop if the file is read successfully
    except UnicodeDecodeError:
        print(f"Could not read the file with encoding: {encoding}")

# If none of the encodings work, consider further inspection or conversion of the file

Could not read the file with encoding: utf-8
File successfully read using encoding: latin1


Preprocessing

In [4]:
# Fungsi untuk membersihkan simbol dan angka dari teks
def remove_symbols_and_numbers(text):
    # Menggunakan ekspresi reguler untuk menghilangkan simbol dan angka
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    return cleaned_text
    
# Terapkan remove_symbols_and_numbers pada kolom 'JudulSkripsi' dan membuat kolom baru judul_cleaned
df['judul_cleaned'] = df['JudulSkripsi'].apply(remove_symbols_and_numbers)

Tokenisasi dan Case Folding

In [5]:
# Tokenisasi dan case folding pada kolom judul
df['judul_tokenized'] = df['judul_cleaned'].apply(lambda x: word_tokenize(str(x)))
df['judul_lower'] = df['judul_tokenized'].apply(lambda x: [word.lower() for word in x])


Filtering

In [6]:
# stopword from file
list_stopwords = pd.read_csv("stopwords.txt", names= ["stopwords"], header = None)

In [7]:
# Function to remove stopwords
def stopwords_removal(words, stopwords):
    return [word for word in words if word.lower() not in stopwords]

# Read stopwords from the file stopwords.txt
with open('stopwords.txt', 'r') as file:
    stopwords = file.read().splitlines()

# Assuming 'data' contains the DataFrame and 'judul_lower' column is the list of words
# Apply stopwords_removal to each list in 'judul_lower' column
df['judul_no_stopwords'] = df['judul_lower'].apply(lambda x: stopwords_removal(x, stopwords))


In [8]:
pip install Sastrawi

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\frans\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [9]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Function to perform stemming
def apply_stemming(words):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()

    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)  # Join stemmed words back into a sentence

# Apply stemming to each list in 'judul_lower' column
df['judul_stemmed'] = df['judul_no_stopwords'].apply(apply_stemming)

Kumpulan kata dasar = df['judul_stemmed']

In [None]:
# Assuming 'df' contains the DataFrame and 'judul_stemmed' column has the stemmed text

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the stemmed text data using TF-IDF
tfidf_matrix = tfidf_vectorizer.fit_transform(df['judul_stemmed'])

# Convert TF-IDF matrix to DataFrame for further analysis
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names(), index=df.index)

In [None]:
from sklearn.cluster import KMeans
    # Melakukan clustering menggunakan KMeans
num_clusters = 5
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(tfidf_matrix)
df["cluster_label"] = kmeans.labels_

# Mendapatkan informasi tentang cluster
cluster_centers = kmeans.cluster_centers_
cluster_counts = df["cluster_label"].value_counts()

In [None]:
from sklearn.metrics import pairwise_distances

# Get the indices of documents in each cluster
cluster_indices = [df.index[df['cluster_label'] == cluster_num].tolist() for cluster_num in range(num_clusters)]

# Function to calculate average distance within a cluster for a given document index
def average_distance_within_cluster(doc_index, cluster_indices):
    cluster_num = df.loc[doc_index, 'cluster_label']
    cluster_docs = cluster_indices[cluster_num]

    # Calculate pairwise distances between the given document and all other documents in the same cluster
    distances = pairwise_distances(tfidf_matrix[doc_index], tfidf_matrix[cluster_docs], metric='cosine')[0]

    # Calculate average distance
    average_distance = sum(distances) / len(distances)

    return average_distance

# Calculate and store average distances for each document in the DataFrame
df['rata_rata_jarak_antar_dokumen_dalam_satu_kluster'] = df.index.map(lambda x: average_distance_within_cluster(x, cluster_indices))

# Optional: Display the DataFrame with the new column
print(df[['judul_stemmed', 'cluster_label', 'rata_rata_jarak_antar_dokumen_dalam_satu_kluster']])


                                        judul_stemmed  cluster_label  \
0   alat deteksi bocor gas lpg bas arduino uno sms...              0   
1   sistem dukung putus nilai kerja pegawai honore...              3   
2   bangun kenal hewan augmented reality bas andro...              1   
3   alat siram tanam otomatis arduino uno kendali sms              0   
4   desain implementasi elearning lkp active engli...              4   
..                                                ...            ...   
61  aplikasi olah data sparepart pt kaltim prima u...              4   
62  aplikasi monitoring bukti potong pph pasal pt ...              4   
63                aplikasi bugar tubuh bas multimedia              4   
64  sistem informasi data pegawai kantor desa loa ...              4   
65  aplikasi olah data service handphone orange ph...              4   

    average_distance_within_cluster  
0                          0.694108  
1                          0.816303  
2                    

In [None]:
from sklearn.metrics import pairwise_distances

# Function to calculate average distance from a document to all documents in other clusters
def average_distance_to_other_clusters(doc_index, cluster_indices):
    cluster_num = df.loc[doc_index, 'cluster_label']
    cluster_docs = cluster_indices[cluster_num]

    # Calculate pairwise distances between the given document and all documents in other clusters
    distances = []
    for other_cluster_num, other_cluster_docs in enumerate(cluster_indices):
        if other_cluster_num != cluster_num:
            distances.extend(pairwise_distances(tfidf_matrix[doc_index], tfidf_matrix[other_cluster_docs], metric='cosine')[0])

    # Calculate average distance to other clusters
    average_distance_to_other_clusters = sum(distances) / len(distances) if len(distances) > 0 else 0

    return average_distance_to_other_clusters

# Calculate and store average distances to other clusters for each document in the DataFrame
df['rata_rata_jarak_antar_dokumen_dengan_kluster_lain'] = df.index.map(lambda x: average_distance_to_other_clusters(x, cluster_indices))

# Optional: Display the DataFrame with the new column
print(df[['judul_stemmed', 'cluster_label', 'rata_rata_jarak_antar_dokumen_dalam_satu_kluster', 'rata_rata_jarak_antar_dokumen_dengan_kluster_lain']])


NameError: name 'df' is not defined