In [110]:
from pprint import pprint

import pandas as pd

df = pd.read_csv('teknik-informatika-raw.csv')
abstracts = df['abstract'].tolist()
title = df['title']

In [111]:
import nltk

nltk.download('stopwords', download_dir='.venv/nltk_data')

[nltk_data] Downloading package stopwords to .venv/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [112]:
from nltk.corpus import stopwords
import string
import re
import json

def lower_case(text):
    return text.lower()

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

def remove_whitespace(text):
    return text.strip()

with open('stopwords-id.json') as f:
    stopwords_id = json.load(f)

stopwords_indonesian = stopwords.words('indonesian')
stopwords_indonesian.extend(stopwords_id)

def remove_stopwords(text):
    return ' '.join([word for word in text.split() if word not in stopwords_indonesian])

In [113]:
def preprocess(text):
    text = lower_case(text)
    text = remove_punctuation(text)
    text = remove_numbers(text)
    text = remove_whitespace(text)
    text = remove_stopwords(text)
    return text

abstracts = [preprocess(abstract) for abstract in abstracts]

In [114]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=100,
    max_df=0.8,
    min_df=0.2,
    ngram_range=(1, 3)
)

In [115]:
tfidf_matrix = vectorizer.fit_transform(abstracts)

In [116]:
features_names = vectorizer.get_feature_names_out()
dense = tfidf_matrix.todense()
denselist = dense.tolist()

all_keywords = []

for description in denselist:
    x = 0
    keywords = []
    for word in description:
        if word > 0:
            keywords.append(features_names[x])
        x += 1
    all_keywords.append(keywords)

In [117]:
from pprint import pprint

pprint(abstracts[0])
pprint(all_keywords[0])

('segmentasi pelanggan berdasarkan rfm menargetkan bagianbagian pelanggannya '
 'meminimalkan biaya perpesanan meningkatkan profitabilitas metode rfm '
 'pengelompokan pelanggan pelanggan dinilai profitabilitasnya perusahaan '
 'transaksi segmentasi pelanggan teknik data mining clustering keputusan '
 'bisnis efektif strategi bisnis segmennya penelitian teknik hierarchical '
 'clustering pengelompokan data metode agglomerative hierarchical clustering '
 'ahc memproses hasil analisis rfm dimana metode menghasilkan pengelompokan '
 'pelanggan dimana pelanggan masuk kelompok data penelitian data transaksi '
 'retail data data jupyter notebook tools implementasi metode bahasa python '
 'mengolah data mempresentasikan data luaran penelitian segmentasi pelanggan '
 'ditampilkan visualisasi grafik menggambarkan perilaku bisnis pelanggan '
 'segmentasi pelanggan dihasilkan kelompok dimana kelompok kelompok '
 'berdasarkan data transaksi')
['bahasa', 'berdasarkan', 'data', 'hasil', 'metode', 'p