In [14]:
!pip install Sastrawi

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [15]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns

# Library Machine Learning (Scikit-Learn)
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Library Deep Learning (TensorFlow/Keras)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Input, GlobalAveragePooling1D, Dropout, LayerNormalization, MultiHeadAttention

In [16]:
df = pd.read_csv('/content/dataset_labelling_manual.csv')
print(f"Total Data: {len(df)}")
print(df.head())

Total Data: 315
                   tokoh                                               text  \
0   Purbaya Yudhi Sadewa  sehat2 bapak presiden ku dan pak menkeu selalu...   
1  Sri Mulyani Indrawati  Di luar negri lumrah pejabat mundur, di NKRI i...   
2  Sri Mulyani Indrawati  Pajak rakyat terus ditekan, pajak pengusaha ti...   
3   Purbaya Yudhi Sadewa  Pihak2 yang selama ini merasa tdk diawasi dlm ...   
4   Purbaya Yudhi Sadewa                                 Pemimpin itu laki2   

     label  
0  positif  
1  negatif  
2  negatif  
3   netral  
4   netral  


In [29]:
# --- KONFIGURASI UMUM ---
MAX_FEATURES = 5000
TEST_SIZE = 0.2
RANDOM_STATE = 42
FILE_MANUAL = 'hasil_labelling_manual.csv'
FILE_FULL = 'dataset_tugas_purbaya_vs_srimulyani.csv'

# --- INISIALISASI STEMMER (WAJIB ADA) ---
# Diperlukan agar perintah stemmer.stem() di bawah berfungsi
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
print("Menginisialisasi Sastrawi Stemmer...")
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# --- KODE BARU: STOPWORDS & CLEANING ---
# Import Stopword Factory dari Sastrawi
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# 1. SIAPKAN DAFTAR STOPWORDS
factory_sw = StopWordRemoverFactory()
stopwords_indo = factory_sw.get_stop_words()

# Tambahan stopwords manual (kata yang sering muncul tapi tidak penting di WordCloud)
more_stopwords = [
    'yg', 'dg', 'rt', 'dgn', 'ny', 'd', 'klo', 'kalo', 'amp', 'biar',
    'bikin', 'bilang', 'gak', 'ga', 'krn', 'nya', 'nih', 'sih', 'si',
    'tau', 'tdk', 'tuh', 'utk', 'ya', 'jd', 'sdh', 'aja', 'n',
    't', 'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt', 'bukan', 'ini',
    'itu', 'ada', 'dan', 'dari', 'dia', 'ke', 'kita', 'mau', 'pada',
    'saya', 'kami', 'anda', 'mereka', 'semua', 'sudah', 'tapi', 'atau',
    'banyak', 'beberapa', 'biasa', 'bila', 'boleh', 'buat', 'bukan',
    'cukup', 'cuma', 'dapat', 'dari', 'depan', 'diri', 'dulu', 'enggak',
    'entah', 'hal', 'hampir', 'hanya', 'harus', 'hendak', 'ia', 'ingin',
    'ini', 'itu', 'jadi', 'jika', 'juga', 'justru', 'kalau',
    'kami', 'kamu', 'kan', 'kapan', 'karena', 'kata', 'ke', 'kembali',
    'kenapa', 'kepada', 'ketika', 'kita', 'lagi', 'lalu', 'lain', 'lama',
    'lewat', 'mana', 'mari', 'masih', 'mau', 'mengapa', 'mereka', 'maka',
    'menurut', 'mungkin', 'nanti', 'namun', 'nyaris', 'oleh', 'pada',
    'padahal', 'paling', 'para', 'pasti', 'per', 'pernah', 'pula', 'pun',
    'saat', 'saja', 'sambil', 'sampai', 'sana', 'sangat', 'saya', 'sebab',
    'bagai', 'seperti', 'sering', 'siapa', 'silakan', 'sini', 'suatu',
    'sudah', 'supaya', 'tadi', 'tanpa', 'tapi', 'telah', 'tentang',
    'tentu', 'tepat', 'terhadap', 'tetapi', 'tiap', 'toh', 'tunjuk',
    'turut', 'untuk', 'waduh', 'wah', 'wahai', 'wong', 'yaitu', 'yakni', 'yang'
]

# 2. DAFTAR NAMA TOKOH (DIHAPUS AGAR TIDAK BIAS)
stopwords_tokoh = [
    'sri', 'mulyani', 'indrawati', 'bu sri', 'menkeu', 'menteri keuangan',
    'purbaya', 'yudhi', 'sadewa', 'pak purbaya', 'purbaya sadewa',
    'menteri', 'pak', 'bu', 'bapak', 'ibu', 'presiden', 'jokowi', 'prabowo'
]

# Gabungkan semua stopwords menjadi satu SET (agar pencarian cepat)
# Set otomatis menghapus duplikat
final_stopwords = set(stopwords_indo + more_stopwords + stopwords_tokoh)

# 3. KAMUS SLANG
slang_dict = {
    'yg': 'yang', 'gk': 'tidak', 'ga': 'tidak', 'gak': 'tidak', 'enggak': 'tidak',
    'bgt': 'banget', 'dlm': 'dalam', 'sy': 'saya', 'aku': 'saya', 'gw': 'saya',
    'gue': 'saya', 'lu': 'kamu', 'lo': 'kamu', 'anda': 'kamu', 'kalo': 'kalau',
    'kl': 'kalau', 'klo': 'kalau', 'dr': 'dari', 'dgn': 'dengan', 'krn': 'karena',
    'jd': 'jadi', 'jdi': 'jadi', 'sdh': 'sudah', 'udh': 'sudah', 'blm': 'belum',
    'tp': 'tapi', 'tpi': 'tapi', 'tak': 'tidak', 'tdk': 'tidak', 'jgn': 'jangan',
    'utk': 'untuk', 'aja': 'saja', 'aj': 'saja', 'lbh': 'lebih', 'sbg': 'sebagai',
    'bs': 'bisa', 'bisa': 'dapat', 'pd': 'pada', 'kmrn': 'kemarin', 'skrg': 'sekarang',
    'trs': 'terus', 'bkn': 'bukan', 'ok': 'oke', 'thx': 'terima kasih',
    'makasih': 'terima kasih', 'tks': 'terima kasih', 'gan': 'juragan',
    'kak': 'kakak', 'min': 'admin', 'tau': 'tahu', 'mw': 'mau', 'mo': 'mau',
    'lbh': 'lebih', 'memang': 'memang', 'emang': 'memang', 'kayak': 'seperti',
    'kyk': 'seperti', 'kek': 'seperti', 'bpk': 'Bapak', 'tdk': 'tidak'
}

def clean_text(text):
    """
    Pipeline: Lowercase -> Regex -> Slang -> Stopwords Removal -> Stemming
    """
    text = str(text).lower()

    # A. Regex Cleaning
    text = re.sub(r'http\S+|www\S+', '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'[^a-z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    words = text.split()
    final_words = []

    for word in words:
        # B. Normalisasi Slang
        word = slang_dict.get(word, word)

        # C. STOPWORD REMOVAL
        # Jika kata ada di daftar stopwords, lewati/hapus
        if word in final_stopwords:
            continue

        # D. Hapus kata < 3 huruf (Double filter)
        if len(word) < 3:
            continue

        final_words.append(word)

    text = " ".join(final_words)

    # E. Stemming (Sastrawi)
    if len(text) > 0:
        text = stemmer.stem(text)

    return text



Menginisialisasi Sastrawi Stemmer...


PERCOBAAN 1: Bag of Words (BoW) + Naive Bayes

In [30]:
vectorizer_bow = CountVectorizer(max_features=5000) # BoW Fitur
X_train_bow = vectorizer_bow.fit_transform(X_train_text)
X_test_bow = vectorizer_bow.transform(X_test_text)

nb_model = MultinomialNB()
nb_model.fit(X_train_bow, y_train)
y_pred_nb = nb_model.predict(X_test_bow)

acc_nb = accuracy_score(y_test, y_pred_nb)
results.append({"Model": "Naive Bayes", "Feature": "BoW", "Accuracy": acc_nb})
print(f"Akurasi: {acc_nb:.4f}")

Akurasi: 0.6190


PERCOBAAN 2: TF-IDF + Logistic Regression

In [31]:
vectorizer_tfidf = TfidfVectorizer(max_features=5000) # TF-IDF Fitur
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train_text)
X_test_tfidf = vectorizer_tfidf.transform(X_test_text)

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)
y_pred_lr = lr_model.predict(X_test_tfidf)

acc_lr = accuracy_score(y_test, y_pred_lr)
results.append({"Model": "Logistic Regression", "Feature": "TF-IDF", "Accuracy": acc_lr})
print(f"Akurasi: {acc_lr:.4f}")

Akurasi: 0.6032


PERCOBAAN 3: TF-IDF + SVM

In [32]:
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_tfidf, y_train)
y_pred_svm = svm_model.predict(X_test_tfidf)

acc_svm = accuracy_score(y_test, y_pred_svm)
results.append({"Model": "SVM", "Feature": "TF-IDF", "Accuracy": acc_svm})
print(f"Akurasi: {acc_svm:.4f}")

Akurasi: 0.6032


PERSIAPAN DEEP LEARNING (Word Embeddings)

In [33]:

print("Sedang membuat kolom 'clean_text'...")

df['clean_text'] = df['text'].apply(clean_text)

df = df.dropna(subset=['clean_text'])
df = df[df['clean_text'].str.strip().astype(bool)]

print("✓ Kolom 'clean_text' berhasil dibuat!")
print(df[['text', 'clean_text']].head()) # Cek hasilnya

Sedang membuat kolom 'clean_text'...
✓ Kolom 'clean_text' berhasil dibuat!
                                                text  \
0  sehat2 bapak presiden ku dan pak menkeu selalu...   
1  Di luar negri lumrah pejabat mundur, di NKRI i...   
2  Pajak rakyat terus ditekan, pajak pengusaha ti...   
3  Pihak2 yang selama ini merasa tdk diawasi dlm ...   
4                                 Pemimpin itu laki2   

                                          clean_text  
0          sehat2 selalu untung lindung tuhan aamiin  
1  luar negri lumrah jabat mundur nkri inginya pe...  
2    pajak rakyat terus tekan pajak usaha transfaran  
3  pihak2 lama rasa awas guna anggar sekarang mul...  
4                                       pimpin laki2  


In [34]:
# Tokenisasi untuk LSTM & Transformer
MAX_WORDS = 5000
MAX_LEN = 100
EMBEDDING_DIM = 50

tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(df['clean_text'])
X_train_seq = tokenizer.texts_to_sequences(X_train_text)
X_test_seq = tokenizer.texts_to_sequences(X_test_text)

# Padding agar panjang kalimat sama
X_train_pad = pad_sequences(X_train_seq, maxlen=MAX_LEN)
X_test_pad = pad_sequences(X_test_seq, maxlen=MAX_LEN)

PERCOBAAN 4: Embeddings + LSTM

In [35]:
model_lstm = Sequential()
model_lstm.add(Embedding(MAX_WORDS, EMBEDDING_DIM, input_length=MAX_LEN))
model_lstm.add(SpatialDropout1D(0.2))
model_lstm.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(num_classes, activation='softmax'))

model_lstm.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_lstm.fit(X_train_pad, y_train, epochs=10, batch_size=32, verbose=0) # Training diam-diam

y_pred_lstm = np.argmax(model_lstm.predict(X_test_pad), axis=1)
acc_lstm = accuracy_score(y_test, y_pred_lstm)
results.append({"Model": "LSTM", "Feature": "Embeddings", "Accuracy": acc_lstm})
print(f"Akurasi: {acc_lstm:.4f}")



[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 282ms/step
Akurasi: 0.4603


PERCOBAAN 5: Embeddings + Transformer

In [36]:
inputs = Input(shape=(MAX_LEN,))
emb_layer = Embedding(MAX_WORDS, EMBEDDING_DIM)(inputs)

# Transformer Block Sederhana
att = MultiHeadAttention(num_heads=2, key_dim=EMBEDDING_DIM)(emb_layer, emb_layer)
att = Dropout(0.1)(att)
norm = LayerNormalization(epsilon=1e-6)(att + emb_layer)
pool = GlobalAveragePooling1D()(norm)
dropout = Dropout(0.1)(pool)
outputs = Dense(num_classes, activation="softmax")(dropout)

model_trans = Model(inputs=inputs, outputs=outputs)
model_trans.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model_trans.fit(X_train_pad, y_train, epochs=10, batch_size=32, verbose=0)

y_pred_trans = np.argmax(model_trans.predict(X_test_pad), axis=1)
acc_trans = accuracy_score(y_test, y_pred_trans)
results.append({"Model": "Transformer", "Feature": "Embeddings", "Accuracy": acc_trans})
print(f"Akurasi: {acc_trans:.4f}")

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 117ms/step
Akurasi: 0.3492


In [37]:
print("\n=== REKAP HASIL ===")
df_results = pd.DataFrame(results).sort_values(by='Accuracy', ascending=False)
print(df_results)

print("\nDetail untuk Model Terbaik (Classification Report):")
# Kita ambil contoh SVM sebagai baseline report
print(classification_report(y_test, y_pred_svm, target_names=le.classes_))


=== REKAP HASIL ===
                  Model     Feature  Accuracy
0           Naive Bayes         BoW  0.619048
1           Naive Bayes         BoW  0.619048
6           Naive Bayes         BoW  0.619048
11          Naive Bayes         BoW  0.619048
3                   SVM      TF-IDF  0.603175
4                  LSTM  Embeddings  0.603175
7   Logistic Regression      TF-IDF  0.603175
2   Logistic Regression      TF-IDF  0.603175
13                  SVM      TF-IDF  0.603175
8                   SVM      TF-IDF  0.603175
12  Logistic Regression      TF-IDF  0.603175
9                  LSTM  Embeddings  0.476190
5           Transformer  Embeddings  0.460317
14                 LSTM  Embeddings  0.460317
10          Transformer  Embeddings  0.396825
15          Transformer  Embeddings  0.349206

Detail untuk Model Terbaik (Classification Report):
              precision    recall  f1-score   support

     negatif       0.50      0.45      0.47        20
      netral       0.52      0.64  