#Klasifikasi Data


Eksperimen 4

In [24]:
import torch
import torch.nn as nn
import torch.optim as optim
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from torch.utils.data import Dataset, DataLoader
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import pandas as pd

In [25]:
df = pd.read_csv('data_bersih_filter_manual.csv')

#Filter Data

In [26]:
# 2. Filter Data Labeled
# Kita membuang baris yang kolom 'label_manual'-nya kosong (NaN)
df_labeled = df.dropna(subset=['label_manual']).copy()

# Opsional: Mengubah format label dari float (1.0) menjadi integer (1) agar lebih rapi
df_labeled['label_manual'] = df_labeled['label_manual'].astype(int)

print(f"Total data setelah filter (hanya yang berlabel): {len(df_labeled)}")
print("Sebaran label:\n", df_labeled['label_manual'].value_counts())

Total data setelah filter (hanya yang berlabel): 267
Sebaran label:
 label_manual
 1    103
-1    101
 0     63
Name: count, dtype: int64


#Stemming

In [27]:
# 3. Proses Stemming dengan Sastrawi
# Membuat stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Fungsi untuk stemming satu kalimat
def stemming_text(text):
    if pd.isna(text): # Cek jika text kosong
        return ""
    return stemmer.stem(text)

# Terapkan ke kolom 'text_clean'
# Kita buat kolom baru 'text_stemmed' agar data asli tidak hilang
df_labeled['text_stemmed'] = df_labeled['text_clean'].apply(stemming_text)

In [28]:
# 4. Lihat Hasil
print("\nContoh hasil stemming:")
print(df_labeled[['text_clean', 'text_stemmed']].head())


Contoh hasil stemming:
                                          text_clean  \
1  bubar aja makan bergizi gratis itu bagus di ka...   
2  wajib diapresiasi presiden prabowo konsisten p...   
3                                     astaghfirullah   
6  saya aman makan bergizi gratis gratis wajah te...   
7  sdn randuagung utara dapat membagi banyak yang...   

                                        text_stemmed  
1  bubar aja makan gizi gratis itu bagus di kasih...  
2  wajib apresiasi presiden prabowo konsisten pri...  
3                                     astaghfirullah  
6  saya aman makan gizi gratis gratis wajah senyu...  
7   sdn randuagung utara dapat bagi banyak yang enak  


In [29]:
# 1. Mapping Label (-1 -> 0, 0 -> 1, 1 -> 2)
label_map = {-1: 0, 0: 1, 1: 2}
df_labeled['label_final'] = df_labeled['label_manual'].map(label_map)

In [30]:
# 2. Tokenisasi (Memecah kalimat menjadi list kata)
# Kita gunakan kolom 'text_stemmed' yang sudah Anda buat
texts = df_labeled['text_stemmed'].astype(str).tolist()
labels = df_labeled['label_final'].tolist()

In [31]:
# Split kalimat menjadi kata-kata (token)
tokenized_texts = [text.split() for text in texts]

#Word2Vec

In [32]:
# 3. Latih Word2Vec
# Kita latih dengan window=5 (konteks 5 kata) dan vector_size=100
print("Melatih Word2Vec...")
w2v_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)
print("Word2Vec selesai dilatih.")

Melatih Word2Vec...
Word2Vec selesai dilatih.


In [33]:
# 4. Buat Vocabulary & Embedding Matrix
# Ini adalah kamus untuk mengubah kata -> indeks angka
word2idx = {word: i+1 for i, word in enumerate(w2v_model.wv.index_to_key)} # 0 disediakan untuk padding
vocab_size = len(word2idx) + 1
embedding_dim = 100

#LSTM

In [34]:
# Buat matriks bobot untuk layer Embedding LSTM nanti
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in word2idx.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

embedding_tensor = torch.FloatTensor(embedding_matrix)
print(f"Ukuran Vocabulary: {vocab_size}")

Ukuran Vocabulary: 1346


In [36]:
# 6. PADDING & SPLIT DATA
# ==========================================

# Tentukan panjang maksimal kalimat (misal 50 kata)
max_len = 50

def encode_and_pad(tokens):
    # Ubah kata jadi angka
    encoded = [word2idx.get(t, 0) for t in tokens]

    # Padding atau Potong
    if len(encoded) < max_len:
        encoded += [0] * (max_len - len(encoded)) # Tambah 0 di belakang
    else:
        encoded = encoded[:max_len] # Potong jika kepanjangan
    return encoded

# Terapkan ke semua data
X_encoded = [encode_and_pad(t) for t in tokenized_texts]

In [37]:
# Ubah ke format Tensor PyTorch
X_tensor = torch.tensor(X_encoded, dtype=torch.long)
y_tensor = torch.tensor(labels, dtype=torch.long)

In [38]:
# Split Train (80%) dan Test (20%)
X_train, X_test, y_train, y_test = train_test_split(
    X_tensor, y_tensor, test_size=0.2, random_state=42, stratify=y_tensor
)

In [39]:
# Buat DataLoader
train_data = list(zip(X_train, y_train))
test_data = list(zip(X_test, y_test))

train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
test_loader = DataLoader(test_data, batch_size=16, shuffle=False)

print(f"Jumlah Data Train: {len(X_train)}")
print(f"Jumlah Data Test: {len(X_test)}")

Jumlah Data Train: 213
Jumlah Data Test: 54


In [40]:
# 7. DEFINISI MODEL LSTM
# ==========================================
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, embedding_matrix):
        super(SentimentLSTM, self).__init__()

        # Layer Embedding (menggunakan bobot dari Word2Vec)
        # freeze=False artinya kita izinkan model mengupdate bobot kata-katanya lagi
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix, freeze=False)

        # Layer LSTM
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)

        # Layer Output (Fully Connected)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Masuk embedding
        embedded = self.embedding(x)

        # Masuk LSTM
        # output: (batch, seq_len, hidden_dim)
        # hidden: (num_layers, batch, hidden_dim)
        _, (hidden, _) = self.lstm(embedded)

        # Kita ambil hidden state terakhir (hidden[-1])
        return self.fc(hidden[-1])

In [41]:
# Inisialisasi Model
model = SentimentLSTM(vocab_size=vocab_size,
                      embed_dim=embedding_dim,
                      hidden_dim=64,   # Bisa diganti 128 jika kurang kompleks
                      output_dim=3,    # 3 Kelas: Negatif, Netral, Positif
                      embedding_matrix=embedding_tensor)

In [42]:
# Cek device (gunakan GPU jika ada)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Menggunakan device: {device}")
model = model.to(device)

Menggunakan device: cuda


In [43]:
# 8. TRAINING MODEL
# ==========================================
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 20 # Coba 15-20 epoch untuk LSTM
print("Mulai Training...")

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    if (epoch+1) % 5 == 0:
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")

print("Training Selesai.")

Mulai Training...
Epoch 5/20, Loss: 1.0675
Epoch 10/20, Loss: 1.0228
Epoch 15/20, Loss: 0.9793
Epoch 20/20, Loss: 0.8593
Training Selesai.


In [44]:
# ==========================================
# 9. EVALUASI
# ==========================================
model.eval()
all_preds = []
all_targets = []

with torch.no_grad():
    for inputs, targets in test_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        _, preds = torch.max(outputs, 1)

        all_preds.extend(preds.cpu().numpy())
        all_targets.extend(targets.cpu().numpy())

# Hitung Akurasi
acc = accuracy_score(all_targets, all_preds)
print(f"\nAkurasi LSTM + Word2Vec: {acc:.2%}")

# Tampilkan Laporan Klasifikasi
target_names = ['Negatif', 'Netral', 'Positif'] # Sesuai urutan 0, 1, 2
print("\nClassification Report:")
print(classification_report(all_targets, all_preds, target_names=target_names))


Akurasi LSTM + Word2Vec: 42.59%

Classification Report:
              precision    recall  f1-score   support

     Negatif       0.42      0.80      0.55        20
      Netral       0.43      0.46      0.44        13
     Positif       0.50      0.05      0.09        21

    accuracy                           0.43        54
   macro avg       0.45      0.44      0.36        54
weighted avg       0.45      0.43      0.35        54

