#Klasifikasi Data

##Eksperimen 3

In [9]:
# ==========================================
# 2. IMPORT & PERSIAPAN DATA
# ==========================================
import pandas as pd
import numpy as np
import torch
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from datasets import Dataset

In [10]:
# Load dataset (Ganti nama file sesuai file Anda di Colab)
# Pastikan file 'data_bersih_filter_manual.csv' sudah diupload ke sesi Colab
df = pd.read_csv('data_bersih_filter_manual.csv')

##Filter Data

In [11]:
# 2. Filter Data Labeled
# Kita membuang baris yang kolom 'label_manual'-nya kosong (NaN)
df_labeled = df.dropna(subset=['label_manual']).copy()

# Opsional: Mengubah format label dari float (1.0) menjadi integer (1) agar lebih rapi
df_labeled['label_manual'] = df_labeled['label_manual'].astype(int)

print(f"Total data setelah filter (hanya yang berlabel): {len(df_labeled)}")
print("Sebaran label:\n", df_labeled['label_manual'].value_counts())

Total data setelah filter (hanya yang berlabel): 267
Sebaran label:
 label_manual
 1    103
-1    101
 0     63
Name: count, dtype: int64


##Stemming

In [12]:
# 3. Proses Stemming dengan Sastrawi
# Membuat stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Fungsi untuk stemming satu kalimat
def stemming_text(text):
    if pd.isna(text): # Cek jika text kosong
        return ""
    return stemmer.stem(text)

# Terapkan ke kolom 'text_clean'
# Kita buat kolom baru 'text_stemmed' agar data asli tidak hilang
df_labeled['text_stemmed'] = df_labeled['text_clean'].apply(stemming_text)

In [13]:
# 4. Lihat Hasil
print("\nContoh hasil stemming:")
print(df_labeled[['text_clean', 'text_stemmed']].head())


Contoh hasil stemming:
                                          text_clean  \
1  bubar aja makan bergizi gratis itu bagus di ka...   
2  wajib diapresiasi presiden prabowo konsisten p...   
3                                     astaghfirullah   
6  saya aman makan bergizi gratis gratis wajah te...   
7  sdn randuagung utara dapat membagi banyak yang...   

                                        text_stemmed  
1  bubar aja makan gizi gratis itu bagus di kasih...  
2  wajib apresiasi presiden prabowo konsisten pri...  
3                                     astaghfirullah  
6  saya aman makan gizi gratis gratis wajah senyu...  
7   sdn randuagung utara dapat bagi banyak yang enak  


In [14]:
# Pastikan tidak ada nilai null di kolom text
df_labeled['text_stemmed'] = df_labeled['text_stemmed'].fillna('')

In [15]:
X = df_labeled['text_stemmed']
y = df_labeled['label_manual']

##Indobert

Model IndoBERT sudah memiliki "otak" internal yang akan mengubah kata menjadi vektor (angka) secara otomatis di lapisan dalamnya (Embedding Layer & Attention Layers).

In [16]:
# ==========================================
# 5. PERSIAPAN DATA UNTUK MODEL
# ==========================================

# Mapping label: -1 -> 0 (Negatif), 0 -> 1 (Netral), 1 -> 2 (Positif)
label_map = {-1: 0, 0: 1, 1: 2}
# Reverse map untuk mengembalikan label nanti
reverse_label_map = {0: "Negatif", 1: "Netral", 2: "Positif"}

In [17]:
df_labeled['label_final'] = df_labeled['label_manual'].map(label_map)

In [18]:
# Cek apakah ada label yang gagal di-mapping (NaN)
if df_labeled['label_final'].isnull().sum() > 0:
    print("Peringatan: Ada label yang tidak dikenali!")
    print(df_labeled[df_labeled['label_final'].isnull()])
    df_labeled = df_labeled.dropna(subset=['label_final'])
    df_labeled['label_final'] = df_labeled['label_final'].astype(int)

In [19]:
# Split Data (80% Train, 20% Val)
# Menggunakan 'text_stemmed' sesuai request Anda
train_df, val_df = train_test_split(
    df_labeled,
    test_size=0.2,
    stratify=df_labeled['label_final'],
    random_state=42
)

In [20]:
print(f"Jumlah Data Train: {len(train_df)}")
print(f"Jumlah Data Validasi: {len(val_df)}")

Jumlah Data Train: 213
Jumlah Data Validasi: 54


In [21]:
# Konversi ke HuggingFace Dataset
# Kita gunakan kolom 'text_stemmed' dan 'label_final'
train_dataset = Dataset.from_pandas(train_df[['text_stemmed', 'label_final']])
val_dataset = Dataset.from_pandas(val_df[['text_stemmed', 'label_final']])

# Rename kolom agar sesuai standar HuggingFace (text -> input, label_final -> labels)
train_dataset = train_dataset.rename_column("text_stemmed", "text")
train_dataset = train_dataset.rename_column("label_final", "label")
val_dataset = val_dataset.rename_column("text_stemmed", "text")
val_dataset = val_dataset.rename_column("label_final", "label")

In [22]:
# ==========================================
# 6. TOKENISASI
# ==========================================
model_checkpoint = "indobenchmark/indobert-base-p1"
tokenizer = BertTokenizer.from_pretrained(model_checkpoint)

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

# Tokenisasi dataset
encoded_train = train_dataset.map(preprocess_function, batched=True)
encoded_val = val_dataset.map(preprocess_function, batched=True)

# Hapus kolom teks asli agar tidak error saat masuk ke model (opsional tapi disarankan)
encoded_train = encoded_train.remove_columns(['text'])
encoded_val = encoded_val.remove_columns(['text'])

# Set format ke torch
encoded_train.set_format("torch")
encoded_val.set_format("torch")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/213 [00:00<?, ? examples/s]

Map:   0%|          | 0/54 [00:00<?, ? examples/s]

In [23]:
# ==========================================
# 7. SETUP MODEL & METRIK
# ==========================================
# num_labels=3 karena ada 3 kelas (Negatif, Netral, Positif)
model = BertForSequenceClassification.from_pretrained(model_checkpoint, num_labels=3)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')

    return {
        'accuracy': acc,
        'f1': f1
    }

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
# ==========================================
# 8. TRAINING
# ==========================================
batch_size = 16 # Sesuaikan dengan RAM GPU, turunkan ke 8 jika error memory

args = TrainingArguments(
    output_dir=f"indobert-finetuned-sentiment",
    eval_strategy = "epoch",     # <--- PERUBAHAN DI SINI (sebelumnya evaluation_strategy)
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    logging_steps=10
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded_train,
    eval_dataset=encoded_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Mulai Training
trainer.train()

  trainer = Trainer(
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 2


[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33makbarrmdhn1304[0m ([33makbarrmdhn1304-university-technology-of-yogyakarta[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.0815,0.948905,0.462963,0.39474
2,0.8742,0.916831,0.574074,0.539095
3,0.5438,0.872462,0.611111,0.599489
4,0.396,0.835062,0.592593,0.58387
5,0.2458,0.833488,0.592593,0.583112


TrainOutput(global_step=70, training_loss=0.5967385189873832, metrics={'train_runtime': 250.183, 'train_samples_per_second': 4.257, 'train_steps_per_second': 0.28, 'total_flos': 70053947470080.0, 'train_loss': 0.5967385189873832, 'epoch': 5.0})

In [29]:
# ==========================================
# 10. PREDIKSI DATA BARU
# ==========================================
def predict_text(text):
    # Preprocess (Stemming dulu karena model dilatih dengan data stemmed)
    # Gunakan fungsi stemming_text yang sudah dibuat di cell awal notebook Anda
    stemmed_text = stemming_text(text)

    inputs = tokenizer(stemmed_text, return_tensors="pt", truncation=True, padding=True, max_length=128)

    # Pindahkan input ke device yang sama dengan model (GPU/CPU)
    device = model.device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    pred_idx = torch.argmax(probs, dim=-1).item()

    return reverse_label_map[pred_idx], probs[0][pred_idx].item()

# Contoh penggunaan
sample_texts = [
    "Program makan gratis ini sangat membantu masyarakat miskin",
    "Program ini lumayan sedikit membantu",
    "berhentikans aja program ini sampah"
]

print("\n--- Hasil Prediksi ---")
for txt in sample_texts:
    sentiment, confidence = predict_text(txt)
    print(f"Teks: {txt}")
    print(f"Prediksi: {sentiment} (Conf: {confidence:.2%})\n")


--- Hasil Prediksi ---
Teks: Program makan gratis ini sangat membantu masyarakat miskin
Prediksi: Positif (Conf: 71.81%)

Teks: Program ini lumayan sedikit membantu
Prediksi: Netral (Conf: 54.03%)

Teks: berhentikans aja program ini sampah
Prediksi: Negatif (Conf: 74.99%)

