In [None]:
from google.colab import drive
import os
import pandas as pd
import numpy as np
import torch
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset

# Hubungkan ke Google Drive
drive.mount('/content/drive')
PROJECT_PATH = "/content/drive/MyDrive/CODE SKRIPSI"

print("Setup Selesai.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Setup Selesai.


In [None]:
def load_slang_dict(filepath):
    df_slang = pd.read_csv(filepath, header=None, names=['slang', 'formal'], encoding='latin-1')
    return pd.Series(df_slang.formal.values, index=df_slang.slang).to_dict()

def preprocess_text(text, slang_dictionary):
    text = str(text).lower()
    words = text.split()
    normalized_words = [slang_dictionary.get(word, word) for word in words]
    text = " ".join(normalized_words)
    text = re.sub(r'@[A-Za-z0-9_]+', ' ', text); text = re.sub(r'https?://[A-Za-z0-9./]+', ' ', text)
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text); text = re.sub(r'\s+', ' ', text).strip()
    return text

print("Fungsi-fungsi pendukung siap.")

Fungsi-fungsi pendukung siap.


In [None]:
PATH_MODEL_TERBAIK = "/content/drive/MyDrive/CODE SKRIPSI/revisi/MODEL_FINAL_SIDANG_REVISI"
PATH_DATA_EVALUASI = os.path.join(PROJECT_PATH, "data/human_evaluation_data.csv")

OPTIMAL_THRESHOLDS = {
    'HS': 0.8, 'HS_Individual': 0.85, 'HS_Group': 0.77,
    'HS_Religion': 0.72, 'HS_Race': 0.36, 'HS_Physical': 0.39,
    'HS_Gender': 0.6, 'HS_Other': 0.84, 'HS_Weak': 0.88,
    'HS_Moderate': 0.71, 'HS_Strong': 0.82
}

print("Memuat model, data, dan peralatan...")
model = AutoModelForSequenceClassification.from_pretrained(PATH_MODEL_TERBAIK)
tokenizer = AutoTokenizer.from_pretrained(PATH_MODEL_TERBAIK)
slang_dict = load_slang_dict(os.path.join(PROJECT_PATH, 'data/new_kamusalay.csv'))

print("\nMemproses 50 kalimat untuk evaluasi...")
df_eval = pd.read_csv(PATH_DATA_EVALUASI)
sentences = df_eval['text'].tolist()
processed_sentences = [preprocess_text(s, slang_dict) for s in sentences]

print("Model sedang melakukan prediksi...")
inputs = tokenizer(processed_sentences, return_tensors="pt", padding=True, truncation=True, max_length=128)
with torch.no_grad():
    logits = model(**inputs).logits
probabilities = torch.sigmoid(logits)

label_columns = list(OPTIMAL_THRESHOLDS.keys())
predictions_df = pd.DataFrame()

# Menerapkan threshold untuk mendapatkan prediksi akhir (0 atau 1)
for i, label in enumerate(label_columns):
    threshold = OPTIMAL_THRESHOLDS[label]
    predictions_df[label] = (probabilities[:, i] > threshold).int().numpy()

target_labels = ['HS_Individual', 'HS_Group']
category_labels = ['HS_Religion', 'HS_Race', 'HS_Physical', 'HS_Gender', 'HS_Other']
level_labels = ['HS_Weak', 'HS_Moderate', 'HS_Strong']

label_to_index = {label: i for i, label in enumerate(label_columns)}
target_indices = [label_to_index[l] for l in target_labels]
category_indices = [label_to_index[l] for l in category_labels]
level_indices = [label_to_index[l] for l in level_labels]

indeks_hs_1 = predictions_df[predictions_df['HS'] == 1].index

if not indeks_hs_1.empty:
    for i in indeks_hs_1: # i adalah indeks baris

        # Periksa Grup Target
        if predictions_df.loc[i, target_labels].sum() == 0:
            probs_target = probabilities[i, target_indices]
            max_prob_index_in_group = torch.argmax(probs_target).item()
            label_to_add = target_labels[max_prob_index_in_group]
            predictions_df.loc[i, label_to_add] = 1

        # Periksa Grup Kategori
        if predictions_df.loc[i, category_labels].sum() == 0:
            probs_category = probabilities[i, category_indices]
            max_prob_index_in_group = torch.argmax(probs_category).item()
            label_to_add = category_labels[max_prob_index_in_group]
            predictions_df.loc[i, label_to_add] = 1

        # Periksa Grup Level
        if predictions_df.loc[i, "HS_Strong"] == 1:
            predictions_df.loc[i, "HS_Moderate"] = 0
            predictions_df.loc[i, "HS_Weak"] = 0
        elif predictions_df.loc[i, "HS_Moderate"] == 1:
            predictions_df.loc[i, "HS_Strong"] = 0
            predictions_df.loc[i, "HS_Weak"] = 0
        elif predictions_df.loc[i, "HS_Weak"] == 1:
            predictions_df.loc[i, "HS_Strong"] = 0
            predictions_df.loc[i, "HS_Moderate"] = 0
        else:
            # FALLBACK: Jika tidak ada level yang terdeteksi (semua 0),
            # paksa pilih satu berdasarkan probabilitas tertinggi.
            probs_level = probabilities[i, level_indices]
            max_prob_index_in_group = torch.argmax(probs_level).item()
            label_to_add = level_labels[max_prob_index_in_group]
            predictions_df.loc[i, label_to_add] = 1

# Jika HS=0, paksa semua label anak menjadi 0
sub_labels = [col for col in label_columns if col != 'HS']
indeks_non_hs = predictions_df[predictions_df['HS'] == 0].index

if not indeks_non_hs.empty:
    predictions_df.loc[indeks_non_hs, sub_labels] = 0

# Menggabungkan hasil
final_df = pd.concat([df_eval[['text']], predictions_df], axis=1)

# Simpan hasil prediksi ke file CSV
output_path = os.path.join(PROJECT_PATH, "data/hasil_prediksi_human_eval.csv")
final_df.to_csv(output_path, index=False)

print(f"\nSelesai! Hasil prediksi disimpan di: {output_path}")
display(final_df)

Memuat model, data, dan peralatan...





Memproses 50 kalimat untuk evaluasi...
Model sedang melakukan prediksi...

Selesai! Hasil prediksi disimpan di: /content/drive/MyDrive/CODE SKRIPSI/data/hasil_prediksi_human_eval.csv


Unnamed: 0,text,HS,HS_Individual,HS_Group,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Weak,HS_Moderate,HS_Strong
0,Ini kan dpr yg tolol itu ya??,1,0,1,0,0,0,0,1,0,1,0
1,"hok cuh, najis dh gua. semoga lu dpt adzab di ...",1,1,0,0,0,0,0,1,1,0,0
2,"Lo harus mati, babi",1,1,0,0,0,0,0,1,1,0,0
3,"Tgl merah pun klien minta meeting, dasar cinaüòí",1,0,1,0,1,0,0,0,1,0,0
4,Wajar kan kalo gw pen nampol elu.\nItu pacar l...,1,1,0,0,0,0,1,0,1,0,0
5,Hahahahah kalo daging babi baru haram,0,0,0,0,0,0,0,0,0,0,0
6,Mata sipit dagu lancip kayak korea,0,0,0,0,0,0,0,0,0,0,0
7,kenapa pada ngira gw batak dah üòÇüòÇ mata gw kura...,0,0,0,0,0,0,0,0,0,0,0
8,Otak udang..kepala batu,0,0,0,0,0,0,0,0,0,0,0
9,benci kali sm prabowo sumpah,1,1,0,0,0,0,0,1,1,0,0


## Perbandingan dengan Ground Truth

In [None]:
from sklearn.metrics import classification_report, accuracy_score

PATH_JAWABAN_MODEL = os.path.join(PROJECT_PATH, "data/hasil_prediksi_human_eval.csv")
PATH_GROUND_TRUTH = os.path.join(PROJECT_PATH, "data/groundtruth_human_eval.csv")

print("Memulai analisis perbandingan...")

try:
    df_model = pd.read_csv(PATH_JAWABAN_MODEL)
    df_human = pd.read_csv(PATH_GROUND_TRUTH)
    print(f"Berhasil memuat:\n  - {PATH_JAWABAN_MODEL}\n  - {PATH_GROUND_TRUTH}")
except FileNotFoundError as e:
    print(f"ERROR: File tidak ditemukan. Pastikan file 'groundtruth_human_eval.csv' sudah diupload ke folder 'data'.\n{e}")
    raise e

# Ekstrak matriks label dari kedua DataFrame
label_columns = [col for col in df_model.columns if col != 'text']
y_pred = df_model[label_columns].values
y_true = df_human[label_columns].values

# Hitung Akurasi Kesamaan Persis (Exact Match Ratio)
# Mengukur seberapa sering model menebak SEMUA 11 label dengan benar untuk satu kalimat
exact_match_accuracy = accuracy_score(y_true, y_pred)
print("\n" + "="*50)
print("Akurasi Kesamaan Persis (Exact Match Ratio)")
print("="*50)
print(f"Dari {len(df_model)} kalimat, model memberikan prediksi yang 100% sama persis dengan label manusia sebanyak: {exact_match_accuracy * 100:.2f}%")
print("(Metrik ini sangat ketat dan biasanya rendah untuk tugas multi-label)")

# Hitung dan Tampilkan Laporan Klasifikasi Lengkap
print("\n" + "="*50)
print("Laporan Klasifikasi Lengkap (Human Evaluation)")
print("="*50)
report = classification_report(
    y_true,
    y_pred,
    target_names=label_columns,
    zero_division=0
)
print(report)

Memulai analisis perbandingan...
Berhasil memuat:
  - /content/drive/MyDrive/CODE SKRIPSI/data/hasil_prediksi_human_eval.csv
  - /content/drive/MyDrive/CODE SKRIPSI/data/groundtruth_human_eval.csv

Akurasi Kesamaan Persis (Exact Match Ratio)
Dari 50 kalimat, model memberikan prediksi yang 100% sama persis dengan label manusia sebanyak: 78.00%
(Metrik ini sangat ketat dan biasanya rendah untuk tugas multi-label)

Laporan Klasifikasi Lengkap (Human Evaluation)
               precision    recall  f1-score   support

           HS       1.00      0.90      0.95        30
HS_Individual       1.00      0.90      0.95        20
     HS_Group       1.00      0.90      0.95        10
  HS_Religion       1.00      1.00      1.00         4
      HS_Race       1.00      1.00      1.00         2
  HS_Physical       0.67      0.40      0.50         5
    HS_Gender       1.00      0.25      0.40         4
     HS_Other       0.82      0.88      0.85        16
      HS_Weak       0.81      0.81      0

In [None]:
print("\n" + "="*50)
print("Analisis Kesalahan Spesifik (Model vs. Manusia)")
print("="*50)

error_indices = [i for i, (p, t) in enumerate(zip(y_pred, y_true)) if not np.array_equal(p, t)]

if not error_indices:
    print("Tidak ada perbedaan antara prediksi model dan ground truth manusia.")
else:
    print(f"Ditemukan {len(error_indices)} dari {len(df_model)} kalimat dengan prediksi yang berbeda.\n")

    # Loop hanya pada baris yang salah untuk menampilkan detailnya
    for i in error_indices:
        print(f"--- Kesalahan pada Kalimat #{i+1} ---")

        # Ambil kalimat asli
        kalimat = df_model.loc[i, 'text']
        print(f"Kalimat: \"{kalimat}\"")

        # Konversi prediksi dan ground truth ke dalam bentuk set untuk perbandingan mudah
        pred_labels = {label for j, label in enumerate(label_columns) if y_pred[i][j] == 1}
        true_labels = {label for j, label in enumerate(label_columns) if y_true[i][j] == 1}

        # Tampilkan hasil perbandingan
        print(f"  > Prediksi Model : {pred_labels if pred_labels else 'Tidak ada'}")
        print(f"  > Jawaban Manusia: {true_labels if true_labels else 'Tidak ada'}")

        # Identifikasi kesalahan spesifik
        false_positives = pred_labels - true_labels
        false_negatives = true_labels - pred_labels

        if false_positives:
            print(f"  > Kesalahan (False Positive): Model salah menambahkan label {false_positives}")
        if false_negatives:
            print(f"  > Kesalahan (False Negative): Model gagal mendeteksi label {false_negatives}")

        print("-" * 25 + "\n")


Analisis Kesalahan Spesifik (Model vs. Manusia)
Ditemukan 11 dari 50 kalimat dengan prediksi yang berbeda.

--- Kesalahan pada Kalimat #3 ---
Kalimat: "Lo harus mati, babi"
  > Prediksi Model : {'HS_Other', 'HS', 'HS_Weak', 'HS_Individual'}
  > Jawaban Manusia: {'HS_Other', 'HS', 'HS_Strong', 'HS_Individual'}
  > Kesalahan (False Positive): Model salah menambahkan label {'HS_Weak'}
  > Kesalahan (False Negative): Model gagal mendeteksi label {'HS_Strong'}
-------------------------

--- Kesalahan pada Kalimat #4 ---
Kalimat: "Tgl merah pun klien minta meeting, dasar cinaüòí"
  > Prediksi Model : {'HS_Race', 'HS_Group', 'HS', 'HS_Weak'}
  > Jawaban Manusia: {'HS_Race', 'HS_Moderate', 'HS_Group', 'HS'}
  > Kesalahan (False Positive): Model salah menambahkan label {'HS_Weak'}
  > Kesalahan (False Negative): Model gagal mendeteksi label {'HS_Moderate'}
-------------------------

--- Kesalahan pada Kalimat #9 ---
Kalimat: "Otak udang..kepala batu"
  > Prediksi Model : Tidak ada
  > Jawaban