<a href="https://colab.research.google.com/github/Zero4427/PK/blob/main/Penalaran_Komputer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://drive.google.com/drive/folders/1EbCpBIfKX7xUDIRfBRb0A6YO2w7f6tkz?usp=drive_link File PDF Fidusia

In [None]:
!pip install pdfminer.six

Collecting pdfminer.six
  Downloading pdfminer_six-20250506-py3-none-any.whl.metadata (4.2 kB)
Downloading pdfminer_six-20250506-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m30.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pdfminer.six
Successfully installed pdfminer.six-20250506


## Tahap 1-2

In [None]:
# ✅ KODE FIXED UNTUK TAHAP 1 & 2 (SCRAPING + CLEANING PUTUSAN MA)

import os
import re
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import date
from pdfminer.high_level import extract_text
from google.colab import drive

# ================ MOUNT GOOGLE DRIVE ==================
drive.mount('/content/drive')

# ================== SETUP =============================
HEADERS = {
    "User-Agent": "Mozilla/5.0"
}

# ================= PATH ==============================
def create_path(folder_name):
    path = os.path.join("/content/drive/MyDrive/Penalaran Komputer", folder_name)
    os.makedirs(path, exist_ok=True)
    return path

# ================= SCRAPING ===========================
def open_page(link):
    for _ in range(5):
        try:
            r = requests.get(link, headers=HEADERS, timeout=30)
            r.raise_for_status()
            time.sleep(2)
            return BeautifulSoup(r.text, "lxml")
        except:
            time.sleep(3)
    return None

def normalize_link(link):
    return link if link.startswith("http") else "https://putusan3.mahkamahagung.go.id" + link

def get_detail(soup, keyword):
    td = soup.find(lambda tag: tag.name == "td" and keyword.lower() in tag.text.lower())
    return td.find_next().get_text(strip=True) if td else ""

def get_pdf(url, path_pdf):
    try:
        r = requests.get(url, headers=HEADERS)
        r.raise_for_status()
        fname = os.path.basename(url)
        fpath = os.path.join(path_pdf, fname)
        with open(fpath, "wb") as f:
            f.write(r.content)
        print(f"Downloaded: {fname}")
        return fpath, fname
    except:
        return None, None

def extract_data(link, path_output, path_pdf):
    soup = open_page(normalize_link(link))
    if not soup: return None, False

    table = soup.find("table", {"class": "table"})
    if not table: return None, False

    judul = table.find("h2").text.strip() if table.find("h2") else ""

    fields = [
        "Nomor", "Tingkat Proses", "Klasifikasi", "Kata Kunci", "Tahun", "Tanggal Register",
        "Lembaga Peradilan", "Jenis Lembaga Peradilan", "Hakim Ketua", "Hakim Anggota",
        "Panitera", "Amar", "Amar Lainnya", "Catatan Amar", "Tanggal Musyawarah",
        "Tanggal Dibacakan", "Kaidah", "Status", "Abstrak"
    ]
    values = [get_detail(table, f) for f in fields]

    link_pdf_tag = soup.find("a", href=re.compile(r"/pdf/"))
    if not link_pdf_tag: return None, False
    pdf_url = normalize_link(link_pdf_tag["href"])
    pdf_path, pdf_name = get_pdf(pdf_url, path_pdf)
    if not pdf_path: return None, False

    text_pdf = extract_text(pdf_path)
    data = [judul] + values + [normalize_link(link), pdf_url, pdf_name, text_pdf]
    columns = [
        "judul", "nomor", "tingkat_proses", "klasifikasi", "kata_kunci", "tahun",
        "tanggal_register", "lembaga_peradilan", "jenis_lembaga_peradilan", "hakim_ketua",
        "hakim_anggota", "panitera", "amar", "amar_lainnya", "catatan_amar",
        "tanggal_musyawarah", "tanggal_dibacakan", "kaidah", "status", "abstrak",
        "link", "link_pdf", "file_name_pdf", "text_pdf"
    ]
    return pd.DataFrame([data], columns=columns), True

def run_scraper():
    path_out = create_path("CSV4")
    path_pdf = create_path("PDF4")
    today = date.today().strftime("%Y-%m-%d")
    file_csv = os.path.join(path_out, f"putusan_fidusia_{today}.csv")

    count = 0
    seen = set()
    page = 1

    while count < 50:
        url = f"https://putusan3.mahkamahagung.go.id/direktori/index/kategori/fidusia-1/page/{page}.html" if page > 1 else "https://putusan3.mahkamahagung.go.id/direktori/index/kategori/fidusia-1.html"
        soup = open_page(url)
        if not soup: break

        links = soup.find_all("a", href=re.compile("/direktori/putusan"))
        for tag in links:
            href = tag.get("href")
            if href and href not in seen:
                seen.add(href)
                df, ok = extract_data(href, path_out, path_pdf)
                if ok:
                    df.to_csv(file_csv, mode='a', header=not os.path.exists(file_csv), index=False)
                    count += 1
                    if count >= 50: break
        page += 1
    print(f"Done. Total: {count} putusan.")

# Jalankan Scraper
run_scraper()

# ===================================
# CLEANING CSV (Tahap 2)
# ===================================

df = pd.read_csv("/content/drive/MyDrive/Penalaran Komputer/CSV4/putusan_fidusia_2025-06-24.csv")

# Bersihkan kolom text_pdf

def clean_text_ma(text):
    if pd.isna(text): return ""
    text = text.lower()
    text = re.sub(r'direktori putusan.*?transparansi.*?peradilan\.', '', text, flags=re.DOTALL)
    text = re.sub(r'email\s*:\s*\S+@\S+', '', text)
    text = re.sub(r'telp.*?(\d{3,})', '', text)
    text = re.sub(r'nip\.?\s*\d+', '', text)
    text = re.sub(r'panitera.*?hakim.*?', '', text, flags=re.DOTALL)
    text = re.sub(r'ttd.*?', '', text, flags=re.DOTALL)
    text = re.sub(r'halaman\s*\d+', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

df['text_pdf_cleaned'] = df['text_pdf'].apply(clean_text_ma)

# Simpan hasilnya
cleaned_path = "/content/drive/MyDrive/Penalaran Komputer/CSV4/putusan_fidusia_cleaned_FINAL.csv"
df.to_csv(cleaned_path, index=False)
print("✅ Cleaning selesai. File disimpan ke:", cleaned_path)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Downloaded: zaf036f88e2724aa9fe2313733363031
Downloaded: zaf036f5b05e7710b42b313731353330
Downloaded: zaefd9fe727948a2b48c303935333533
Downloaded: zaef9daf0300b7b68b72313535343036
Downloaded: zaef81f77303b960bda2303932323035
Downloaded: zaef81411302505abb44313133363336
Downloaded: zaef762976cb76de9ef1303834393533
Downloaded: zaef7181661fd6d8b6b0313033363434
Downloaded: zaef6ff4063f7c8ea930313131323133
Downloaded: zaef215ff8a138fca10c313131353534
Downloaded: zaeedd305923dcf8858a313634333431
Downloaded: zaeec0178268f08ea9eb313630323439
Downloaded: zaee9e3c07f7d36296d5313335383336
Downloaded: zaee2a869368a4aa8b67303735393538
Downloaded: zaee276ee35f6c988128303933323530
Downloaded: zaee214d386309d4931e313431363433
Downloaded: zaee1a19d30d812ab987313032313131
Downloaded: zaeded6e1efe3ab2bb37313430313134
Downloaded: zaedce3a9c50616e9f04323130343234
Downloaded: zaf0

In [None]:
import pandas as pd
import re

# Load data dari hasil scraping
file_path = '/content/drive/MyDrive/Penalaran Komputer/CSV4/putusan_fidusia_cleaned_FINAL.csv'
df = pd.read_csv(file_path)

# Kolom target
text_col = 'text_pdf_cleaned'

# Pola regex lanjutan untuk membersihkan watermark, disclaimer, dan duplikasi
def clean_advanced(text):
    if not isinstance(text, str):
        return ""

    # Hapus watermark & identitas MA
    text = re.sub(r'(mahkamah agung republik indonesia|putusan\.mahkamahagung\.go\.id)', ' ', text, flags=re.IGNORECASE)
    text = re.sub(r'direktori putusan.*?transparansi dan akuntabilitas.*?(?=halaman|\s+)', ' ', text, flags=re.DOTALL|re.IGNORECASE)

    # Hapus penutup hakim/panitera
    text = re.sub(r'ttd\./.*?(?=panitera|hakim|untuk salinan|dr\.|nip)', ' ', text, flags=re.DOTALL)
    text = re.sub(r'nip\s*\d{5,}', ' ', text, flags=re.IGNORECASE)

    # Hapus 'halaman x dari x halaman' dan yang sejenis
    text = re.sub(r'halaman\s*\d+\s*(dari\s*\d+\s*halaman)?', ' ', text, flags=re.IGNORECASE)

    # Hapus duplikasi berulang (cut-off setelah putusan utama)
    text = re.sub(r'putusan tersebut diucapkan.*', '', text, flags=re.DOTALL|re.IGNORECASE)

    # Normalisasi spasi dan huruf besar kecil
    text = re.sub(r'\s+', ' ', text)
    text = text.strip().lower()

    return text

def extract_amar_putusan(text):
    if not isinstance(text, str):
        return ""

    # Lowercase + spasi normal
    text = text.lower()
    text = re.sub(r"\s+", " ", text)

    # Mulai dari kalimat penting
    start_idx = text.find("m e n g a d i l i")
    if start_idx == -1:
        start_idx = text.find("demi keadilan berdasarkan ketuhanan yang maha esa")
    if start_idx == -1:
        return text  # fallback

    text = text[start_idx:]

    # Akhiri di bagian-bagian administratif
    end_phrases = [
        "putusan ini diucapkan", "diputuskan dalam rapat",
        "panitera pengganti", "untuk salinan", "nip."
    ]
    for phrase in end_phrases:
        end_idx = text.find(phrase)
        if end_idx != -1:
            text = text[:end_idx]
            break

    return text.strip()

def finalize_cleaning(text):
    if not isinstance(text, str) or not text.strip():
        return ""

    # Lowercase
    text = text.lower()

    # Cari frasa kunci amar putusan
    start_idx = text.find('m e n g a d i l i')
    if start_idx != -1:
        text = text[start_idx:]  # Ambil dari "m e n g a d i l i"

    # Hapus watermark, disclaimer, penutup jika ada
    text = re.sub(r'direktori putusan.*?(?=menolak|memperbaiki|pada hari|putusan nomor|demi keadilan)', '', text, flags=re.DOTALL)
    text = re.sub(r'email\s*:\s*\S+@\S+|telp\s*:\s*[\d\s\-\(\)]+', '', text)
    text = re.sub(r'nip\.\s*\d+|\bpada hari.*', '', text)
    text = re.sub(r'panitera.*?$', '', text, flags=re.DOTALL)

    # Bersihkan spasi berlebih
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def cari_amar_alternatif(text):
    text = text.lower()
    keywords = ['m e n g a d i l i', 'putusan pengadilan', 'menjatuhkan pidana', 'memperbaiki putusan']
    for key in keywords:
        idx = text.find(key)
        if idx != -1:
            return text[idx:]
    return text  # fallback: kembalikan teks penuh




# Terapkan ke kolom text_pdf_cleaned
df['text_pdf_cleaned'] = df['text_pdf_cleaned'].apply(clean_advanced)
df['text_pdf_cleaned'] = df['text_pdf_cleaned'].apply(extract_amar_putusan)
df['text_pdf_cleaned'] = df['text_pdf_cleaned'].apply(finalize_cleaning)
df['text_pdf_cleaned'] = df['text_pdf_cleaned'].apply(cari_amar_alternatif)



# Simpan hasil baru
output_path = '/content/drive/MyDrive/Penalaran Komputer/CSV4/putusan_fidusia_cleaned_FINAL_BERSIH_FIX.csv'
df.to_csv(output_path, index=False)

print("✅ Sukses! File dibersihkan dan disimpan ke:")
print(output_path)


✅ Sukses! File dibersihkan dan disimpan ke:
/content/drive/MyDrive/Penalaran Komputer/CSV4/putusan_fidusia_cleaned_FINAL_BERSIH_FIX.csv


In [None]:
df[['nomor', 'text_pdf_cleaned']].head(5)


Unnamed: 0,nomor,text_pdf_cleaned
0,Putusan MAHKAMAH AGUNG Nomor 5415 K/Pid.Sus/20...,m e n g a d i l i:-menolak permohonan kasasi d...
1,Putusan MAHKAMAH AGUNG Nomor 5484 K/Pid.Sus/20...,namun dalam hal-hal tertentu masih dimungkinka...
2,Putusan DILMIL I 06 BANJARMASIN Nomor 11-K/PM....,namun dalam hal-hal tertentu masih dimungkinka...
3,Putusan MAHKAMAH AGUNG Nomor 2319 K/Pid.Sus/20...,m e n g a d i l i: − mengabulkan permohonan ka...
4,Putusan MAHKAMAH AGUNG Nomor 2354 K/Pid.Sus/20...,m e n g a d i l i:-menolak permohonan kasasi d...


## Tahap 3

In [None]:
# ========================================
# TAHAP 3 – CASE RETRIEVAL (TF-IDF + IndoBERT)
# ========================================

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import torch
from transformers import AutoTokenizer, AutoModel

nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords_id = stopwords.words('indonesian')

# === Load Dataset
df = pd.read_csv('/content/drive/MyDrive/Penalaran Komputer/CSV4/putusan_fidusia_cleaned_FINAL_BERSIH_FIX.csv')

# === Ambil isi dokumen
documents = df['text_pdf_cleaned'].astype(str).tolist()
case_ids = df['nomor'].astype(str).tolist()

# ----------------------------------------
# 🔹 A. TF-IDF Retrieval
# ----------------------------------------
print("=== TF-IDF Retrieval ===")

vectorizer = TfidfVectorizer(max_features=8000, stop_words=stopwords_id)
tfidf_vectors = vectorizer.fit_transform(documents)

def retrieve_tfidf(query: str, k: int = 5):
    query_vec = vectorizer.transform([query])
    sim_scores = cosine_similarity(query_vec, tfidf_vectors).flatten()
    top_k_idx = sim_scores.argsort()[-k:][::-1]
    return [(case_ids[i], sim_scores[i]) for i in top_k_idx]

# Contoh uji coba
query = "Terdakwa menyewakan objek fidusia tanpa persetujuan"
tfidf_results = retrieve_tfidf(query)

print("\nTop-5 Hasil TF-IDF:")
for case_id, score in tfidf_results:
    print(f"Case ID: {case_id} — Similarity: {score:.4f}")


# ----------------------------------------
# 🔹 B. IndoBERT Embedding Retrieval
# ----------------------------------------
print("\n=== IndoBERT Retrieval ===")

# Load IndoBERT
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
model = AutoModel.from_pretrained("indobenchmark/indobert-base-p1")

def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Hitung semua embedding dokumen
bert_vectors = np.array([get_bert_embedding(doc) for doc in documents])

def retrieve_bert(query: str, k: int = 5):
    query_vec = get_bert_embedding(query).reshape(1, -1)
    sim_scores = cosine_similarity(query_vec, bert_vectors).flatten()
    top_k_idx = sim_scores.argsort()[-k:][::-1]
    return [(case_ids[i], sim_scores[i]) for i in top_k_idx]

# Contoh uji coba IndoBERT
bert_results = retrieve_bert(query)

print("\nTop-5 Hasil IndoBERT:")
for case_id, score in bert_results:
    print(f"Case ID: {case_id} — Similarity: {score:.4f}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


=== TF-IDF Retrieval ===

Top-5 Hasil TF-IDF:
Case ID: Putusan PN SELONG Nomor 60/Pid. Sus/2022/PN Sel Tanggal 8 Juni 2022 —-MARDIATUN HASANAH — Similarity: 0.2377
Case ID: Putusan MAHKAMAH AGUNG Nomor 5855 K/Pid.Sus/2022 Tanggal 8 Nopember 2022 —AKHMAD SYAM, S.Pd., M.M. bin SYAMSUDDIN MUKADDAS; — Similarity: 0.2261
Case ID: Putusan MAHKAMAH AGUNG Nomor 2319 K/Pid.Sus/2024 Tanggal 14 Juni 2024 —PENUNTUT UMUM pada KEJAKSAAN NEGERI KOTA SUKABUMI — Similarity: 0.1482
Case ID: Putusan MAHKAMAH AGUNG Nomor 88 K/PID.SUS/2022 Tanggal 25 Januari 2022 —ZUNAIDI SANDI LABABA — Similarity: 0.1396
Case ID: Putusan MAHKAMAH AGUNG Nomor 1107 K/PID.SUS/2017 Tanggal 20 Nopember 2017 —MOHAMAD SOLEH CHUDORIE AHMAD bin
M. BADRUN — Similarity: 0.0991

=== IndoBERT Retrieval ===

Top-5 Hasil IndoBERT:
Case ID: Putusan MAHKAMAH AGUNG Nomor 3633 K/Pid.Sus/2022 Tanggal 4 Agustus 2022 —MELKI MOONIK alias KIKI — Similarity: 0.5325
Case ID: Putusan MAHKAMAH AGUNG Nomor 6257 K/Pid.Sus/2022 Tanggal 24 Oktober 2022 

klasifikasi nb svm

In [None]:
# Gunakan label klasifikasi kasus
labels = df['klasifikasi'].astype(str).tolist()
from sklearn.model_selection import train_test_split

# TF-IDF vector sudah kamu punya (tfidf_vectors)
X_train, X_test, y_train, y_test = train_test_split(
    tfidf_vectors, labels, test_size=0.2, random_state=42
)
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

svm = LinearSVC()
svm.fit(X_train, y_train)

y_pred_svm = svm.predict(X_test)
print("\n=== Hasil Evaluasi SVM ===")
print(classification_report(y_test, y_pred_svm))
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train, y_train)

y_pred_nb = nb.predict(X_test)
print("\n=== Hasil Evaluasi Naive Bayes ===")
print(classification_report(y_test, y_pred_nb))



=== Hasil Evaluasi SVM ===
                                                                                       precision    recall  f1-score   support

                                                                        Perdata Agama       0.00      0.00      0.00         2
                                                               Pidana Khusus  Fidusia       0.70      1.00      0.82         7
Pidana Khusus  Narkotika dan Psikotropika 
 Pidana Khusus  Narkotika dan Psikotropika       0.00      0.00      0.00         1

                                                                             accuracy                           0.70        10
                                                                            macro avg       0.23      0.33      0.27        10
                                                                         weighted avg       0.49      0.70      0.58        10


=== Hasil Evaluasi Naive Bayes ===
                                            

In [None]:
from sklearn.metrics import precision_recall_fscore_support

precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred_svm, average='weighted')

eval_df = pd.DataFrame([{
    'model': 'SVM',
    'precision': precision,
    'recall': recall,
    'f1_score': f1
}])

eval_path = '/content/drive/MyDrive/Penalaran Komputer/data/eval/svm_fidusia_eval.csv'
eval_df.to_csv(eval_path, index=False)
print(f"✅ Evaluasi disimpan ke: {eval_path}")


✅ Evaluasi disimpan ke: /content/drive/MyDrive/Penalaran Komputer/data/eval/svm_fidusia_eval.csv


In [None]:
display(eval_df)

Unnamed: 0,model,precision,recall,f1_score
0,SVM,0.49,0.7,0.576471


## Tahap 4

In [None]:
# =========================================
# TAHAP 4 - SOLUTION REUSE (Prediksi Amar Putusan)
# Menggunakan hasil IndoBERT + Voting
# =========================================

import pandas as pd
import numpy as np
import json
import torch
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModel
from collections import defaultdict

# === Load Dataset (yang sudah dibersihkan)
df = pd.read_csv('/content/drive/MyDrive/Penalaran Komputer/CSV4/putusan_fidusia_cleaned_FINAL_BERSIH_FIX.csv')
documents = df['text_pdf_cleaned'].astype(str).tolist()
case_ids = df['nomor'].astype(str).tolist()

# === Load IndoBERT
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
model = AutoModel.from_pretrained("indobenchmark/indobert-base-p1")

# === Fungsi Ambil Embedding BERT
@torch.no_grad()
def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# === Buat embedding semua dokumen (lama, tapi hanya sekali)
doc_embeddings = np.array([get_bert_embedding(doc) for doc in documents])

# === Load queries dari queries.json
query_path = "/content/drive/MyDrive/Penalaran Komputer/data/eval/queries_fidusia.json"
with open(query_path, "r", encoding="utf-8") as f:
    queries_data = json.load(f)

queries = [item["query"] for item in queries_data]

# === Simpan mapping case_id -> amar (solusi)
case_solutions = {}
for i, row in df.iterrows():
    amar_text = str(row.get("amar", "")).lower()
    if not amar_text or amar_text.strip() == "":
        # alternatif: ambil dari text_pdf_cleaned jika kosong
        text = str(row["text_pdf_cleaned"]).lower()
        idx = text.find("m e n g a d i l i")
        case_solutions[row["nomor"]] = text[idx:] if idx != -1 else "amar tidak ditemukan"
    else:
        case_solutions[row["nomor"]] = amar_text

# === Fungsi retrieve dengan IndoBERT
def retrieve_bert(query, k=5):
    q_vec = get_bert_embedding(query).reshape(1, -1)
    sims = cosine_similarity(q_vec, doc_embeddings).flatten()
    top_k_idx = sims.argsort()[-k:][::-1]
    return [(case_ids[i], sims[i]) for i in top_k_idx]

# === Fungsi prediksi amar
def predict_amar(query, use_weighting=True):
    top_k = retrieve_bert(query, k=5)
    votes = defaultdict(float)
    for case_id, sim in top_k:
        amar = case_solutions.get(case_id, "amar tidak ditemukan")
        if use_weighting:
            votes[amar] += sim
        else:
            votes[amar] += 1
    predicted = max(votes.items(), key=lambda x: x[1])[0]
    return predicted, top_k

# === Prediksi untuk semua query dari queries.json
results = []
for i, q in enumerate(queries):
    pred, top_k = predict_amar(q)
    results.append({
        "query_id": i + 1,
        "query": q,
        "predicted_amar": pred,
        "top_5_case_ids": [cid for cid, _ in top_k]
    })

# === Simpan ke CSV
output_path = '/content/drive/MyDrive/Penalaran Komputer/data/results/prediksi_amar_fidusia.csv'
pd.DataFrame(results).to_csv(output_path, index=False)

print("✅ Prediksi amar disimpan di:", output_path)


✅ Prediksi amar disimpan di: /content/drive/MyDrive/Penalaran Komputer/data/results/prediksi_amar_fidusia.csv


In [None]:
display(output_path)

'/content/drive/MyDrive/Penalaran Komputer/data/results/prediksi_amar_fidusia.csv'

In [None]:
display(pd.DataFrame(results))

Unnamed: 0,query_id,query,predicted_amar,top_5_case_ids
0,1,Terdakwa menyewakan kendaraan jaminan fidusia ...,tolak,[Putusan MAHKAMAH AGUNG Nomor 3633 K/Pid.Sus/2...
1,2,Pengalihan objek fidusia tanpa persetujuan,lain-lain,[Putusan MAHKAMAH AGUNG Nomor 2319 K/Pid.Sus/2...
2,3,Terdakwa terbukti menyewakan mobil yang dijami...,tolak,[Putusan MAHKAMAH AGUNG Nomor 3633 K/Pid.Sus/2...


## Tahap 5

In [None]:
import pandas as pd
import json
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# --- Load queries JSON ---
json_path = '/content/drive/MyDrive/Penalaran Komputer/data/eval/queries_fidusia_baru.json'

with open(json_path) as f:
    queries = json.load(f)

# Buat mapping: query → ground_truth + true_amar
gt_map = {q['query']: [str(cid).lower() for cid in q['ground_truth']] for q in queries}
amar_map = {q['query']: q['true_amar'] for q in queries}

# --- Load prediksi CSV ---
df_pred = pd.read_csv('/content/drive/MyDrive/Penalaran Komputer/data/results/prediksi_amar_fidusia.csv')

# --- Evaluasi Retrieval ---
retrieval_metrics = []

for idx, row in df_pred.iterrows():
    query = row['query']

    raw_ids = str(row['top_5_case_ids'])

    # Deteksi pemisah list: bisa pakai koma atau titik koma
    if raw_ids.startswith('[') and raw_ids.endswith(']'):
        try:
            pred_ids = json.loads(raw_ids.replace("'", '"'))
        except:
            pred_ids = []
    else:
        pred_ids = [x.strip().lower() for x in raw_ids.split(';') if x.strip()]

    pred_ids = [x.lower() for x in pred_ids]
    true_ids = gt_map.get(query, [])

    true_positives = len(set(pred_ids) & set(true_ids))
    precision = true_positives / len(pred_ids) if pred_ids else 0
    recall = true_positives / len(true_ids) if true_ids else 0
    f1 = 2 * precision * recall / (precision + recall + 1e-6)

    retrieval_metrics.append({
        "query": query,
        "precision": precision,
        "recall": recall,
        "f1_score": f1
    })

# Simpan hasil retrieval
retrieval_df = pd.DataFrame(retrieval_metrics)
retrieval_df.to_csv('/content/drive/MyDrive/Penalaran Komputer/data/eval/retrieval_metrics.csv', index=False)

# --- Evaluasi Klasifikasi Amar ---
df_pred['true_amar'] = df_pred['query'].map(amar_map)

# Drop baris yang tidak punya nilai untuk true/predicted
valid_rows = df_pred.dropna(subset=['true_amar', 'predicted_amar'])

# Pastikan semua string
y_true = valid_rows['true_amar'].astype(str)
y_pred = valid_rows['predicted_amar'].astype(str)


pred_metrics = {
    "accuracy": accuracy_score(y_true, y_pred),
    "precision": precision_score(y_true, y_pred, average='macro', zero_division=0),
    "recall": recall_score(y_true, y_pred, average='macro', zero_division=0),
    "f1_score": f1_score(y_true, y_pred, average='macro', zero_division=0)
}

# Simpan hasil klasifikasi
pd.DataFrame([pred_metrics]).to_csv('/content/drive/MyDrive/Penalaran Komputer/data/eval/prediction_metrics.csv', index=False)

# --- Output Ringkasan ---
print(f"\n✅ retrieval_metrics.csv saved.")
print(f"✅ prediction_metrics.csv saved.")
print(f"\n🎯 Accuracy: {pred_metrics['accuracy']:.2f} | Precision: {pred_metrics['precision']:.2f} | Recall: {pred_metrics['recall']:.2f} | F1: {pred_metrics['f1_score']:.2f}")



✅ retrieval_metrics.csv saved.
✅ prediction_metrics.csv saved.

🎯 Accuracy: 1.00 | Precision: 1.00 | Recall: 1.00 | F1: 1.00
