In [None]:
## Tahap 3

# ========================================
# TAHAP 3 – CASE RETRIEVAL (TF-IDF + IndoBERT)
# ========================================

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import torch
from transformers import AutoTokenizer, AutoModel

nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords_id = stopwords.words('indonesian')

# === Load Dataset
df = pd.read_csv('/content/drive/MyDrive/Penalaran Komputer/CSV4/putusan_fidusia_cleaned_FINAL_BERSIH_FIX.csv')

# === Ambil isi dokumen
documents = df['text_pdf_cleaned'].astype(str).tolist()
case_ids = df['nomor'].astype(str).tolist()

# ----------------------------------------
# 🔹 A. TF-IDF Retrieval
# ----------------------------------------
print("=== TF-IDF Retrieval ===")

vectorizer = TfidfVectorizer(max_features=8000, stop_words=stopwords_id)
tfidf_vectors = vectorizer.fit_transform(documents)

def retrieve_tfidf(query: str, k: int = 5):
    query_vec = vectorizer.transform([query])
    sim_scores = cosine_similarity(query_vec, tfidf_vectors).flatten()
    top_k_idx = sim_scores.argsort()[-k:][::-1]
    return [(case_ids[i], sim_scores[i]) for i in top_k_idx]

# Contoh uji coba
query = "Terdakwa menyewakan objek fidusia tanpa persetujuan"
tfidf_results = retrieve_tfidf(query)

print("\nTop-5 Hasil TF-IDF:")
for case_id, score in tfidf_results:
    print(f"Case ID: {case_id} — Similarity: {score:.4f}")


# ----------------------------------------
# 🔹 B. IndoBERT Embedding Retrieval
# ----------------------------------------
print("\n=== IndoBERT Retrieval ===")

# Load IndoBERT
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")
model = AutoModel.from_pretrained("indobenchmark/indobert-base-p1")

def get_bert_embedding(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Hitung semua embedding dokumen
bert_vectors = np.array([get_bert_embedding(doc) for doc in documents])

def retrieve_bert(query: str, k: int = 5):
    query_vec = get_bert_embedding(query).reshape(1, -1)
    sim_scores = cosine_similarity(query_vec, bert_vectors).flatten()
    top_k_idx = sim_scores.argsort()[-k:][::-1]
    return [(case_ids[i], sim_scores[i]) for i in top_k_idx]

# Contoh uji coba IndoBERT
bert_results = retrieve_bert(query)

print("\nTop-5 Hasil IndoBERT:")
for case_id, score in bert_results:
    print(f"Case ID: {case_id} — Similarity: {score:.4f}")


klasifikasi nb svm

# Gunakan label klasifikasi kasus
labels = df['klasifikasi'].astype(str).tolist()
from sklearn.model_selection import train_test_split

# TF-IDF vector sudah kamu punya (tfidf_vectors)
X_train, X_test, y_train, y_test = train_test_split(
    tfidf_vectors, labels, test_size=0.2, random_state=42
)
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

svm = LinearSVC()
svm.fit(X_train, y_train)

y_pred_svm = svm.predict(X_test)
print("\n=== Hasil Evaluasi SVM ===")
print(classification_report(y_test, y_pred_svm))
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(X_train, y_train)

y_pred_nb = nb.predict(X_test)
print("\n=== Hasil Evaluasi Naive Bayes ===")
print(classification_report(y_test, y_pred_nb))


from sklearn.metrics import precision_recall_fscore_support

precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred_svm, average='weighted')

eval_df = pd.DataFrame([{
    'model': 'SVM',
    'precision': precision,
    'recall': recall,
    'f1_score': f1
}])

eval_path = '/content/drive/MyDrive/Penalaran Komputer/data/eval/svm_fidusia_eval.csv'
eval_df.to_csv(eval_path, index=False)
print(f"✅ Evaluasi disimpan ke: {eval_path}")


display(eval_df)