In [None]:
import os
import re
import string
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.Dictionary.ArrayDictionary import ArrayDictionary
import time
from collections import defaultdict
import sys

# --- Konfigurasi ---
NAMA_FOLDER_INPUT = 'input'
NAMA_FILE_KAMUS_KUSTOM = "kamus_kata_dasar.txt"
# Nama file output baru untuk menandakan ini simulasi
NAMA_FOLDER_OUTPUT_COMPARISON_RELATIF = 'output/sastrawi_comparison_simulated_metrics'
NAMA_FOLDER_OUTPUT_CLEANED_RELATIF = 'output/cleaned_simulated_metrics' # Opsional
# --- Akhir Konfigurasi ---

# Path Setup
base_dir = os.getcwd()
print(f"Menggunakan direktori kerja saat ini sebagai basis: {base_dir}")
input_folder_path = os.path.join(base_dir, NAMA_FOLDER_INPUT)
kamus_kustom_path = os.path.join(base_dir, NAMA_FILE_KAMUS_KUSTOM)
output_folder_comparison_path = os.path.join(base_dir, NAMA_FOLDER_OUTPUT_COMPARISON_RELATIF)

output_folder_cleaned_path = None
if 'NAMA_FOLDER_OUTPUT_CLEANED_RELATIF' in locals() and NAMA_FOLDER_OUTPUT_CLEANED_RELATIF:
    output_folder_cleaned_path = os.path.join(base_dir, NAMA_FOLDER_OUTPUT_CLEANED_RELATIF)

# --- Fungsi Muat Kamus Kustom (sama) ---
def muat_kamus_kustom(filepath):
    kata_dasar_kustom = set()
    # ... (isi fungsi sama seperti sebelumnya) ...
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                kata = line.strip().lower()
                if kata:
                    kata_dasar_kustom.add(kata)
        if not kata_dasar_kustom:
            raise ValueError(f"File kamus kustom '{filepath}' ditemukan tapi kosong.")
        print(f"Berhasil memuat {len(kata_dasar_kustom)} kata dari kamus kustom: {filepath}")
        return kata_dasar_kustom
    except FileNotFoundError:
        print(f"ERROR: File kamus kustom '{filepath}' tidak ditemukan!")
        raise
    except ValueError as ve:
        print(f"ERROR: {ve}")
        raise
    except Exception as e:
        print(f"ERROR: Terjadi kesalahan tak terduga saat memuat kamus kustom: {e}")
        raise


# --- Fungsi Cleaning (sama) ---
def bersihkan_teks_preserve_lines(teks):
    # ... (isi fungsi sama seperti sebelumnya) ...
    lines = teks.split('\n')
    cleaned_lines = []
    tanda_baca_escaped = re.escape(string.punctuation)
    for line in lines:
        line = line.lower()
        line = re.sub(r"\d+", "", line)
        line = re.sub(r'[' + tanda_baca_escaped + ']', '', line)
        line = re.sub(r'[ \t]+', ' ', line)
        line = line.strip()
        cleaned_lines.append(line)
    return '\n'.join(cleaned_lines)

# === 1. Muat Kamus Kustom & Inisialisasi Stemmer ===
try:
    print(f"Mencoba memuat kamus kustom dari: {kamus_kustom_path}...")
    kamus_set_kustom = muat_kamus_kustom(kamus_kustom_path)
    print("Menginisialisasi stemmer Sastrawi DENGAN KAMUS KUSTOM...")
    custom_dictionary = ArrayDictionary(list(kamus_set_kustom))
    factory = StemmerFactory(custom_dictionary)
    stemmer = factory.create_stemmer() # Ini stemmer yang akan kita evaluasi
    print("Stemmer dengan kamus kustom SIAP.")
except (FileNotFoundError, ValueError, Exception) as e:
    print(f"\nGAGAL menginisialisasi stemmer: {e}")
    sys.exit(1)

# === Fungsi untuk Menghitung Metrik Simulasi ===
def hitung_simulasi_metrik(list_kata_unik_cleaned, stemmer_obj, kamus_dasar_set):
    """
    Menghitung simulasi MWC, UI, OI berdasarkan peta tiruan.
    INGAT: Ini BUKAN evaluasi yang valid secara linguistik.
    """
    simulated_gold_map = {} # Peta tiruan: kata_cleaned -> stem_anggap_benar
    sastrawi_results = {}   # Peta hasil: kata_cleaned -> stem_hasil_sastrawi

    # 1. Bangun peta tiruan dan dapatkan hasil Sastrawi
    for kata in list_kata_unik_cleaned:
        if not kata: continue # Lewati string kosong

        hasil_stem = stemmer_obj.stem(kata)
        sastrawi_results[kata] = hasil_stem

        # Buat entri untuk peta tiruan "gold standard"
        if kata in kamus_dasar_set:
            simulated_gold_map[kata] = kata # Jika kata dasar, stem benarnya = dirinya sendiri
        else:
            # Asumsi KRUSIAL: anggap hasil stemmer BENAR untuk kata non-dasar
            simulated_gold_map[kata] = hasil_stem

    # 2. Hitung Metrik Simulasi
    mwc_sim = 0
    oi_sim_groups_error = 0
    ui_sim_groups_error = 0

    # MWC Simulasi
    for kata, stem_sastrawi in sastrawi_results.items():
        # Bandingkan dengan peta tiruan
        if kata in simulated_gold_map and stem_sastrawi != simulated_gold_map[kata]:
            mwc_sim += 1
            # Seharusnya mwc_sim akan ~0 karena cara peta tiruan dibuat

    # UI/OI Simulasi - Kelompokkan berdasarkan peta tiruan dan hasil sastrawi
    gold_groups = defaultdict(set)
    sastrawi_groups = defaultdict(set)

    for kata, stem_anggap_benar in simulated_gold_map.items():
        gold_groups[stem_anggap_benar].add(kata)
        # Pastikan kata ada di hasil sastrawi (seharusnya selalu ada)
        if kata in sastrawi_results:
            sastrawi_groups[sastrawi_results[kata]].add(kata)

    # Hitung UI Simulasi (Kelompok "benar" yang dipecah Sastrawi)
    for stem_anggap_benar, kata_di_gold_group in gold_groups.items():
        if not kata_di_gold_group: continue
        hasil_sastrawi_untuk_grup = {sastrawi_results.get(k) for k in kata_di_gold_group if k in sastrawi_results}
        # Hapus None jika ada kata yg tdk terproses (jarang terjadi)
        hasil_sastrawi_untuk_grup.discard(None)
        if len(hasil_sastrawi_untuk_grup) > 1:
            ui_sim_groups_error += 1

    # Hitung OI Simulasi (Kelompok Sastrawi yang mencampur kata dari grup "benar" berbeda)
    for stem_sastrawi, kata_di_sastrawi_group in sastrawi_groups.items():
        if not kata_di_sastrawi_group: continue
        asal_anggap_benar_untuk_grup = {simulated_gold_map.get(k) for k in kata_di_sastrawi_group if k in simulated_gold_map}
        asal_anggap_benar_untuk_grup.discard(None)
        if len(asal_anggap_benar_untuk_grup) > 1:
            oi_sim_groups_error += 1

    return mwc_sim, ui_sim_groups_error, oi_sim_groups_error
# =============================================

# 2. Buat folder output (sama)
# ... (kode pembuatan folder) ...
if not os.path.exists(output_folder_comparison_path): os.makedirs(output_folder_comparison_path); print(f"Folder output dibuat: {output_folder_comparison_path}")
else: print(f"Folder output sudah ada: {output_folder_comparison_path}")
if output_folder_cleaned_path:
    if not os.path.exists(output_folder_cleaned_path): os.makedirs(output_folder_cleaned_path); print(f"Folder dibuat: {output_folder_cleaned_path}")
    else: print(f"Folder sudah ada: {output_folder_cleaned_path}")


# Variabel Evaluasi TOTAL
# ... (variabel statistik dasar) ...
total_words_original_all_files = 0; total_words_cleaned_all_files = 0; total_words_stemmed_all_files = 0
unique_words_original_total = set(); unique_words_cleaned_total = set(); unique_words_stemmed_total = set()
# Variabel Total untuk Metrik Simulasi
total_mwc_sim_all_files = 0
total_ui_sim_all_files = 0
total_oi_sim_all_files = 0

# 3. Proses setiap file
print(f"\nMemulai proses cleaning dan stemming dari folder: {input_folder_path}")
print(f"Menggunakan KAMUS KUSTOM: {kamus_kustom_path}")
start_time = time.time()

try:
    # ... (kode cek folder input, list file) ...
    if not os.path.isdir(input_folder_path): raise FileNotFoundError(f"Folder input '{input_folder_path}' tidak ditemukan.")
    list_file = os.listdir(input_folder_path); processed_files = 0; skipped_files = 0
    if not list_file: print("Folder input kosong.")

    for filename in list_file:
        # ... (kode path file) ...
        input_file_path = os.path.join(input_folder_path, filename)
        if os.path.isfile(input_file_path) and filename.lower().endswith('.txt'):
            output_file_comparison_path = os.path.join(output_folder_comparison_path, filename)
            current_output_cleaned_path = None
            if output_folder_cleaned_path: current_output_cleaned_path = os.path.join(output_folder_cleaned_path, filename)

            print(f"  -> Memproses file: {filename}...")

            try:
                # ... (baca file, statistik original) ...
                with open(input_file_path, 'r', encoding='utf-8', errors='ignore') as f_in: original_text = f_in.read()
                original_words_list = original_text.split(); count_original_words = len(original_words_list)
                set_unique_original = set(w for w in original_words_list if w); count_unique_original = len(set_unique_original)

                # ... (cleaning, opsional simpan cleaned) ...
                cleaned_text = bersihkan_teks_preserve_lines(original_text)
                if current_output_cleaned_path:
                    with open(current_output_cleaned_path, 'w', encoding='utf-8') as f_clean: f_clean.write(cleaned_text)

                # Stemming
                stemmed_text = stemmer.stem(cleaned_text)

                # Statistik Dasar Cleaned & Stemmed
                valid_cleaned_words = [w for w in cleaned_text.split() if w]
                valid_stemmed_words = [w for w in stemmed_text.split() if w]
                count_cleaned_words = len(valid_cleaned_words)
                set_unique_cleaned = set(valid_cleaned_words); count_unique_cleaned = len(set_unique_cleaned)
                count_stemmed_words = len(valid_stemmed_words)
                set_unique_stemmed = set(valid_stemmed_words); count_unique_stemmed = len(set_unique_stemmed)

                # *** Hitung Metrik SIMULASI untuk file ini ***
                # Kita butuh daftar kata unik yang bersih sebagai input
                list_unik_cleaned = list(set_unique_cleaned)
                mwc_sim_file, ui_sim_file, oi_sim_file = hitung_simulasi_metrik(
                    list_unik_cleaned, stemmer, kamus_set_kustom
                )
                # ********************************************

                # Update Total Simulasi
                total_mwc_sim_all_files += mwc_sim_file
                total_ui_sim_all_files += ui_sim_file
                total_oi_sim_all_files += oi_sim_file

                # Buat Header File Output (dengan metrik simulasi)
                file_header = f"""=============================================
FILE: {filename} - STATISTIK (Kamus Kustom: {os.path.basename(kamus_kustom_path)})
=============================================
A. STATISTIK DASAR:
   Jumlah Kata (Original): {count_original_words}
   Jumlah Kata Unik (Original): {count_unique_original}
   ------------------------------------------
   Jumlah Kata (Setelah Cleaning): {count_cleaned_words}
   Jumlah Kata Unik (Setelah Cleaning): {count_unique_cleaned}
   ------------------------------------------
   Jumlah Kata (Setelah Stemming): {count_stemmed_words}
   Jumlah Kata Unik (Setelah Stemming): {count_unique_stemmed}
---------------------------------------------
B. METRIK EVALUASI SIMULASI (PERKIRAAN SANGAT KASAR!):
   Simulated MWC (Mis-stemmed*): {mwc_sim_file}
   Simulated UI (Under-stemming Groups**): {ui_sim_file}
   Simulated OI (Over-stemming Groups***): {oi_sim_file}
=============================================
CATATAN SANGAT PENTING:
Angka MWC, UI, OI di atas adalah HASIL SIMULASI berdasarkan asumsi bahwa
hasil stemmer dianggap 'benar' jika kata input tidak ada di kamus dasar.
Ini BUKAN evaluasi linguistik yang valid dan TIDAK MENGGUNAKAN GOLD STANDARD
STEMMING yang sebenarnya. Gunakan HANYA untuk perbandingan relatif kasar
dalam eksperimen internal Anda.
*   Simulated MWC: Jumlah kata unik yang stem hasil Sastrawi != stem 'benar'
    menurut peta tiruan (kemungkinan selalu 0).
**  Simulated UI: Jumlah kelompok stem 'benar' (dari peta tiruan) yang
    hasil stem Sastrawi-nya terpecah menjadi >1 stem berbeda.
*** Simulated OI: Jumlah kelompok stem hasil Sastrawi yang berisi kata-kata
    yang berasal dari >1 stem 'benar' berbeda (menurut peta tiruan).

-> Interpretasikan angka Simulasi MWC, UI, OI dengan SANGAT HATI-HATI! <-

"""
                separator_cleaned = "\n--- TEKS SETELAH CLEANING (SEBELUM STEMMING) ---\n"
                separator_stemmed = "\n\n--- TEKS SETELAH STEMMING ---\n"
                final_comparison_content = (file_header + separator_cleaned + cleaned_text + separator_stemmed + stemmed_text)

                # ... (tulis file perbandingan) ...
                with open(output_file_comparison_path, 'w', encoding='utf-8') as f_comparison: f_comparison.write(final_comparison_content)

                # Update Statistik TOTAL Dasar
                total_words_original_all_files += count_original_words; unique_words_original_total.update(set_unique_original)
                total_words_cleaned_all_files += count_cleaned_words; unique_words_cleaned_total.update(set_unique_cleaned)
                total_words_stemmed_all_files += count_stemmed_words; unique_words_stemmed_total.update(set_unique_stemmed)
                processed_files += 1

            except Exception as e:
                print(f"     ERROR saat memproses file {filename}: {e}")
        else:
            # ... (log skip file) ...
            if os.path.isfile(input_file_path): print(f"  -> Melewati file non-txt: {filename}")
            elif os.path.exists(input_file_path): print(f"  -> Melewati item non-file: {filename}")
            else: print(f"  -> Path tidak valid: {filename}")
            skipped_files += 1

    end_time = time.time(); total_time = end_time - start_time

    # ... (print status selesai) ...
    print(f"\nProses SELESAI (Kamus Kustom, Metrik Simulasi).")
    print(f"Jumlah file .txt diproses: {processed_files}")
    if skipped_files > 0: print(f"Jumlah item dilewati: {skipped_files}")
    print(f"Hasil perbandingan disimpan di: {output_folder_comparison_path}")
    if output_folder_cleaned_path and os.path.exists(output_folder_cleaned_path): print(f"Hasil cleaning saja disimpan di: {output_folder_cleaned_path}")
    print(f"Total waktu eksekusi: {total_time:.2f} detik")

    # Cetak Statistik TOTAL dengan Metrik Simulasi
    print("\n--- Statistik TOTAL (Semua File - Kamus Kustom - TERMASUK SIMULASI METRIK) ---")
    count_unique_original_total = len(unique_words_original_total); count_unique_cleaned_total = len(unique_words_cleaned_total); count_unique_stemmed_total = len(unique_words_stemmed_total)
    print(f"Total kata (tokens) original: {total_words_original_all_files}"); print(f"Jumlah kata unik (types) original: {count_unique_original_total}")
    print("-" * 30)
    print(f"Total kata (tokens) cleaned: {total_words_cleaned_all_files}"); print(f"Jumlah kata unik (types) cleaned: {count_unique_cleaned_total}")
    print("-" * 30)
    print(f"Total kata (tokens) stemmed: {total_words_stemmed_all_files}"); print(f"Jumlah kata unik (types) stemmed: {count_unique_stemmed_total}")
    print("-" * 30)
    print("METRIK EVALUASI SIMULASI TOTAL (PERKIRAAN SANGAT KASAR):")
    print(f"  Total Simulated MWC: {total_mwc_sim_all_files}")
    print(f"  Total Simulated UI Groups: {total_ui_sim_all_files}")
    print(f"  Total Simulated OI Groups: {total_oi_sim_all_files}")
    print("-" * 30)
    # ... (print persentase reduksi) ...
    if count_unique_original_total > 0: reduction_from_original = ((count_unique_original_total - count_unique_stemmed_total) / count_unique_original_total) * 100; print(f"Reduksi kosakata unik (Original -> Stemmed): {reduction_from_original:.2f}%")
    if count_unique_cleaned_total > 0: reduction_from_cleaned = ((count_unique_cleaned_total - count_unique_stemmed_total) / count_unique_cleaned_total) * 100; print(f"Reduksi kosakata unik (Cleaned -> Stemmed): {reduction_from_cleaned:.2f}%")

except FileNotFoundError as fnf_error: print(f"ERROR: {fnf_error}"); print("Pastikan folder input ada.")
except Exception as e: print(f"Terjadi kesalahan umum: {e}")

Menggunakan direktori kerja saat ini sebagai basis: /home/xerces/project/stemming-project
Menginisialisasi stemmer Sastrawi...
Stemmer siap.
Folder output perbandingan sudah ada: /home/xerces/project/stemming-project/output/sastrawi_comparison_stats
Folder output cleaned-only sudah ada: /home/xerces/project/stemming-project/output/cleaned_stats

Memulai proses cleaning dan stemming dari folder: /home/xerces/project/stemming-project/input/indo
  -> Memproses file: Kel3_Peran Bimbingan dan Konseling Dalam Pendidikan Karakter    .txt...
  -> Memproses file: Kel6_Benarkah anak-anak butuh mata pelajaran koding dan AI di sekolah.txt...
     WARN: Jumlah kata cleaned (852) != stemmed (857) untuk file Kel6_Benarkah anak-anak butuh mata pelajaran koding dan AI di sekolah.txt. Proxy Stem Collision tidak dihitung.
  -> Memproses file: Dampak Tarif Resiprokal Trump terhadap Industri di Indonesia_1.txt...
  -> Memproses file: Global South dan Ilusi Netralitas_10.txt...
     WARN: Jumlah kata cleane