In [None]:
import os
import re
import string
import sys
import time
from collections import defaultdict

# --- Import Stemmer ---
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.Dictionary.ArrayDictionary import ArrayDictionary
import nltk
try:
    from nltk.stem.snowball import SnowballStemmer
    nltk.data.find('tokenizers/punkt') # Cek jika 'punkt' ada
except LookupError:
    print("NLTK 'punkt' resource not found. Downloading...")
    nltk.download('punkt', quiet=True)
    from nltk.stem.snowball import SnowballStemmer
# --------------------

# --- KONFIGURASI ---
# Folder Input
INPUT_FOLDER_INDO = 'input/indo'
INPUT_FOLDER_ENG = 'input/eng'

# Folder Output
OUTPUT_FOLDER_INDO_COMPARE = 'output/indo_comparison_simulated'
OUTPUT_FOLDER_ENG_COMPARE = 'output/eng_comparison_simulated'
OUTPUT_FOLDER_INDO_STEMMED = 'output/indo_stemmed' # Untuk IR
OUTPUT_FOLDER_ENG_STEMMED = 'output/eng_stemmed'   # Untuk IR

# Kamus Kata Dasar Kustom (dari gambar Anda)
# Pastikan semua lowercase
root_words_custom = set([
    "politik", "ekonomi", "olahraga", "kesehatan", "pendidikan",
    "sakit", "ajar", "main", "sehat", "kerja", "didik", "guna", "pakai",
    "tahu", "percaya", "ubah", "kaji", "nilai", "teknologi", "finance",
    "government", "education", "sport", "health", "play", "study", "run"
])
# --------------------

# --- Fungsi Helper ---

def bersihkan_teks_preserve_lines(teks):
    """Membersihkan teks per baris sambil mempertahankan newline."""
    lines = teks.split('\n')
    cleaned_lines = []
    # Hapus tanda baca kecuali apostrof tunggal dalam kata (opsional)
    # punctuation_to_remove = string.punctuation.replace("'", "")
    # tanda_baca_escaped = re.escape(punctuation_to_remove)
    tanda_baca_escaped = re.escape(string.punctuation) # Versi simpel

    for line in lines:
        line = line.lower()
        line = re.sub(r"\d+", "", line) # Hapus angka
        line = re.sub(r'[' + tanda_baca_escaped + ']', '', line) # Hapus tanda baca
        line = re.sub(r'[ \t]+', ' ', line) # Spasi/tab berlebih jadi satu
        line = line.strip() # Spasi awal/akhir baris
        cleaned_lines.append(line)
    return '\n'.join(cleaned_lines)

def hitung_simulasi_metrik(list_kata_unik_cleaned, stemmer_obj, kamus_dasar_set):
    """
    Menghitung simulasi MWC, UI, OI berdasarkan peta tiruan.
    INGAT: Ini BUKAN evaluasi yang valid secara linguistik.
    """
    simulated_gold_map = {}
    sastrawi_results = {}
    is_sastrawi = isinstance(stemmer_obj, nltk.stem.api.StemmerI) # Cek tipe stemmer (kurang ideal, tapi untuk contoh)
                                                                  # Atau cek berdasarkan bahasa saja

    # Bangun peta tiruan dan hasil stemmer
    for kata in list_kata_unik_cleaned:
        if not kata: continue
        hasil_stem = stemmer_obj.stem(kata)
        sastrawi_results[kata] = hasil_stem

        if kata in kamus_dasar_set:
            simulated_gold_map[kata] = kata
        else:
            # Asumsi krusial: anggap hasil stemmer BENAR untuk kata non-dasar
            simulated_gold_map[kata] = hasil_stem

    mwc_sim = 0; oi_sim_groups_error = 0; ui_sim_groups_error = 0
    gold_groups = defaultdict(set); sastrawi_groups = defaultdict(set)

    for kata, stem_anggap_benar in simulated_gold_map.items():
        gold_groups[stem_anggap_benar].add(kata)
        if kata in sastrawi_results:
            sastrawi_groups[sastrawi_results[kata]].add(kata)
            if sastrawi_results[kata] != stem_anggap_benar:
                 mwc_sim += 1 # MWC (tiruan): hasil beda dari peta tiruan

    for stem_anggap_benar, kata_di_gold_group in gold_groups.items():
        if not kata_di_gold_group: continue
        hasil_sastrawi_untuk_grup = {sastrawi_results.get(k) for k in kata_di_gold_group if k in sastrawi_results}
        hasil_sastrawi_untuk_grup.discard(None)
        if len(hasil_sastrawi_untuk_grup) > 1:
            ui_sim_groups_error += 1 # UI (tiruan): kelompok 'benar' dipecah

    for stem_sastrawi, kata_di_sastrawi_group in sastrawi_groups.items():
        if not kata_di_sastrawi_group: continue
        asal_anggap_benar_untuk_grup = {simulated_gold_map.get(k) for k in kata_di_sastrawi_group if k in simulated_gold_map}
        asal_anggap_benar_untuk_grup.discard(None)
        if len(asal_anggap_benar_untuk_grup) > 1:
            oi_sim_groups_error += 1 # OI (tiruan): kelompok hasil mencampur asal 'benar' yg beda

    return mwc_sim, ui_sim_groups_error, oi_sim_groups_error

def proses_dokumen_set(input_folder, output_folder_compare, output_folder_stemmed, stemmer_obj, kamus_dasar_set, language_name):
    """Memproses satu set dokumen (Indo atau Eng)."""
    print(f"\n--- Memproses Dokumen Bahasa: {language_name} ---")
    print(f"Input: {input_folder}")
    print(f"Output Perbandingan: {output_folder_compare}")
    print(f"Output Stemmed (IR): {output_folder_stemmed}")

    # Buat folder output jika belum ada
    os.makedirs(output_folder_compare, exist_ok=True)
    os.makedirs(output_folder_stemmed, exist_ok=True)

    # Variabel Akumulasi Statistik untuk set ini
    total_stats = {
        'original_words': 0, 'cleaned_words': 0, 'stemmed_words': 0,
        'unique_original': set(), 'unique_cleaned': set(), 'unique_stemmed': set(),
        'mwc_sim': 0, 'ui_sim': 0, 'oi_sim': 0,
        'processed_files': 0, 'skipped_files': 0
    }

    if not os.path.isdir(input_folder):
        print(f"WARNING: Folder input '{input_folder}' tidak ditemukan. Melewati set ini.")
        return total_stats # Kembalikan stats kosong

    try:
        list_file = os.listdir(input_folder)
        if not list_file:
            print("Folder input kosong.")
            return total_stats

        for filename in list_file:
            input_file_path = os.path.join(input_folder, filename)

            if os.path.isfile(input_file_path) and filename.lower().endswith('.txt'):
                output_file_compare_path = os.path.join(output_folder_compare, filename)
                output_file_stemmed_path = os.path.join(output_folder_stemmed, filename)

                print(f"  -> Memproses: {filename}...")
                try:
                    with open(input_file_path, 'r', encoding='utf-8', errors='ignore') as f_in:
                        original_text = f_in.read()

                    # Statistik Original
                    original_words_list = original_text.split()
                    count_original_words = len(original_words_list)
                    set_unique_original = set(w for w in original_words_list if w)
                    count_unique_original = len(set_unique_original)

                    # Cleaning
                    cleaned_text = bersihkan_teks_preserve_lines(original_text)

                    # Stemming
                    stemmed_text = stemmer_obj.stem(cleaned_text) # Gunakan stemmer yang sesuai

                    # Simpan HANYA stemmed text untuk IR
                    with open(output_file_stemmed_path, 'w', encoding='utf-8') as f_stem_out:
                        f_stem_out.write(stemmed_text)

                    # Statistik Dasar Cleaned & Stemmed
                    valid_cleaned_words = [w for w in cleaned_text.split() if w]
                    valid_stemmed_words = [w for w in stemmed_text.split() if w]
                    count_cleaned_words = len(valid_cleaned_words)
                    set_unique_cleaned = set(valid_cleaned_words); count_unique_cleaned = len(set_unique_cleaned)
                    count_stemmed_words = len(valid_stemmed_words)
                    set_unique_stemmed = set(valid_stemmed_words); count_unique_stemmed = len(set_unique_stemmed)

                    # Hitung Metrik Simulasi
                    list_unik_cleaned = list(set_unique_cleaned)
                    mwc_sim_file, ui_sim_file, oi_sim_file = hitung_simulasi_metrik(
                        list_unik_cleaned, stemmer_obj, kamus_dasar_set
                    )

                    # Buat Header File Output Perbandingan
                    file_header = f"""=============================================
FILE: {filename} - STATISTIK & METRIK SIMULASI ({language_name})
=============================================
A. STATISTIK DASAR:
   Jumlah Kata (Original): {count_original_words}
   Jumlah Kata Unik (Original): {count_unique_original}
   ------------------------------------------
   Jumlah Kata (Setelah Cleaning): {count_cleaned_words}
   Jumlah Kata Unik (Setelah Cleaning): {count_unique_cleaned}
   ------------------------------------------
   Jumlah Kata (Setelah Stemming): {count_stemmed_words}
   Jumlah Kata Unik (Setelah Stemming): {count_unique_stemmed}
---------------------------------------------
B. METRIK EVALUASI SIMULASI (PERKIRAAN SANGAT KASAR!):
   Simulated MWC (Mis-stemmed*): {mwc_sim_file}
   Simulated UI (Under-stemming Groups**): {ui_sim_file}
   Simulated OI (Over-stemming Groups***): {oi_sim_file}
=============================================
CATATAN SANGAT PENTING:
Metrik MWC, UI, OI di atas adalah HASIL SIMULASI berdasarkan kamus dasar kustom
dan asumsi internal. Ini BUKAN evaluasi linguistik yang valid. Gunakan HANYA
untuk perbandingan relatif kasar. (Lihat definisi di kode sumber).

-> Interpretasikan angka Simulasi MWC, UI, OI dengan SANGAT HATI-HATI! <-

"""
                    separator_cleaned = "\n--- TEKS SETELAH CLEANING (SEBELUM STEMMING) ---\n"
                    separator_stemmed = "\n\n--- TEKS SETELAH STEMMING ---\n"
                    final_comparison_content = (file_header + separator_cleaned + cleaned_text + separator_stemmed + stemmed_text)

                    # Tulis file perbandingan
                    with open(output_file_compare_path, 'w', encoding='utf-8') as f_comparison:
                        f_comparison.write(final_comparison_content)

                    # Akumulasi Statistik Total
                    total_stats['original_words'] += count_original_words
                    total_stats['cleaned_words'] += count_cleaned_words
                    total_stats['stemmed_words'] += count_stemmed_words
                    total_stats['unique_original'].update(set_unique_original)
                    total_stats['unique_cleaned'].update(set_unique_cleaned)
                    total_stats['unique_stemmed'].update(set_unique_stemmed)
                    total_stats['mwc_sim'] += mwc_sim_file
                    total_stats['ui_sim'] += ui_sim_file
                    total_stats['oi_sim'] += oi_sim_file
                    total_stats['processed_files'] += 1

                except Exception as e:
                    print(f"     ERROR saat memproses file {filename}: {e}")
            else:
                # Log skip file
                if os.path.isfile(input_file_path): print(f"  -> Melewati file non-txt: {filename}")
                elif os.path.exists(input_file_path): print(f"  -> Melewati item non-file: {filename}")
                else: print(f"  -> Path tidak valid: {filename}")
                total_stats['skipped_files'] += 1

    except FileNotFoundError:
        print(f"WARNING: Folder input '{input_folder}' tidak ditemukan.")
    except Exception as e:
        print(f"ERROR saat memproses folder {input_folder}: {e}")

    print(f"--- Selesai memproses {language_name} ---")
    return total_stats

def cari_dokumen(query, language, stemmed_docs_folder, stemmer_obj, top_n=5):
    """Mencari dokumen berdasarkan query yang di-stem."""
    print(f"\n--- Mencari Dokumen ({language}) untuk Query: '{query}' ---")
    if not os.path.isdir(stemmed_docs_folder):
        print(f"ERROR: Folder dokumen stemmed '{stemmed_docs_folder}' tidak ditemukan.")
        return []

    # 1. Bersihkan dan stem query
    cleaned_query = bersihkan_teks_preserve_lines(query)
    query_stems = set(w for w in stemmer_obj.stem(cleaned_query).split() if w)
    if not query_stems:
        print("Query tidak menghasilkan stem yang valid setelah dibersihkan.")
        return []
    print(f"Stem Query: {query_stems}")

    # 2. Hitung skor relevansi (simple matching count)
    doc_scores = {}
    try:
        for filename in os.listdir(stemmed_docs_folder):
            if filename.lower().endswith('.txt'):
                filepath = os.path.join(stemmed_docs_folder, filename)
                try:
                    with open(filepath, 'r', encoding='utf-8') as f:
                        stemmed_content = f.read()
                        doc_stems = set(w for w in stemmed_content.split() if w)

                        # Hitung jumlah stem query yang ada di dokumen
                        score = len(query_stems.intersection(doc_stems))
                        if score > 0:
                            doc_scores[filename] = score
                except Exception as e:
                    print(f"  Warning: Gagal membaca/memproses file stemmed {filename}: {e}")

    except Exception as e:
        print(f"ERROR saat mengakses folder stemmed {stemmed_docs_folder}: {e}")
        return []

    # 3. Urutkan dokumen berdasarkan skor
    sorted_docs = sorted(doc_scores.items(), key=lambda item: item[1], reverse=True)

    # 4. Kembalikan top N
    print(f"Dokumen teratas yang relevan (Top {top_n}):")
    if not sorted_docs:
        print("Tidak ada dokumen yang cocok ditemukan.")
        return []

    results = []
    for i, (doc, score) in enumerate(sorted_docs[:top_n]):
        print(f"{i+1}. {doc} (Skor: {score})")
        results.append((doc, score))
    return results

# --- PROGRAM UTAMA ---
if __name__ == "__main__":
    start_pipeline_time = time.time()

    # Inisialisasi Stemmer
    print("--- Inisialisasi Stemmer ---")
    try:
        # Sastrawi dengan kamus kustom
        sastrawi_custom_dict = ArrayDictionary(list(root_words_custom))
        sastrawi_factory = StemmerFactory(sastrawi_custom_dict)
        sastrawi_stemmer = sastrawi_factory.create_stemmer()
        print("Stemmer Sastrawi (Kamus Kustom) siap.")

        # Snowball untuk Inggris
        snowball_stemmer = SnowballStemmer('english')
        print("Stemmer Snowball (English) siap.")
    except Exception as e:
        print(f"ERROR: Gagal menginisialisasi stemmer: {e}")
        sys.exit(1)
    print("-----------------------------")

    # Proses Dokumen Indonesia
    stats_indo = proses_dokumen_set(
        INPUT_FOLDER_INDO,
        OUTPUT_FOLDER_INDO_COMPARE,
        OUTPUT_FOLDER_INDO_STEMMED,
        sastrawi_stemmer,
        root_words_custom, # Kamus dasar digunakan untuk perhitungan metrik simulasi
        "Indonesia"
    )

    # Proses Dokumen Inggris
    stats_eng = proses_dokumen_set(
        INPUT_FOLDER_ENG,
        OUTPUT_FOLDER_ENG_COMPARE,
        OUTPUT_FOLDER_ENG_STEMMED,
        snowball_stemmer,
        root_words_custom, # Kamus dasar kustom (mungkin kurang relevan untuk Snowball, tapi dipakai untuk konsistensi simulasi)
        "English"
    )

    # --- Ringkasan dan Perbandingan Statistik Total ---
    print("\n\n--- RINGKASAN STATISTIK TOTAL & PERBANDINGAN ---")

    def print_stats_comparison(stats_indo, stats_eng):
        total_files_indo = stats_indo['processed_files']
        total_files_eng = stats_eng['processed_files']
        print(f"{'Metrik':<40} | {'Indonesia':<20} | {'English':<20}")
        print("-" * 85)
        print(f"{'Jumlah Dokumen Diproses':<40} | {total_files_indo:<20} | {total_files_eng:<20}")
        print(f"{'Jumlah Dokumen Dilewati':<40} | {stats_indo['skipped_files']:<20} | {stats_eng['skipped_files']:<20}")
        print("-" * 85)
        print("Statistik Kata (Total):")
        print(f"{'  Total Kata Original':<38} | {stats_indo['original_words']:<20} | {stats_eng['original_words']:<20}")
        print(f"{'  Total Kata Cleaned':<38} | {stats_indo['cleaned_words']:<20} | {stats_eng['cleaned_words']:<20}")
        print(f"{'  Total Kata Stemmed':<38} | {stats_indo['stemmed_words']:<20} | {stats_eng['stemmed_words']:<20}")
        print("-" * 85)
        print("Statistik Kata Unik (Total):")
        unique_ori_indo = len(stats_indo['unique_original']); unique_ori_eng = len(stats_eng['unique_original'])
        unique_clean_indo = len(stats_indo['unique_cleaned']); unique_clean_eng = len(stats_eng['unique_cleaned'])
        unique_stem_indo = len(stats_indo['unique_stemmed']); unique_stem_eng = len(stats_eng['unique_stemmed'])
        print(f"{'  Unik Original':<38} | {unique_ori_indo:<20} | {unique_ori_eng:<20}")
        print(f"{'  Unik Cleaned':<38} | {unique_clean_indo:<20} | {unique_clean_eng:<20}")
        print(f"{'  Unik Stemmed':<38} | {unique_stem_indo:<20} | {unique_stem_eng:<20}")
        print("-" * 85)
        print("METRIK EVALUASI SIMULASI (Total - PERKIRAAN KASAR):")
        print(f"{'  Total Simulated MWC':<38} | {stats_indo['mwc_sim']:<20} | {stats_eng['mwc_sim']:<20}")
        print(f"{'  Total Simulated UI Groups':<38} | {stats_indo['ui_sim']:<20} | {stats_eng['ui_sim']:<20}")
        print(f"{'  Total Simulated OI Groups':<38} | {stats_indo['oi_sim']:<20} | {stats_eng['oi_sim']:<20}")
        print("-" * 85)
        print("Rata-rata Metrik Simulasi per Dokumen (Jika > 0 dokumen):")
        avg_mwc_indo = stats_indo['mwc_sim'] / total_files_indo if total_files_indo else 0
        avg_ui_indo = stats_indo['ui_sim'] / total_files_indo if total_files_indo else 0
        avg_oi_indo = stats_indo['oi_sim'] / total_files_indo if total_files_indo else 0
        avg_mwc_eng = stats_eng['mwc_sim'] / total_files_eng if total_files_eng else 0
        avg_ui_eng = stats_eng['ui_sim'] / total_files_eng if total_files_eng else 0
        avg_oi_eng = stats_eng['oi_sim'] / total_files_eng if total_files_eng else 0
        print(f"{'  Avg. Simulated MWC':<38} | {avg_mwc_indo:<20.2f} | {avg_mwc_eng:<20.2f}")
        print(f"{'  Avg. Simulated UI Groups':<38} | {avg_ui_indo:<20.2f} | {avg_ui_eng:<20.2f}")
        print(f"{'  Avg. Simulated OI Groups':<38} | {avg_oi_indo:<20.2f} | {avg_oi_eng:<20.2f}")
        print("-" * 85)

    print_stats_comparison(stats_indo, stats_eng)
    print("PERINGATAN: Ingatlah bahwa Metrik MWC, UI, OI adalah simulasi kasar!")

    # --- Contoh Penggunaan Information Retrieval ---
    print("\n\n--- CONTOH INFORMATION RETRIEVAL ---")
    contoh_query_indo = "kajian tentang pendidikan teknologi"
    hasil_cari_indo = cari_dokumen(
        contoh_query_indo,
        "Indonesia",
        OUTPUT_FOLDER_INDO_STEMMED,
        sastrawi_stemmer,
        top_n=3
    )

    print("-" * 30)

    contoh_query_eng = "studies about health and education technology"
    hasil_cari_eng = cari_dokumen(
        contoh_query_eng,
        "English",
        OUTPUT_FOLDER_ENG_STEMMED,
        snowball_stemmer,
        top_n=3
    )
    print("--------------------------------------")

    end_pipeline_time = time.time()
    print(f"\nTotal waktu eksekusi pipeline: {end_pipeline_time - start_pipeline_time:.2f} detik")
    print("Pipeline Selesai.")