In [2]:
import os
import re
import string
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import time
from collections import Counter, defaultdict # Import defaultdict

# --- Konfigurasi ---
NAMA_FOLDER_INPUT = 'input/indo'
NAMA_FOLDER_OUTPUT_COMPARISON_RELATIF = 'output/sastrawi_comparison_stats' # Nama baru
# Hapus atau komentari baris ini jika tidak butuh folder cleaned terpisah
NAMA_FOLDER_OUTPUT_CLEANED_RELATIF = 'output/cleaned_stats'
# --- Akhir Konfigurasi ---

base_dir = os.getcwd()
print(f"Menggunakan direktori kerja saat ini sebagai basis: {base_dir}")
input_folder_path = os.path.join(base_dir, NAMA_FOLDER_INPUT)
output_folder_comparison_path = os.path.join(base_dir, NAMA_FOLDER_OUTPUT_COMPARISON_RELATIF)

output_folder_cleaned_path = None # Default ke None
if 'NAMA_FOLDER_OUTPUT_CLEANED_RELATIF' in locals() and NAMA_FOLDER_OUTPUT_CLEANED_RELATIF:
    output_folder_cleaned_path = os.path.join(base_dir, NAMA_FOLDER_OUTPUT_CLEANED_RELATIF)

# --- Fungsi Cleaning (sama seperti sebelumnya) ---
def bersihkan_teks_preserve_lines(teks):
    lines = teks.split('\n')
    cleaned_lines = []
    tanda_baca_escaped = re.escape(string.punctuation)
    for line in lines:
        line = line.lower()
        line = re.sub(r"\d+", "", line)
        line = re.sub(r'[' + tanda_baca_escaped + ']', '', line)
        line = re.sub(r'[ \t]+', ' ', line)
        line = line.strip()
        cleaned_lines.append(line)
    return '\n'.join(cleaned_lines)
# --- Akhir Fungsi Cleaning ---

# 1. Inisialisasi Stemmer Sastrawi
print("Menginisialisasi stemmer Sastrawi...")
factory = StemmerFactory()
stemmer = factory.create_stemmer()
print("Stemmer siap.")

# 2. Buat folder output
if not os.path.exists(output_folder_comparison_path):
    os.makedirs(output_folder_comparison_path)
    print(f"Folder output perbandingan dibuat: {output_folder_comparison_path}")
else:
    print(f"Folder output perbandingan sudah ada: {output_folder_comparison_path}")

if output_folder_cleaned_path:
    if not os.path.exists(output_folder_cleaned_path):
        os.makedirs(output_folder_cleaned_path)
        print(f"Folder output cleaned-only dibuat: {output_folder_cleaned_path}")
    else:
        print(f"Folder output cleaned-only sudah ada: {output_folder_cleaned_path}")

# Variabel Evaluasi TOTAL (untuk console)
# ... (variabel total tetap sama) ...
unique_words_original_total = set()
unique_words_cleaned_total = set()
unique_words_stemmed_total = set()
total_words_original_all_files = 0
total_words_cleaned_all_files = 0
total_words_stemmed_all_files = 0
total_stem_collisions_all_files = 0 # Tambahan untuk total proxy OI

# 3. Proses setiap file
print(f"\nMemulai proses cleaning dan stemming dari folder: {input_folder_path}")
start_time = time.time()

try:
    if not os.path.isdir(input_folder_path):
        raise FileNotFoundError(f"Folder input '{input_folder_path}' tidak ditemukan.")

    list_file = os.listdir(input_folder_path)
    processed_files = 0
    skipped_files = 0

    if not list_file:
        print("Folder input kosong.")

    for filename in list_file:
        input_file_path = os.path.join(input_folder_path, filename)

        if os.path.isfile(input_file_path) and filename.lower().endswith('.txt'):
            output_file_comparison_path = os.path.join(output_folder_comparison_path, filename)
            current_output_cleaned_path = None
            if output_folder_cleaned_path:
                current_output_cleaned_path = os.path.join(output_folder_cleaned_path, filename)

            print(f"  -> Memproses file: {filename}...")

            try:
                with open(input_file_path, 'r', encoding='utf-8', errors='ignore') as f_in:
                    original_text = f_in.read()

                # Statistik Original
                original_words_list = original_text.split()
                count_original_words = len(original_words_list)
                # Hanya hitung unik kata non-kosong dari original
                set_unique_original = set(w for w in original_words_list if w)
                count_unique_original = len(set_unique_original)

                # Cleaning
                cleaned_text = bersihkan_teks_preserve_lines(original_text)

                # Simpan Cleaned (Opsional)
                if current_output_cleaned_path:
                    with open(current_output_cleaned_path, 'w', encoding='utf-8') as f_clean:
                        f_clean.write(cleaned_text)

                # Stemming
                stemmed_text = stemmer.stem(cleaned_text)

                # Statistik Cleaned & Stemmed
                cleaned_words_list = cleaned_text.split()
                stemmed_words_list = stemmed_text.split()

                # Hanya hitung kata/unik non-kosong
                valid_cleaned_words = [w for w in cleaned_words_list if w]
                valid_stemmed_words = [w for w in stemmed_words_list if w]

                count_cleaned_words = len(valid_cleaned_words)
                set_unique_cleaned = set(valid_cleaned_words)
                count_unique_cleaned = len(set_unique_cleaned)

                count_stemmed_words = len(valid_stemmed_words)
                set_unique_stemmed = set(valid_stemmed_words)
                count_unique_stemmed = len(set_unique_stemmed)

                # *** Hitung Proxy "Stem Collision" (Indikator Kasar OI) ***
                # Buat mapping: stem -> set(kata_unik_cleaned_yg_menghasilkan_stem_ini)
                stem_to_cleaned_map = defaultdict(set)
                # Asumsi kasar: urutan kata cukup terjaga antara cleaned & stemmed list
                # (Ini mungkin tidak 100% akurat jika stemmer menghapus/menggabungkan kata internal)
                # Kita pakai list yang sudah divalidasi (non-kosong)
                # Perlu panjang list sama agar zip aman, jika tidak sama, lewati proxy ini
                num_stem_collisions = 0 # Default 0
                if len(valid_cleaned_words) == len(valid_stemmed_words):
                    for cleaned_word, stemmed_word in zip(valid_cleaned_words, valid_stemmed_words):
                        stem_to_cleaned_map[stemmed_word].add(cleaned_word)

                    # Hitung berapa banyak stem yang berasal dari > 1 kata unik cleaned
                    for stem, source_words in stem_to_cleaned_map.items():
                        if len(source_words) > 1:
                            num_stem_collisions += 1
                    total_stem_collisions_all_files += num_stem_collisions # Update total
                else:
                    print(f"     WARN: Jumlah kata cleaned ({len(valid_cleaned_words)}) "
                          f"!= stemmed ({len(valid_stemmed_words)}) untuk file {filename}. "
                          f"Proxy Stem Collision tidak dihitung.")
                # ***********************************************************


                # Buat Header File Output (dengan proxy)
                file_header = f"""=============================================
FILE: {filename} - STATISTIK
=============================================
Jumlah Kata (Original): {count_original_words}
Jumlah Kata Unik (Original): {count_unique_original}
---------------------------------------------
Jumlah Kata (Setelah Cleaning): {count_cleaned_words}
Jumlah Kata Unik (Setelah Cleaning): {count_unique_cleaned}
---------------------------------------------
Jumlah Kata (Setelah Stemming): {count_stemmed_words}
Jumlah Kata Unik (Setelah Stemming): {count_unique_stemmed}
---------------------------------------------
Proxy Over-Stemming (Jumlah Stem Collision*): {num_stem_collisions if len(valid_cleaned_words) == len(valid_stemmed_words) else 'N/A'}
=============================================
* Stem Collision: Jumlah stem yang dihasilkan dari >1 kata unik berbeda setelah cleaning.
  Angka tinggi MUNGKIN indikasi over-stemming (perlu cek manual). Dihitung jika jumlah
  kata cleaned == stemmed.

"""
                separator_cleaned = "\n--- TEKS SETELAH CLEANING (SEBELUM STEMMING) ---\n"
                separator_stemmed = "\n\n--- TEKS SETELAH STEMMING ---\n"

                final_comparison_content = (
                    file_header +
                    separator_cleaned +
                    cleaned_text +
                    separator_stemmed +
                    stemmed_text
                )

                # Tulis hasil perbandingan
                with open(output_file_comparison_path, 'w', encoding='utf-8') as f_comparison:
                    f_comparison.write(final_comparison_content)

                # Update Statistik TOTAL
                total_words_original_all_files += count_original_words
                unique_words_original_total.update(set_unique_original)
                total_words_cleaned_all_files += count_cleaned_words
                unique_words_cleaned_total.update(set_unique_cleaned)
                total_words_stemmed_all_files += count_stemmed_words
                unique_words_stemmed_total.update(set_unique_stemmed)

                processed_files += 1

            except Exception as e:
                print(f"     ERROR saat memproses file {filename}: {e}")
        else:
            # Log skip file
            # ... (log skip file tetap sama) ...
            if os.path.isfile(input_file_path):
                 print(f"  -> Melewati file non-txt: {filename}")
            elif os.path.exists(input_file_path):
                 print(f"  -> Melewati item yang bukan file (misal: folder): {filename}")
            else:
                 print(f"  -> Path tidak valid atau tidak ditemukan: {filename}")
            skipped_files += 1

    end_time = time.time()
    total_time = end_time - start_time

    # Print status selesai
    # ... (print status, jumlah file, path output, waktu) ...
    print(f"\nProses cleaning dan stemming selesai.")
    print(f"Jumlah file .txt yang diproses: {processed_files}")
    if skipped_files > 0:
        print(f"Jumlah item non-txt/subfolder yang dilewati: {skipped_files}")
    print(f"Hasil perbandingan disimpan di: {output_folder_comparison_path}")
    if output_folder_cleaned_path and os.path.exists(output_folder_cleaned_path):
         print(f"Hasil cleaning saja disimpan di: {output_folder_cleaned_path}")
    print(f"Total waktu eksekusi: {total_time:.2f} detik")


    # --- Cetak Hasil Evaluasi Statistik TOTAL (Semua File) ---
    print("\n--- Evaluasi Statistik TOTAL (Semua File) ---")
    count_unique_original_total = len(unique_words_original_total)
    count_unique_cleaned_total = len(unique_words_cleaned_total)
    count_unique_stemmed_total = len(unique_words_stemmed_total)

    print(f"Total kata (tokens) original (semua file): {total_words_original_all_files}")
    print(f"Jumlah kata unik (types) original (semua file): {count_unique_original_total}")
    print("-" * 30)
    print(f"Total kata (tokens) setelah cleaning (semua file): {total_words_cleaned_all_files}")
    print(f"Jumlah kata unik (types) setelah cleaning (semua file): {count_unique_cleaned_total}")
    print("-" * 30)
    print(f"Total kata (tokens) setelah stemming (semua file): {total_words_stemmed_all_files}")
    print(f"Jumlah kata unik (types) setelah stemming (semua file): {count_unique_stemmed_total}")
    print("-" * 30)
    print(f"Total Proxy Stem Collision (semua file): {total_stem_collisions_all_files}") # Baru
    # --- Perhitungan Persentase Reduksi (sama seperti sebelumnya) ---
    print("-" * 30)
    # ... (print persentase reduksi) ...
    if count_unique_original_total > 0:
        reduction_from_original = ((count_unique_original_total - count_unique_stemmed_total) / count_unique_original_total) * 100
        print(f"Persentase reduksi kosakata unik (Original -> Stemmed): {reduction_from_original:.2f}%")
    else:
        print("Tidak dapat menghitung reduksi dari Original (kosakata original 0).")

    if count_unique_cleaned_total > 0:
        reduction_from_cleaned = ((count_unique_cleaned_total - count_unique_stemmed_total) / count_unique_cleaned_total) * 100
        print(f"Persentase reduksi kosakata unik (Cleaned -> Stemmed): {reduction_from_cleaned:.2f}%")
    else:
        print("Tidak dapat menghitung reduksi dari Cleaned (kosakata cleaned 0).")


except FileNotFoundError as fnf_error:
    print(f"ERROR: {fnf_error}")
    print("Pastikan folder input ada.")
except Exception as e:
    print(f"Terjadi kesalahan umum: {e}")

Menggunakan direktori kerja saat ini sebagai basis: /home/xerces/project/stemming-project
Menginisialisasi stemmer Sastrawi...
Stemmer siap.
Folder output perbandingan sudah ada: /home/xerces/project/stemming-project/output/sastrawi_comparison_stats
Folder output cleaned-only sudah ada: /home/xerces/project/stemming-project/output/cleaned_stats

Memulai proses cleaning dan stemming dari folder: /home/xerces/project/stemming-project/input/indo
  -> Memproses file: Kel3_Peran Bimbingan dan Konseling Dalam Pendidikan Karakter    .txt...
  -> Memproses file: Kel6_Benarkah anak-anak butuh mata pelajaran koding dan AI di sekolah.txt...
     WARN: Jumlah kata cleaned (852) != stemmed (857) untuk file Kel6_Benarkah anak-anak butuh mata pelajaran koding dan AI di sekolah.txt. Proxy Stem Collision tidak dihitung.
  -> Memproses file: Dampak Tarif Resiprokal Trump terhadap Industri di Indonesia_1.txt...
  -> Memproses file: Global South dan Ilusi Netralitas_10.txt...
     WARN: Jumlah kata cleane