Menggunakan direktori kerja saat ini sebagai basis: /home/xerces/project/stemming-project
Menginisialisasi stemmer Sastrawi...
Stemmer siap.
Folder output perbandingan dibuat: /home/xerces/project/stemming-project/output/sastrawi_comparison
Folder output cleaned-only sudah ada: /home/xerces/project/stemming-project/output/cleaned

Memulai proses cleaning dan stemming dari folder: /home/xerces/project/stemming-project/input/indo
  -> Memproses file: Kel3_Peran Bimbingan dan Konseling Dalam Pendidikan Karakter    .txt...
  -> Memproses file: Kel6_Benarkah anak-anak butuh mata pelajaran koding dan AI di sekolah.txt...
  -> Memproses file: Dampak Tarif Resiprokal Trump terhadap Industri di Indonesia_1.txt...
  -> Memproses file: Global South dan Ilusi Netralitas_10.txt...
  -> Memproses file: Eksistensi Media Massa Nasional_5.txt...
  -> Memproses file: Peran Media Massa dalam Membentuk Opini Publik_5.txt...
  -> Memproses file: Kelompok 8_Ini 5 Bahaya Makanan Junk Food yang Perlu Diwaspad

In [None]:
import os
import re
import string
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import time

# --- Konfigurasi ---
NAMA_FOLDER_INPUT = 'input'
# Folder untuk hasil akhir yang berisi perbandingan
NAMA_FOLDER_OUTPUT_COMPARISON_RELATIF = 'output/sastrawi_comparison'
# Folder untuk hasil cleaning saja (opsional, bisa dihapus jika tidak perlu)
# Jika tidak mau folder ini, hapus atau komentari baris di bawah ini
NAMA_FOLDER_OUTPUT_CLEANED_RELATIF = 'output/cleaned'
# --- Akhir Konfigurasi ---

base_dir = os.getcwd()
print(f"Menggunakan direktori kerja saat ini sebagai basis: {base_dir}")
input_folder_path = os.path.join(base_dir, NAMA_FOLDER_INPUT)
output_folder_comparison_path = os.path.join(base_dir, NAMA_FOLDER_OUTPUT_COMPARISON_RELATIF)
# Hanya definisikan jika folder cleaned digunakan
if 'NAMA_FOLDER_OUTPUT_CLEANED_RELATIF' in locals() and NAMA_FOLDER_OUTPUT_CLEANED_RELATIF:
    output_folder_cleaned_path = os.path.join(base_dir, NAMA_FOLDER_OUTPUT_CLEANED_RELATIF)
else:
    # Jika tidak didefinisikan, set ke None agar pengecekan nanti tidak error
    output_folder_cleaned_path = None


# --- Fungsi untuk Membersihkan Teks (Mempertahankan Newline - Revisi) ---
def bersihkan_teks_preserve_lines(teks):
    """Membersihkan teks per baris sambil mempertahankan struktur newline."""
    lines = teks.split('\n')
    cleaned_lines = []
    tanda_baca_escaped = re.escape(string.punctuation)

    for line in lines:
        line = line.lower()
        line = re.sub(r"\d+", "", line)
        line = re.sub(r'[' + tanda_baca_escaped + ']', '', line)
        line = re.sub(r'[ \t]+', ' ', line)
        line = line.strip()
        cleaned_lines.append(line)
    return '\n'.join(cleaned_lines)
# --- Akhir Fungsi Cleaning ---

# 1. Inisialisasi Stemmer Sastrawi
print("Menginisialisasi stemmer Sastrawi...")
factory = StemmerFactory()
stemmer = factory.create_stemmer()
print("Stemmer siap.")

# 2. Buat folder output
if not os.path.exists(output_folder_comparison_path):
    os.makedirs(output_folder_comparison_path)
    print(f"Folder output perbandingan dibuat: {output_folder_comparison_path}")
else:
    print(f"Folder output perbandingan sudah ada: {output_folder_comparison_path}")

# Buat folder cleaned jika variabelnya ada dan valid
if output_folder_cleaned_path:
    if not os.path.exists(output_folder_cleaned_path):
        os.makedirs(output_folder_cleaned_path)
        print(f"Folder output cleaned-only dibuat: {output_folder_cleaned_path}")
    else:
        print(f"Folder output cleaned-only sudah ada: {output_folder_cleaned_path}")

# Variabel Evaluasi TOTAL (untuk console)
unique_words_original_total = set()
unique_words_cleaned_total = set()
unique_words_stemmed_total = set()
total_words_original_all_files = 0
total_words_cleaned_all_files = 0
total_words_stemmed_all_files = 0

# 3. Proses setiap file
print(f"\nMemulai proses cleaning dan stemming dari folder: {input_folder_path}")
start_time = time.time()

try:
    if not os.path.isdir(input_folder_path):
        raise FileNotFoundError(f"Folder input '{input_folder_path}' tidak ditemukan.")

    list_file = os.listdir(input_folder_path)
    processed_files = 0
    skipped_files = 0

    if not list_file:
        print("Folder input kosong.")

    for filename in list_file:
        input_file_path = os.path.join(input_folder_path, filename)

        if os.path.isfile(input_file_path) and filename.lower().endswith('.txt'):
            output_file_comparison_path = os.path.join(output_folder_comparison_path, filename)
            # Path opsional untuk file cleaned
            current_output_cleaned_path = None
            if output_folder_cleaned_path: # Cek jika path utama valid
                current_output_cleaned_path = os.path.join(output_folder_cleaned_path, filename)

            print(f"  -> Memproses file: {filename}...")

            try:
                with open(input_file_path, 'r', encoding='utf-8', errors='ignore') as f_in:
                    original_text = f_in.read()

                # *** HITUNG STATISTIK ORIGINAL ***
                original_words_list = original_text.split() # Split sederhana utk hitung kata
                count_original_words = len(original_words_list)
                count_unique_original = len(set(original_words_list))
                # *******************************

                # Lakukan Cleaning Teks (dengan fungsi yg diperbaiki)
                cleaned_text = bersihkan_teks_preserve_lines(original_text)

                # Simpan Teks yang Hanya Dibersihkan (OPSIONAL)
                if current_output_cleaned_path: # Cek jika path spesifik file ini valid
                    with open(current_output_cleaned_path, 'w', encoding='utf-8') as f_clean:
                        f_clean.write(cleaned_text)

                # Lakukan stemming
                stemmed_text = stemmer.stem(cleaned_text)

                # Hitung Statistik Cleaned & Stemmed untuk file ini
                # Split sederhana sudah cukup karena cleaning sudah menangani spasi
                cleaned_words_list = cleaned_text.split()
                stemmed_words_list = stemmed_text.split()

                count_cleaned_words = len(cleaned_words_list)
                count_unique_cleaned = len(set(w for w in cleaned_words_list if w)) # Hitung unik, abaikan string kosong hasil split baris kosong
                count_stemmed_words = len(stemmed_words_list)
                count_unique_stemmed = len(set(w for w in stemmed_words_list if w)) # Hitung unik, abaikan string kosong

                # *** Buat Konten Header dengan Statistik Lengkap (TERMASUK ORIGINAL) *** <--- PERBAIKAN DI SINI
                file_header = f"""=============================================
FILE: {filename} - STATISTIK
=============================================
Jumlah Kata (Original): {count_original_words}
Jumlah Kata Unik (Original): {count_unique_original}
---------------------------------------------
Jumlah Kata (Setelah Cleaning): {count_cleaned_words}
Jumlah Kata Unik (Setelah Cleaning): {count_unique_cleaned}
---------------------------------------------
Jumlah Kata (Setelah Stemming): {count_stemmed_words}
Jumlah Kata Unik (Setelah Stemming): {count_unique_stemmed}
=============================================

"""
                # --- Akhir Perbaikan Header ---

                separator_cleaned = "\n--- TEKS SETELAH CLEANING (SEBELUM STEMMING) ---\n"
                separator_stemmed = "\n\n--- TEKS SETELAH STEMMING ---\n"

                final_comparison_content = (
                    file_header +
                    separator_cleaned +
                    cleaned_text +
                    separator_stemmed +
                    stemmed_text
                )

                # Tulis hasil perbandingan
                with open(output_file_comparison_path, 'w', encoding='utf-8') as f_comparison:
                    f_comparison.write(final_comparison_content)

                # Update Statistik TOTAL (untuk console)
                total_words_original_all_files += count_original_words
                unique_words_original_total.update(w for w in original_words_list if w) # Update set unik original
                total_words_cleaned_all_files += count_cleaned_words
                unique_words_cleaned_total.update(w for w in cleaned_words_list if w) # Update set unik cleaned
                total_words_stemmed_all_files += count_stemmed_words
                unique_words_stemmed_total.update(w for w in stemmed_words_list if w) # Update set unik stemmed

                processed_files += 1

            except Exception as e:
                print(f"     ERROR saat memproses file {filename}: {e}")
        else:
            # Log skip file
            if os.path.isfile(input_file_path):
                 print(f"  -> Melewati file non-txt: {filename}")
            elif os.path.exists(input_file_path):
                 print(f"  -> Melewati item yang bukan file (misal: folder): {filename}")
            else: # Handle jika path input tidak ada
                 print(f"  -> Path tidak valid atau tidak ditemukan: {filename}")
            skipped_files += 1

    end_time = time.time()
    total_time = end_time - start_time

    print(f"\nProses cleaning dan stemming selesai.")
    print(f"Jumlah file .txt yang diproses: {processed_files}")
    if skipped_files > 0:
        print(f"Jumlah item non-txt/subfolder yang dilewati: {skipped_files}")
    print(f"Hasil perbandingan disimpan di: {output_folder_comparison_path}")
    if output_folder_cleaned_path and os.path.exists(output_folder_cleaned_path):
         print(f"Hasil cleaning saja disimpan di: {output_folder_cleaned_path}")
    print(f"Total waktu eksekusi: {total_time:.2f} detik")

    # --- Cetak Hasil Evaluasi Statistik TOTAL (Semua File) ---
    print("\n--- Evaluasi Statistik TOTAL (Semua File) ---")
    count_unique_original_total = len(unique_words_original_total)
    count_unique_cleaned_total = len(unique_words_cleaned_total)
    count_unique_stemmed_total = len(unique_words_stemmed_total)

    print(f"Total kata (tokens) original (semua file): {total_words_original_all_files}")
    print(f"Jumlah kata unik (types) original (semua file): {count_unique_original_total}")
    print("-" * 30)
    print(f"Total kata (tokens) setelah cleaning (semua file): {total_words_cleaned_all_files}")
    print(f"Jumlah kata unik (types) setelah cleaning (semua file): {count_unique_cleaned_total}")
    print("-" * 30)
    print(f"Total kata (tokens) setelah stemming (semua file): {total_words_stemmed_all_files}")
    print(f"Jumlah kata unik (types) setelah stemming (semua file): {count_unique_stemmed_total}")

    # --- Perhitungan Persentase Reduksi ---
    print("-" * 30)
    if count_unique_original_total > 0:
        reduction_from_original = ((count_unique_original_total - count_unique_stemmed_total) / count_unique_original_total) * 100
        print(f"Persentase reduksi kosakata unik (Original -> Stemmed): {reduction_from_original:.2f}%")
    else:
        print("Tidak dapat menghitung reduksi dari Original (kosakata original 0).")

    if count_unique_cleaned_total > 0:
        reduction_from_cleaned = ((count_unique_cleaned_total - count_unique_stemmed_total) / count_unique_cleaned_total) * 100
        print(f"Persentase reduksi kosakata unik (Cleaned -> Stemmed): {reduction_from_cleaned:.2f}%")
    else:
        print("Tidak dapat menghitung reduksi dari Cleaned (kosakata cleaned 0).")
    # --- Akhir Persentase Reduksi ---

except FileNotFoundError as fnf_error:
    print(f"ERROR: {fnf_error}")
    print("Pastikan folder input ada.")
except Exception as e:
    print(f"Terjadi kesalahan umum: {e}")