In [13]:
import pandas as pd
import re  
from loguru import logger
import numpy as np
from datasketch import MinHash, MinHashLSH
from tqdm import tqdm
import fasttext 
import os


logger.add("batak_processing.log", rotation="10 MB", level="INFO")
print("Dependencies loaded successfully.")

Dependencies loaded successfully.


In [14]:

CSV_FILE_PATH = "clean_batak_scraped_pdfs.csv"
OUTPUT_CLEAN = "batak_pdfs_CLEANED.xls"
PRETRAINED_MODEL_PATH = 'lid.176.ftz' 

print("Configuration loaded.")

Configuration loaded.


In [15]:
df_batak = pd.read_csv('clean_batak_scraped_pdfs.csv')

In [16]:
import re

def clean_markdown_text(text: str) -> str:
    """
    (VERSI V3 - SANGAT AGRESIF)
    Membersihkan noise spesifik dari ekstraksi PDF (LaTeX, tag gambar, dll).
    """
    if not isinstance(text, str):
        return "" 
    
    # 1. Hapus tag gambar aneh: !(page0Picture3.jpeg)
    text = re.sub(r'!\(.*?\)', '', text)
    
    # 2. Hapus sintaks LaTeX-like: \sigma, \boldsymbol{...}, \eta

    text = re.sub(r'\\[a-zA-Z]+(?:\{.*?\})?', '', text)
    
    # 3. Hapus karakter Markdown/noise (#, *, _, $, [, ], `)
    text = re.sub(r'[#\*\_`$\[\]]+', '', text) 
    
    # 4. Hapus repeating pipes, backslashes, dan dashes: |||, ---, \\
    text = re.sub(r'[\|\\-]+', ' ', text) 
    
    # 5. Ganti newline (dan tab) jadi spasi tunggal
    text = re.sub(r'\s*\n\s*|\s*\t\s*', ' ', text)
    
    # 6. Hapus spasi berlebih
    text = re.sub(r'\s{2,}', ' ', text)
    
    return text.strip()

In [17]:
def check_quality(text: str) -> bool:
    """
    - Panjang minimal 50 
    - Filter tanda baca di akhir DIHAPUS
    """
    if not text: 
        return False 
    if len(text) < 100: 
        return False 
        

    if len(text) == 0:
        return False
    digit_ratio = sum(c.isdigit() for c in text) / len(text)
    if digit_ratio > 0.3: 
        return False        
    return True

In [18]:
try:
    if not os.path.exists(PRETRAINED_MODEL_PATH):
        print(f"Error: Model fastText '{PRETRAINED_MODEL_PATH}' tidak ditemukan.")
        lid_model = None
    else:
        lid_model = fasttext.load_model(PRETRAINED_MODEL_PATH)
        print("Model deteksi bahasa (fastText) dimuat.")
except Exception as e:
    print(f"Error memuat model fastText: {e}")
    lid_model = None

def detect_language_indo_batak(text: str, model=lid_model, confidence_threshold=0.4):

    if model is None or not text or not isinstance(text, str):
        return False
    
    try:
        predictions = model.predict(text, k=1)
        lang_code = predictions[0][0].replace('__label__', '')
        confidence = predictions[1][0]
        
        is_target_language = (lang_code == 'id' or lang_code == 'bts')
        
        return is_target_language and (confidence >= confidence_threshold)
    except Exception as e:
        logger.warning(f"FastText prediction error: {e}")
        return False

Model deteksi bahasa (fastText) dimuat.


In [None]:
def filter_near_duplicates(df: pd.DataFrame, text_column: str = 'text', threshold: float = 0.90, num_perm: int = 128) -> pd.DataFrame:
    """
    Memfilter duplikat atau dokumen yang sangat mirip menggunakan MinHash LSH.
    """
    print(f"Running Near-Duplicate Cleaning (Threshold={threshold})...")
    initial_count = len(df)
    
    minhashes = {}
    for index, row in tqdm(df.iterrows(), total=initial_count, desc="1/3 Creating MinHashes"):
        text = str(row[text_column])
        m = MinHash(num_perm=num_perm)
        for d in text.split(): # Tokenisasi gunakan .split 
            m.update(d.encode('utf8'))
        minhashes[index] = m

    lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
    for index, m in tqdm(minhashes.items(), desc="2/3 Indexing LSH"):
        lsh.insert(index, m)

    unique_ids = set()
    processed_ids = set() 
    
    for index in tqdm(df.index, desc="3/3 Querying Duplicates"):
        if index in processed_ids:
            continue
            
        similar_items = set(lsh.query(minhashes[index]))
        processed_ids.update(similar_items)
        unique_ids.add(index)
    df_filtered = df.loc[list(unique_ids)]
    removed = initial_count - len(df_filtered)
    print(f"Done. Documents removed (duplicates): {removed}")
    
    return df_filtered.copy()

print("Near-duplicate filter function diinisiasi.")

Near-duplicate filter function diinisiasi.


In [None]:
try:
    
    df_batak = pd.read_csv(CSV_FILE_PATH)
    initial_doc_count = len(df_batak)
    print(f"\nTotal dokumen dimuat: {initial_doc_count}")

    INPUT_COLUMN = 'md_extraction_result'
    CLEAN_COLUMN = 'text_clean'

    df_batak.dropna(subset=[INPUT_COLUMN], inplace=True)
    print(f"Dokumen setelah drop NaN: {len(df_batak)}")

    # 1. Preprocessing (Membersihkan Markdown) 
    print("\nStarting (1) Preprocessing (Markdown Clean)...")
    df_batak[CLEAN_COLUMN] = df_batak[INPUT_COLUMN].apply(clean_markdown_text)
    print("Selesai.")

    # 2. Quality Filtering  
    print("\nStarting (2) Quality Filtering (Versi Santai)...")
    df_batak['is_quality_safe'] = df_batak[CLEAN_COLUMN].apply(check_quality)
    df_clean = df_batak[df_batak['is_quality_safe'] == True].copy()
    print(f"Quality Filtered. Dokumen tersisa: {len(df_clean)}")
    logger.info(f"Quality Filtered: {len(df_batak) - len(df_clean)} dokumen terbuang.")

    # 3. Language Identification (Filter 'id' + 'bts') 
    print("\nStarting (3) Language Identification...")
    if lid_model is not None:
        df_clean['is_target_lang'] = df_clean[CLEAN_COLUMN].apply(detect_language_indo_batak)
        df_lang_filtered = df_clean[df_clean['is_target_lang'] == True].copy()
        print(f"Language Filtered. Dokumen tersisa: {len(df_lang_filtered)}")
        logger.info(f"Language Filtered: {len(df_clean) - len(df_lang_filtered)} dokumen (non-target) terbuang.")
    else:
        print("Model fastText tidak dimuat, melewati langkah Language Filter.")
        df_lang_filtered = df_clean.copy() 

    # 4. Near-Duplicate Filtering
    print("\nStarting (4) Near-Duplicate Filtering...")
    df_lang_filtered.reset_index(drop=True, inplace=True) 
    df_final = filter_near_duplicates(df_lang_filtered, text_column=CLEAN_COLUMN, threshold=0.95, num_perm=128)
    print(f"Near-Duplicates Filtered. Dokumen tersisa: {len(df_final)}")
    logger.info(f"Near-Duplicate Filtered: {len(df_lang_filtered) - len(df_final)} dokumen terbuang.")

    # 5. Penyimpanan Hasil Bersih 
    print("\nStarting (5) Saving cleaned data...")
    final_columns = [CLEAN_COLUMN, INPUT_COLUMN, 'extracted_meaningful_text_v2']
    cols_to_save = [col for col in final_columns if col in df_final.columns]
    
    df_final[cols_to_save].to_json(OUTPUT_CLEAN, orient='records', lines=True)

    final_doc_count = len(df_final)
    print(f"\n--- PROSES SELESAI ---")
    print(f"Dokumen awal: {initial_doc_count}")
    print(f"Dokumen akhir: {final_doc_count}.")
    print(f"Data bersih disimpan di: {OUTPUT_CLEAN}")
    logger.info(f"Proses selesai. DokMen awal: {initial_doc_count}, Dokumen akhir: {final_doc_count}")

except Exception as e:
    print(f"\n--- !!! ERROR DI TENGAH JALAN !!! ---")
    print(f"Error: {e}")
    logger.error(f"Pipeline gagal: {e}")


Total dokumen dimuat: 47
Dokumen setelah drop NaN: 47

Starting (1) Preprocessing (Markdown Clean)...


[32m2025-11-17 13:05:10.050[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m23[0m - [1mQuality Filtered: 0 dokumen terbuang.[0m


Selesai.

Starting (2) Quality Filtering (Versi Santai)...
Quality Filtered. Dokumen tersisa: 47

Starting (3) Language Identification...


[32m2025-11-17 13:05:10.217[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m31[0m - [1mLanguage Filtered: 0 dokumen (non-target) terbuang.[0m


Language Filtered. Dokumen tersisa: 47

Starting (4) Near-Duplicate Filtering...
Running Near-Duplicate Cleaning (Threshold=0.95)...


1/3 Creating MinHashes: 100%|██████████| 47/47 [00:01<00:00, 29.99it/s]
2/3 Indexing LSH: 100%|██████████| 47/47 [00:00<00:00, 72555.13it/s]
3/3 Querying Duplicates: 100%|██████████| 47/47 [00:00<00:00, 96350.09it/s]
[32m2025-11-17 13:05:11.810[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m41[0m - [1mNear-Duplicate Filtered: 0 dokumen terbuang.[0m
[32m2025-11-17 13:05:11.843[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m55[0m - [1mProses selesai. DokMen awal: 47, Dokumen akhir: 47[0m


Done. Documents removed (duplicates): 0
Near-Duplicates Filtered. Dokumen tersisa: 47

Starting (5) Saving cleaned data...

--- PROSES SELESAI ---
Dokumen awal: 47
Dokumen akhir: 47.
Data bersih disimpan di: batak_pdfs_CLEANED.xls
