In [9]:
import pandas as pd
import re  
from loguru import logger
import numpy as np
from datasketch import MinHash, MinHashLSH
from tqdm import tqdm
import fasttext 
import os


logger.add("batak_processing.log", rotation="10 MB", level="INFO")
print("Dependencies loaded successfully.")

Dependencies loaded successfully.


In [10]:

CSV_FILE_PATH = "clean_batak_scraped_pdfs.csv"
OUTPUT_CLEAN = "batak_pdfs_CLEANED.xls"
PRETRAINED_MODEL_PATH = 'lid.176.ftz' 

print("Configuration loaded.")

Configuration loaded.


In [11]:
df_batak = pd.read_csv('clean_batak_scraped_pdfs.csv')

In [12]:
import re

def clean_markdown_text(text: str) -> str:
    """
    (VERSI V3 - SANGAT AGRESIF)
    Membersihkan noise spesifik dari ekstraksi PDF (LaTeX, tag gambar, dll).
    """
    if not isinstance(text, str):
        return "" 
    
    # 1. Hapus tag gambar aneh: !(page0Picture3.jpeg)
    text = re.sub(r'!\(.*?\)', '', text)
    
    # 2. Hapus sintaks LaTeX-like: \sigma, \boldsymbol{...}, \eta

    text = re.sub(r'\\[a-zA-Z]+(?:\{.*?\})?', '', text)
    
    # 3. Hapus karakter Markdown/noise (#, *, _, $, [, ], `)
    text = re.sub(r'[#\*\_`$\[\]]+', '', text) 
    
    # 4. Hapus repeating pipes, backslashes, dan dashes: |||, ---, \\
    text = re.sub(r'[\|\\-]+', ' ', text) 
    
    # 5. Ganti newline (dan tab) jadi spasi tunggal
    text = re.sub(r'\s*\n\s*|\s*\t\s*', ' ', text)
    
    # 6. Hapus spasi berlebih
    text = re.sub(r'\s{2,}', ' ', text)
    
    return text.strip()

In [13]:
def check_quality(text: str) -> bool:
    """
    - Panjang minimal 50 
    - Filter tanda baca di akhir DIHAPUS
    """
    if not text: 
        return False 
    if len(text) < 100: 
        return False 
        

    if len(text) == 0:
        return False
    digit_ratio = sum(c.isdigit() for c in text) / len(text)
    if digit_ratio > 0.3: 
        return False        
    return True

In [14]:
import pandas as pd
import re
import os
import fasttext
from loguru import logger

try:
    if not os.path.exists(PRETRAINED_MODEL_PATH):
        print(f"Error: Model fastText '{PRETRAINED_MODEL_PATH}' tidak ditemukan.")
        lid_model = None
    else:
        lid_model = fasttext.load_model(PRETRAINED_MODEL_PATH)
        print("Model deteksi bahasa (fastText) dimuat.")
except Exception as e:
    print(f"Error memuat model fastText: {e}")
    lid_model = None

def split_text_by_language(text: str, model=lid_model):
    """
    Memecah teks menjadi dua bagian: Bahasa Batak dan Bahasa Indonesia
    berdasarkan prediksi per kalimat.
    """
    if model is None or not text or not isinstance(text, str):
        return "", ""

    # 1. Bersihkan artifact gambar/halaman
    text = re.sub(r'!\(page\d+Picture.*?\)', '', text)
    
    # 2. Pecah kalimat (Split by titik)
    sentences = text.split('.')
    
    batak_buffer = []
    indo_buffer = []
    
    for sent in sentences:
        sent = sent.strip()
        # Skip kalimat terlalu pendek (kurang dari 3 kata)
        if len(sent.split()) < 3:
            continue
            
        try:
            # 3. Prediksi Bahasa per Kalimat
            prediction = model.predict(sent, k=1)
            lang_code = prediction[0][0].replace('__label__', '')
            confidence = prediction[1][0]
            
            # --- LOGIKA PEMISAHAN ---
            # Kategori INDONESIA: Terdeteksi 'id' dengan confidence > 0.5
            if lang_code == 'id' and confidence > 0.5:
                indo_buffer.append(sent)
            
            else:
                batak_buffer.append(sent)
        except:
            continue
            
    return ". ".join(batak_buffer), ". ".join(indo_buffer)

Model deteksi bahasa (fastText) dimuat.


In [15]:
def filter_near_duplicates(df: pd.DataFrame, text_column: str = 'text', threshold: float = 0.90, num_perm: int = 128) -> pd.DataFrame:
    """
    Memfilter duplikat atau dokumen yang sangat mirip menggunakan MinHash LSH.
    """
    print(f"Running Near-Duplicate Cleaning (Threshold={threshold})...")
    initial_count = len(df)
    
    minhashes = {}
    for index, row in tqdm(df.iterrows(), total=initial_count, desc="1/3 Creating MinHashes"):
        text = str(row[text_column])
        m = MinHash(num_perm=num_perm)
        for d in text.split(): # Tokenisasi gunakan .split 
            m.update(d.encode('utf8'))
        minhashes[index] = m

    lsh = MinHashLSH(threshold=threshold, num_perm=num_perm)
    for index, m in tqdm(minhashes.items(), desc="2/3 Indexing LSH"):
        lsh.insert(index, m)

    unique_ids = set()
    processed_ids = set() 
    
    for index in tqdm(df.index, desc="3/3 Querying Duplicates"):
        if index in processed_ids:
            continue
            
        similar_items = set(lsh.query(minhashes[index]))
        processed_ids.update(similar_items)
        unique_ids.add(index)
    df_filtered = df.loc[list(unique_ids)]
    removed = initial_count - len(df_filtered)
    print(f"Done. Documents removed (duplicates): {removed}")
    
    return df_filtered.copy()

print("Near-duplicate filter function diinisiasi.")

Near-duplicate filter function diinisiasi.


In [19]:
import re

def extract_italic_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    # Ambil teks di antara *...* atau _..._
    matches = re.findall(r'(\*|_)(.*?)\1', text)
    return " ".join([m[1] for m in matches])

In [36]:
import pandas as pd
import re
import os
import fasttext

# --- 1. KONFIGURASI ---
OUTPUT_BATAK = "batak_dataset_FINAL.jsonl"
OUTPUT_INDO = "indonesia_dataset_FINAL.jsonl"
CSV_FILE_PATH = "clean_batak_scraped_pdfs.csv" 
PRETRAINED_MODEL_PATH = 'lid.176.ftz'

# Load Model
lid_model = None
try:
    if os.path.exists(PRETRAINED_MODEL_PATH):
        lid_model = fasttext.load_model(PRETRAINED_MODEL_PATH)
        print("Model FastText berhasil dimuat.")
    else:
        print("WARNING: Model tidak ditemukan.")
except Exception as e:
    print(f"Error loading model: {e}")

# --- 2. FUNGSI CLEANING (REVISI: HAPUS TANDA SERU & KURUNG KOSONG) ---
def cleaning_ultimate(text: str) -> str:
    if not isinstance(text, str): return ""
    
    # 1. HAPUS SEMUA TAG HTML/XML (<...>)
    text = re.sub(r'<[^>]+>', ' ', text)
    
    # 2. HAPUS TAG GAMBAR PDF & "PAGE PICTURE"
    # Hapus pola !(...)
    text = re.sub(r'!\(.*?\)', ' ', text)
    text = re.sub(r'(?i)(page|Picture)\s*\d*', ' ', text)

    # 3. HAPUS HEADER SKRIPSI & BIROKRASI (METADATA)
    sampah_birokrasi = [
        r'(?i)NIM\s*[\.:]?\s*\d+',    # NIM
        r'(?i)NIP\s*[\.:]?\s*\d+',    # NIP
        r'(?i)No\.?\s*Reg\s*[\.:]?\s*\d+', 
        r'(?i)SKRIPSI', 
        r'(?i)DISUSUN OLEH.*', 
        r'(?i)OLEH\s*:?',
        r'(?i)FAKULTAS\s+\w+',
        r'(?i)UNIVERSITAS\s+\w+',
        r'(?i)PROGRAM STUDI',
        r'(?i)JURUSAN',
        r'(?i)BAB\s*[IVX]+',          
        r'(?i)Dipindai dengan.*',     
        r'(?i)Scanned by.*',
        r'(?i)DAFTAR PUSTAKA',
        r'(?i)KATA PENGANTAR',
        r'(?i)ABSTRAK'
    ]
    for pola in sampah_birokrasi:
        text = re.sub(pola, ' ', text)

    # 4. HAPUS EKSTENSI FILE SISA
    text = re.sub(r'(?i)(\.jpeg|\.jpg|\.png)', ' ', text)

    # 5. HAPUS TANDA BACA & SIMBOL KHUSUS [DIPERBARUI]
    # Hapus simbol matematika LaTeX seperti $ \mathbf $ dll
    text = re.sub(r'\$.*?\$', ' ', text)  # Hapus konten dalam $...$
    text = re.sub(r'\\mathbf\s*\{[^}]*\}', ' ', text)  # Hapus \mathbf{...}
    text = re.sub(r'\\[a-zA-Z]+\s*\{[^}]*\}', ' ', text)  # Hapus perintah LaTeX lain
    
    # 6. HAPUS KURUNG DAN KONTENNYA [DIPERBARUI]
    # Hapus semua ( ) beserta isinya, termasuk ( ), (text), dll
    text = re.sub(r'\([^)]*\)', ' ', text)
    
    # 7. HAPUS GARIS PANJANG & PEMISAH [DIPERBARUI]
    text = re.sub(r'-{3,}', ' ', text)  # Hapus --- atau lebih
    text = re.sub(r'_{3,}', ' ', text)  # Hapus ___ atau lebih
    text = re.sub(r'={3,}', ' ', text)  # Hapus === atau lebih
    
    # 8. HAPUS TANDA SERU & SIMBOL LAINNYA
    text = re.sub(r'[!\[\]\{\}\*\_#\|\\^~<>=]+', ' ', text)

    # 9. HAPUS ANGKA JOMBLO (NOMOR HALAMAN)
    text = re.sub(r'\b\d+\b', ' ', text)

    # 10. HAPUS SPASI BERLEBIHAN DAN KATA SINGKAT
    # Hapus kata dengan kurang dari 2 huruf (kecuali kata penting)
    words = text.split()
    filtered_words = []
    for word in words:
        if len(word) > 2 or word.lower() in ['di', 'ke', 'dari', 'dan', 'atau', 'yang']:
            filtered_words.append(word)
    text = ' '.join(filtered_words)

    # 11. FINAL POLISH
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

# --- 3. FUNGSI SPLIT BAHASA ---
def split_text_by_language(text: str, model=lid_model):
    if model is None or not text: return "", ""
    
    sentences = text.split('.') 
    batak_buf, indo_buf = [], []
    
    for sent in sentences:
        sent = sent.strip()
        # Filter kalimat pendek (< 3 kata) atau kependekan (< 15 char)
        if len(sent.split()) < 3 or len(sent) < 15: continue 
        
        try:
            pred = model.predict(sent, k=1)
            lang = pred[0][0].replace('__label__', '')
            conf = pred[1][0]
            
            # STRICT INDO
            if lang == 'id' and conf > 0.6: 
                indo_buf.append(sent)
            # BATAK (Residual)
            elif lang not in ['ja', 'zh', 'ko', 'ru', 'ar', 'en']: 
                batak_buf.append(sent)
        except: continue
            
    return ". ".join(batak_buf), ". ".join(indo_buf)

# --- 4. EKSEKUSI PIPELINE ---
try:
    print("1. Memuat Data CSV...")
    df = pd.read_csv(CSV_FILE_PATH)
    
    input_col = 'md_extraction_result'
    if input_col not in df.columns:
        input_col = df.select_dtypes(include=['object']).columns[0]
    df.dropna(subset=[input_col], inplace=True)
    
    print("2. BERSIH-BERSIH TOTAL (ANTI HTML, TANDA SERU, KURUNG KOSONG)...")
    df['clean_temp'] = df[input_col].apply(cleaning_ultimate)
    
    print("3. Memisahkan Bahasa...")
    split_results = df['clean_temp'].apply(split_text_by_language)
    
    batak_list = [res[0] for res in split_results if len(res[0]) > 50]
    indo_list = [res[1] for res in split_results if len(res[1]) > 50]
    
    df_batak = pd.DataFrame({'text': batak_list})
    df_indo = pd.DataFrame({'text': indo_list})

    print("4. Menyimpan Hasil...")
    df_batak.to_json(OUTPUT_BATAK, orient='records', lines=True)
    df_indo.to_json(OUTPUT_INDO, orient='records', lines=True)
    
    print(f"\n--- SUKSES! ---")
    print(f"Batak Bersih: {len(df_batak)} baris -> {OUTPUT_BATAK}")
    print(f"Indo Bersih : {len(df_indo)} baris -> {OUTPUT_INDO}")

except Exception as e:
    print(f"\nCRITICAL ERROR: {e}")

Model FastText berhasil dimuat.
1. Memuat Data CSV...
2. BERSIH-BERSIH TOTAL (ANTI HTML, TANDA SERU, KURUNG KOSONG)...
3. Memisahkan Bahasa...
4. Menyimpan Hasil...

--- SUKSES! ---
Batak Bersih: 47 baris -> batak_dataset_FINAL.jsonl
Indo Bersih : 47 baris -> indonesia_dataset_FINAL.jsonl
