In [7]:
import re
import unicodedata
import emoji
import pandas as pd
from unidecode import unidecode
import ftfy
from cleantext import clean
import nltk
from tqdm import tqdm
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

In [8]:
df = pd.read_csv('komentar.csv')

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,comment_id,video_id,author,comment_text,published_at,like_count
0,0,UgwQfy0VpXXIRhaTF5B4AaABAg,YZ4N8jH5R_M,@nurjannah8803,Di acaranya indra harusnya serame ini suaranya...,2025-11-11T04:23:30Z,0
1,1,Ugx4wAAoBODqxmyAwE14AaABAg,YZ4N8jH5R_M,@pupungpamungkas5557,iya mirip teh rosa hehe,2025-11-11T04:18:27Z,0
2,2,UgyJEH-tNGIabbj3YXN4AaABAg,YZ4N8jH5R_M,@kurt___,üëç,2025-11-10T17:47:47Z,0
3,3,UgwMidWTArDF-YcnQzV4AaABAg,YZ4N8jH5R_M,@rhiiyoanggara6419,Terlalu dar der dor üòÖüòÖüòÖ,2025-11-10T16:52:14Z,0
4,4,UgzbQiD2vSzb217TdIR4AaABAg,YZ4N8jH5R_M,@samuelmanalumanalu8681,Ga enak ga emosi pak Jarwo...,2025-11-10T15:38:29Z,0


In [10]:
class JudolTextCleaner:
    def __init__(self, domain_number_strategy='preserve', number_replacement_strategy='smart'):
        """
        Initialize JudolTextCleaner
        
        Parameters:
        domain_number_strategy (str): Strategi untuk angka di domain
            - 'remove': Hapus angka di akhir (pstoto99 -> pstoto)
            - 'preserve': Pertahankan angka sebagai token terpisah (pstoto99 -> pstoto 99) [RECOMMENDED]
            - 'separate_token': Gunakan token khusus (pstoto99 -> pstoto [DOMAIN_NUMBER])
        number_replacement_strategy (str): Strategi untuk angka di tengah kata
            - 'aggressive': Ganti semua angka dengan huruf (insan4d -> insanad)
            - 'smart': Ganti hanya angka yang membentuk kata judol, pertahankan lainnya (insan4d -> insan4d) [RECOMMENDED]
            - 'preserve': Pertahankan semua angka asli
        """       

        # Initialize Sastrawi untuk bahasa Indonesia
        factory = StemmerFactory()
        self.stemmer = factory.create_stemmer()
        stopword_factory = StopWordRemoverFactory()
        self.stopword_remover = stopword_factory.create_stop_word_remover()
        
        # Strategy configuration
        self.domain_number_strategy = domain_number_strategy
        self.number_replacement_strategy = number_replacement_strategy
        
        # Mapping untuk number replacement
        self.number_map = {
            '0': 'o', '1': 'i', '2': 'z', '3': 'e', '4': 'a',
            '5': 's', '6': 'g', '7': 't', '8': 'b', '9': 'g',
            '!': 'i', '@': 'a', '$': 's', '+': 't'
        }

        # Emoji mapping untuk angka dan huruf
        self.emoji_numbers = {
            '1Ô∏è‚É£': '1', '2Ô∏è‚É£': '2', '3Ô∏è‚É£': '3', '4Ô∏è‚É£': '4', '5Ô∏è‚É£': '5', 
            '6Ô∏è‚É£': '6', '7Ô∏è‚É£': '7', '8Ô∏è‚É£': '8', '9Ô∏è‚É£': '9', '0Ô∏è‚É£': '0',
            '‚ûÄ': '1', '‚ûÅ': '2', '‚ûÇ': '3', '‚ûÉ': '4', '‚ûÑ': '5',
            '‚ûÖ': '6', '‚ûÜ': '7', '‚ûá': '8', '‚ûà': '9', 'üÑã': '0',
            'ü•á': '1', 'ü•à': '2', 'ü•â': '3', 'üèÜ': ' trophy ', 'üéØ': ' target ',
            '‚ù∂': '1', '‚ù∑': '2', '‚ù∏': '3', '‚ùπ': '4', '‚ù∫': '5',
            '‚ùª': '6', '‚ùº': '7', '‚ùΩ': '8', '‚ùæ': '9', '‚ùø': '10',
        }
        
        self.emoji_letters = {
            'üá¶': 'a', 'üáß': 'b', 'üá®': 'c', 'üá©': 'd', 'üá™': 'e',
            'üá´': 'f', 'üá¨': 'g', 'üá≠': 'h', 'üáÆ': 'i', 'üáØ': 'j',
            'üá∞': 'k', 'üá±': 'l', 'üá≤': 'm', 'üá≥': 'n', 'üá¥': 'o',
            'üáµ': 'p', 'üá∂': 'q', 'üá∑': 'r', 'üá∏': 's', 'üáπ': 't',
            'üá∫': 'u', 'üáª': 'v', 'üáº': 'w', 'üáΩ': 'x', 'üáæ': 'y',
            'üáø': 'z', 'üÖ∞': 'a', 'üÖ±': 'b', 'üÖ≤': 'c', 'üÖ≥': 'd',
            'üÖ¥': 'e', 'üÖµ': 'f', 'üÖ∂': 'g', 'üÖ∑': 'h', 'üÖ∏': 'i',
            'üÖπ': 'j', 'üÖ∫': 'k', 'üÖª': 'l', 'üÖº': 'm', 'üÖΩ': 'n',
            'üÖæ': 'o', 'üÖø': 'p', 'üÜÄ': 'q', 'üÜÅ': 'r', 'üÜÇ': 's',
            'üÜÉ': 't', 'üÜÑ': 'u', 'üÜÖ': 'v', 'üÜÜ': 'w', 'üÜá': 'x',
            'üÜà': 'y', 'üÜâ': 'z', 'üÖê': 'a', 'üÖë': 'b', 'üÖí': 'c',
            'üÖì': 'd', 'üÖî': 'e', 'üÖï': 'f', 'üÖñ': 'g', 'üÖó': 'h',
            'üÖò': 'i', 'üÖô': 'j', 'üÖö': 'k', 'üÖõ': 'l', 'üÖú': 'm',
            'üÖù': 'n', 'üÖû': 'o', 'üÖü': 'p', 'üÖ†': 'q', 'üÖ°': 'r',
            'üÖ¢': 's', 'üÖ£': 't', 'üÖ§': 'u', 'üÖ•': 'v', 'üÖ¶': 'w',
            'üÖß': 'x', 'üÖ®': 'y', 'üÖ©': 'z', '‚í∂': 'a', '‚í∑': 'b',
            '‚í∏': 'c', '‚íπ': 'd', '‚í∫': 'e', '‚íª': 'f', '‚íº': 'g',
            '‚íΩ': 'h', '‚íæ': 'i', '‚íø': 'j', '‚ìÄ': 'k', '‚ìÅ': 'l',
            '‚ìÇ': 'm', '‚ìÉ': 'n', '‚ìÑ': 'o', '‚ìÖ': 'p', '‚ìÜ': 'q',
            '‚ìá': 'r', '‚ìà': 's', '‚ìâ': 't', '‚ìä': 'u', '‚ìã': 'v',
            '‚ìå': 'w', '‚ìç': 'x', '‚ìé': 'y', '‚ìè': 'z',
        }

        # Leet speak patterns khusus bahasa Indonesia
        self.leet_speak_indonesia = {
            'b9s4n': 'bosan', 'b0s4n': 'bosan', 'b05an': 'bosan',
            '7un9kad': 'rungkad', 'tun9kad': 'rungkad', 'rungk4d': 'rungkad',
            'runk4d': 'rungkad', 'j04n': 'join', 'j01n': 'join',
            'buru4n': 'buruan', 'b3s4r': 'besar', 'k3c11': 'kecill',
            'murah4n': 'murahan', 'g4c0r': 'gacor', 'g4cor': 'gacor',
            'm4nt4p': 'mantap', 'm4nt4b': 'mantap', 's3r1u': 'serius',
            's3r1ous': 'serius', 'wd': 'wd', 'dp': 'deposit', 'd3p0': 'depo',
            'm4nd4ng': 'mandang', 's4k1t': 'sakit', 'b4ng3t': 'banget',
            'k3r3n': 'keren', 'h4ncur': 'hancur', 'm4ntu1': 'mantul',
            'p4st1': 'pasti', 't0p': 'top', 'pr0': 'pro', 'n0ob': 'noob',
            'c0b4': 'coba',
        }
        
        # Extended character normalization mapping - DIPERLUAS dan DIPERBAIKI
        self.extended_char_map = {
            # Latin Extended characters
            '√§': 'a', '√Ñ': 'a', '√•': 'a', '√Ö': 'a', '√¶': 'ae', '√Ü': 'ae',
            '√ß': 'c', '√á': 'c', '√∞': 'd', '√ê': 'd', '√´': 'e', '√ã': 'e',
            '√Ø': 'i', '√è': 'i', '√±': 'n', '√ë': 'n', '√∂': 'o', '√ñ': 'o',
            '√∏': 'o', '√ò': 'o', '√º': 'u', '√ú': 'u', '√ø': 'y', '≈∏': 'y',
            '≈æ': 'z', '≈Ω': 'z', '≈°': 's', '≈†': 's', 'ƒç': 'c', 'ƒå': 'c',
            'ƒá': 'c', 'ƒÜ': 'c', 'ƒü': 'g', 'ƒû': 'g', '≈ü': 's', '≈û': 's',
            'ƒ±': 'i', 'ƒ∞': 'i',
            
            # Greek letters yang sering digunakan sebagai pengganti
            'Œ±': 'a', 'Œ≤': 'b', 'Œ≥': 'g', 'Œ¥': 'd', 'Œµ': 'e', 'Œ∂': 'z',
            'Œ∑': 'h', 'Œ∏': 'th', 'Œπ': 'i', 'Œ∫': 'k', 'Œª': 'l', 'Œº': 'm',
            'ŒΩ': 'n', 'Œæ': 'x', 'Œø': 'o', 'œÄ': 'p', 'œÅ': 'r', 'œÉ': 's',
            'œÑ': 't', 'œÖ': 'u', 'œÜ': 'ph', 'œá': 'ch', 'œà': 'ps', 'œâ': 'w',
            'Œë': 'a', 'Œí': 'b', 'Œì': 'g', 'Œî': 'd', 'Œï': 'e', 'Œñ': 'z',
            'Œó': 'h', 'Œò': 'th', 'Œô': 'i', 'Œö': 'k', 'Œõ': 'l', 'Œú': 'm',
            'Œù': 'n', 'Œû': 'x', 'Œü': 'o', 'Œ†': 'p', 'Œ°': 'r', 'Œ£': 's',
            'Œ§': 't', 'Œ•': 'u', 'Œ¶': 'ph', 'Œß': 'ch', 'Œ®': 'ps', 'Œ©': 'w',
            
            # Cyrillic characters yang sering digunakan
            '–∞': 'a', '–±': 'b', '–≤': 'v', '–≥': 'g', '–¥': 'd', '–µ': 'e',
            '—ë': 'e', '–∂': 'zh', '–∑': 'z', '–∏': 'i', '–π': 'y', '–∫': 'k',
            '–ª': 'l', '–º': 'm', '–Ω': 'n', '–æ': 'o', '–ø': 'p', '—Ä': 'r',
            '—Å': 's', '—Ç': 't', '—É': 'u', '—Ñ': 'f', '—Ö': 'h', '—Ü': 'ts',
            '—á': 'ch', '—à': 'sh', '—â': 'sch', '—ä': '', '—ã': 'y', '—å': '',
            '—ç': 'e', '—é': 'yu', '—è': 'ya',
            '–ê': 'a', '–ë': 'b', '–í': 'v', '–ì': 'g', '–î': 'd', '–ï': 'e',
            '–Å': 'e', '–ñ': 'zh', '–ó': 'z', '–ò': 'i', '–ô': 'y', '–ö': 'k',
            '–õ': 'l', '–ú': 'm', '–ù': 'n', '–û': 'o', '–ü': 'p', '–†': 'r',
            '–°': 's', '–¢': 't', '–£': 'u', '–§': 'f', '–•': 'h', '–¶': 'ts',
            '–ß': 'ch', '–®': 'sh', '–©': 'sch', '–™': '', '–´': 'y', '–¨': '',
            '–≠': 'e', '–Æ': 'yu', '–Ø': 'ya',
            
            # Mathematical alphanumeric symbols
            'ùêÄ': 'a', 'ùêÅ': 'b', 'ùêÇ': 'c', 'ùêÉ': 'd', 'ùêÑ': 'e', 'ùêÖ': 'f',
            'ùêÜ': 'g', 'ùêá': 'h', 'ùêà': 'i', 'ùêâ': 'j', 'ùêä': 'k', 'ùêã': 'l',
            'ùêå': 'm', 'ùêç': 'n', 'ùêé': 'o', 'ùêè': 'p', 'ùêê': 'q', 'ùêë': 'r',
            'ùêí': 's', 'ùêì': 't', 'ùêî': 'u', 'ùêï': 'v', 'ùêñ': 'w', 'ùêó': 'x',
            'ùêò': 'y', 'ùêô': 'z', 'ùêö': 'a', 'ùêõ': 'b', 'ùêú': 'c', 'ùêù': 'd',
            'ùêû': 'e', 'ùêü': 'f', 'ùê†': 'g', 'ùê°': 'h', 'ùê¢': 'i', 'ùê£': 'j',
            'ùê§': 'k', 'ùê•': 'l', 'ùê¶': 'm', 'ùêß': 'n', 'ùê®': 'o', 'ùê©': 'p',
            'ùê™': 'q', 'ùê´': 'r', 'ùê¨': 's', 'ùê≠': 't', 'ùêÆ': 'u', 'ùêØ': 'v',
            'ùê∞': 'w', 'ùê±': 'x', 'ùê≤': 'y', 'ùê≥': 'z',
            
            'ùê¥': 'a', 'ùêµ': 'b', 'ùê∂': 'c', 'ùê∑': 'd', 'ùê∏': 'e', 'ùêπ': 'f',
            'ùê∫': 'g', 'ùêª': 'h', 'ùêº': 'i', 'ùêΩ': 'j', 'ùêæ': 'k', 'ùêø': 'l',
            'ùëÄ': 'm', 'ùëÅ': 'n', 'ùëÇ': 'o', 'ùëÉ': 'p', 'ùëÑ': 'q', 'ùëÖ': 'r',
            'ùëÜ': 's', 'ùëá': 't', 'ùëà': 'u', 'ùëâ': 'v', 'ùëä': 'w', 'ùëã': 'x',
            'ùëå': 'y', 'ùëç': 'z', 'ùëé': 'a', 'ùëè': 'b', 'ùëê': 'c', 'ùëë': 'd',
            'ùëí': 'e', 'ùëì': 'f', 'ùëî': 'g', '‚Ñé': 'h', 'ùëñ': 'i', 'ùëó': 'j',
            'ùëò': 'k', 'ùëô': 'l', 'ùëö': 'm', 'ùëõ': 'n', 'ùëú': 'o', 'ùëù': 'p',
            'ùëû': 'q', 'ùëü': 'r', 'ùë†': 's', 'ùë°': 't', 'ùë¢': 'u', 'ùë£': 'v',
            'ùë§': 'w', 'ùë•': 'x', 'ùë¶': 'y', 'ùëß': 'z',
            
            'ùíú': 'a', '‚Ñ¨': 'b', 'ùíû': 'c', 'ùíü': 'd', '‚Ñ∞': 'e', '‚Ñ±': 'f',
            'ùí¢': 'g', '‚Ñã': 'h', '‚Ñê': 'i', 'ùí•': 'j', 'ùí¶': 'k', '‚Ñí': 'l',
            '‚Ñ≥': 'm', 'ùí©': 'n', 'ùí™': 'o', 'ùí´': 'p', 'ùí¨': 'q', '‚Ñõ': 'r',
            'ùíÆ': 's', 'ùíØ': 't', 'ùí∞': 'u', 'ùí±': 'v', 'ùí≤': 'w', 'ùí≥': 'x',
            'ùí¥': 'y', 'ùíµ': 'z', 'ùí∂': 'a', 'ùí∑': 'b', 'ùí∏': 'c', 'ùíπ': 'd',
            '‚ÑØ': 'e', 'ùíª': 'f', '‚Ñä': 'g', 'ùíΩ': 'h', 'ùíæ': 'i', 'ùíø': 'j',
            'ùìÄ': 'k', 'ùìÅ': 'l', 'ùìÇ': 'm', 'ùìÉ': 'n', '‚Ñ¥': 'o', 'ùìÖ': 'p',
            'ùìÜ': 'q', 'ùìá': 'r', 'ùìà': 's', 'ùìâ': 't', 'ùìä': 'u', 'ùìã': 'v',
            'ùìå': 'w', 'ùìç': 'x', 'ùìé': 'y', 'ùìè': 'z',
            
            'ùìê': 'a', 'ùìë': 'b', 'ùìí': 'c', 'ùìì': 'd', 'ùìî': 'e', 'ùìï': 'f',
            'ùìñ': 'g', 'ùìó': 'h', 'ùìò': 'i', 'ùìô': 'j', 'ùìö': 'k', 'ùìõ': 'l',
            'ùìú': 'm', 'ùìù': 'n', 'ùìû': 'o', 'ùìü': 'p', 'ùì†': 'q', 'ùì°': 'r',
            'ùì¢': 's', 'ùì£': 't', 'ùì§': 'u', 'ùì•': 'v', 'ùì¶': 'w', 'ùìß': 'x',
            'ùì®': 'y', 'ùì©': 'z', 'ùì™': 'a', 'ùì´': 'b', 'ùì¨': 'c', 'ùì≠': 'd',
            'ùìÆ': 'e', 'ùìØ': 'f', 'ùì∞': 'g', 'ùì±': 'h', 'ùì≤': 'i', 'ùì≥': 'j',
            'ùì¥': 'k', 'ùìµ': 'l', 'ùì∂': 'm', 'ùì∑': 'n', 'ùì∏': 'o', 'ùìπ': 'p',
            'ùì∫': 'q', 'ùìª': 'r', 'ùìº': 's', 'ùìΩ': 't', 'ùìæ': 'u', 'ùìø': 'v',
            'ùîÄ': 'w', 'ùîÅ': 'x', 'ùîÇ': 'y', 'ùîÉ': 'z',
            
            'ùîÑ': 'a', 'ùîÖ': 'b', '‚Ñ≠': 'c', 'ùîá': 'd', 'ùîà': 'e', 'ùîâ': 'f',
            'ùîä': 'g', '‚Ñå': 'h', '‚Ñë': 'i', 'ùîç': 'j', 'ùîé': 'k', 'ùîè': 'l',
            'ùîê': 'm', 'ùîë': 'n', 'ùîí': 'o', 'ùîì': 'p', 'ùîî': 'q', '‚Ñú': 'r',
            'ùîñ': 's', 'ùîó': 't', 'ùîò': 'u', 'ùîô': 'v', 'ùîö': 'w', 'ùîõ': 'x',
            'ùîú': 'y', '‚Ñ®': 'z', 'ùîû': 'a', 'ùîü': 'b', 'ùî†': 'c', 'ùî°': 'd',
            'ùî¢': 'e', 'ùî£': 'f', 'ùî§': 'g', 'ùî•': 'h', 'ùî¶': 'i', 'ùîß': 'j',
            'ùî®': 'k', 'ùî©': 'l', 'ùî™': 'm', 'ùî´': 'n', 'ùî¨': 'o', 'ùî≠': 'p',
            'ùîÆ': 'q', 'ùîØ': 'r', 'ùî∞': 's', 'ùî±': 't', 'ùî≤': 'u', 'ùî≥': 'v',
            'ùî¥': 'w', 'ùîµ': 'x', 'ùî∂': 'y', 'ùî∑': 'z',
            
            'ùï¨': 'a', 'ùï≠': 'b', 'ùïÆ': 'c', 'ùïØ': 'd', 'ùï∞': 'e', 'ùï±': 'f',
            'ùï≤': 'g', 'ùï≥': 'h', 'ùï¥': 'i', 'ùïµ': 'j', 'ùï∂': 'k', 'ùï∑': 'l',
            'ùï∏': 'm', 'ùïπ': 'n', 'ùï∫': 'o', 'ùïª': 'p', 'ùïº': 'q', 'ùïΩ': 'r',
            'ùïæ': 's', 'ùïø': 't', 'ùñÄ': 'u', 'ùñÅ': 'v', 'ùñÇ': 'w', 'ùñÉ': 'x',
            'ùñÑ': 'y', 'ùñÖ': 'z', 'ùñÜ': 'a', 'ùñá': 'b', 'ùñà': 'c', 'ùñâ': 'd',
            'ùñä': 'e', 'ùñã': 'f', 'ùñå': 'g', 'ùñç': 'h', 'ùñé': 'i', 'ùñè': 'j',
            'ùñê': 'k', 'ùñë': 'l', 'ùñí': 'm', 'ùñì': 'n', 'ùñî': 'o', 'ùñï': 'p',
            'ùññ': 'q', 'ùñó': 'r', 'ùñò': 's', 'ùñô': 't', 'ùñö': 'u', 'ùñõ': 'v',
            'ùñú': 'w', 'ùñù': 'x', 'ùñû': 'y', 'ùñü': 'z',
            
            'ùñ†': 'a', 'ùñ°': 'b', 'ùñ¢': 'c', 'ùñ£': 'd', 'ùñ§': 'e', 'ùñ•': 'f',
            'ùñ¶': 'g', 'ùñß': 'h', 'ùñ®': 'i', 'ùñ©': 'j', 'ùñ™': 'k', 'ùñ´': 'l',
            'ùñ¨': 'm', 'ùñ≠': 'n', 'ùñÆ': 'o', 'ùñØ': 'p', 'ùñ∞': 'q', 'ùñ±': 'r',
            'ùñ≤': 's', 'ùñ≥': 't', 'ùñ¥': 'u', 'ùñµ': 'v', 'ùñ∂': 'w', 'ùñ∑': 'x',
            'ùñ∏': 'y', 'ùñπ': 'z', 'ùñ∫': 'a', 'ùñª': 'b', 'ùñº': 'c', 'ùñΩ': 'd',
            'ùñæ': 'e', 'ùñø': 'f', 'ùóÄ': 'g', 'ùóÅ': 'h', 'ùóÇ': 'i', 'ùóÉ': 'j',
            'ùóÑ': 'k', 'ùóÖ': 'l', 'ùóÜ': 'm', 'ùóá': 'n', 'ùóà': 'o', 'ùóâ': 'p',
            'ùóä': 'q', 'ùóã': 'r', 'ùóå': 's', 'ùóç': 't', 'ùóé': 'u', 'ùóè': 'v',
            'ùóê': 'w', 'ùóë': 'x', 'ùóí': 'y', 'ùóì': 'z',
            
            'ùóî': 'a', 'ùóï': 'b', 'ùóñ': 'c', 'ùóó': 'd', 'ùóò': 'e', 'ùóô': 'f',
            'ùóö': 'g', 'ùóõ': 'h', 'ùóú': 'i', 'ùóù': 'j', 'ùóû': 'k', 'ùóü': 'l',
            'ùó†': 'm', 'ùó°': 'n', 'ùó¢': 'o', 'ùó£': 'p', 'ùó§': 'q', 'ùó•': 'r',
            'ùó¶': 's', 'ùóß': 't', 'ùó®': 'u', 'ùó©': 'v', 'ùó™': 'w', 'ùó´': 'x',
            'ùó¨': 'y', 'ùó≠': 'z', 'ùóÆ': 'a', 'ùóØ': 'b', 'ùó∞': 'c', 'ùó±': 'd',
            'ùó≤': 'e', 'ùó≥': 'f', 'ùó¥': 'g', 'ùóµ': 'h', 'ùó∂': 'i', 'ùó∑': 'j',
            'ùó∏': 'k', 'ùóπ': 'l', 'ùó∫': 'm', 'ùóª': 'n', 'ùóº': 'o', 'ùóΩ': 'p',
            'ùóæ': 'q', 'ùóø': 'r', 'ùòÄ': 's', 'ùòÅ': 't', 'ùòÇ': 'u', 'ùòÉ': 'v',
            'ùòÑ': 'w', 'ùòÖ': 'x', 'ùòÜ': 'y', 'ùòá': 'z',
            
            'ùòà': 'a', 'ùòâ': 'b', 'ùòä': 'c', 'ùòã': 'd', 'ùòå': 'e', 'ùòç': 'f',
            'ùòé': 'g', 'ùòè': 'h', 'ùòê': 'i', 'ùòë': 'j', 'ùòí': 'k', 'ùòì': 'l',
            'ùòî': 'm', 'ùòï': 'n', 'ùòñ': 'o', 'ùòó': 'p', 'ùòò': 'q', 'ùòô': 'r',
            'ùòö': 's', 'ùòõ': 't', 'ùòú': 'u', 'ùòù': 'v', 'ùòû': 'w', 'ùòü': 'x',
            'ùò†': 'y', 'ùò°': 'z', 'ùò¢': 'a', 'ùò£': 'b', 'ùò§': 'c', 'ùò•': 'd',
            'ùò¶': 'e', 'ùòß': 'f', 'ùò®': 'g', 'ùò©': 'h', 'ùò™': 'i', 'ùò´': 'j',
            'ùò¨': 'k', 'ùò≠': 'l', 'ùòÆ': 'm', 'ùòØ': 'n', 'ùò∞': 'o', 'ùò±': 'p',
            'ùò≤': 'q', 'ùò≥': 'r', 'ùò¥': 's', 'ùòµ': 't', 'ùò∂': 'u', 'ùò∑': 'v',
            'ùò∏': 'w', 'ùòπ': 'x', 'ùò∫': 'y', 'ùòª': 'z',
            
            'ùòº': 'a', 'ùòΩ': 'b', 'ùòæ': 'c', 'ùòø': 'd', 'ùôÄ': 'e', 'ùôÅ': 'f',
            'ùôÇ': 'g', 'ùôÉ': 'h', 'ùôÑ': 'i', 'ùôÖ': 'j', 'ùôÜ': 'k', 'ùôá': 'l',
            'ùôà': 'm', 'ùôâ': 'n', 'ùôä': 'o', 'ùôã': 'p', 'ùôå': 'q', 'ùôç': 'r',
            'ùôé': 's', 'ùôè': 't', 'ùôê': 'u', 'ùôë': 'v', 'ùôí': 'w', 'ùôì': 'x',
            'ùôî': 'y', 'ùôï': 'z', 'ùôñ': 'a', 'ùôó': 'b', 'ùôò': 'c', 'ùôô': 'd',
            'ùôö': 'e', 'ùôõ': 'f', 'ùôú': 'g', 'ùôù': 'h', 'ùôû': 'i', 'ùôü': 'j',
            'ùô†': 'k', 'ùô°': 'l', 'ùô¢': 'm', 'ùô£': 'n', 'ùô§': 'o', 'ùô•': 'p',
            'ùô¶': 'q', 'ùôß': 'r', 'ùô®': 's', 'ùô©': 't', 'ùô™': 'u', 'ùô´': 'v',
            'ùô¨': 'w', 'ùô≠': 'x', 'ùôÆ': 'y', 'ùôØ': 'z',
            
            'ùô∞': 'a', 'ùô±': 'b', 'ùô≤': 'c', 'ùô≥': 'd', 'ùô¥': 'e', 'ùôµ': 'f',
            'ùô∂': 'g', 'ùô∑': 'h', 'ùô∏': 'i', 'ùôπ': 'j', 'ùô∫': 'k', 'ùôª': 'l',
            'ùôº': 'm', 'ùôΩ': 'n', 'ùôæ': 'o', 'ùôø': 'p', 'ùöÄ': 'q', 'ùöÅ': 'r',
            'ùöÇ': 's', 'ùöÉ': 't', 'ùöÑ': 'u', 'ùöÖ': 'v', 'ùöÜ': 'w', 'ùöá': 'x',
            'ùöà': 'y', 'ùöâ': 'z', 'ùöä': 'a', 'ùöã': 'b', 'ùöå': 'c', 'ùöç': 'd',
            'ùöé': 'e', 'ùöè': 'f', 'ùöê': 'g', 'ùöë': 'h', 'ùöí': 'i', 'ùöì': 'j',
            'ùöî': 'k', 'ùöï': 'l', 'ùöñ': 'm', 'ùöó': 'n', 'ùöò': 'o', 'ùöô': 'p',
            'ùöö': 'q', 'ùöõ': 'r', 'ùöú': 's', 'ùöù': 't', 'ùöû': 'u', 'ùöü': 'v',
            'ùö†': 'w', 'ùö°': 'x', 'ùö¢': 'y', 'ùö£': 'z',
            
            # Special symbols and brackets
            '„Äê': ' ', '„Äë': ' ', '„Äé': ' ', '„Äè': ' ', '„Äñ': ' ', '„Äó': ' ',
            '„Äå': ' ', '„Äç': ' ', 'ÔΩ¢': ' ', 'ÔΩ£': ' ', '„Äî': ' ', '„Äï': ' ',
            '„Äà': ' ', '„Äâ': ' ', '„Ää': ' ', '„Äã': ' ', '¬´': ' ', '¬ª': ' ',
            '„Äù': ' ', '„Äû': ' ', 'ÔºÇ': ' ', '‚Äü': ' ', '„Äü': ' ',
            'Ôºö': ' ', 'Ôºõ': ' ', 'Ôºå': ' ', '„ÄÇ': ' ', '„ÄÅ': ' ',
            'ÔºÅ': ' ', 'Ôºü': ' ', 'ÔΩû': ' ', '‚Äß': ' ', '„Éª': ' ',
            '¬¢': ' ', '@': ' ', '¬Æ': ' ', '¬©': ' ', '‚Ñ¢': ' ', '?': ' ',
            '‚ôú': ' ', '‚òÜ': ' ', 'üéØ': ' ', 'üêü': ' ', '‚ùà': ' ', '‚ú∑': ' ',
            'üéÄ': ' ', 'üíÆ': 'o', 'üèµ': 'o', '|': ' ', '!': ' ', '¬§': ' ',
            '*': ' ', "'": ' ', '~': ' ', '`': ' ', '¬Ø': ' ', '‚Ä¢': ' ', 
            ',': ' ', '¬∏': ' ', '¬¥': ' ', 'Œî': 'a', '·óØ': 'w', '·ó©': 'a',
            '‚Ä†': 't', '‰∏Ö': 't', '‚ìÑ': 'o', '~': ' ', '`': ' ', '¬¥': ' ',
            
            # TAMBAHAN BARU UNTUK PERBAIKAN ARWANATOTO:
            # Greek and special characters untuk "arwanatoto"
            '≈ò': 'r', 'Œ¨': 'a', 
            '«ü': 'a', ' Ä': 'r', '’°': 'w', '’º': 'n', '»∂': 't', '÷Ö': 'o',
            '√±': 'n', 
            'A“â': 'a', 'R“â': 'r', 'W“â': 'w', 'N“â': 'n', 'T“â': 't', 'O“â': 'o',
            
            # Special decorated characters
            'ùíú': 'a', 'ùëÖ': 'r', 'ùí≤': 'w', 'ùíú': 'a', 'ùí©': 'n', 'ùíØ': 't', 
            'üíÆ': 'o', 'üèµ': 'o', 'üç¨': 'o', '‚ô°': 'o', 'üíû': 'o',
            
            # Mathematical symbols
            'ùêÄ': 'a', 'ùêë': 'r', 'ùêñ': 'w', 'ùêÄ': 'a', 'ùêç': 'n', 'ùêì': 't', 'ùêé': 'o',
            'ùìê': 'a', 'ùì°': 'r', 'ùì¶': 'w', 'ùìê': 'a', 'ùìù': 'n', 'ùì£': 't', 'ùìû': 'o',
            'ùîÑ': 'a', '‚Ñú': 'r', 'ùîö': 'w', 'ùîÑ': 'a', 'ùîë': 'n', 'ùîó': 't', 'ùîí': 'o',
            
            # Tambahkan lebih banyak variant
            'üÖê': 'a', 'üÖ°': 'r', 'üÖ¶': 'w', 'üÖù': 'n', 'üÖ£': 't', 'üÖû': 'o',
            '‚í∂': 'a', '‚ìá': 'r', '‚ìå': 'w', '‚ìÉ': 'n', '‚ìâ': 't', '‚ìÑ': 'o',
            
            # Special case characters
            'œÉ': 'o', 'ùìΩ': 't', 'ùêé': 'o', 'ùïí': 'a', '—Ç': 't', 'ŒÆ': 'n',
            
            # Emoji dan simbol khusus
            'ü•á': '1', 'üèÜ': ' trophy ', 'üéØ': ' target ', 'üíé': ' diamond ',
            'üí∞': ' money ', 'üí∏': ' money ', 'ü§ë': ' money ', 'üíµ': ' money ',
            'üí¥': ' money ', 'üí∂': ' money ', 'üí∑': ' money ', 'üí≥': ' card ',
            'üíπ': ' chart ', '‚Üó': ' up ', '‚¨Ü': ' up ', '‚Üò': ' down ', 
            '‚¨á': ' down ', '‚¨Ö': ' left ', '‚û°': ' right ', '‚Üî': ' both ',
            'üîù': ' top ', 'üîô': ' back ', 'üîõ': ' on ', 'üîú': ' soon ',
            'üîö': ' end ', '‚úÖ': ' yes ', '‚úî': ' yes ', '‚úì': ' yes ',
            '‚ùå': ' no ', '‚úñ': ' no ', '‚ùé': ' no ', '‚ö†': ' warning ',
        }

        # Common judol domains untuk pattern recognition
        self.judol_domains = [
            'pstoto', 'toto', 'slot', 'poker', 'judi', 'bonus', 'arwana', 
            'pulau', 'win', 'casino', 'situs', 'bandar', 'sabung', 'taruhan',
            'insan', 'lazadatoto', 'paste4d', 'pandora4d', 'naga4d', 'hoki4d',
            'sendal4d', 'garudahoki', 'togel62', 'arwanatoto', 'pstoto99',
            'sgi88', 'sgi', 'sg188', 'sgi808', 'sgi888', 'sekali4d'  # TAMBAHKAN SEKALI4D
        ]

        # Brand names yang harus dipertahankan sebagai SATU KATA - DIPERBAIKI
        self.preserved_brands = {
            'insan4d': 'insan4d', 'pandora4d': 'pandora4d', 'naga4d': 'naga4d',
            'hoki4d': 'hoki4d', 'jaya4d': 'jaya4d', 'mega4d': 'mega4d',
            'super4d': 'super4d', 'lazadatoto': 'lazadatoto', 'lazada4d': 'lazada4d',
            'lazada88': 'lazada88', 'lazada77': 'lazada77', 'paste4d': 'paste4d',
            'pstoto99': 'pstoto99', 'pstoto88': 'pstoto88', 'pstoto77': 'pstoto77',
            'arwanatoto': 'arwanatoto', 'pulauwin': 'pulauwin', 'sendal4d': 'sendal4d',
            'garudahoki': 'garudahoki', 'togel62': 'togel62', 
            'sgi88': 'sgi88', 'sg188': 'sg188', 'sgi808': 'sgi808', 'sgi888': 'sgi888',
            'pstoto': 'pstoto', 'sekali4d': 'sekali4d'  # TAMBAHKAN SEKALI4D
        }

        # Common judol words untuk reconstruction - DIPERBAIKI
        self.judol_words_for_reconstruction = [
            'pulauwin', 'pulau', 'win', 'arwanatoto', 'arwana', 'toto',
            'lazadatoto', 'lazada', 'pstoto', 'pstoto99', 'pstoto88', 'pstoto77',
            'insan4d', 'pandora4d', 'paste4d', 'situs', 'slot', 'judi', 'togel', 
            'poker', 'bonus', 'deposit', 'withdraw', 'jackpot', 'freespin', 
            'casino', 'bandar', 'sabung', 'taruhan', 'rungkad', 'bosan', 'join', 
            'buruan', 'gacor', 'mantap', 'sendal4d', 'garudahoki', 'garuda', 
            'hoki', 'togel62', 'sgi88', 'sgi', 'sg188', 'sgi808', 'sgi888',
            'sekali4d'  # TAMBAHKAN SEKALI4D
        ]

        # Words yang mengandung angka tapi harus dipertahankan (brand names dengan angka) - DIPERBAIKI
        self.preserve_number_words = {
            'sendal4d', 'insan4d', 'pandora4d', 'naga4d', 'hoki4d', 'jaya4d',
            'mega4d', 'super4d', 'lazada4d', 'paste4d', 'pstoto99', 'pstoto88', 'pstoto77',
            'lazada88', 'garudahoki', 'togel62', 
            'sgi88', 'sg188', 'sgi808', 'sgi888', 'sekali4d'  # TAMBAHKAN SEKALI4D
        }

        # Common word combinations yang sering dipisah - DIPERBAIKI
        self.common_combinations = {
            'ga ruda ho ki': 'garudahoki',
            'ga ruda hoki': 'garudahoki',
            'garuda ho ki': 'garudahoki',
            'garuda hoki': 'garudahoki',
            'ga rudahoki': 'garudahoki',
            'pula uwin': 'pulauwin',
            'pulau win': 'pulauwin',
            'arwana toto': 'arwanatoto',
            'arwana to to': 'arwanatoto',
            'lazada toto': 'lazadatoto',
            'sendal 4d': 'sendal4d',
            'insan 4d': 'insan4d',
            'togel 62': 'togel62',
            'psto to': 'pstoto',
            'pstoto 99': 'pstoto99',
            'ps toto': 'pstoto',
            'sgi 88': 'sgi88',
            'sg 188': 'sg188',
            'sgi 808': 'sgi808',
            'sgi 888': 'sgi888',
            'di sgi88': 'di sgi88',
            'di pstoto99': 'di pstoto99',
            'sekali 4d': 'sekali4d',  # TAMBAHKAN SEKALI4D
        }

        # IMPORTANT WORDS yang TIDAK BOLEH dihapus oleh stopword removal
        self.important_words = {
            'saya', 'kamu', 'dia', 'kami', 'kita', 'mereka', 'ini', 'itu',
            'harapan', 'cuman', 'hanya', 'sekali', 'selalu', 'pernah', 'ingin',
            'mau', 'akan', 'bisa', 'dapat', 'boleh', 'harus', 'perlu', 'bisa',
            'membuat', 'menjadi', 'ubah', 'transformasi', 'dari', 'jadi',
            'pengantar', 'surat', 'manajer', 'direktur', 'bos', 'ketua',
            'manager', 'karyawan', 'pegawai', 'kerja', 'pekerjaan',
            'transaksi', 'deposit', 'withdraw', 'bonus', 'jackpot',
            'menang', 'kalah', 'untung', 'rugi', 'profit', 'hasil',
            'bertransformasi', 'berubah', 'hidup', 'nasib', 'kehidupan',
            'kaya', 'miskin', 'sukses', 'gagal', 'berhasil', 'pendapatan',
            'penghasilan', 'gaji', 'uang', 'duit', 'modal', 'investasi'
        }

        # Custom stopword list yang lebih selektif
        self.custom_stopwords = {
            'yang', 'di', 'ke', 'dari', 'pada', 'dalam', 'untuk', 'dengan', 
            'adalah', 'atau', 'tapi', 'dan', 'jika', 'karena', 'serta', 
            'oleh', 'itu', 'ini', 'saja', 'hanya', 'pun', 'lah', 'kah',
            'tah', 'pun', 'nya', 'ku', 'mu', 'kau', 'kami', 'kita', 'mereka',
            'saya', 'kamu', 'dia', 'beliau', 'para', 'si', 'sang', 'itu',
            'hal', 'per', 'oleh', 'agar', 'supaya', 'meski', 'walau',
            'sebab', 'karena', 'jika', 'kalau', 'apabila', 'seandainya',
            'agar', 'supaya', 'guna', 'untuk', 'demi', 'sebagai', 'laksana',
            'bak', 'ibarat', 'serupa', 'tanpa', 'dengan', 'secara', 'sambil',
            'seraya', 'selagi', 'sementara', 'ketika', 'tatkala', 'sewaktu',
            'sebelum', 'sesudah', 'setelah', 'hingga', 'sampai', 'semenjak',
            'sedari', 'seraya', 'sambil', 'seraya', 'sambil', 'seraya'
        }

    # ===== IMPROVED STOPWORD REMOVAL =====
    def selective_stopword_removal(self, text):
        """Stopword removal yang selektif - hanya menghapus stopwords umum"""
        words = text.split()
        filtered_words = []
        
        for word in words:
            # Jangan hapus jika:
            # 1. Termasuk important words
            # 2. Adalah brand/judol word  
            # 3. Mengandung angka
            # 4. Panjang kata > 3 karakter
            # 5. Bukan stopword custom
            word_lower = word.lower()
            if (word_lower in self.important_words or
                any(brand in word_lower for brand in self.preserved_brands) or
                any(char.isdigit() for char in word) or
                len(word) > 3 or
                word_lower not in self.custom_stopwords):
                filtered_words.append(word)
        
        return ' '.join(filtered_words)

    # ===== IMPROVED WORD SEGMENTATION =====
    def improved_word_segmentation(self, text):
        """Segmentasi kata yang lebih baik untuk kasus seperti 'disgi88membuat'"""
        # Pattern untuk memisahkan kata yang menempel pada brand
        patterns = [
            # Kasus: prefix + brand + kata (disgi88membuat -> di sgi88 membuat)
            (r'(\b\w{1,2})(sgi88|sg188|sgi808|sgi888)(\w+)\b', r'\1 \2 \3'),
            (r'(\b\w{1,2})(pstoto|arwanatoto|garudahoki)(\w+)\b', r'\1 \2 \3'),
            
            # Kasus: kata + brand (membuatsgi88 -> membuat sgi88)
            (r'(\b\w+)(sgi88|sg188|sgi808|sgi888)(\w{1,2}\b)', r'\1 \2 \3'),
            (r'(\b\w+)(pstoto|arwanatoto|garudahoki)(\w{1,2}\b)', r'\1 \2 \3'),
            
            # Kasus: brand langsung gabung dengan kata
            (r'\b(sgi88|sg188|sgi808|sgi888)(\w{3,})\b', r'\1 \2'),
            (r'\b(\w{3,})(sgi88|sg188|sgi808|sgi888)\b', r'\1 \2'),
            
            # ‚úÖ PERBAIKI: Pattern untuk Togel62, Sendal4d, Sekali4d - LEBIH SPESIFIK
            # Kasus: kata + togel62 (membuattogel62 -> membuat togel62)
            (r'\b(\w{3,})(togel62)(\w*)\b', r'\1 \2 \3'),
            # Kasus: togel62 + kata (togel62membuat -> togel62 membuat)  
            (r'\b(togel62)(\w{3,})\b', r'\1 \2'),
            # Kasus: prefix pendek + togel62 (ditogel62 -> di togel62)
            (r'\b(\w{1,2})(togel62)(\w*)\b', r'\1 \2 \3'),
            
            # Pattern yang sama untuk sendal4d dan sekali4d
            (r'\b(\w{3,})(sendal4d)(\w*)\b', r'\1 \2 \3'),
            (r'\b(sendal4d)(\w{3,})\b', r'\1 \2'),
            (r'\b(\w{1,2})(sendal4d)(\w*)\b', r'\1 \2 \3'),
            
            (r'\b(\w{3,})(sekali4d)(\w*)\b', r'\1 \2 \3'),
            (r'\b(sekali4d)(\w{3,})\b', r'\1 \2'),
            (r'\b(\w{1,2})(sekali4d)(\w*)\b', r'\1 \2 \3'),
            
            # Kasus umum: prefix + kata
            (r'\b(di)(\w{3,})\b', r'\1 \2'),  # dimembuat -> di membuat
            (r'\b(ke)(\w{3,})\b', r'\1 \2'),  # kemana -> ke mana
            (r'\b(se)(\w{3,})\b', r'\1 \2'),  # semahal -> se mahal
        ]
        
        for pattern, replacement in patterns:
            text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
        
        return text
        
    # ===== TEXT DECORATION CLEANING =====
    def remove_text_decorations(self, text):
        """Hapus dekorasi teks seperti |!¬§*'~``~'*¬§!| dan sejenisnya"""
        # Pattern untuk decorated text dengan berbagai simbol
        decoration_patterns = [
            r'[|!¬§*\'~`¬Ø,¬∏√∏¬∫¬∞‚àô‚ñ™‚ñ†‚ñ°‚ñ¢‚ñ£‚ñ§‚ñ•‚ñ¶‚ñß‚ñ®‚ñ©‚ñ™‚ñ´‚ñ¨‚ñ≠‚ñÆ‚ñØ‚ñ∞‚ñ±‚ñ≤‚ñ≥‚ñ¥‚ñµ‚ñ∂‚ñ∑‚ñ∏‚ñπ‚ñ∫‚ñª‚ñº‚ñΩ‚ñæ‚ñø‚óÄ‚óÅ‚óÇ‚óÉ‚óÑ‚óÖ‚óÜ‚óá‚óà‚óâ‚óä‚óã‚óå‚óç‚óé‚óè‚óê‚óë‚óí‚óì‚óî‚óï‚óñ‚óó‚óò‚óô‚óö‚óõ‚óú‚óù‚óû‚óü‚ó†‚ó°‚ó¢‚ó£‚ó§‚ó•‚ó¶‚óß‚ó®‚ó©‚ó™‚ó´‚ó¨‚ó≠‚óÆ‚óØ‚ó∞‚ó±‚ó≤‚ó≥‚ó¥‚óµ‚ó∂‚ó∑‚ó∏‚óπ‚ó∫‚óª‚óº‚óΩ‚óæ‚óø]+',
        ]
        
        for pattern in decoration_patterns:
            text = re.sub(pattern, ' ', text)
        
        return text

    # ===== EMOJI HANDLING METHODS =====
    def replace_emoji_numbers(self, text):
        """Ganti emoji angka dengan angka biasa"""
        for emoji_num, normal_num in self.emoji_numbers.items():
            text = text.replace(emoji_num, normal_num)
        return text

    def replace_emoji_letters(self, text):
        """Ganti emoji huruf dengan huruf biasa"""
        for emoji_letter, normal_letter in self.emoji_letters.items():
            text = text.replace(emoji_letter, normal_letter)
        return text

    def handle_emoji_characters(self, text):
        """Handle semua jenis emoji karakter"""
        # Step 1: Replace emoji numbers
        text = self.replace_emoji_numbers(text)
        
        # Step 2: Replace emoji letters  
        text = self.replace_emoji_letters(text)
        
        # Step 3: Remove remaining emojis, tapi pertahankan makna
        text = emoji.demojize(text)
        text = re.sub(r':[a-z_]+:', ' ', text)  # Hapus kode emoji
        
        return text

    # ===== BASIC CLEANING METHODS =====
    def remove_emojis(self, text):
        """Hapus semua emoji dan simbol"""
        return emoji.replace_emoji(text, replace=' ')

    def fix_encoding(self, text):
        """Perbaiki encoding issues"""
        return ftfy.fix_text(text)

    def normalize_unicode(self, text):
        """Normalisasi karakter unicode"""
        return unicodedata.normalize('NFKD', text)

    def remove_special_chars(self, text):
        """Hapus karakter khusus tapi pertahankan huruf Indonesia"""
        # Pertahankan kata dengan angka (brand names)
        text = re.sub(r'[^\w\s\d]', ' ', text)
        return text

    def remove_urls(self, text):
        """Hapus URL dan domain"""
        text = re.sub(r'http\S+|www\.\S+', '', text)
        text = re.sub(r'\S+\.(com|net|org|id|io)\S*', '', text)
        return text

    def remove_phone_numbers(self, text):
        """Hapus nomor telepon"""
        text = re.sub(r'[\+]?[0-9]{2,}[\s\-]?[0-9]{2,}[\s\-]?[0-9]{2,}[\s\-]?[0-9]{2,}', '', text)
        return text

    def clean_whitespace_characters(self, text):
        """Bersihkan karakter whitespace tidak terlihat"""
        text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)
        text = re.sub(r'&nbsp;', ' ', text)
        return text

    # ===== IMPROVED CHARACTER NORMALIZATION =====
    def enhanced_character_normalization(self, text):
        """Normalisasi karakter extended dan khusus dengan lebih baik"""
        # Step 1: Handle emoji characters first
        text = self.handle_emoji_characters(text)
        
        # Step 2: Normalize Unicode (NFKD untuk memisahkan diacritics)
        text = unicodedata.normalize('NFKD', text)
        
        # Step 3: Replace extended characters
        for char, replacement in self.extended_char_map.items():
            text = text.replace(char, replacement)
        
        # Step 4: Remove diacritics (accents)
        text = ''.join(c for c in text if not unicodedata.combining(c))
        
        # Step 5: Use unidecode untuk karakter yang tersisa
        text = unidecode(text)
        
        # Step 6: Remove extra spaces
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text

    def clean_brackets_and_special_chars(self, text):
        """Bersihkan brackets dan karakter khusus secara terpisah"""
        # Hapus semua brackets dan karakter khusus, ganti dengan spasi
        text = re.sub(r'[„Äê„Äë„Äé„Äè„Äñ„Äó„Äå„ÄçÔΩ¢ÔΩ£„Äî„Äï„Äà„Äâ„Ää¬ª¬´„Äù„ÄûÔºÇ‚Äü„ÄüÔºöÔºõÔºå„ÄÇ„ÄÅÔºÅÔºüÔΩû‚Äß„Éª¬¢@¬Æ¬©‚Ñ¢]', ' ', text)
        return text

    # ===== IMPROVED BRAND RECOGNITION =====
    def enhanced_brand_recognition(self, text):
        """Enhanced brand recognition dengan pattern matching yang lebih kuat"""
        brand_patterns = {
            # SGI88 variations
            r'\b[sS5][gG9][iI1]88\b': 'sgi88',
            r'\b[sS5][gG9][iI1]\s*88\b': 'sgi88', 
            r'\b[sS5][gG9][iI1]808\b': 'sgi808',
            r'\b[sS5][gG9][iI1]888\b': 'sgi888',
            r'\b[sS5][gG9]188\b': 'sg188',
            r'\b[sS5][gG9]\s*188\b': 'sg188',
            
            # PSTOTO99 variations
            r'\b[pP][sS5][tT7][oO0][tT7][oO0]99\b': 'pstoto99',
            r'\b[pP][sS5][tT7][oO0][tT7][oO0]\s*99\b': 'pstoto99',
            r'\bpstoto\s*99\b': 'pstoto99',
            
            # ‚úÖ PERBAIKI: Togel62 variations - HAPUS pattern spacing di sini
            r'\b[tT7][oO0][gG9][eE3][lL1]62\b': 'togel62',
            r'\b[tT7][oO0][gG9][eE3][lL1]\s*62\b': 'togel62',
            
            # Sendal4d variations
            r'\b[sS5][eE3][nN][dD][aA4@][lL1]4[dD]\b': 'sendal4d',
            r'\b[sS5][eE3][nN][dD][aA4@][lL1]\s*4[dD]\b': 'sendal4d',
            
            # Sekali4d variations
            r'\b[sS5][eE3][kK][aA4@][lL1][iI1]4[dD]\b': 'sekali4d',
            r'\b[sS5][eE3][kK][aA4@][lL1][iI1]\s*4[dD]\b': 'sekali4d',
            
            # Pattern lainnya tetap sama...
        }
        
        for pattern, replacement in brand_patterns.items():
            text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
        
        return text

    # ===== BRAND PATTERN HANDLING =====
    def fix_specific_brand_patterns(self, text):
        """Perbaiki pattern brand khusus"""
        # Pattern untuk brackets dan special characters - HAPUS saja
        bracket_patterns = [
            r'„Äê.*?„Äë', r'„Äé.*?„Äè', r'„Äñ.*?„Äó', r'„Äå.*?„Äç', r'ÔΩ¢.*?ÔΩ£',
            r'„Äî.*?„Äï', r'„Äà.*?„Äâ', r'„Ää.*?„Äã', r'¬´.*?¬ª', r'@¬¢', r'¬Æ', r'¬©', r'‚Ñ¢'
        ]
        
        for pattern in bracket_patterns:
            text = re.sub(pattern, ' ', text)
        
        # Pattern untuk brand names - NORMALIZE tapi pertahankan sebagai SATU KATA
        brand_patterns = {
            # SGI88 patterns - DIPERBAIKI
            r'sgi\s*88': 'sgi88',
            r'sg\s*188': 'sg188', 
            r'sgi\s*808': 'sgi808',
            r'sgi\s*888': 'sgi888',
            
            # PSTOTO99 patterns - DIPERBAIKI
            r'pstoto\s*99': 'pstoto99',
            r'ps\s*toto\s*99': 'pstoto99',
            
            # Togel62 patterns - TAMBAHKAN
            r'togel\s*62': 'togel62',
            r't0gel\s*62': 'togel62',
            
            # Sendal4d patterns - TAMBAHKAN
            r'sendal\s*4d': 'sendal4d',
            r'sendal\s*4\s*d': 'sendal4d',
            
            # Sekali4d patterns - TAMBAHKAN
            r'sekali\s*4d': 'sekali4d',
            r'sekali\s*4\s*d': 'sekali4d',
            
            # "cari di google" -> pisah menjadi 3 kata terpisah
            r'cari\s*di\s*google': 'cari di google',
            r'cari\s*di\s*g[o0][o0]gle': 'cari di google',
            r'√ß√§ri\s*di\s*g√∂√∂gle': 'cari di google',
            r'√ßari\s*di\s*google': 'cari di google',
            
            # "lazadatoto" -> pertahankan sebagai SATU KATA
            r'lazada\s*toto': 'lazadatoto',
            r'lazada\s*t[o0]t[o0]': 'lazadatoto',
            r'lazada\s*4d': 'lazada4d',
            
            # "garudahoki" -> pertahankan sebagai SATU KATA
            r'ga\s*ruda\s*ho\s*ki': 'garudahoki',
            r'ga\s*ruda\s*hoki': 'garudahoki',
            r'garuda\s*ho\s*ki': 'garudahoki',
            r'garuda\s*hoki': 'garudahoki',
        }
        
        for pattern, replacement in brand_patterns.items():
            text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
        
        return text

    def fix_brand_spacing(self, text):
        """Perbaiki spacing khusus untuk brand names - VERSI DIPERBAIKI"""
        # Pattern untuk memastikan brand names memiliki spasi yang tepat
        brand_spacing_patterns = {
            # ‚úÖ PERBAIKI: Gunakan word boundaries dan pastikan spasi konsisten
            r'\b(\w{2,})(togel62|sendal4d|sekali4d|sgi88|sg188|sgi808|sgi888|pstoto99)\b': r'\1 \2',
            r'\b(togel62|sendal4d|sekali4d|sgi88|sg188|sgi808|sgi888|pstoto99)(\w{2,})\b': r'\1 \2',
            
            # Handle kasus khusus dengan karakter tunggal
            r'\b(\w{1})(togel62|sendal4d|sekali4d)\b': r'\1 \2',
            r'\b(togel62|sendal4d|sekali4d)(\w{1})\b': r'\1 \2',
        }
        
        for pattern, replacement in brand_spacing_patterns.items():
            text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
        
        return text

    def preserve_brand_names_in_text(self, text):
        """Pertahankan brand names sebagai satu kata dalam teks - VERSI DIPERBAIKI"""
        # ‚úÖ PERBAIKI: Jangan tambahkan brand baru di sini, gunakan yang sudah ada di __init__
        
        # Normalize spacing terlebih dahulu
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Urutkan dari yang terpanjang ke terpendek untuk menghindari partial matching
        sorted_brands = sorted(self.preserved_brands.keys(), key=len, reverse=True)
        
        for brand in sorted_brands:
            preserved_form = self.preserved_brands[brand]
            
            # ‚úÖ PERBAIKI: Gunakan pattern yang lebih spesifik
            # Pattern 1: Brand sebagai kata utuh dengan boundaries
            pattern1 = r'\b' + re.escape(brand) + r'\b'
            text = re.sub(pattern1, preserved_form, text, flags=re.IGNORECASE)
            
            # Pattern 2: Brand dengan spasi internal (sgi 88 -> sgi88)
            if any(char.isdigit() for char in brand):
                # Untuk brand dengan angka, buat pattern dengan optional spaces
                chars = list(brand)
                spaced_pattern = r'\s*'.join(re.escape(char) for char in chars)
                pattern2 = r'\b' + spaced_pattern + r'\b'
                text = re.sub(pattern2, preserved_form, text, flags=re.IGNORECASE)
        
        return text

    # ===== IMPROVED WORD COMBINATION FIXING =====
    def fix_common_combinations(self, text):
        """Perbaiki kombinasi kata yang sering dipisah"""
        # Normalize spacing terlebih dahulu
        text = re.sub(r'\s+', ' ', text).strip()
        
        # Urutkan kombinasi dari yang terpanjang ke terpendek
        sorted_combinations = sorted(self.common_combinations.items(), 
                                   key=lambda x: len(x[0]), reverse=True)
        
        for combination, replacement in sorted_combinations:
            # Gunakan regex untuk matching yang lebih fleksibel
            pattern = r'\b' + re.escape(combination) + r'\b'
            if re.search(pattern, text, re.IGNORECASE):
                text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
        
        return text

    # ===== IMPROVED NUMBER REPLACEMENT =====
    def smart_number_replacement(self, text):
        """Ganti angka dengan huruf hanya untuk kata-kata tertentu, pertahankan brand names"""
        # Step 1: Preserve brand names dengan angka terlebih dahulu
        text = self.preserve_brand_names_in_text(text)
        
        # Step 2: Untuk kata lainnya, gunakan replacement berdasarkan strategi
        if self.number_replacement_strategy == 'aggressive':
            # Ganti semua angka kecuali dalam brand names yang sudah dipreserve
            words = text.split()
            processed_words = []
            
            for word in words:
                # Jika word adalah brand name yang dipreserve, skip
                if word in self.preserved_brands.values():
                    processed_words.append(word)
                    continue
                
                # Jika word mengandung angka dan bukan brand name, lakukan replacement
                if any(char.isdigit() for char in word) and not word.isdigit():
                    processed_word = ''
                    for char in word:
                        if char in self.number_map:
                            processed_word += self.number_map[char]
                        else:
                            processed_word += char
                    processed_words.append(processed_word)
                else:
                    processed_words.append(word)
            
            text = ' '.join(processed_words)
            
        elif self.number_replacement_strategy == 'smart':
            # Hanya ganti angka pada kata-kata leet speak yang diketahui
            for leet_word, normal_word in self.leet_speak_indonesia.items():
                text = re.sub(r'\b' + re.escape(leet_word) + r'\b', normal_word, text, flags=re.IGNORECASE)
            
        # 'preserve' strategy tidak melakukan apa-apa terhadap angka
        
        return text

    def comprehensive_number_replacement(self, text):
        """Ganti angka dengan huruf yang sesuai - VERSI DIPERBAIKI"""
        # Step 1: Handle domain numbers berdasarkan strategi
        text = self.preserve_common_domains(text)
        
        # Step 2: Preserve brand names dengan angka
        text = self.preserve_brand_names_in_text(text)
        
        # Step 3: Decode leet speak Indonesia
        text = self.decode_leet_speak_indonesia(text)
        
        # Step 4: Smart number replacement berdasarkan strategi
        text = self.smart_number_replacement(text)
        
        return text

    # ===== WORD RECONSTRUCTION METHODS =====
    def advanced_word_reconstruction(self, text):
        """Rekonstruksi yang lebih advanced dengan pattern matching - DIPERBAIKI"""
        # Pattern untuk kata yang sering dipisah - DIPERBAIKI
        separation_patterns = {
            # SGI88 variations - DIPERBAIKI
            r'\b(s)\s*(g)\s*(i)\s*(8)\s*(8)\b': 'sgi88',
            r'\b(s\s*g\s*i\s*8\s*8)\b': 'sgi88',
            r'\b(sg)\s*(i88)\b': 'sgi88',
            r'\b(sgi)\s*(88)\b': 'sgi88',
            
            # PSTOTO99 variations - DIPERBAIKI
            r'\b(p)\s*(s)\s*(t)\s*(o)\s*(t)\s*(o)\s*(9)\s*(9)\b': 'pstoto99',
            r'\b(p\s*s\s*t\s*o\s*t\s*o\s*9\s*9)\b': 'pstoto99',
            r'\b(pstoto)\s*(99)\b': 'pstoto99',
            r'\b(ps)\s*(toto)\s*(99)\b': 'pstoto99',
            
            # Togel62 variations - TAMBAHKAN
            r'\b(t)\s*(o)\s*(g)\s*(e)\s*(l)\s*(6)\s*(2)\b': 'togel62',
            r'\b(t\s*o\s*g\s*e\s*l\s*6\s*2)\b': 'togel62',
            r'\b(togel)\s*(62)\b': 'togel62',
            
            # Sendal4d variations - TAMBAHKAN
            r'\b(s)\s*(e)\s*(n)\s*(d)\s*(a)\s*(l)\s*(4)\s*(d)\b': 'sendal4d',
            r'\b(s\s*e\s*n\s*d\s*a\s*l\s*4\s*d)\b': 'sendal4d',
            r'\b(sendal)\s*(4d)\b': 'sendal4d',
            
            # Sekali4d variations - TAMBAHKAN
            r'\b(s)\s*(e)\s*(k)\s*(a)\s*(l)\s*(i)\s*(4)\s*(d)\b': 'sekali4d',
            r'\b(s\s*e\s*k\s*a\s*l\s*i\s*4\s*d)\b': 'sekali4d',
            r'\b(sekali)\s*(4d)\b': 'sekali4d',
            
            # Pulauwin variations
            r'\b(p)\s*(u)\s*(l)\s*(a)\s*(u)\s*(w)\s*(i)\s*(n)\b': 'pulauwin',
            r'\b(p\s*u\s*l\s*a\s*u\s*w\s*i\s*n)\b': 'pulauwin',
            r'\b(pula)\s*(uwin)\b': 'pulauwin',
            r'\b(pulau)\s*(win)\b': 'pulauwin',
            
            # Arwanatoto variations
            r'\b(a)\s*(r)\s*(w)\s*(a)\s*(n)\s*(a)\s*(t)\s*(o)\s*(t)\s*(o)\b': 'arwanatoto',
            r'\b(arwana)\s*(toto)\b': 'arwanatoto',
            
            # Garudahoki variations
            r'\b(g)\s*(a)\s*(r)\s*(u)\s*(d)\s*(a)\s*(h)\s*(o)\s*(k)\s*(i)\b': 'garudahoki',
            r'\b(garuda)\s*(hoki)\b': 'garudahoki',
        }
        
        for pattern, replacement in separation_patterns.items():
            text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
        
        return text

    def detect_and_fix_word_separation(self, text):
        """Deteksi dan perbaiki pemisahan kata dengan algoritma yang lebih robust"""
        # Step 1: Fix common combinations first
        text = self.fix_common_combinations(text)
        
        # Step 2: Original algorithm
        words = text.split()
        if len(words) < 2:
            return text
        
        i = 0
        result_words = []
        
        while i < len(words):
            current_word = words[i]
            
            # Skip jika kata sudah panjang atau mengandung angka
            if len(current_word) > 3 or any(char.isdigit() for char in current_word):
                result_words.append(current_word)
                i += 1
                continue
            
            # Coba gabung dengan kata berikutnya
            if i + 1 < len(words):
                next_word = words[i + 1]
                combined = current_word + next_word
                combined_lower = combined.lower()
                
                # Cek apakah gabungan membentuk kata judol
                is_judol_word = any(
                    judol_word == combined_lower or 
                    judol_word.startswith(combined_lower) or
                    combined_lower.startswith(judol_word)
                    for judol_word in self.judol_words_for_reconstruction
                )
                
                if is_judol_word and len(combined) >= 4:
                    # Cari kata judol yang paling tepat
                    best_match = None
                    for judol_word in self.judol_words_for_reconstruction:
                        if judol_word.startswith(combined_lower) or combined_lower.startswith(judol_word):
                            best_match = judol_word
                            break
                    
                    if best_match:
                        result_words.append(best_match)
                        i += 2  # Skip kedua kata
                        continue
            
            result_words.append(current_word)
            i += 1
        
        return ' '.join(result_words)

    def reconstruct_separated_words(self, text):
        """Rekonstruksi kata yang sengaja dipisah seperti 'p u l a u w i n' atau 'pula uwin'"""
        words = text.split()
        
        if len(words) <= 1:
            return text
        
        reconstructed_words = []
        i = 0
        
        while i < len(words):
            current_word = words[i]
            
            # Coba gabung dengan kata berikutnya untuk membentuk kata judol
            combined = current_word
            j = i + 1
            found_combination = False
            
            while j <= len(words):
                # Cek kombinasi saat ini
                current_combination = ''.join(words[i:j])
                current_combination_lower = current_combination.lower()
                
                # Cek apakah kombinasi ini adalah kata judol atau bagian darinya
                is_judol_combination = any(
                    judol_word.startswith(current_combination_lower) or 
                    current_combination_lower in judol_word or
                    judol_word.startswith(current_combination_lower.replace(' ', ''))
                    for judol_word in self.judol_words_for_reconstruction
                )
                
                # Jika kombinasi membentuk kata judol yang lengkap
                exact_match = any(
                    judol_word == current_combination_lower.replace(' ', '')
                    for judol_word in self.judol_words_for_reconstruction
                )
                
                if exact_match:
                    # Found exact match! Gunakan kata judol yang benar
                    matched_word = next(
                        judol_word for judol_word in self.judol_words_for_reconstruction 
                        if judol_word == current_combination_lower.replace(' ', '')
                    )
                    reconstructed_words.append(matched_word)
                    i = j
                    found_combination = True
                    break
                elif is_judol_combination and j < len(words):
                    # Masih mungkin bisa digabung lebih lanjut
                    j += 1
                else:
                    # Tidak bisa digabung lebih lanjut
                    break
            
            if not found_combination:
                # Jika tidak ada kombinasi yang ditemukan, gunakan kata asli
                reconstructed_words.append(current_word)
                i += 1
        
        return ' '.join(reconstructed_words)

    # ===== LEET SPEAK DECODING =====
    def decode_leet_speak_indonesia(self, text):
        """Decode leet speak khusus bahasa Indonesia"""
        # Step 1: Replace known leet speak patterns
        for leet_word, normal_word in self.leet_speak_indonesia.items():
            text = re.sub(r'\b' + re.escape(leet_word) + r'\b', normal_word, text, flags=re.IGNORECASE)
        
        return text

    def smart_contextual_replacement(self, word):
        """Ganti angka dengan huruf berdasarkan konteks - HANYA untuk non-brand words"""
        # Jika word adalah brand name yang dipreserve, return asli
        if word.lower() in [brand.lower() for brand in self.preserved_brands.values()]:
            return word
        
        if not any(char.isdigit() for char in word) or word.isdigit():
            return word
        
        # Convert to lowercase untuk processing
        word_lower = word.lower()
        result = []
        i = 0
        
        while i < len(word_lower):
            char = word_lower[i]
            
            if char in self.number_map:
                # Special case untuk '7' (bisa 't' atau 'r')
                if char == '7':
                    # '7un9kad' -> 'rungkad' (7u -> ru)
                    if i + 1 < len(word_lower) and word_lower[i + 1] == 'u':
                        result.append('r')
                    else:
                        result.append('t')
                else:
                    result.append(self.number_map[char])
            else:
                result.append(char)
            i += 1
        
        return ''.join(result)

    def advanced_leet_decode(self, text):
        """Decode leet speak dengan algoritma yang lebih advanced"""
        words = text.split()
        decoded_words = []
        
        for word in words:
            # Skip jika sudah berupa brand yang di-preserve
            if word in self.preserved_brands.values():
                decoded_words.append(word)
                continue
            
            # Skip jika hanya angka
            if word.isdigit():
                decoded_words.append(word)
                continue
            
            # Coba decode dengan pattern yang diketahui dulu
            original_word = word
            word_lower = word.lower()
            
            # Decode known patterns
            for leet_pattern, normal_word in self.leet_speak_indonesia.items():
                if leet_pattern in word_lower:
                    word = word_lower.replace(leet_pattern, normal_word)
                    break
            
            # Jika masih ada angka dan BUKAN brand name, gunakan contextual replacement
            if any(char.isdigit() for char in word) and word_lower not in [brand.lower() for brand in self.preserved_brands.values()]:
                word = self.smart_contextual_replacement(word)
            
            decoded_words.append(word)
        
        return ' '.join(decoded_words)

    # ===== DOMAIN NUMBER HANDLING =====
    def handle_domain_numbers(self, text):
        """Handle angka di domain dengan strategi yang berbeda"""
        if self.domain_number_strategy == 'remove':
            for domain in self.judol_domains:
                pattern = r'\b(' + domain + r')(\d{2,3})\b'
                text = re.sub(pattern, r'\1', text, flags=re.IGNORECASE)
            
        elif self.domain_number_strategy == 'preserve':
            for domain in self.judol_domains:
                pattern = r'\b(' + domain + r')(\d{2,3})\b'
                text = re.sub(pattern, r'\1 \2', text, flags=re.IGNORECASE)
            
        elif self.domain_number_strategy == 'separate_token':
            for domain in self.judol_domains:
                pattern = r'\b(' + domain + r')(\d{2,3})\b'
                text = re.sub(pattern, r'\1 [DOMAIN_NUMBER]', text, flags=re.IGNORECASE)
            
        return text

    def preserve_common_domains(self, text):
        """Preserve common domain patterns yang mengandung angka"""
        return self.handle_domain_numbers(text)

    # ===== TEXT NORMALIZATION =====
    def normalize_case(self, text):
        """Normalisasi kapitalisasi"""
        return text.lower()

    def remove_stopwords_id(self, text):
        """Hapus stopwords bahasa Indonesia - GUNAKAN YANG SELEKTIF"""
        return self.selective_stopword_removal(text)

    def stem_text(self, text):
        """Stemming bahasa Indonesia"""
        return self.stemmer.stem(text)

    def remove_repeated_chars(self, text):
        """Kurangi karakter berulang berlebihan"""
        text = re.sub(r'(.)\1{2,}', r'\1\1', text)
        return text

    def handle_repeated_words(self, text):
        """Handle kata yang diulang-ulang"""
        text = re.sub(r'\b(\w+)(?:\s+\1\b)+', r'\1', text)
        return text

    # ===== SPACING CLEANING =====
    def fix_advanced_spacing(self, text):
        """Perbaiki spacing"""
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    # ===== SPECIAL CHARACTER CLEANING =====
    def clean_special_characters(self, text):
        """Bersihkan karakter khusus seperti @@ dan lainnya"""
        # Hapus karakter khusus seperti @@, **, dll
        text = re.sub(r'[@#\$%\^&\*\(\)_\+=\[\]\{\};:"\\|<>/~`]', ' ', text)
        # Hapus multiple spaces
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    # ===== IMPROVED CLEANING PIPELINE =====
    def clean_comprehensive(self, text, aggressive=True):
        """
        Pipeline cleaning komprehensif untuk teks judol - VERSI DIPERBAIKI
        """
        if not isinstance(text, str) or not text.strip():
            return ""
        
        # ‚úÖ PERBAIKI: URUTAN YANG LEBIH OPTIMAL
        steps = [
            self.fix_encoding,
            self.remove_text_decorations,
            self.enhanced_character_normalization,
            self.clean_brackets_and_special_chars,
            self.handle_emoji_characters,
            self.clean_whitespace_characters,
            
            # ‚úÖ FASE 1: Brand Recognition & Segmentation
            self.enhanced_brand_recognition,      # Kenali brand patterns
            self.improved_word_segmentation,      # Pisahkan kata yang menempel
            self.fix_brand_spacing,               # Pastikan spasi konsisten
            
            # ‚úÖ FASE 2: Word Reconstruction  
            self.fix_common_combinations,         # Gabungkan kombinasi umum
            self.advanced_word_reconstruction,    # Rekonstruksi kata terpisah
            self.detect_and_fix_word_separation,  # Deteksi pemisahan kata
            self.reconstruct_separated_words,     # Rekonstruksi kata terpisah
            
            # ‚úÖ FASE 3: Brand Preservation & Cleaning
            self.preserve_brand_names_in_text,    # ‚úÖ DIPINDAH: Sekarang setelah reconstruction
            self.fix_specific_brand_patterns,     # Perbaiki pattern brand khusus
            
            # ‚úÖ FASE 4: General Cleaning
            self.remove_urls,
            self.remove_phone_numbers,
            self.comprehensive_number_replacement,
            self.remove_special_chars,
            self.remove_repeated_chars,
            self.handle_repeated_words,
            self.normalize_case,
            self.fix_advanced_spacing,
        ]
        
        # Add aggressive cleaning steps if enabled
        if aggressive:
            aggressive_steps = [
                self.selective_stopword_removal,
                self.stem_text,
                self.fix_advanced_spacing,
            ]
            steps.extend(aggressive_steps)
        
        # Execute cleaning pipeline
        cleaned_text = text
        for step in steps:
            try:
                previous_text = cleaned_text
                cleaned_text = step(cleaned_text)
                
                # Debug: Cetak perubahan jika ada
                if previous_text != cleaned_text:
                    print(f"After {step.__name__}: {cleaned_text}")
                    
                if not cleaned_text.strip():
                    return ""
            except Exception as e:
                print(f"Error in {step.__name__}: {e}")
                continue
        
        return cleaned_text

    # ===== BATCH PROCESSING =====
    def clean_dataset(self, df, text_column='text', new_column='cleaned_text', aggressive=True):
        """
        Clean entire dataset
        
        Parameters:
        df (DataFrame): DataFrame pandas
        text_column (str): Nama kolom teks
        new_column (str): Nama kolom hasil cleaning
        aggressive (bool): Mode aggressive cleaning
        """
        tqdm.pandas(desc="Cleaning texts")
        df[new_column] = df[text_column].progress_apply(
            lambda x: self.clean_comprehensive(x, aggressive=aggressive)
        )
        
        return df

    # ===== ANALYSIS METHODS =====
    def analyze_cleaning_result(self, original_text, cleaned_text):
        """Analisis hasil cleaning"""
        analysis = {
            'original_length': len(original_text),
            'cleaned_length': len(cleaned_text),
            'reduction_ratio': round((len(original_text) - len(cleaned_text)) / len(original_text) * 100, 2) if original_text else 0,
            'original_words': len(original_text.split()),
            'cleaned_words': len(cleaned_text.split()),
            'contains_judol_keywords': self.contains_judol_keywords(cleaned_text)
        }
        return analysis

    def contains_judol_keywords(self, text):
        """Cek apakah teks mengandung kata kunci judol"""
        judol_keywords = [
            'situs', 'slot', 'judi', 'togel', 'poker', 'casino', 
            'taruhan', 'betting', 'deposit', 'withdraw', 'bonus',
            'jackpot', 'freespin', 'bandar', 'sabung', 'gambling',
            'pstoto', 'toto', 'arwana', 'pulauwin', 'lazadatoto',
            'insan4d', 'paste4d', 'pandora4d', 'bosan', 'rungkad',
            'sendal4d', 'garudahoki', 'togel62', 'arwanatoto', 'pstoto99',
            'sgi88', 'sg188', 'sgi808', 'sgi888', 'sekali4d'  # TAMBAHKAN SEKALI4D
        ]
        
        text_lower = text.lower()
        found_keywords = [keyword for keyword in judol_keywords if keyword in text_lower]
        return len(found_keywords) > 0, found_keywords

In [11]:
cleaner = JudolTextCleaner()

In [13]:
for i, text in enumerate(df["comment_text"], 1):
    # Menggunakan method yang benar: cleaner.clean(text)
    cleaned = cleaner.clean_comprehensive(text)
    
    # print(f"Original {i}: {text}")
    # print(f"Cleaned  {i}: {cleaned}")
    # print("-" * 50)
    df.loc[i-1, "comment_text"] = cleaned

After remove_text_decorations: Di acaranya indra harusnya serame ini suaranya jadi tambah seru  sunyi bgt üòÇ
After enhanced_character_normalization: Di acaranya indra harusnya serame ini suaranya jadi tambah seru sunyi bgt
After improved_word_segmentation: Di acaranya indra harusnya se rame ini suaranya jadi tambah seru sunyi bgt
After normalize_case: di acaranya indra harusnya se rame ini suaranya jadi tambah seru sunyi bgt
After selective_stopword_removal: acaranya indra harusnya se rame ini suaranya jadi tambah seru sunyi bgt
After stem_text: acara indra harus se rame ini suara jadi tambah seru sunyi bgt
After enhanced_character_normalization: 
After enhanced_character_normalization: Terlalu dar der dor
After normalize_case: terlalu dar der dor
After remove_special_chars: Ga enak ga emosi pak Jarwo   
After remove_repeated_chars: Ga enak ga emosi pak Jarwo  
After normalize_case: ga enak ga emosi pak jarwo  
After fix_advanced_spacing: ga enak ga emosi pak jarwo
After fix_encoding

In [16]:
df.loc[100]

Unnamed: 0                                                    100
comment_id                             UgxRG9kG1g8JNLVilKl4AaABAg
video_id                                              YZ4N8jH5R_M
author                                             @othoysrecipes
comment_text    video ini seru tapi percaya deh pstoto99 tuh t...
published_at                                 2025-11-06T21:22:04Z
like_count                                                      0
Name: 100, dtype: object

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2061 entries, 0 to 2060
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    2061 non-null   int64 
 1   comment_id    2061 non-null   object
 2   video_id      2061 non-null   object
 3   author        2059 non-null   object
 4   comment_text  2061 non-null   object
 5   published_at  2061 non-null   object
 6   like_count    2061 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 112.8+ KB


In [20]:
empty_comments = df[df['comment_text'].str.strip() == '']
print(f"Jumlah komentar yang kosong: {len(empty_comments)}")
print(f"Persentase: {(len(empty_comments) / len(df)) * 100:.2f}%")

Jumlah komentar yang kosong: 57
Persentase: 2.77%


In [22]:
df = df[df['comment_text'].str.strip().astype(bool)].reset_index(drop=True)


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2004 entries, 0 to 2003
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    2004 non-null   int64 
 1   comment_id    2004 non-null   object
 2   video_id      2004 non-null   object
 3   author        2002 non-null   object
 4   comment_text  2004 non-null   object
 5   published_at  2004 non-null   object
 6   like_count    2004 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 109.7+ KB


In [24]:
df.to_csv('cleaned_commentar.csv')