In [64]:
import pandas as pd
import re
from typing import Tuple, List

class FinalJudiClassifier:
    def __init__(self):
        # Brand judi yang diketahui
        self.gambling_brands = {
            'pesiar88', 'mini1221', 'mbak4d2', 'sekali4d', 'squad777', 'sor76', 'rtpwin',
            'glowin88', 'harta138', 'generasi88', 'procuan33', 'seru69', 'targetplay303',
            'sis77', 'buruanwd98', 'sikat88', 'lining4d', 'timah33', 'dewajudi818',
            'royal505', 'cahayatoto', 'tiket200', 'evostoto', 'mega177', 'enakmega177',
            'qscbd88', 'holywin99', 'arch', 'upahslot', 'maxswin', 'dayak777', 'intan777',
            'winbet679', 'pusatwin', 'g3d3', 'emas24', 'emas24karat', 'bilik', 'jepe',
            'dedetoto', 'poker88', 'lazadatoto', 'dedetoto129', 'supertoto99', 'lucky777',
            'raja4d', 'mega121', 'batak87', 'togel62', 'togel99', 'poker22', 'bibit168', 'dora77'
        }
        
        # Kata promosi judi (diperluas)
        self.promotion_words = {
            'mantap', 'bagus', 'asik', 'seru', 'gacor', 'jackpot', 'menang', 'wd', 'withdraw',
            'deposit', 'depo', 'bonus', 'freebet', 'cuan', 'profit', 'untung', 'maxwin',
            'gampang', 'mudah', 'cepat', 'langsung', 'auto', 'pasti', 'jamin', 'recommend',
            'saran', 'coba', 'daftar', 'bergabung', 'join', 'link', 'situs', 'website',
            'scatter', 'wild', 'fitur', 'game', 'putar', 'spin', 'rtp', 'payline', 'symbol',
            'cair', 'gacir', 'jepe', 'nomplok', 'receh', 'hoki', 'hasil', 'makswin', 'maxwin',
            'target', 'gede', 'jalan', 'usaha', 'handal', 'buka', 'gabung', 'takluk', 'virtual',
            'thebast', 'broo', 'z0nk', 'kasih', 'dapet', 'eh', 'malah'
        }
        
        # Kata peringatan/anti judi (diperluas)
        self.warning_words = {
            'haram', 'dosa', 'bahaya', 'henti', 'stop', 'jangan', 'jauh', 'tipu', 'bodong',
            'scam', 'korban', 'hancur', 'miskin', 'hutang', 'celaka', 'larang', 'blokir',
            'tolong', 'sadar', 'tobat', 'ampun', 'lindung', 'selamat', 'sumpah', 'janji',
            'musnah', 'jauhi', 'peringatan', 'awas', 'waspada', 'rugi', 'kalah', 'bosan',
            'rungkad', 'cape', 'sesal', 'menyesal', 'kecewa', 'palsu', 'tipuan', 'cuma',
            'kirain', 'zonk', 'habis', 'waktu', 'kuliah', 'gratis', 'pak', 'sugiri', 'percaya'
        }
        
        # Pattern untuk brand baru
        self.brand_patterns = [
            r'\b[a-z]{4,}\d{2,}\b',
            r'\b[a-z]+\d{3,}\b',
            r'\b(win|bet|slot|toto|togel|poker|casino|judi|qq|dadu|mega|royal|hoki|bola)\d+\b',
            r'\b[a-z]+\d*[a-z]+\d*\b',
        ]

    def preprocess_text(self, text: str) -> str:
        if pd.isna(text):
            return ""
        text = str(text).lower()
        # Normalisasi angka-huruf (h0k1 -> hoki, bet4d -> bet)
        text = re.sub(r'(\d+)', ' ', text)
        text = re.sub(r'[^\w\s]', ' ', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def detect_brands(self, text: str) -> List[str]:
        """Deteksi brand dengan pattern matching yang lebih baik"""
        brands = []
        
        # Brand yang diketahui
        for brand in self.gambling_brands:
            if brand in text:
                brands.append(brand)
        
        # Pattern matching untuk brand baru
        original_text = text
        for pattern in self.brand_patterns:
            matches = re.findall(pattern, original_text)
            for match in matches:
                if len(match) >= 5 and match not in brands:
                    brands.append(match)
        
        # Deteksi pattern khusus seperti "garudahoki", "plazabola"
        compound_words = re.findall(r'\b[a-z]{5,}[a-z]+\b', original_text)
        for word in compound_words:
            if any(keyword in word for keyword in ['hoki', 'toto', 'togel', 'poker', 'bola', 'bet', 'slot', 'win']):
                if word not in brands and len(word) >= 6:
                    brands.append(word)
        
        return brands

    def analyze_sentiment(self, text: str) -> Tuple[List[str], List[str], bool]:
        """Analisis sentimen dengan konteks yang lebih baik"""
        promo_found = []
        warn_found = []
        
        words = text.split()
        
        for word in words:
            if word in self.promotion_words:
                # Cek konteks negatif
                if not self._is_negative_context(text, word):
                    promo_found.append(word)
            
            if word in self.warning_words:
                warn_found.append(word)
        
        # Deteksi transaksi (pattern lebih luas)
        has_transaction = any(re.search(pattern, text) for pattern in [
            r'(wd|withdraw)', r'(depo|deposit)', r'menang', r'jackpot', r'jepe',
            'bonus', 'cuan', 'untung', 'profit', 'maxwin', 'makswin'
        ])
        
        if has_transaction and not self._is_negative_context(text, ""):
            promo_found.append('transaction')
        
        # Deteksi hidden promotion
        has_hidden_promo = any(re.search(pattern, text) for pattern in [
            r'bosan.*(coba|daftar|main)', r'rungkad.*(coba|daftar|main)',
            r'cape.*(coba|daftar|main)', r'auto.*cair', r'kirain.*zonk.*kasih',
            r'gabung.*takluk', r'buka.*jalan', r'hasil.*usaha', r'baru.*gabung.*langsung'
        ])
        
        return promo_found, warn_found, has_hidden_promo

    def _is_negative_context(self, text: str, keyword: str) -> bool:
        """Cek konteks negatif dengan pattern yang lebih baik"""
        negative_patterns = [
            r'scatter.*tipu', r'wild.*janji', r'bonus.*palsu',
            r'jangan.*percaya', r'henti.*judi', r'stop.*main',
            r'tipu.*bandar', r'cuma.*janji', r'kirain.*zonk',
            r'haram.*bahaya', r'rugi.*kalah', r'bosan.*rungkad',
            r'wild.*scatter.*cuma.*janji', r'scatter.*palsu',
            r'kuliah.*gratis', r'pak.*toto.*sugiri'
        ]
        
        return any(re.search(pattern, text) for pattern in negative_patterns)

    def classify_comment(self, text: str) -> Tuple[str, str, float, List[str], List[str]]:
        if pd.isna(text) or text == "":
            return "bukan", "empty_text", 0.7, [], []
            
        processed_text = self.preprocess_text(text)
        
        # Deteksi brand
        brands = self.detect_brands(text)
        
        # Analisis sentimen
        promo_words, warn_words, has_hidden_promo = self.analyze_sentiment(processed_text)
        
        # Decision tree yang diperbaiki
        confidence = 0.7
        
        # CASE 1: Hidden promotion -> JUDOL (priority tinggi)
        if has_hidden_promo:
            return "judol", "hidden_promotion", 0.95, brands, promo_words
        
        # CASE 2: Ada brand + promosi -> JUDOL
        if brands and promo_words:
            return "judol", "brand_with_promotion", 0.9, brands, promo_words
        
        # CASE 3: Kata peringatan kuat -> BUKAN (override)
        if len(warn_words) >= 2 or any(word in warn_words for word in ['haram', 'henti', 'jangan', 'tipu', 'palsu']):
            return "bukan", "strong_warning", 0.9, brands, warn_words
        
        # CASE 4: Konteks negatif spesifik -> BUKAN
        if self._is_negative_context(processed_text, ""):
            return "bukan", "negative_context", 0.8, brands, warn_words
        
        # CASE 5: Brand mencurigakan tanpa konteks negatif -> JUDOL
        if brands and not warn_words:
            suspicious_brands = [b for b in brands if any(c.isdigit() for c in b) or any(kw in b for kw in ['toto', 'togel', 'poker', 'slot', 'hoki'])]
            if suspicious_brands:
                return "judol", "suspicious_brand", 0.8, brands, []
        
        # CASE 6: Pattern transaksi jelas -> JUDOL
        if ('transaction' in promo_words or len(promo_words) >= 3) and not warn_words:
            return "judol", "strong_promotion_pattern", 0.85, brands, promo_words
        
        # CASE 7: Default -> BUKAN
        return "bukan", "insufficient_evidence", 0.7, brands, []

def comprehensive_test():
    classifier = FinalJudiClassifier()
    
    # Semua test cases dari sebelumnya
    all_test_cases = [
        # Masalah sebelumnya
        "scatter itu cuma tipuan bandar",
        "scatter 5x dapat bonus 100rb",
        "bosan rungkad main di dedetoto129",
        "henti judi sebelum terlambat",
        "auto cair main di batak 87",
        "jangan tertipu scatter palsu",
        "wild dan scatter cuma janji",
        
        # Test cases scatter dari sebelumnya
        "scatter sering keluar di pesiar88",
        "game ini scatter nya gacor banget",
        "scatter tipu jangan percaya",
        
        # Edge cases dari sebelumnya
        "depo 100k wd 5jt di mini1221",
        "baru deposit 200rb langsung menang jackpot",
        "judi itu haram dan bahaya",
        
        # Kasus baru yang bermasalah
        "togel62",
        "togel99", 
        "poker22",
        "gabung takluk dunia virtual sama mbak4d2",
        "kuliah gratis dr pak toto sugiri",
        "baru gabung langsung hoki bonus dapet jepe jalan garudahoki thebast broo",
        "kirain depo 20k bakal zonk garudahoki eh malah kasih 37jt",
        "main di situs google auto maxwin",
        "main di plazabola saja",
        "yang lain habis waktu kita hasil bibit168 bola magesiti",
        "main di situs google pasti makswin",
        "main dora77 buka jalan dari jual pulsa jadi usaha handal",
        "h0k1e banget bet4d target jeipe gede makin seru main",
        
        # Test cases tambahan
        "garudahoki thebast broo",
        "plazabola situs terbaik",
    ]
    
    print("COMPREHENSIVE FINAL CLASSIFIER TEST:")
    print("=" * 70)
    
    results = []
    for comment in all_test_cases:
        label, reason, confidence, brands, patterns = classifier.classify_comment(comment)
        results.append({
            'comment': comment,
            'label': label,
            'reason': reason,
            'confidence': confidence,
            'brands': brands,
            'patterns': patterns
        })
        
        print(f"'{comment}'")
        print(f"→ {label.upper()} (Confidence: {confidence:.2f}, Reason: {reason})")
        if brands:
            print(f"  Brands: {brands}")
        if patterns:
            print(f"  Patterns: {patterns}")
        print("---")
    
    # Statistics
    judol_count = sum(1 for r in results if r['label'] == 'judol')
    bukan_count = sum(1 for r in results if r['label'] == 'bukan')
    
    print(f"\n=== STATISTICS ===")
    print(f"Total test cases: {len(results)}")
    print(f"JUDOL: {judol_count} ({judol_count/len(results)*100:.1f}%)")
    print(f"BUKAN: {bukan_count} ({bukan_count/len(results)*100:.1f}%)")
    
    # Problem cases analysis - FIXED VERSION
    problem_cases = [
        "scatter itu cuma tipuan bandar", 
        "wild dan scatter cuma janji", 
        "henti judi sebelum terlambat", 
        "h0k1e banget bet4d target jeipe gede makin seru main",
        "kuliah gratis dr pak toto sugiri", 
        "main di plazabola saja",
        "mona4d top banget layan cepet"
    ]
    
    print(f"\n=== PROBLEM CASES ANALYSIS ===")
    for case in problem_cases:
        # Cari result yang sesuai dengan case
        matching_results = [r for r in results if r['comment'] == case]
        if matching_results:
            result = matching_results[0]
            print(f"'{case}' → {result['label'].upper()} (Reason: {result['reason']})")
        else:
            print(f"'{case}' → NOT FOUND IN RESULTS")

if __name__ == "__main__":
    comprehensive_test()

COMPREHENSIVE FINAL CLASSIFIER TEST:
'scatter itu cuma tipuan bandar'
  Brands: ['scatter', 'tipuan', 'bandar']
  Patterns: ['cuma', 'tipuan']
---
'scatter 5x dapat bonus 100rb'
→ JUDOL (Confidence: 0.90, Reason: brand_with_promotion)
  Brands: ['scatter', 'dapat', 'bonus']
  Patterns: ['scatter', 'bonus', 'transaction']
---
'bosan rungkad main di dedetoto129'
→ JUDOL (Confidence: 0.95, Reason: hidden_promotion)
  Brands: ['dedetoto', 'dedetoto129', 'bosan', 'rungkad']
---
'henti judi sebelum terlambat'
  Brands: ['henti', 'sebelum', 'terlambat']
  Patterns: ['henti']
---
'auto cair main di batak 87'
→ JUDOL (Confidence: 0.95, Reason: hidden_promotion)
  Brands: ['batak']
  Patterns: ['auto', 'cair']
---
'jangan tertipu scatter palsu'
  Brands: ['jangan', 'tertipu', 'scatter', 'palsu']
  Patterns: ['jangan', 'palsu']
---
'wild dan scatter cuma janji'
  Brands: ['scatter', 'janji']
  Patterns: ['cuma', 'janji']
---
'scatter sering keluar di pesiar88'
→ JUDOL (Confidence: 0.90, Reason: b