In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib

In [15]:
df = pd.read_csv("output/labeled_dataset_20251112_124437.csv")

In [16]:
df = df[['comment_text', 'label_ultimate']].dropna()

In [17]:
# 2Ô∏è‚É£ Split dataset
X = df['comment_text']
y = df['label_ultimate']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [19]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        lowercase=True,
        stop_words='english',
        ngram_range=(1, 2),  # unigram + bigram
        max_df=0.9,
        min_df=5
    )),
    ('clf', MultinomialNB())
])

In [20]:
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),  # unigram + bigram
    stop_words=None,     # sudah dihandle di preprocessing
    min_df=2,            # ignore terms that appear in less than 2 documents
    max_df=0.8           # ignore terms that appear in more than 80% of documents
)

In [21]:
pipeline.fit(X_train, y_train)

In [22]:
y_pred = pipeline.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9127957931638914

Confusion Matrix:
 [[1625   25]
 [ 174  458]]

Classification Report:
               precision    recall  f1-score   support

       bukan       0.90      0.98      0.94      1650
       judol       0.95      0.72      0.82       632

    accuracy                           0.91      2282
   macro avg       0.93      0.85      0.88      2282
weighted avg       0.92      0.91      0.91      2282



In [23]:
models = {
    'LogisticRegression': LogisticRegression(
        class_weight='balanced',
        random_state=42,
        max_iter=1000
    ),
    'RandomForest': RandomForestClassifier(
        class_weight='balanced',
        random_state=42,
        n_estimators=100
    )
}

‚úÖ Model saved as judol_classifier_model.pkl


In [26]:
# 7Ô∏è‚É£ Contoh prediksi
sample_texts = [
    "sgi88 slot bonus 100% deposit 25rb saja",
    "hati-hati dengan judi online, saya bangkrut karenanya",
    "grok681h bakal running cycle ini udah ga perlu tanya",
    "jangan main judi online",
    'makan bakso',
    'main bulutangkis'
]

predictions = pipeline.predict(sample_texts)
for text, pred in zip(sample_texts, predictions):
    print(f"[{pred}] {text}")

[judol] sgi88 slot bonus 100% deposit 25rb saja
[judol] hati-hati dengan judi online, saya bangkrut karenanya
[judol] grok681h bakal running cycle ini udah ga perlu tanya
[judol] jangan main judi online
[bukan] makan bakso
[judol] main bulutangkis


In [31]:
import pandas as pd
import re
from tqdm import tqdm

class AdvancedJudiLabelingEngine:
    def __init__(self):
        # 1Ô∏è‚É£ Strong brands
        self.strong_brands = [
            'pesiar88', 'mbak4d2', 'g3d3', 'sor76', 'squad777', 'inigrok681h',
            'tapidora77', 'cobadora77', 'denyut69', 'gadaob4t', 'major189', 'starstruck',
            'tkp189', 'grokk681h', 'dora77', 'pakcoy', 'derr', 'sgi88','sg188','sgi808',
            'sgi888','sgi','sg','pstoto','pstoto99','pstoto88','pstoto77','psto',
            'arwanatoto','arwana','toto','pulauwin','pulau','win','lazadatoto','lazada4d',
            'lazada88','lazada77','lazada','visi4d','visi','jaya4d','mega4d','super4d',
            'ultra4d','prime4d','royal4d','king4d','queen4d','pro4d','max4d','gold4d',
            'silver4d','bronze4d','new4d','neo4d','alpha4d','beta4d','omega4d','delta4d',
            'city4d','metro4d','urban4d','capital4d','luck4d','fortune4d','rich4d','wealth4d',
            'star4d','moon4d','sun4d','galaxy4d','speed4d','quick4d','fast4d','instant4d',
            'insan4d','pandora4d','naga4d','hoki4d','paste4d','sendal4d','sekali4d',
            'togel62','garudahoki','garuda','hoki','dewapoker','pokermasa','masapoker',
            'karturapi','dominoqq','bandarqq','capsasusun','cemeonline','berkahslot','berkah',
            'slot','mini1221','mini12211','mini','mini88','mini77','mini99','mini55','mini33',
            'mini22','mini11','mini123','mini321','zeus','bibit168','bibit169','cilik168',
            'grok681h','tapigrok681h','samagrok681h','xrpgrok681h','hpgrok681h'
        ]

        # 2Ô∏è‚É£ Pattern detection
        self.patterns = [
            r'mini\d+', r'maxi\d+', r'mega\d+', r'super\d+', r'pro\d+',
            r'royal\d+', r'king\d+', r'queen\d+', r'\b\d{4,}\b',
            r'[a-z]{3,}\d{2,}', r'\w*slot\w*', r'\w*togel\w*',
            r'\w*judi\w*', r'\w*poker\w*', r'\w*casino\w*',
            r'[a-z]{3,}4d', r'[a-z]{3,}\s*4[dD]',
            r'\b[a-z]\d+[a-z]\d*\b',      
            r'\b[a-z]+\d+[a-z]+\d*\b',    
            r'\b[a-z]+\d{3,}\b',          
        ]

        # 3Ô∏è‚É£ Domain keywords
        self.domain_keywords = [
            'togel','slot','judi','poker','casino','taruhan','betting','bola','scatter',
            'jackpot','menang','rezeki','untung','profit','bonus','main','eth','btc','bnb',
            'portofolio','buy','sell','pump','market'
        ]

    # -------------------------
    # Detection functions
    # -------------------------
    def detect_strong_brands(self, text):
        text_lower = text.lower()
        return [brand for brand in self.strong_brands if brand in text_lower]

    def detect_patterns(self, text):
        text_lower = text.lower()
        found = []
        for pattern in self.patterns:
            found.extend(re.findall(pattern, text_lower))
        return found

    def rule_based_labeling(self, text):
        """Labeling berbasis rule"""
        brands = self.detect_strong_brands(text)
        patterns = self.detect_patterns(text)
        domain_hits = [kw for kw in self.domain_keywords if kw in text.lower()]
        details = {'strong_brands': brands, 'patterns': patterns, 'domain_hits': domain_hits}

        # Rule: strong brand + domain = judol
        if brands and domain_hits:
            return 'judol', details
        # Rule: patterns + domain = judol
        if patterns and domain_hits:
            return 'judol', details
        # Default: bukan judol
        return 'bukan', details

    # -------------------------
    # Label dataset
    # -------------------------
    def label_dataset(self, df, text_column='comment_text'):
        """Label seluruh dataset"""
        df = df.dropna(subset=[text_column]).reset_index(drop=True)
        df['label_ultimate'] = None
        df['strong_brands_detected'] = ''
        df['patterns_detected'] = ''

        for idx, row in tqdm(df.iterrows(), total=len(df)):
            text = row[text_column]
            label, details = self.rule_based_labeling(text)
            df.at[idx, 'label_ultimate'] = label
            df.at[idx, 'strong_brands_detected'] = ','.join(details['strong_brands'])
            df.at[idx, 'patterns_detected'] = ','.join(details['patterns'])
        return df

    # -------------------------
    # Analysis function
    # -------------------------
    def analyze_labeling_results(self, df):
        """Analisis distribusi label dan brand"""
        print("\n" + "="*60)
        print("üìä ADVANCED LABELING RESULTS ANALYSIS")
        print("="*60)
        label_counts = df['label_ultimate'].value_counts()
        for label, count in label_counts.items():
            pct = count / len(df) * 100
            print(f"{label:10} : {count:5} ({pct:.2f}%)")

        print("\nüîç Strong Brands Detection (Top 10):")
        all_brands = df['strong_brands_detected'].str.split(',').explode()
        brand_counts = all_brands.value_counts()
        for brand, count in brand_counts.head(10).items():
            print(f"{brand:20} : {count:4}")

# =======================
# Example usage
# =======================
if __name__ == "__main__":
    # Load CSV
    df = pd.read_csv('labeled_comments.csv')

    # Initialize labeling engine
    labeler = AdvancedJudiLabelingEngine()

    # Label dataset
    labeled_df = labeler.label_dataset(df)

    # Analyze results
    labeler.analyze_labeling_results(labeled_df)

    # Show sample
    print("\nSample labeled data:")
    print(labeled_df[['comment_text','label_ultimate','strong_brands_detected','patterns_detected']])


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 11409/11409 [00:01<00:00, 6138.98it/s]



üìä ADVANCED LABELING RESULTS ANALYSIS
bukan      : 10752 (94.24%)
judol      :   657 (5.76%)

üîç Strong Brands Detection (Top 10):
                     : 9761
zeus                 :  426
pesiar88             :  377
toto                 :  226
slot                 :  121
grok681h             :  104
win                  :   99
mbak4d2              :   95
pulau                :   92
sekali4d             :   55

Sample labeled data:
                                            comment_text label_ultimate  \
0               jauh yang nama judi online kawan percaya          judol   
1      buat apa untung mboja mereka kalau ga karna ju...          judol   
2      sekali4d maxswin bilang jp pala kou modal aj b...          bukan   
3      ada nama judol dayak 777 heyy orang dayak nama...          judol   
4      tpi klo sya mh maen slot ga prnah ngarepin mna...          judol   
...                                                  ...            ...   
11404                                