In [None]:
!git clone https://huggingface.co/yazidsupriadi/bot-detection-indobert

Cloning into 'bot-detection-indobert'...
remote: Enumerating objects: 72, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (3/3), done.[K
remote: Total 72 (delta 0), reused 0 (delta 0), pack-reused 69 (from 1)[K
Unpacking objects: 100% (72/72), 1.91 MiB | 9.14 MiB/s, done.


In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
import numpy as np
import joblib
import gradio as gr

# ================================
# Define model
# ================================
class IndoBERTBotClassifier(nn.Module):
    def __init__(self):
        super(IndoBERTBotClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained("indobenchmark/indobert-base-p1")
        self.classifier = nn.Sequential(
            nn.Linear(768 + 5, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1)
        )

    def forward(self, input_ids, attention_mask, numerical_features):
        bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = bert_output.last_hidden_state[:, 0, :]
        combined = torch.cat((pooled_output, numerical_features), dim=1)
        logits = self.classifier(combined)
        return logits.squeeze()

# ================================
# Load tokenizer, model, scaler
# ================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

model = IndoBERTBotClassifier()
state_dict = torch.load("./bot-detection-indobert/pytorch_model.bin", map_location=device)
model.load_state_dict(state_dict)
model.to(device)
model.eval()

scaler = joblib.load("./bot-detection-indobert/scaler.pkl")

print("‚úÖ Model, tokenizer, scaler loaded successfully.")

# ================================
# Prediction function
# ================================
def predict_gradio(text, favorite_count, retweet_count, reply_count, quote_count, tweet_per_day, model_choice):
    numeric_features = [tweet_per_day, favorite_count, retweet_count, reply_count, quote_count]

    # Tokenize
    tokens = tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
    input_ids = tokens['input_ids'].to(device)
    attention_mask = tokens['attention_mask'].to(device)

    # Scale numeric
    numeric_scaled = scaler.transform(np.array([numeric_features]))
    numeric_tensor = torch.tensor(numeric_scaled, dtype=torch.float32).to(device)

    # Predict
    with torch.no_grad():
        logits = model(input_ids, attention_mask, numeric_tensor)
        prob = torch.sigmoid(logits).item()
        label = "Bot" if prob > 0.5 else "Human"

    return f"""üìù Model: {model_choice}
üìå Prediction: {label}
üî¢ Confidence: {prob:.4f}"""

# ================================
# Gradio interface
# ================================
demo = gr.Interface(
    fn=predict_gradio,
    inputs=[
        gr.Textbox(label="Teks Tweet"),
        gr.Slider(0, 10000, step=1, label="Favorite Count"),
        gr.Slider(0, 10000, step=1, label="Retweet Count"),
        gr.Slider(0, 10000, step=1, label="Reply Count"),
        gr.Slider(0, 10000, step=1, label="Quote Count"),
        gr.Slider(0.0, 100.0, step=0.1, label="Tweet Per Day"),
        gr.Radio(["IndoBERT"], label="Pilih Model")
    ],
    outputs=gr.Textbox(label="Hasil Prediksi"),
    title="Deteksi Akun Bot (IndoBERT + Fitur Numerik)",
    description="Prediksi apakah sebuah akun merupakan bot berdasarkan teks tweet dan fitur aktivitas.",
    allow_flagging="never"
)

if __name__ == "__main__":
    demo.launch()


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

‚úÖ Model, tokenizer, scaler loaded successfully.




It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://99637b3f76fb065e47.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [None]:
!git clone https://huggingface.co/yazidsupriadi/indo_lstm_bot

Cloning into 'indo_lstm_bot'...
remote: Enumerating objects: 581, done.[K
remote: Counting objects: 100% (578/578), done.[K
remote: Compressing objects: 100% (578/578), done.[K
remote: Total 581 (delta 269), reused 0 (delta 0), pack-reused 3 (from 1)[K
Receiving objects: 100% (581/581), 542.62 KiB | 8.48 MiB/s, done.
Resolving deltas: 100% (269/269), done.
Filtering content: 100% (3/3), 953.09 MiB | 35.78 MiB/s, done.


In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
import numpy as np
import gradio as gr

class IndoBERT_LSTM(nn.Module):
    def __init__(self):
        super(IndoBERT_LSTM, self).__init__()
        self.bert = AutoModel.from_pretrained("indobenchmark/indobert-base-p1")
        self.lstm = nn.LSTM(self.bert.config.hidden_size, 128, batch_first=True)
        self.classifier = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask, numerical_features=None):
        with torch.no_grad():
            bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        _, (hidden, _) = self.lstm(bert_output)
        lstm_out = hidden[-1]
        logits = self.classifier(lstm_out)
        return self.sigmoid(logits).squeeze()

# ================================
# Load assets
# ================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("indobenchmark/indobert-base-p1")

model = IndoBERT_LSTM()
model.load_state_dict(torch.load("./indo_lstm_bot/model_epoch_10.pth", map_location=device))
model.to(device)
model.eval()

print("‚úÖ IndoBERT + LSTM model loaded successfully.")

# ================================
# Prediction function
# ================================
def predict_gradio(text, favorite_count, retweet_count, reply_count, quote_count, tweet_per_day, model_choice):
    # Input numerik tetap diterima untuk konsistensi UI tapi diabaikan
    tokens = tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
    input_ids = tokens['input_ids'].to(device)
    attention_mask = tokens['attention_mask'].to(device)

    with torch.no_grad():
        prob = model(input_ids, attention_mask).item()
    label = "Bot" if prob > 0.5 else "Human"

    return f"""üìù Model: {model_choice}
üìå Prediction: {label}
üî¢ Confidence: {prob:.4f}"""

# ================================
# Gradio Interface
# ================================
demo = gr.Interface(
    fn=predict_gradio,
    inputs=[
        gr.Textbox(label="Teks Tweet"),
        gr.Slider(0, 10000, step=1, label="Favorite Count"),
        gr.Slider(0, 10000, step=1, label="Retweet Count"),
        gr.Slider(0, 10000, step=1, label="Reply Count"),
        gr.Slider(0, 10000, step=1, label="Quote Count"),
        gr.Slider(0.0, 100.0, step=0.1, label="Tweet Per Day"),
        gr.Radio(["IndoBERT +LSTM"], label="Pilih Model")
    ],
    outputs=gr.Textbox(label="Hasil Prediksi"),
    title="Deteksi Akun Bot (IndoBERT + LSTM + Fitur Numerik)",
    description="Prediksi apakah sebuah akun merupakan bot berdasarkan teks tweet dan fitur aktivitas.",
    allow_flagging="never"
)

if __name__ == "__main__":
    demo.launch()


‚úÖ IndoBERT + LSTM model loaded successfully.




It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a4a5a46a7023a98047.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


In [None]:
!git clone https://huggingface.co/yazidsupriadi/mbert_lstm_bot

Cloning into 'mbert_lstm_bot'...
remote: Enumerating objects: 322, done.[K
remote: Counting objects: 100% (319/319), done.[K
remote: Compressing objects: 100% (319/319), done.[K
remote: Total 322 (delta 141), reused 0 (delta 0), pack-reused 3 (from 1)[K
Receiving objects: 100% (322/322), 514.48 KiB | 5.65 MiB/s, done.
Resolving deltas: 100% (141/141), done.
Filtering content: 100% (3/3), 1.32 GiB | 21.98 MiB/s, done.


In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
import numpy as np
import joblib
import gradio as gr

# =========================================
# MODEL DEFINITION: mBERT + LSTM + NUMERIC
# =========================================
class mBERT_LSTM(nn.Module):
    def __init__(self, num_numerical_features):
        super(mBERT_LSTM, self).__init__()
        self.bert = AutoModel.from_pretrained("bert-base-multilingual-cased")
        self.lstm = nn.LSTM(self.bert.config.hidden_size, 128, batch_first=True)

        self.num_fc = nn.Sequential(
            nn.Linear(num_numerical_features, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU()
        )

        self.classifier = nn.Linear(128 + 32, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, input_ids, attention_mask, numerical_features):
        with torch.no_grad():
            bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        _, (hidden, _) = self.lstm(bert_output)
        lstm_out = hidden[-1]

        num_out = self.num_fc(numerical_features)
        combined = torch.cat((lstm_out, num_out), dim=1)
        logits = self.classifier(combined)
        return self.sigmoid(logits).squeeze()

# ================================
# LOAD MODEL, TOKENIZER, SCALER
# ================================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

model = mBERT_LSTM(num_numerical_features=5)
state_dict = torch.load("./mbert_lstm_bot/model_epoch_10.pt", map_location=device)
model.load_state_dict(state_dict)
model.to(device)
model.eval()

scaler = joblib.load("./mbert_lstm_bot/scaler.pkl")
print("‚úÖ mBERT + LSTM model loaded successfully.")

# ================================
# INFERENCE FUNCTION
# ================================
def predict_gradio(text, favorite_count, retweet_count, reply_count, quote_count, tweet_per_day, model_choice):
    numeric_features = [tweet_per_day, favorite_count, retweet_count, reply_count, quote_count]
    numeric_scaled = scaler.transform(np.array([numeric_features]))
    numeric_tensor = torch.tensor(numeric_scaled, dtype=torch.float32).to(device)

    tokens = tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
    input_ids = tokens['input_ids'].to(device)
    attention_mask = tokens['attention_mask'].to(device)

    with torch.no_grad():
        prob = model(input_ids, attention_mask, numeric_tensor).item()
    label = "Bot" if prob > 0.5 else "Human"

    return f"""üìù Model: {model_choice}
üìå Prediction: {label}
üî¢ Confidence: {prob:.4f}"""

# ================================
# GRADIO INTERFACE
# ================================
demo = gr.Interface(
    fn=predict_gradio,
    inputs=[
        gr.Textbox(label="Teks Tweet"),
        gr.Slider(0, 10000, step=1, label="Favorite Count"),
        gr.Slider(0, 10000, step=1, label="Retweet Count"),
        gr.Slider(0, 10000, step=1, label="Reply Count"),
        gr.Slider(0, 10000, step=1, label="Quote Count"),
        gr.Slider(0.0, 100.0, step=0.1, label="Tweet Per Day"),
        gr.Radio(["mBERT + LSTM"], label="Pilih Model")
    ],
    outputs=gr.Textbox(label="Hasil Prediksi"),
    title="Deteksi Bot (mBERT + LSTM + Fitur Numerik)",
    description="Prediksi akun bot di platform X menggunakan model mBERT + LSTM dengan fitur aktivitas akun.",
    allow_flagging="never"
)

if __name__ == "__main__":
    demo.launch()


‚úÖ mBERT + LSTM model loaded successfully.




It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://38c9112bc2c9e52815.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
