# McHacks 26 - Bot or Not

### Imports

In [25]:
import numpy as np
import matplotlib as plt
from pathlib import Path
import json
import pandas as pd
import re
from transformers import AutoTokenizer
from sklearn.preprocessing import StandardScaler

### Data processing

In [26]:
DATA_DIR = Path("data")

def get_version(path):
    try:
        return int(path.stem.split(".")[-1])
    except ValueError:
        return None

posts_users_files = sorted(DATA_DIR.glob("dataset.posts&users.*.json"), key=get_version)
if not posts_users_files:
    raise FileNotFoundError("No dataset.posts&users.*.json files found in data/")

combined = {}
bots_by_lang = {}

for path in posts_users_files:
    with path.open() as f:
        data = json.load(f)
    lang = data.get("lang")

    combined.setdefault(lang, {"posts": [], "users": [], "sources": []})
    combined[lang]["posts"].extend(data.get("posts", []))
    combined[lang]["users"].extend(data.get("users", []))
    combined[lang]["sources"].append(path.name)

    version = get_version(path)
    if version is not None:
        bots_path = DATA_DIR / f"dataset.bots.{version}.txt"
        if bots_path.exists():
            bots_by_lang.setdefault(lang, set()).update(bots_path.read_text().splitlines())

posts_en = pd.DataFrame(combined.get("en", {}).get("posts", []))
users_en = pd.DataFrame(combined.get("en", {}).get("users", []))
bot_ids_en = bots_by_lang.get("en", set())
if not users_en.empty:
    users_en["is_bot"] = users_en["id"].isin(bot_ids_en)

posts_fr = pd.DataFrame(combined.get("fr", {}).get("posts", []))
users_fr = pd.DataFrame(combined.get("fr", {}).get("users", []))
bot_ids_fr = bots_by_lang.get("fr", set())
if not users_fr.empty:
    users_fr["is_bot"] = users_fr["id"].isin(bot_ids_fr)

print("EN sources:", combined.get("en", {}).get("sources", []))
print(f"EN posts: {len(posts_en):,} users: {len(users_en):,} bot_ids: {len(bot_ids_en):,}")
print("FR sources:", combined.get("fr", {}).get("sources", []))
print(f"FR posts: {len(posts_fr):,} users: {len(users_fr):,} bot_ids: {len(bot_ids_fr):,}")


EN sources: ['dataset.posts&users.30.json', 'dataset.posts&users.32.json']
EN posts: 15,765 users: 546 bot_ids: 129
FR sources: ['dataset.posts&users.31.json', 'dataset.posts&users.33.json']
FR posts: 9,004 users: 343 bot_ids: 55


### Experiment parameters

In [27]:
EXPERIMENT_CONFIG = {
    "tokenizer_name": "vinai/bertweet-base",
    "max_length": 96,
    "dedupe_users": True,
    "dedupe_posts": True,
    "scale_meta_features": True,
    "use_topic_features": True,
    "topic_match_mode": "word",  # options: "contains", "word"
    "test_size": 0.20,
    "random_seed": 42,
    "validation_split": 0.15,
    "epochs": 8,
    "batch_size": 128,
    "learning_rate": 0.001,
    "prediction_threshold": 0.62,
    "use_class_weights": False,
    "embedding_dim": 96,
    "gru_units": 48,
    "aux_dense_units": 32,
    "head_dense_units": 48,
    "dropout_text": 0.40,
    "dropout_aux": 0.30,
    "dropout_head": 0.40,
    "early_stopping_patience": 1,
    "reduce_lr_patience": 1,
    "reduce_lr_factor": 0.50,
    "reduce_lr_min_lr": 1e-5,
}

print("Experiment config loaded:")
for key, value in EXPERIMENT_CONFIG.items():
    print(f"- {key}: {value}")


Experiment config loaded:
- tokenizer_name: vinai/bertweet-base
- max_length: 96
- dedupe_users: True
- dedupe_posts: True
- scale_meta_features: True
- use_topic_features: True
- topic_match_mode: word
- test_size: 0.2
- random_seed: 42
- validation_split: 0.15
- epochs: 8
- batch_size: 128
- learning_rate: 0.001
- prediction_threshold: 0.62
- use_class_weights: False
- embedding_dim: 96
- gru_units: 48
- aux_dense_units: 32
- head_dense_units: 48
- dropout_text: 0.4
- dropout_aux: 0.3
- dropout_head: 0.4
- early_stopping_patience: 1
- reduce_lr_patience: 1
- reduce_lr_factor: 0.5
- reduce_lr_min_lr: 1e-05


### Tokenizing the text

In [28]:
import re

try:
    from transformers import AutoTokenizer
except ImportError as exc:
    raise ImportError("Install transformers first: pip install transformers") from exc

from sklearn.preprocessing import StandardScaler

if "EXPERIMENT_CONFIG" not in globals():
    raise ValueError("Run the Experiment parameters cell first.")

TOKENIZER_NAME = str(EXPERIMENT_CONFIG["tokenizer_name"])
MAX_LENGTH = int(EXPERIMENT_CONFIG["max_length"])
DEDUPE_USERS = bool(EXPERIMENT_CONFIG["dedupe_users"])
DEDUPE_POSTS = bool(EXPERIMENT_CONFIG["dedupe_posts"])
SCALE_META_FEATURES = bool(EXPERIMENT_CONFIG["scale_meta_features"])

def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.replace("\n", " ").strip()
    text = re.sub(r"\s+", " ", text)
    return text

def add_text_features(df):
    out = df.copy()
    out["text_clean"] = out["text"].fillna("").map(clean_text)
    out["char_count"] = out["text_clean"].str.len()
    out["word_count"] = out["text_clean"].str.split().str.len()
    out["url_count"] = out["text_clean"].str.count(r"https?://\S+|www\.\S+")
    out["mention_count"] = out["text_clean"].str.count(r"@\w+")
    out["hashtag_count"] = out["text_clean"].str.count(r"#\w+")
    out["exclamation_count"] = out["text_clean"].str.count(r"!")
    out["question_count"] = out["text_clean"].str.count(r"\?")
    return out

if posts_en.empty or users_en.empty:
    raise ValueError("Run the data processing cell first to load English data.")

users_en_labeled = (
    users_en.drop_duplicates(subset="id", keep="last").copy()
    if DEDUPE_USERS
    else users_en.copy()
)
posts_en_unique = (
    posts_en.drop_duplicates(subset="id", keep="last").copy()
    if DEDUPE_POSTS
    else posts_en.copy()
)

label_map_en = users_en_labeled.set_index("id")["is_bot"]
train_en = posts_en_unique.copy()
train_en["is_bot"] = train_en["author_id"].map(label_map_en)
train_en = train_en.dropna(subset=["is_bot"]).copy()
train_en["is_bot"] = train_en["is_bot"].astype("int64")

train_en = add_text_features(train_en)

tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME, use_fast=True)
encodings_en = tokenizer(
    train_en["text_clean"].tolist(),
    truncation=True,
    padding="max_length",
    max_length=MAX_LENGTH,
    return_attention_mask=True,
)

feature_cols_en = [
    "char_count",
    "word_count",
    "url_count",
    "mention_count",
    "hashtag_count",
    "exclamation_count",
    "question_count",
]
X_meta_en = train_en[feature_cols_en].to_numpy(dtype=np.float32)
y_en = train_en["is_bot"].to_numpy(dtype=np.int64)

if SCALE_META_FEATURES:
    scaler_en = StandardScaler()
    X_meta_en_scaled = scaler_en.fit_transform(X_meta_en).astype(np.float32)
else:
    scaler_en = None
    X_meta_en_scaled = X_meta_en.copy()

print(f"Tokenizer: {TOKENIZER_NAME}")
print(f"English labeled posts: {len(train_en):,}")
print(f"Token tensor shape: {np.asarray(encodings_en['input_ids']).shape}")
print(f"Meta feature shape: {X_meta_en_scaled.shape}, label shape: {y_en.shape}")
print(f"Dedupe users/posts: {DEDUPE_USERS}/{DEDUPE_POSTS}")
print(f"Scale metadata features: {SCALE_META_FEATURES}")




Tokenizer: vinai/bertweet-base
English labeled posts: 15,765
Token tensor shape: (15765, 96)
Meta feature shape: (15765, 7), label shape: (15765,)
Dedupe users/posts: True/True
Scale metadata features: True


## Train-Test split for model

In [29]:
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.utils.class_weight import compute_class_weight

try:
    import tensorflow as tf
except ImportError as exc:
    raise ImportError("Install tensorflow first: pip install tensorflow") from exc

if "EXPERIMENT_CONFIG" not in globals():
    raise ValueError("Run the Experiment parameters cell first.")

TEST_SIZE = float(EXPERIMENT_CONFIG["test_size"])
RANDOM_SEED = int(EXPERIMENT_CONFIG["random_seed"])
VALIDATION_SPLIT = float(EXPERIMENT_CONFIG["validation_split"])
EPOCHS = int(EXPERIMENT_CONFIG["epochs"])
BATCH_SIZE = int(EXPERIMENT_CONFIG["batch_size"])
LEARNING_RATE = float(EXPERIMENT_CONFIG["learning_rate"])
PREDICTION_THRESHOLD = float(EXPERIMENT_CONFIG["prediction_threshold"])
USE_CLASS_WEIGHTS = bool(EXPERIMENT_CONFIG["use_class_weights"])
USE_TOPIC_FEATURES = bool(EXPERIMENT_CONFIG["use_topic_features"])
TOPIC_MATCH_MODE = str(EXPERIMENT_CONFIG["topic_match_mode"])

EMBEDDING_DIM = int(EXPERIMENT_CONFIG["embedding_dim"])
GRU_UNITS = int(EXPERIMENT_CONFIG["gru_units"])
AUX_DENSE_UNITS = int(EXPERIMENT_CONFIG["aux_dense_units"])
HEAD_DENSE_UNITS = int(EXPERIMENT_CONFIG["head_dense_units"])
DROPOUT_TEXT = float(EXPERIMENT_CONFIG["dropout_text"])
DROPOUT_AUX = float(EXPERIMENT_CONFIG["dropout_aux"])
DROPOUT_HEAD = float(EXPERIMENT_CONFIG["dropout_head"])

EARLY_STOPPING_PATIENCE = int(EXPERIMENT_CONFIG["early_stopping_patience"])
REDUCE_LR_PATIENCE = int(EXPERIMENT_CONFIG["reduce_lr_patience"])
REDUCE_LR_FACTOR = float(EXPERIMENT_CONFIG["reduce_lr_factor"])
REDUCE_LR_MIN_LR = float(EXPERIMENT_CONFIG["reduce_lr_min_lr"])

if not (0.0 < TEST_SIZE < 1.0):
    raise ValueError("test_size must be between 0 and 1.")
if not (0.0 <= VALIDATION_SPLIT < 1.0):
    raise ValueError("validation_split must be in [0, 1).")
if TOPIC_MATCH_MODE not in {"contains", "word"}:
    raise ValueError('topic_match_mode must be "contains" or "word".')

def load_english_topic_keywords():
    topic_keywords = {}
    for source_name in combined.get("en", {}).get("sources", []):
        source_path = DATA_DIR / source_name
        with source_path.open() as f:
            payload = json.load(f)
        for topic_item in payload.get("metadata", {}).get("topics", []):
            topic = str(topic_item.get("topic", "")).strip().lower()
            if not topic:
                continue
            keywords = {
                str(keyword).strip().lower()
                for keyword in topic_item.get("keywords", [])
                if str(keyword).strip()
            }
            keywords.add(topic)
            topic_keywords.setdefault(topic, set()).update(keywords)
    return {topic: sorted(values, key=len, reverse=True) for topic, values in topic_keywords.items()}

def add_topic_features(df, topic_keywords, match_mode):
    out = df.copy()
    text_lower = out["text_clean"].str.lower()
    topic_cols = []
    for topic, keywords in topic_keywords.items():
        col = f"topic_{topic}"
        topic_cols.append(col)
        if not keywords:
            out[col] = 0
            continue
        if match_mode == "word":
            pattern = "|".join(rf"\\b{re.escape(keyword)}\\b" for keyword in keywords)
        else:
            pattern = "|".join(re.escape(keyword) for keyword in keywords)
        out[col] = text_lower.str.contains(pattern, regex=True).astype(np.int8)
    return out, topic_cols

if USE_TOPIC_FEATURES:
    topic_keywords_en = load_english_topic_keywords()
    train_en_model, topic_feature_cols_en = add_topic_features(train_en, topic_keywords_en, TOPIC_MATCH_MODE)
else:
    train_en_model = train_en.copy()
    topic_feature_cols_en = []

input_ids_en = np.asarray(encodings_en["input_ids"], dtype=np.int32)
attention_mask_en = np.asarray(encodings_en["attention_mask"], dtype=np.float32)
X_topic_en = (
    train_en_model[topic_feature_cols_en].to_numpy(dtype=np.float32)
    if topic_feature_cols_en
    else np.zeros((len(train_en_model), 0), dtype=np.float32)
)
X_aux_en = np.concatenate([X_meta_en_scaled, X_topic_en], axis=1)

idx = np.arange(len(y_en))
train_idx, test_idx = train_test_split(
    idx,
    test_size=TEST_SIZE,
    random_state=RANDOM_SEED,
    stratify=y_en,
)

X_train_ids, X_test_ids = input_ids_en[train_idx], input_ids_en[test_idx]
X_train_mask, X_test_mask = attention_mask_en[train_idx], attention_mask_en[test_idx]
X_train_aux, X_test_aux = X_aux_en[train_idx], X_aux_en[test_idx]
y_train, y_test = y_en[train_idx], y_en[test_idx]

class_weight_dict = None
if USE_CLASS_WEIGHTS:
    classes = np.unique(y_train)
    class_weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
    class_weight_dict = {int(c): float(w) for c, w in zip(classes, class_weights)}

def build_multifeature_model(
    vocab_size,
    seq_len,
    aux_dim,
    embedding_dim,
    gru_units,
    aux_dense_units,
    head_dense_units,
    dropout_text,
    dropout_aux,
    dropout_head,
    learning_rate,
):
    ids_input = tf.keras.layers.Input(shape=(seq_len,), dtype="int32", name="input_ids")
    mask_input = tf.keras.layers.Input(shape=(seq_len,), dtype="float32", name="attention_mask")
    aux_input = tf.keras.layers.Input(shape=(aux_dim,), dtype="float32", name="aux_features")

    x = tf.keras.layers.Embedding(vocab_size, embedding_dim, name="token_embedding")(ids_input)
    mask = tf.keras.layers.Reshape((seq_len, 1))(mask_input)
    x = tf.keras.layers.Multiply()([x, mask])
    x = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(gru_units))(x)
    x = tf.keras.layers.Dropout(dropout_text)(x)

    aux = tf.keras.layers.Dense(aux_dense_units, activation="relu")(aux_input)
    aux = tf.keras.layers.Dropout(dropout_aux)(aux)

    merged = tf.keras.layers.Concatenate()([x, aux])
    merged = tf.keras.layers.Dense(head_dense_units, activation="relu")(merged)
    merged = tf.keras.layers.Dropout(dropout_head)(merged)
    output = tf.keras.layers.Dense(1, activation="sigmoid")(merged)

    model = tf.keras.Model(inputs=[ids_input, mask_input, aux_input], outputs=output)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss="binary_crossentropy",
        metrics=[tf.keras.metrics.BinaryAccuracy(name="accuracy"), tf.keras.metrics.AUC(name="auc")],
    )
    return model

model_en = build_multifeature_model(
    vocab_size=tokenizer.vocab_size,
    seq_len=MAX_LENGTH,
    aux_dim=X_train_aux.shape[1],
    embedding_dim=EMBEDDING_DIM,
    gru_units=GRU_UNITS,
    aux_dense_units=AUX_DENSE_UNITS,
    head_dense_units=HEAD_DENSE_UNITS,
    dropout_text=DROPOUT_TEXT,
    dropout_aux=DROPOUT_AUX,
    dropout_head=DROPOUT_HEAD,
    learning_rate=LEARNING_RATE,
)

callbacks = []
if EARLY_STOPPING_PATIENCE > 0:
    callbacks.append(
        tf.keras.callbacks.EarlyStopping(
            monitor="val_auc",
            mode="max",
            patience=EARLY_STOPPING_PATIENCE,
            restore_best_weights=True,
        )
    )
if REDUCE_LR_PATIENCE > 0:
    callbacks.append(
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor="val_auc",
            mode="max",
            factor=REDUCE_LR_FACTOR,
            patience=REDUCE_LR_PATIENCE,
            min_lr=REDUCE_LR_MIN_LR,
        )
    )

history_en = model_en.fit(
    {
        "input_ids": X_train_ids,
        "attention_mask": X_train_mask,
        "aux_features": X_train_aux,
    },
    y_train,
    validation_split=VALIDATION_SPLIT,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    class_weight=class_weight_dict,
    callbacks=callbacks,
    verbose=1,
)

y_prob = model_en.predict(
    {
        "input_ids": X_test_ids,
        "attention_mask": X_test_mask,
        "aux_features": X_test_aux,
    },
    verbose=0,
).ravel()
y_pred = (y_prob >= PREDICTION_THRESHOLD).astype(np.int64)

print(f"Topic features enabled: {USE_TOPIC_FEATURES}")
print(f"Topic match mode: {TOPIC_MATCH_MODE}")
print("Topic columns:", topic_feature_cols_en)
print("Train/Test sizes:", len(train_idx), len(test_idx))
print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"Test ROC-AUC: {roc_auc_score(y_test, y_prob):.4f}")
print(classification_report(y_test, y_pred, digits=4))


Epoch 1/8
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 162ms/step - accuracy: 0.7372 - auc: 0.7434 - loss: 0.5359 - val_accuracy: 0.8679 - val_auc: 0.8553 - val_loss: 0.3691 - learning_rate: 0.0010
Epoch 2/8
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 162ms/step - accuracy: 0.8961 - auc: 0.9008 - loss: 0.3012 - val_accuracy: 0.8800 - val_auc: 0.8580 - val_loss: 0.3575 - learning_rate: 0.0010
Epoch 3/8
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 159ms/step - accuracy: 0.9257 - auc: 0.9430 - loss: 0.2240 - val_accuracy: 0.8663 - val_auc: 0.8460 - val_loss: 0.3965 - learning_rate: 0.0010
Topic features enabled: True
Topic match mode: word
Topic columns: ['topic_pop', 'topic_nba', 'topic_movies', 'topic_nhl']
Train/Test sizes: 12612 3153
Test Accuracy: 0.8890
Test ROC-AUC: 0.8805
              precision    recall  f1-score   support

           0     0.8862    0.9628    0.9229      2175
           1     0.8975    0.7249    0.802