# French Account-Level Model (PyTorch)

Account-level bot detection for the French split using a hierarchical tweet encoder with `transformers.AutoModel` (PyTorch backend).


## 1. Imports and Config


In [None]:
import json
import random
import re
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

try:
    import torch
    import torch.nn as nn
    from torch.utils.data import Dataset, DataLoader
except ImportError as exc:
    raise ImportError("Install torch first: pip install torch") from exc

try:
    from transformers import AutoTokenizer, AutoModel
except ImportError as exc:
    raise ImportError("Install transformers first: pip install transformers") from exc

CONFIG = {
    "target_lang": "fr",
    "model_name": "cardiffnlp/twitter-xlm-roberta-base",
    "max_length": 96,
    "max_tweets_per_account": 16,
    "min_tweets_per_account": 1,
    "test_size": 0.20,
    "val_size": 0.15,
    "random_seed": 42,
    "batch_size": 8,
    "epochs": 6,
    "learning_rate": 2e-5,
    "weight_decay": 0.01,
    "dropout": 0.30,
    "threshold_min": 0.05,
    "threshold_max": 0.95,
    "threshold_steps": 181,
    "trainable_encoder": True,
    "use_weighted_loss": True,
}

np.random.seed(CONFIG["random_seed"])
random.seed(CONFIG["random_seed"])
torch.manual_seed(CONFIG["random_seed"])
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(CONFIG["random_seed"])

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", DEVICE)

print("Config loaded:")
for k, v in CONFIG.items():
    print(f"- {k}: {v}")


KeyboardInterrupt: 

## 2. Load French Competition Data


In [None]:
DATA_DIR = Path("data").resolve()


def _version_from_path(path):
    try:
        return int(path.stem.split(".")[-1])
    except ValueError:
        return None


posts_records = []
users_records = []
bot_ids = set()
fr_sources = []

for path in sorted(DATA_DIR.glob("dataset.posts&users.*.json"), key=_version_from_path):
    with path.open() as f:
        payload = json.load(f)

    lang = str(payload.get("lang", "")).strip().lower()
    if lang != CONFIG["target_lang"]:
        continue

    fr_sources.append(path.name)
    posts_records.extend(payload.get("posts", []))
    users_records.extend(payload.get("users", []))

    version = _version_from_path(path)
    if version is not None:
        bot_path = DATA_DIR / f"dataset.bots.{version}.txt"
        if bot_path.exists():
            bot_ids.update([line.strip() for line in bot_path.read_text().splitlines() if line.strip()])

posts_df = pd.DataFrame(posts_records)
users_df = pd.DataFrame(users_records).drop_duplicates(subset=["id"]).copy()
users_df["is_bot"] = users_df["id"].isin(bot_ids).astype(np.int64)

print("French sources:", fr_sources)
print(f"Posts: {len(posts_df):,}")
print(f"Users: {len(users_df):,}")
print(f"Bot ids: {len(bot_ids):,}")


French sources: ['dataset.posts&users.31.json', 'dataset.posts&users.33.json']
Posts: 9,004
Users: 343
Bot ids: 55


## 3. Build Account Dataset


In [None]:
URL_RE = re.compile(r"https?://\S+|www\.\S+")
MENTION_RE = re.compile(r"@\w+")
HASHTAG_RE = re.compile(r"#(\w+)")
WS_RE = re.compile(r"\s+")


def clean_text(text):
    if not isinstance(text, str):
        return ""
    txt = text.replace("\n", " ").strip().lower()
    txt = URL_RE.sub(" <url> ", txt)
    txt = MENTION_RE.sub(" <user> ", txt)
    txt = HASHTAG_RE.sub(r" \1 ", txt)
    txt = WS_RE.sub(" ", txt).strip()
    return txt


posts = posts_df.copy()
posts["text_clean"] = posts["text"].map(clean_text)
posts = posts[posts["text_clean"].str.len() > 0].copy()

user_lookup = users_df.set_index("id")

account_rows = []
for author_id, grp in posts.groupby("author_id"):
    if author_id not in user_lookup.index:
        continue

    user = user_lookup.loc[author_id]

    g = grp.copy()
    if "created_at" in g.columns:
        g = g.sort_values("created_at")

    texts = g["text_clean"].tolist()
    if len(texts) < CONFIG["min_tweets_per_account"]:
        continue

    if len(texts) > CONFIG["max_tweets_per_account"]:
        texts = texts[-CONFIG["max_tweets_per_account"] :]

    raw_text = g["text"].fillna("")
    clean_texts = g["text_clean"]

    n_posts = len(g)
    lengths = clean_texts.str.len().to_numpy(dtype=np.float32)

    account_rows.append(
        {
            "author_id": author_id,
            "is_bot": int(user["is_bot"]),
            "texts": texts,
            "n_posts": float(n_posts),
            "log_n_posts": float(np.log1p(n_posts)),
            "mean_text_len": float(np.mean(lengths)) if len(lengths) else 0.0,
            "std_text_len": float(np.std(lengths)) if len(lengths) else 0.0,
            "url_rate": float(clean_texts.str.contains("<url>", regex=False).mean()),
            "mention_rate": float(clean_texts.str.contains("<user>", regex=False).mean()),
            "hashtag_rate": float(raw_text.str.contains("#").mean()),
            "tweet_count_meta": float(pd.to_numeric(user.get("tweet_count", 0), errors="coerce") or 0),
            "z_score_meta": float(pd.to_numeric(user.get("z_score", 0), errors="coerce") or 0),
            "description_len": float(len(str(user.get("description", "") or ""))),
            "has_location": float(bool(str(user.get("location", "") or "").strip())),
            "username_len": float(len(str(user.get("username", "") or ""))),
        }
    )

accounts_df = pd.DataFrame(account_rows)
accounts_df = accounts_df.sample(frac=1.0, random_state=CONFIG["random_seed"]).reset_index(drop=True)

print(f"Accounts kept: {len(accounts_df):,}")
print(accounts_df["is_bot"].value_counts(dropna=False).rename("n_accounts"))


Accounts kept: 343
is_bot
0    288
1     55
Name: n_accounts, dtype: int64


## 4. Split, Tokenize, Build Dataloaders


In [None]:
feature_cols = [
    "n_posts",
    "log_n_posts",
    "mean_text_len",
    "std_text_len",
    "url_rate",
    "mention_rate",
    "hashtag_rate",
    "tweet_count_meta",
    "z_score_meta",
    "description_len",
    "has_location",
    "username_len",
]


def split_accounts(df):
    idx = np.arange(len(df))
    y = df["is_bot"].to_numpy(dtype=np.int64)

    train_val_idx, test_idx = train_test_split(
        idx,
        test_size=CONFIG["test_size"],
        random_state=CONFIG["random_seed"],
        stratify=y,
    )

    y_train_val = y[train_val_idx]
    val_rel_size = CONFIG["val_size"] / (1.0 - CONFIG["test_size"])

    train_idx, val_idx = train_test_split(
        train_val_idx,
        test_size=val_rel_size,
        random_state=CONFIG["random_seed"],
        stratify=y_train_val,
    )
    return train_idx, val_idx, test_idx


def encode_accounts(text_lists, tokenizer, max_tweets, max_len):
    n = len(text_lists)
    input_ids = np.zeros((n, max_tweets, max_len), dtype=np.int64)
    attention = np.zeros((n, max_tweets, max_len), dtype=np.int64)
    tweet_mask = np.zeros((n, max_tweets), dtype=np.float32)

    for i, texts in enumerate(text_lists):
        texts = [t for t in texts if isinstance(t, str) and t.strip()]
        if len(texts) == 0:
            texts = [""]
        if len(texts) > max_tweets:
            texts = texts[-max_tweets:]

        enc = tokenizer(
            texts,
            padding="max_length",
            truncation=True,
            max_length=max_len,
            return_attention_mask=True,
            return_tensors="np",
        )

        m = min(len(texts), max_tweets)
        input_ids[i, :m] = enc["input_ids"][:m]
        attention[i, :m] = enc["attention_mask"][:m]
        tweet_mask[i, :m] = 1.0

    return input_ids, attention, tweet_mask


class AccountDataset(Dataset):
    def __init__(self, input_ids, attention_mask, tweet_mask, account_features, labels):
        self.input_ids = torch.tensor(input_ids, dtype=torch.long)
        self.attention_mask = torch.tensor(attention_mask, dtype=torch.long)
        self.tweet_mask = torch.tensor(tweet_mask, dtype=torch.float32)
        self.account_features = torch.tensor(account_features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.input_ids[idx],
            "attention_mask": self.attention_mask[idx],
            "tweet_mask": self.tweet_mask[idx],
            "account_features": self.account_features[idx],
            "labels": self.labels[idx],
        }


train_idx, val_idx, test_idx = split_accounts(accounts_df)

train_acc = accounts_df.iloc[train_idx].reset_index(drop=True)
val_acc = accounts_df.iloc[val_idx].reset_index(drop=True)
test_acc = accounts_df.iloc[test_idx].reset_index(drop=True)

print(f"Train/Val/Test accounts: {len(train_acc)} / {len(val_acc)} / {len(test_acc)}")

# Scale numeric account features
scaler = StandardScaler()
X_train_feat = scaler.fit_transform(train_acc[feature_cols].to_numpy(dtype=np.float32)).astype(np.float32)
X_val_feat = scaler.transform(val_acc[feature_cols].to_numpy(dtype=np.float32)).astype(np.float32)
X_test_feat = scaler.transform(test_acc[feature_cols].to_numpy(dtype=np.float32)).astype(np.float32)

y_train = train_acc["is_bot"].to_numpy(dtype=np.int64)
y_val = val_acc["is_bot"].to_numpy(dtype=np.int64)
y_test = test_acc["is_bot"].to_numpy(dtype=np.int64)

tokenizer = AutoTokenizer.from_pretrained(CONFIG["model_name"])

X_train_ids, X_train_attn, X_train_tmask = encode_accounts(
    train_acc["texts"].tolist(), tokenizer, CONFIG["max_tweets_per_account"], CONFIG["max_length"]
)
X_val_ids, X_val_attn, X_val_tmask = encode_accounts(
    val_acc["texts"].tolist(), tokenizer, CONFIG["max_tweets_per_account"], CONFIG["max_length"]
)
X_test_ids, X_test_attn, X_test_tmask = encode_accounts(
    test_acc["texts"].tolist(), tokenizer, CONFIG["max_tweets_per_account"], CONFIG["max_length"]
)

train_ds = AccountDataset(X_train_ids, X_train_attn, X_train_tmask, X_train_feat, y_train)
val_ds = AccountDataset(X_val_ids, X_val_attn, X_val_tmask, X_val_feat, y_val)
test_ds = AccountDataset(X_test_ids, X_test_attn, X_test_tmask, X_test_feat, y_test)

train_loader = DataLoader(train_ds, batch_size=CONFIG["batch_size"], shuffle=True)
val_loader = DataLoader(val_ds, batch_size=CONFIG["batch_size"], shuffle=False)
test_loader = DataLoader(test_ds, batch_size=CONFIG["batch_size"], shuffle=False)


Train/Val/Test accounts: 222 / 52 / 69


## 5. Build Account-Level Model (PyTorch)


In [None]:
class AccountBotModel(nn.Module):
    def __init__(self, model_name, feat_dim, dropout=0.3, trainable_encoder=True):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(model_name)
        if not trainable_encoder:
            for p in self.encoder.parameters():
                p.requires_grad = False

        hidden = self.encoder.config.hidden_size
        self.attn = nn.Linear(hidden, 1)
        self.norm = nn.LayerNorm(hidden * 2 + feat_dim)
        self.fc1 = nn.Linear(hidden * 2 + feat_dim, 256)
        self.fc2 = nn.Linear(256, 128)
        self.out = nn.Linear(128, 1)
        self.drop = nn.Dropout(dropout)
        self.act = nn.GELU()

    def forward(self, input_ids, attention_mask, tweet_mask, account_features):
        bsz, n_tweets, seq_len = input_ids.shape

        flat_ids = input_ids.view(bsz * n_tweets, seq_len)
        flat_attn = attention_mask.view(bsz * n_tweets, seq_len)

        out = self.encoder(input_ids=flat_ids, attention_mask=flat_attn)
        cls = out.last_hidden_state[:, 0, :]
        cls = cls.view(bsz, n_tweets, -1)

        # Attention pool across tweets
        attn_logits = self.attn(cls).squeeze(-1)
        attn_logits = attn_logits.masked_fill(tweet_mask <= 0, -1e9)
        attn_w = torch.softmax(attn_logits, dim=1)
        attn_pool = torch.einsum("bt,bth->bh", attn_w, cls)

        # Mean pool across valid tweets
        denom = tweet_mask.sum(dim=1, keepdim=True).clamp(min=1.0)
        mean_pool = (cls * tweet_mask.unsqueeze(-1)).sum(dim=1) / denom

        x = torch.cat([attn_pool, mean_pool, account_features], dim=-1)
        x = self.norm(x)
        x = self.drop(self.act(self.fc1(x)))
        x = self.drop(self.act(self.fc2(x)))
        logits = self.out(x).squeeze(-1)
        return logits


model = AccountBotModel(
    model_name=CONFIG["model_name"],
    feat_dim=len(feature_cols),
    dropout=CONFIG["dropout"],
    trainable_encoder=CONFIG["trainable_encoder"],
).to(DEVICE)

n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Trainable params: {n_params:,}")


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mXLMRobertaModel LOAD REPORT[0m from: cardiffnlp/twitter-xlm-roberta-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.decoder.bias            | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.decoder.weight          | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
pooler.dense.weight             | MISSING    | 
pooler.dense.bias               | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Trainable params: 278,477,082


## 6. Train Loop


In [None]:
def competition_score(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=np.int64)
    y_pred = np.asarray(y_pred, dtype=np.int64)
    tp = int(((y_true == 1) & (y_pred == 1)).sum())
    fn = int(((y_true == 1) & (y_pred == 0)).sum())
    fp = int(((y_true == 0) & (y_pred == 1)).sum())
    tn = int(((y_true == 0) & (y_pred == 0)).sum())
    score = (4 * tp) - fn - (2 * fp)
    return score, tp, fn, fp, tn


def predict_probs(model, loader):
    model.eval()
    all_probs = []
    all_labels = []
    with torch.no_grad():
        for batch in loader:
            ids = batch["input_ids"].to(DEVICE)
            attn = batch["attention_mask"].to(DEVICE)
            tmask = batch["tweet_mask"].to(DEVICE)
            feats = batch["account_features"].to(DEVICE)
            labels = batch["labels"].cpu().numpy()

            logits = model(ids, attn, tmask, feats)
            probs = torch.sigmoid(logits).cpu().numpy()

            all_probs.append(probs)
            all_labels.append(labels)

    return np.concatenate(all_probs), np.concatenate(all_labels)


def best_threshold_from_val(y_true, probs):
    thresholds = np.linspace(CONFIG["threshold_min"], CONFIG["threshold_max"], CONFIG["threshold_steps"])
    rows = []
    for thr in thresholds:
        pred = (probs >= thr).astype(np.int64)
        score, tp, fn, fp, tn = competition_score(y_true, pred)
        rows.append((thr, score, tp, fn, fp, tn))
    df = pd.DataFrame(rows, columns=["threshold", "score", "tp", "fn", "fp", "tn"])
    df = df.sort_values(by=["score", "fp", "fn", "threshold"], ascending=[False, True, True, True])
    best = df.iloc[0]
    return float(best["threshold"]), df


pos_weight = None
if CONFIG["use_weighted_loss"]:
    neg = float((y_train == 0).sum())
    pos = float((y_train == 1).sum())
    if pos > 0:
        pos_weight = torch.tensor([neg / pos], dtype=torch.float32, device=DEVICE)

criterion = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
optimizer = torch.optim.AdamW(model.parameters(), lr=CONFIG["learning_rate"], weight_decay=CONFIG["weight_decay"])

best_state = None
best_val_score = -10**9
best_val_threshold = 0.5
patience = 2
no_improve = 0

for epoch in range(1, CONFIG["epochs"] + 1):
    model.train()
    losses = []

    for batch in train_loader:
        ids = batch["input_ids"].to(DEVICE)
        attn = batch["attention_mask"].to(DEVICE)
        tmask = batch["tweet_mask"].to(DEVICE)
        feats = batch["account_features"].to(DEVICE)
        labels = batch["labels"].to(DEVICE)

        optimizer.zero_grad()
        logits = model(ids, attn, tmask, feats)
        loss = criterion(logits, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        losses.append(float(loss.item()))

    val_prob, val_true = predict_probs(model, val_loader)
    val_thr, val_df = best_threshold_from_val(val_true, val_prob)
    val_pred = (val_prob >= val_thr).astype(np.int64)
    val_score, val_tp, val_fn, val_fp, val_tn = competition_score(val_true, val_pred)

    print(
        f"Epoch {epoch}: loss={np.mean(losses):.4f} "
        f"val_score={val_score} thr={val_thr:.3f} TP={val_tp} FN={val_fn} FP={val_fp}"
    )

    if val_score > best_val_score:
        best_val_score = int(val_score)
        best_val_threshold = float(val_thr)
        best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
        no_improve = 0
    else:
        no_improve += 1
        if no_improve >= patience:
            print("Early stopping on validation score")
            break

if best_state is not None:
    model.load_state_dict(best_state)

print(f"Best val score: {best_val_score} @ threshold={best_val_threshold:.3f}")


## 7. Test Evaluation


In [None]:
val_prob, y_val_eval = predict_probs(model, val_loader)
test_prob, y_test_eval = predict_probs(model, test_loader)

# Recompute threshold on full validation predictions from best checkpoint
best_threshold, threshold_table = best_threshold_from_val(y_val_eval, val_prob)

y_test_pred = (test_prob >= best_threshold).astype(np.int64)
test_score, test_tp, test_fn, test_fp, test_tn = competition_score(y_test_eval, y_test_pred)
max_possible = int(4 * (y_test_eval == 1).sum())

print(f"Selected threshold: {best_threshold:.3f}")
print(f"Test score: {test_score}/{max_possible}")
print(f"TP={test_tp} FN={test_fn} FP={test_fp} TN={test_tn}")
print(classification_report(y_test_eval, y_test_pred, digits=4))

account_eval = pd.DataFrame(
    {
        "author_id": test_acc["author_id"].to_numpy(),
        "true_is_bot": y_test_eval.astype(np.int64),
        "pred_prob": test_prob.astype(np.float32),
        "pred_is_bot": y_test_pred.astype(np.int64),
    }
)


: 

## 8. Error Analysis


In [None]:
account_eval["error_type"] = np.where(
    (account_eval["true_is_bot"] == 1) & (account_eval["pred_is_bot"] == 0),
    "FN",
    np.where(
        (account_eval["true_is_bot"] == 0) & (account_eval["pred_is_bot"] == 1),
        "FP",
        np.where(account_eval["true_is_bot"] == 1, "TP", "TN"),
    ),
)

print("Account error summary:")
print(account_eval["error_type"].value_counts().rename_axis("error_type").reset_index(name="n_accounts"))

fp_accounts = account_eval[account_eval["error_type"] == "FP"]["author_id"].tolist()
fn_accounts = account_eval[account_eval["error_type"] == "FN"]["author_id"].tolist()

print("\nFP account ids:", fp_accounts)
print("FN account ids:", fn_accounts)

post_lookup = posts.groupby("author_id")["text_clean"].apply(list)

rows = []
for aid in fp_accounts[:5]:
    for txt in post_lookup.get(aid, [])[:3]:
        rows.append({"author_id": aid, "error_type": "FP", "text_clean": txt})
for aid in fn_accounts[:5]:
    for txt in post_lookup.get(aid, [])[:3]:
        rows.append({"author_id": aid, "error_type": "FN", "text_clean": txt})

hard_examples = pd.DataFrame(rows)
if len(hard_examples):
    print("\nHard-case sample tweets:")
    print(hard_examples.to_string(index=False))


: 

## 9. Optional Export


In [None]:
ARTIFACTS_DIR = Path("artifacts").resolve()
ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

submission = account_eval[["author_id", "pred_is_bot"]].copy()
submission_path = ARTIFACTS_DIR / "submission_fr_account_split.csv"
submission.to_csv(submission_path, index=False)

bots_txt = ARTIFACTS_DIR / "submission_fr_account_split.bots.txt"
bot_ids = submission.loc[submission["pred_is_bot"] == 1, "author_id"].astype(str).tolist()
bots_txt.write_text("\n".join(bot_ids) + ("\n" if bot_ids else ""))

print(f"Saved: {submission_path}")
print(f"Saved: {bots_txt}")


: 