### Preprocessing and Setup

In [2]:
import os, json, gzip, io, random
from pathlib import Path
from typing import List, Optional
import numpy as np
import pandas as pd
import html
from bs4 import BeautifulSoup

RANDOM_SEED = 42
rng = np.random.default_rng(RANDOM_SEED)
random.seed(RANDOM_SEED)

DATA_DIRS = [
    r".\\bluesky\\dataset\\labeled_all_posts.json",
    r".\\truthsocial\\dataset\\labeled_all_posts.json",
]

LABELS = ["Left","Right","Neutral"]
LABEL2ID = {k:i for i,k in enumerate(LABELS)}
ID2LABEL = {v:k for k,v in LABEL2ID.items()}

def _open_text(path: Path):
    if str(path).endswith(".gz"):
        return io.TextIOWrapper(gzip.open(path, "rb"), encoding="utf-8", errors="ignore")
    return open(path, "r", encoding="utf-8", errors="ignore")

def _iter_paths(root: Path):
    ok = (".json",".jsonl",".ndjson",".json.gz",".jsonl.gz",".ndjson.gz")
    for p in root.rglob("*"):
        if any("".join(p.suffixes).lower().endswith(ext) for ext in ok):
            yield p

def _iter_records(path: Path):
    with _open_text(path) as f:
        head = f.read(2048)
        f.seek(0)
        first = head.lstrip()[:1]
        if first == "[":
            try:
                data = json.load(f)
                if isinstance(data, list):
                    for x in data:
                        if isinstance(x, dict): yield x
                elif isinstance(data, dict):
                    yield data
            except Exception:
                for line in f:
                    line=line.strip()
                    if not line: continue
                    try:
                        x=json.loads(line)
                        if isinstance(x, dict): yield x
                    except: pass
        else:
            for line in f:
                line=line.strip()
                if not line: continue
                try:
                    x=json.loads(line)
                    if isinstance(x, dict): yield x
                except: pass

def _clean_text(html_content):
    """
    Strips all HTML tags and unescapes entities from a string.
    """
    if not html_content or not isinstance(html_content, str):
        return None

    soup = BeautifulSoup(html_content, "html.parser")
    text_with_entities = soup.get_text(separator=" ", strip=True)
    clean_text = html.unescape(text_with_entities)
    return clean_text or None

def _extract_text(rec: dict) -> Optional[str]:
    """
    Heuristics across Bluesky/TruthSocial scrapes.
    Try your common fields here; add more if needed.
    """
    
    # Bluesky record style:
    if "record" in rec:
        record = rec.get("record") or {}
        text = record.get("text")
        if isinstance(text, str) and text.strip():
            return text.strip()
        
    # Truth social:
    if "content" in rec:
        cleaned = _clean_text(rec.get("content"))
        if cleaned:
            return cleaned
    
    return None

def load_dataframe(roots: List[str], min_len: int = 5, include_topic_prefix: bool = True) -> pd.DataFrame:
    rows = []
    for d in roots:
        root = Path(d)
        if not root.exists():
            print(f"[WARN] Missing: {root}")
            continue

        if root.is_file():
            paths = [root]
        else:
            paths = _iter_paths(root)

        for path in paths:
            for rec in _iter_records(path):
                meta = rec.get("__meta__", {}) or {}
                label = str(meta.get("llm_label") or "").strip().capitalize()
                if label not in LABELS:  # skip unknown labels
                    continue
                txt = _extract_text(rec)
                if not txt: 
                    continue
                
                topic = meta.get("topic") or ""
                if include_topic_prefix and topic:
                    txt = f"Topic: {topic}. Post: {txt}"

                platform = meta.get("platform") or ""
                matched_keyword = meta.get("matched_keyword") or ""
                
                author_did = ""

                if platform == "bluesky":
                    author = rec.get("author") or {}
                    did = author.get("did")
                    if did:
                        # prefix to avoid collisions across platforms
                        author_did = f"bsky:{did}"
                elif platform == "truthsocial":
                    account = rec.get("account") or {}
                    acc_id = account.get("id")
                    if acc_id:
                        author_did = f"truth:{acc_id}"
                
                post_id = rec.get("id") or rec.get("cid") or ""

                if len(txt) < min_len:
                    continue

                rows.append({
                    "text": txt,
                    "label": label,
                    "y": LABEL2ID[label],
                    "topic": topic,
                    "platform": platform,
                    "matched_keyword": matched_keyword,
                    "author_did": author_did,
                    "post_id": post_id,
                })
    df = pd.DataFrame(rows).drop_duplicates(subset=["text","post_id"], keep="first")
    print(df.shape, df["label"].value_counts(dropna=False))
    return df

def stratified_splits(df: pd.DataFrame, test_size=0.2, val_size=0.1, group_col: Optional[str]=None, seed=RANDOM_SEED):
    """
    Returns df_train, df_val, df_test.
    If group_col is provided (e.g., 'author_did' or 'platform'), we avoid leakage by grouping.
    """
    from sklearn.model_selection import train_test_split, GroupShuffleSplit

    if group_col and group_col in df.columns and df[group_col].astype(bool).any():
        gss = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=seed)
        idx = np.arange(len(df))
        train_idx, test_idx = next(gss.split(idx, groups=df[group_col]))
        df_train_full, df_test = df.iloc[train_idx], df.iloc[test_idx]

        # val from train_full
        gss2 = GroupShuffleSplit(n_splits=1, test_size=val_size/(1.0-test_size), random_state=seed)
        idx2 = np.arange(len(df_train_full))
        tr_idx, val_idx = next(gss2.split(idx2, groups=df_train_full[group_col].values))
        df_train, df_val = df_train_full.iloc[tr_idx], df_train_full.iloc[val_idx]
    else:
        from sklearn.model_selection import train_test_split
        df_train_full, df_test = train_test_split(
            df, test_size=test_size, random_state=seed, stratify=df["y"]
        )
        df_train, df_val = train_test_split(
            df_train_full, test_size=val_size/(1.0-test_size), random_state=seed, stratify=df_train_full["y"]
        )
    for name, part in [("train",df_train),("val",df_val),("test",df_test)]:
        print(name, part.shape, part["label"].value_counts())
    return df_train.reset_index(drop=True), df_val.reset_index(drop=True), df_test.reset_index(drop=True)

In [3]:
# load and split dataset
df = load_dataframe(DATA_DIRS, include_topic_prefix=True)
df_train, df_val, df_test = stratified_splits(df, group_col="author_did")

(131808, 8) label
Neutral    53778
Left       44332
Right      33698
Name: count, dtype: int64
train (93014, 8) label
Neutral    38107
Left       30928
Right      23979
Name: count, dtype: int64
val (13240, 8) label
Neutral    5448
Left       4407
Right      3385
Name: count, dtype: int64
test (25554, 8) label
Neutral    10223
Left        8997
Right       6334
Name: count, dtype: int64


### Naive Bayes

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import ComplementNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
import joblib

# pipeline
pipe = Pipeline([
    ("tfidf", TfidfVectorizer(
        lowercase=True,
        ngram_range=(1,2),
        min_df=5,
        max_df=0.7,
        strip_accents="unicode",
        sublinear_tf=True,
        token_pattern=r"[A-Za-z][A-Za-z0-9_\-']+"
    )),
    ("nb", ComplementNB(alpha=0.5))
])

param_grid = {"nb__alpha": [0.1, 0.3, 0.5, 1.0]}
gs = GridSearchCV(pipe, param_grid, scoring="f1_macro", cv=3, n_jobs=-1, verbose=1)
gs.fit(df_train["text"], df_train["label"])

print("Best params:", gs.best_params_)
print("Best Mean Cross-Validation Score on train:", gs.best_score_)

# evaluate on test
best = gs.best_estimator_
y_pred = best.predict(df_test["text"])
print(classification_report(df_test["label"], y_pred, digits=3))
print(confusion_matrix(df_test["label"], y_pred, labels=LABELS))

# saving artifacts
os.makedirs("models_nb", exist_ok=True)
joblib.dump(best, "models_nb/nb_tfidf.joblib")
with open("models_nb/labels.txt","w") as f: f.write("\n".join(LABELS))

(131808, 8) label
Neutral    53778
Left       44332
Right      33698
Name: count, dtype: int64
train (93014, 8) label
Neutral    38107
Left       30928
Right      23979
Name: count, dtype: int64
val (13240, 8) label
Neutral    5448
Left       4407
Right      3385
Name: count, dtype: int64
test (25554, 8) label
Neutral    10223
Left        8997
Right       6334
Name: count, dtype: int64
Fitting 3 folds for each of 4 candidates, totalling 12 fits
Best params: {'nb__alpha': 0.1}
Best Mean Cross-Validation Score on train: 0.3756423319278246
              precision    recall  f1-score   support

        Left      0.592     0.765     0.667      8997
     Neutral      0.715     0.583     0.643     10223
       Right      0.671     0.593     0.630      6334

    accuracy                          0.650     25554
   macro avg      0.660     0.647     0.646     25554
weighted avg      0.661     0.650     0.648     25554

[[6881  675 1441]
 [1648 3755  931]
 [3097 1164 5962]]


### BERT

In [4]:
import torch, numpy as np
from datasets import Dataset, DatasetDict
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                        DataCollatorWithPadding, TrainingArguments, Trainer)
from sklearn.metrics import accuracy_score, f1_score

device = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "distilbert-base-uncased"
MAX_LEN = 256

# build HF datasets
def to_hf(df):
    # keep numeric label and rename it to 'labels'
    tmp = df[["text", "y", "topic", "platform"]].copy()
    tmp = tmp.rename(columns={"y": "labels"})
    return Dataset.from_pandas(tmp, preserve_index=False)

hf = DatasetDict({
    "train": to_hf(df_train),
    "validation": to_hf(df_val),
    "test": to_hf(df_test),
})


tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def tokenize(batch):
    return tok(batch["text"], truncation=True, max_length=MAX_LEN)

hf_tok = hf.map(
    tokenize,
    batched=True,
    remove_columns=["text", "topic", "platform"],  # 'labels' is kept
)
data_collator = DataCollatorWithPadding(tokenizer=tok)

def compute_metrics(pred):
    logits, labels = pred
    preds = np.argmax(logits, axis=-1)

    acc = accuracy_score(labels, preds)
    f1_macro = f1_score(labels, preds, average="macro")

    out = {"accuracy": acc, "f1": f1_macro}

    # per-class F1
    for i, name in ID2LABEL.items():
        out[f"f1_{name}"] = f1_score(
            (labels == i).astype(int),
            (preds == i).astype(int),
            average="binary",
            zero_division=0,
        )
    return out

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(LABELS),
    id2label=ID2LABEL,
    label2id=LABEL2ID
).to(device)

args = TrainingArguments(
    output_dir="models_distilbert",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=200,
    save_steps=2000,
    seed=RANDOM_SEED,
    dataloader_num_workers=4,
)

# handle imbalance with class weights
use_class_weights = True
class_counts = df_train["y"].value_counts().reindex(range(len(LABELS)), fill_value=0).values
weights = torch.tensor(len(df_train)/np.maximum(class_counts,1), dtype=torch.float32, device=device)
weights = weights / weights.sum() * len(LABELS)  # normalize around 1

class WeightedTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # labels
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        if self.class_weights is not None:
            loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights)
        else:
            loss_fct = torch.nn.CrossEntropyLoss()

        loss = loss_fct(
            logits.view(-1, model.config.num_labels),
            labels.view(-1)
        )
        return (loss, outputs) if return_outputs else loss

trainer = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=hf_tok["train"],
    eval_dataset=hf_tok["validation"],
    tokenizer=tok,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    class_weights=weights,
)

trainer.train()
print("Val metrics:", trainer.evaluate(hf_tok["validation"]))
print("Test metrics:", trainer.evaluate(hf_tok["test"]))

# saving model
trainer.save_model("models_distilbert/best")
tok.save_pretrained("models_distilbert/best")


  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 93014/93014 [00:05<00:00, 16972.69 examples/s]
Map: 100%|██████████| 13240/13240 [00:00<00:00, 21028.37 examples/s]
Map: 100%|██████████| 25554/25554 [00:01<00:00, 17054.36 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  super().__init__(*args, **kwargs)


Step,Training Loss
200,0.9566
400,0.8306
600,0.7677
800,0.7558
1000,0.7147
1200,0.703
1400,0.6971
1600,0.6451
1800,0.6465
2000,0.6426


Val metrics: {'eval_loss': 0.6404696106910706, 'eval_accuracy': 0.7536253776435046, 'eval_f1': 0.751164508545172, 'eval_f1_Left': 0.74834728955487, 'eval_f1_Right': 0.7342911043389929, 'eval_f1_Neutral': 0.7708551317416532, 'eval_runtime': 25.664, 'eval_samples_per_second': 515.898, 'eval_steps_per_second': 8.066, 'epoch': 3.0}
Test metrics: {'eval_loss': 0.6434869170188904, 'eval_accuracy': 0.7495108397902481, 'eval_f1': 0.7475576787316847, 'eval_f1_Left': 0.7557268483335142, 'eval_f1_Right': 0.731244211176289, 'eval_f1_Neutral': 0.7557019766852509, 'eval_runtime': 29.6035, 'eval_samples_per_second': 863.21, 'eval_steps_per_second': 13.512, 'epoch': 3.0}


('models_distilbert/best\\tokenizer_config.json',
 'models_distilbert/best\\special_tokens_map.json',
 'models_distilbert/best\\vocab.txt',
 'models_distilbert/best\\added_tokens.json',
 'models_distilbert/best\\tokenizer.json')