### Preprocessing and Setup

In [14]:
import os, json, gzip, io, random
from pathlib import Path
from typing import List, Optional
import numpy as np
import pandas as pd
import html
from bs4 import BeautifulSoup

RANDOM_SEED = 42
rng = np.random.default_rng(RANDOM_SEED)
random.seed(RANDOM_SEED)

DATA_DIRS = [
    "./bluesky/dataset/bluesky_with_noisy_labels.json",
    "./truthsocial/dataset/truthsocial_with_noisy_labels.json",
]

LABELS_LLM = ["Left","Right","Neutral"]
LABELS_NOISY = ["Left", "Right"]
LABEL2ID_LLM = {k:i for i,k in enumerate(LABELS_LLM)}
ID2LABEL_LLM = {v:k for k,v in LABEL2ID_LLM.items()}
LABEL2ID_NOISY = {k:i for i,k in enumerate(LABELS_NOISY)}
ID2LABEL_NOISY = {v:k for k,v in LABEL2ID_NOISY.items()}

def _open_text(path: Path):
    if str(path).endswith(".gz"):
        return io.TextIOWrapper(gzip.open(path, "rb"), encoding="utf-8", errors="ignore")
    return open(path, "r", encoding="utf-8", errors="ignore")

def _iter_paths(root: Path):
    ok = (".json",".jsonl",".ndjson",".json.gz",".jsonl.gz",".ndjson.gz")
    for p in root.rglob("*"):
        if any("".join(p.suffixes).lower().endswith(ext) for ext in ok):
            yield p

def _iter_records(path: Path):
    with _open_text(path) as f:
        head = f.read(2048)
        f.seek(0)
        first = head.lstrip()[:1]
        if first == "[":
            try:
                data = json.load(f)
                if isinstance(data, list):
                    for x in data:
                        if isinstance(x, dict): yield x
                elif isinstance(data, dict):
                    yield data
            except Exception:
                for line in f:
                    line=line.strip()
                    if not line: continue
                    try:
                        x=json.loads(line)
                        if isinstance(x, dict): yield x
                    except: pass
        else:
            for line in f:
                line=line.strip()
                if not line: continue
                try:
                    x=json.loads(line)
                    if isinstance(x, dict): yield x
                except: pass

def _clean_text(html_content):
    """
    Strips all HTML tags and unescapes entities from a string.
    """
    if not html_content or not isinstance(html_content, str):
        return None

    soup = BeautifulSoup(html_content, "html.parser")
    text_with_entities = soup.get_text(separator=" ", strip=True)
    clean_text = html.unescape(text_with_entities)
    return clean_text or None

def _extract_text(rec: dict) -> Optional[str]:
    """
    Heuristics across Bluesky/TruthSocial scrapes.
    Try your common fields here; add more if needed.
    """
    
    # Bluesky record style:
    if "record" in rec:
        record = rec.get("record") or {}
        text = record.get("text")
        if isinstance(text, str) and text.strip():
            return text.strip()
        
    # Truth social:
    if "content" in rec:
        cleaned = _clean_text(rec.get("content"))
        if cleaned:
            return cleaned
    
    return None

def load_dataframe(
    roots: List[str],
    min_len: int = 5,
    include_topic_prefix: bool = True,
) -> pd.DataFrame:
    rows = []
    
    # 添加更详细的计数器
    file_stats = {}
    
    for d in roots:
        root = Path(d)
        if not root.exists():
            print(f"[ERROR] File/Directory does not exist: {root}")
            continue
        
        print(f"\n{'='*60}")
        print(f"Processing: {root}")
        print(f"Exists: {root.exists()}")
        print(f"Is file: {root.is_file()}")
        print(f"File size: {root.stat().st_size if root.exists() else 'N/A'} bytes")
        
        # 为每个文件初始化统计
        file_key = str(root)
        file_stats[file_key] = {
            "total_records": 0,
            "valid_records": 0,
            "skipped_no_label": 0,
            "skipped_no_text": 0,
            "platform_count": {},
            "noisy_label_dist": {"left": 0, "right": 0, "other": 0, "none": 0},
            "llm_label_dist": {"left": 0, "right": 0, "neutral": 0, "other": 0}
        }
        
        paths = [root] if root.is_file() else list(_iter_paths(root))
        print(f"Files to process: {len(paths)}")
        
        for path in paths:
            print(f"  Processing file: {path}")
            record_count = 0
            
            try:
                for rec in _iter_records(path):
                    record_count += 1
                    file_stats[file_key]["total_records"] += 1
                    
                    # 打印前几条记录的结构
                    if record_count <= 2:
                        print(f"\n    Record {record_count} structure:")
                        print(f"      Top-level keys: {list(rec.keys())[:10]}")  # 只显示前10个key
                        if "__meta__" in rec:
                            print(f"      __meta__ keys: {list(rec.get('__meta__', {}).keys())}")
                        print(f"      noisy_label: {rec.get('noisy_label')}")
                        if "__meta__" in rec:
                            print(f"      platform: {rec.get('__meta__', {}).get('platform')}")
                            print(f"      llm_label: {rec.get('__meta__', {}).get('llm_label')}")
                    
                    meta = rec.get("__meta__", {}) or {}
                    platform = (meta.get("platform") or "").lower()
                    
                    # 统计平台
                    if platform:
                        file_stats[file_key]["platform_count"][platform] = \
                            file_stats[file_key]["platform_count"].get(platform, 0) + 1
                    
                    # ====== 规范化 LLM label ======
                    raw_llm = meta.get("llm_label")
                    llm_label = None
                    if isinstance(raw_llm, str):
                        s = raw_llm.strip().lower()
                        if s in ["left", "right", "neutral"]:
                            file_stats[file_key]["llm_label_dist"][s] += 1
                            mapping_llm = {
                                "left": "Left",
                                "right": "Right",
                                "neutral": "Neutral",
                            }
                            llm_label = mapping_llm.get(s)
                        else:
                            file_stats[file_key]["llm_label_dist"]["other"] += 1
                    
                    # ====== 调试 noisy_label ======
                    raw_noisy = rec.get("noisy_label")
                    if raw_noisy is None:
                        file_stats[file_key]["noisy_label_dist"]["none"] += 1
                    elif isinstance(raw_noisy, str):
                        s = raw_noisy.strip().lower()
                        if s in ["left", "right"]:
                            file_stats[file_key]["noisy_label_dist"][s] += 1
                        else:
                            file_stats[file_key]["noisy_label_dist"]["other"] += 1
                    
                    # ====== 规范化 noisy label ======
                    noisy_label = None
                    if isinstance(raw_noisy, str):
                        s = raw_noisy.strip().lower()
                        mapping_noisy = {
                            "left": "Left",
                            "right": "Right",
                        }
                        noisy_label = mapping_noisy.get(s)
                    
                    # 检查标签
                    if llm_label not in LABELS_LLM or noisy_label not in LABELS_NOISY:
                        file_stats[file_key]["skipped_no_label"] += 1
                        continue
                    
                    # ====== 文本抽取与清洗 ======
                    txt = _extract_text(rec)
                    if not txt or len(txt) < min_len:
                        file_stats[file_key]["skipped_no_text"] += 1
                        continue
                    
                    file_stats[file_key]["valid_records"] += 1
                    
                    topic = meta.get("topic") or ""
                    if include_topic_prefix and topic:
                        txt = f"Topic: {topic}. Post: {txt}"
                    
                    matched_keyword = meta.get("matched_keyword") or ""
                    
                    author_did = ""
                    if platform == "bluesky":
                        author = rec.get("author") or {}
                        did = author.get("did")
                        if did:
                            author_did = f"bsky:{did}"
                    elif platform == "truthsocial":
                        account = rec.get("account") or {}
                        acc_id = account.get("id")
                        if acc_id:
                            author_did = f"truth:{acc_id}"
                    
                    post_id = rec.get("id") or rec.get("cid") or ""
                    
                    rows.append({
                        "text": txt,
                        "llm_label": llm_label,
                        "noisy_label": noisy_label,
                        "topic": topic,
                        "platform": platform,
                        "matched_keyword": matched_keyword,
                        "author_did": author_did,
                        "post_id": post_id,
                    })
                    
            except Exception as e:
                print(f"  [ERROR] Failed to process file {path}: {e}")
                import traceback
                traceback.print_exc()
            
            print(f"    Processed {record_count} records from this file")
    
    # 打印详细统计
    print("\n" + "="*60)
    print("=== Detailed File Statistics ===")
    print("="*60)
    
    for file_key, stats in file_stats.items():
        print(f"\nFile: {file_key}")
        print(f"  Total records: {stats['total_records']}")
        print(f"  Valid records kept: {stats['valid_records']}")
        print(f"  Skipped (no valid labels): {stats['skipped_no_label']}")
        print(f"  Skipped (no text): {stats['skipped_no_text']}")
        print(f"  Platforms: {stats['platform_count']}")
        print(f"  Noisy label distribution: {stats['noisy_label_dist']}")
        print(f"  LLM label distribution: {stats['llm_label_dist']}")
    
    df = pd.DataFrame(rows).drop_duplicates(subset=["text", "post_id"], keep="first")
    
    print("\n" + "="*60)
    print("=== Final DataFrame Summary ===")
    print("="*60)
    print(f"Total rows after deduplication: {len(df)}")
    
    if len(df) > 0:
        print("\n=== Platform vs Noisy Label ===")
        print(pd.crosstab(df["platform"], df["noisy_label"], margins=True))
        
        print("\n=== Platform Distribution ===")
        print(df["platform"].value_counts())
    
    return df

def stratified_splits(df: pd.DataFrame, test_size=0.2, val_size=0.1, group_col: Optional[str]=None, seed=RANDOM_SEED):
    """
    Returns df_train, df_val, df_test.
    If group_col is provided (e.g., 'author_did' or 'platform'), we avoid leakage by grouping.
    """
    from sklearn.model_selection import train_test_split, GroupShuffleSplit

    if group_col and group_col in df.columns and df[group_col].astype(bool).any():
        gss = GroupShuffleSplit(n_splits=1, test_size=test_size, random_state=seed)
        idx = np.arange(len(df))
        train_idx, test_idx = next(gss.split(idx, groups=df[group_col]))
        df_train_full, df_test = df.iloc[train_idx], df.iloc[test_idx]

        # val from train_full
        gss2 = GroupShuffleSplit(n_splits=1, test_size=val_size/(1.0-test_size), random_state=seed)
        idx2 = np.arange(len(df_train_full))
        tr_idx, val_idx = next(gss2.split(idx2, groups=df_train_full[group_col].values))
        df_train, df_val = df_train_full.iloc[tr_idx], df_train_full.iloc[val_idx]
    else:
        df_train_full, df_test = train_test_split(
            df, test_size=test_size, random_state=seed, stratify=df["y"]
        )
        df_train, df_val = train_test_split(
            df_train_full, test_size=val_size/(1.0-test_size), random_state=seed, stratify=df_train_full["y"]
        )
    
    print("\n=== Split Statistics ===")    
    for name, part in [("train", df_train), ("val", df_val), ("test", df_test)]:
        print(f"\n{name.upper()}: shape={part.shape}")
        print(f"  LLM labels: {part['llm_label'].value_counts().to_dict()}")
        print(f"  Noisy labels: {part['noisy_label'].value_counts().to_dict()}")

    return df_train.reset_index(drop=True), df_val.reset_index(drop=True), df_test.reset_index(drop=True)

    


In [24]:
df_all = load_dataframe(DATA_DIRS)

# 2. 保险过滤一下（理论上 load_dataframe 已经做了）
df_all = df_all[
    df_all["llm_label"].isin(LABELS_LLM) &
    df_all["noisy_label"].isin(LABELS_NOISY)
].reset_index(drop=True)

# 3. 用 LLM 标签做 stratify basis
df_all["y"] = df_all["llm_label"].map(LABEL2ID_LLM)

df_train, df_val, df_test = stratified_splits(
    df_all,
    test_size=0.2,
    val_size=0.1,
    group_col=None,   # 或 None，看你需不需要 author 去重
    seed=RANDOM_SEED,
)

print("\n" + "="*50)
print("=== Final Summary ===")
print("="*50)
print(f"Train: {len(df_train)}")
print(f"Val: {len(df_val)}")
print(f"Test: {len(df_test)}")


Processing: bluesky/dataset/bluesky_with_noisy_labels.json
Exists: True
Is file: True
File size: 431365692 bytes
Files to process: 1
  Processing file: bluesky/dataset/bluesky_with_noisy_labels.json

    Record 1 structure:
      Top-level keys: ['author', 'cid', 'indexed_at', 'record', 'uri', 'embed', 'labels', 'like_count', 'quote_count', 'reply_count']
      __meta__ keys: ['platform', 'topic', 'framing', 'matched_keyword', 'llm_label']
      noisy_label: left
      platform: bluesky
      llm_label: Left

    Record 2 structure:
      Top-level keys: ['author', 'cid', 'indexed_at', 'record', 'uri', 'embed', 'labels', 'like_count', 'quote_count', 'reply_count']
      __meta__ keys: ['platform', 'topic', 'framing', 'matched_keyword', 'llm_label']
      noisy_label: left
      platform: bluesky
      llm_label: Neutral
    Processed 94264 records from this file

Processing: truthsocial/dataset/truthsocial_with_noisy_labels.json
Exists: True
Is file: True
File size: 298493511 bytes
Fi

### Naive Bayes

In [16]:
import os
import json
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import ComplementNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
import joblib

# 假设你已经运行了之前的数据加载代码，得到了 df_train, df_val, df_test

def train_and_evaluate_nb(df_train, df_val, df_test, label_col, labels_list, model_name):
    """
    训练和评估 Naive Bayes 模型
    
    参数:
    - df_train: 训练数据
    - df_val: 验证数据
    - df_test: 测试数据
    - label_col: 标签列名 ('llm_label' 或 'noisy_label')
    - labels_list: 标签列表 (LABELS_LLM 或 LABELS_NOISY)
    - model_name: 模型名称（用于保存）
    """
    
    print(f"\n{'='*60}")
    print(f"Training {model_name} with {label_col}")
    print(f"{'='*60}")
    
    # 准备数据
    X_train = df_train["text"]
    y_train = df_train[label_col]
    X_val = df_val["text"]
    y_val = df_val[label_col]
    X_test = df_test["text"]
    y_test = df_test[label_col]
    
    # 打印标签分布
    print(f"\nTraining label distribution:")
    print(y_train.value_counts())
    
    # 创建 pipeline
    pipe = Pipeline([
        ("tfidf", TfidfVectorizer(
            lowercase=True,
            ngram_range=(1, 2),
            min_df=5,
            max_df=0.7,
            strip_accents="unicode",
            sublinear_tf=True,
            token_pattern=r"[A-Za-z][A-Za-z0-9_\-']+"
        )),
        ("nb", ComplementNB())
    ])
    
    # Grid search
    param_grid = {"nb__alpha": [0.1, 0.3, 0.5, 1.0]}
    gs = GridSearchCV(
        pipe, 
        param_grid, 
        scoring="f1_macro", 
        cv=3, 
        n_jobs=-1, 
        verbose=1
    )
    
    print(f"\nPerforming grid search...")
    gs.fit(X_train, y_train)
    
    print(f"Best params: {gs.best_params_}")
    print(f"Best Mean Cross-Validation F1 Score on train: {gs.best_score_:.4f}")
    
    # 在验证集上评估
    best_model = gs.best_estimator_
    y_val_pred = best_model.predict(X_val)
    val_acc = accuracy_score(y_val, y_val_pred)
    print(f"\nValidation Accuracy: {val_acc:.4f}")
    
    # 在测试集上评估
    y_test_pred = best_model.predict(X_test)
    test_acc = accuracy_score(y_test, y_test_pred)
    
    print(f"\n--- Test Set Results ---")
    print(f"Test Accuracy: {test_acc:.4f}")
    print(f"\nClassification Report:")
    print(classification_report(y_test, y_test_pred, labels=labels_list, digits=3))
    
    print(f"\nConfusion Matrix:")
    cm = confusion_matrix(y_test, y_test_pred, labels=labels_list)
    print(f"Labels: {labels_list}")
    print(cm)
    
    # 保存模型
    os.makedirs(f"models_nb/{model_name}", exist_ok=True)
    model_path = f"models_nb/{model_name}/nb_tfidf.joblib"
    joblib.dump(best_model, model_path)
    
    # 保存标签
    labels_path = f"models_nb/{model_name}/labels.txt"
    with open(labels_path, "w") as f:
        f.write("\n".join(labels_list))
    
    # 保存结果
    results = {
        "model_name": model_name,
        "label_col": label_col,
        "best_params": gs.best_params_,
        "best_cv_score": float(gs.best_score_),
        "val_accuracy": float(val_acc),
        "test_accuracy": float(test_acc),
        "test_predictions": {
            "y_true": y_test.tolist(),
            "y_pred": y_test_pred.tolist()
        }
    }
    
    results_path = f"models_nb/{model_name}/results.json"
    with open(results_path, "w") as f:
        json.dump(results, f, indent=2)
    
    print(f"\nModel saved to: {model_path}")
    print(f"Labels saved to: {labels_path}")
    print(f"Results saved to: {results_path}")
    
    return best_model, results

def compare_models(results_llm, results_noisy):
    """比较两个模型的性能"""
    print(f"\n{'='*60}")
    print("Model Comparison")
    print(f"{'='*60}")
    
    comparison = pd.DataFrame({
        "Model": ["LLM Labels", "Noisy Labels"],
        "Best Alpha": [results_llm["best_params"]["nb__alpha"], 
                      results_noisy["best_params"]["nb__alpha"]],
        "CV F1 Score": [results_llm["best_cv_score"], 
                       results_noisy["best_cv_score"]],
        "Val Accuracy": [results_llm["val_accuracy"], 
                        results_noisy["val_accuracy"]],
        "Test Accuracy": [results_llm["test_accuracy"], 
                         results_noisy["test_accuracy"]]
    })
    
    print(comparison.to_string(index=False))
    
    # 计算两个模型预测的一致性（仅适用于二分类）
    if len(LABELS_LLM) == 2 and len(LABELS_NOISY) == 2:
        y_pred_llm = results_llm["test_predictions"]["y_pred"]
        y_pred_noisy = results_noisy["test_predictions"]["y_pred"]
        
        # 如果标签集相同，可以直接比较
        if set(LABELS_LLM) == set(LABELS_NOISY):
            agreement = sum(1 for a, b in zip(y_pred_llm, y_pred_noisy) if a == b)
            agreement_rate = agreement / len(y_pred_llm)
            print(f"\nPrediction Agreement Rate: {agreement_rate:.4f}")

# 主程序
if __name__ == "__main__":
    # 确保已经加载了数据
    # 这里假设你已经运行了之前的代码得到了 df_train, df_val, df_test
    
    # 加载数据（如果还没有加载）
    df_all = load_dataframe(DATA_DIRS)
    
    # 过滤
    df_all = df_all[
        df_all["llm_label"].isin(LABELS_LLM) &
        df_all["noisy_label"].isin(LABELS_NOISY)
    ].reset_index(drop=True)
    
    # 用 LLM 标签做 stratify（确保相同的数据划分）
    df_all["y"] = df_all["llm_label"].map(LABEL2ID_LLM)
    
    # 划分数据集（这确保了两个模型使用完全相同的训练/验证/测试集）
    df_train, df_val, df_test = stratified_splits(
        df_all,
        test_size=0.2,
        val_size=0.1,
        group_col=None,
        seed=RANDOM_SEED,
    )
    
    print(f"\nDataset sizes:")
    print(f"Train: {len(df_train)}")
    print(f"Val: {len(df_val)}")
    print(f"Test: {len(df_test)}")
    
    # 训练 LLM 标签模型
    model_llm, results_llm = train_and_evaluate_nb(
        df_train, df_val, df_test,
        label_col="llm_label",
        labels_list=LABELS_LLM,
        model_name="llm_labels"
    )
    
    # 训练 Noisy 标签模型
    model_noisy, results_noisy = train_and_evaluate_nb(
        df_train, df_val, df_test,
        label_col="noisy_label",
        labels_list=LABELS_NOISY,
        model_name="noisy_labels"
    )
    
    # 比较两个模型
    compare_models(results_llm, results_noisy)
    
    # 额外分析：查看 noisy label 的分布情况
    print(f"\n{'='*60}")
    print("Noisy Label Analysis")
    print(f"{'='*60}")
    
    # 查看不同 LLM 标签下的 noisy label 分布
    print("\nCrosstab: LLM Label vs Noisy Label (in test set):")
    print(pd.crosstab(df_test["llm_label"], df_test["noisy_label"], margins=True))
    
    # 如果你想看看模型在不同子集上的表现
    for llm_label in LABELS_LLM:
        subset = df_test[df_test["llm_label"] == llm_label]
        if len(subset) > 0:
            # 对于 noisy label 模型的预测
            y_true_subset = subset["noisy_label"]
            X_subset = subset["text"]
            y_pred_subset = model_noisy.predict(X_subset)
            acc = accuracy_score(y_true_subset, y_pred_subset)
            print(f"\nNoisy model accuracy on LLM={llm_label} subset: {acc:.4f} (n={len(subset)})")


Processing: bluesky/dataset/bluesky_with_noisy_labels.json
Exists: True
Is file: True
File size: 431365692 bytes
Files to process: 1
  Processing file: bluesky/dataset/bluesky_with_noisy_labels.json

    Record 1 structure:
      Top-level keys: ['author', 'cid', 'indexed_at', 'record', 'uri', 'embed', 'labels', 'like_count', 'quote_count', 'reply_count']
      __meta__ keys: ['platform', 'topic', 'framing', 'matched_keyword', 'llm_label']
      noisy_label: left
      platform: bluesky
      llm_label: Left

    Record 2 structure:
      Top-level keys: ['author', 'cid', 'indexed_at', 'record', 'uri', 'embed', 'labels', 'like_count', 'quote_count', 'reply_count']
      __meta__ keys: ['platform', 'topic', 'framing', 'matched_keyword', 'llm_label']
      noisy_label: left
      platform: bluesky
      llm_label: Neutral
    Processed 94264 records from this file

Processing: truthsocial/dataset/truthsocial_with_noisy_labels.json
Exists: True
Is file: True
File size: 298493511 bytes
Fi

In [17]:
# 让我们明确地比较这两个任务
print("="*60)
print("Task Comparison")
print("="*60)

print("\nTask 1 - Platform Classification (Noisy Labels):")
print("  - Binary classification: Bluesky vs TruthSocial")
print("  - What model learns: Platform-specific writing styles, topics, vocabulary")
print("  - Accuracy: 89.85%")

print("\nTask 2 - Political Stance Detection (LLM Labels):")
print("  - 3-way classification: Left vs Right vs Neutral")
print("  - What model learns: Actual political ideology from content")
print("  - Accuracy: 68.93%")

# 查看实际的政治多样性
print("\n" + "="*60)
print("Actual Political Diversity in Each Platform")
print("="*60)

for platform in ['bluesky', 'truthsocial']:
    subset = df_test[df_test['platform'] == platform]
    print(f"\n{platform.upper()} (n={len(subset)}):")
    distribution = subset['llm_label'].value_counts(normalize=True)
    for label, pct in distribution.items():
        print(f"  {label}: {pct:.1%}")

Task Comparison

Task 1 - Platform Classification (Noisy Labels):
  - Binary classification: Bluesky vs TruthSocial
  - What model learns: Platform-specific writing styles, topics, vocabulary
  - Accuracy: 89.85%

Task 2 - Political Stance Detection (LLM Labels):
  - 3-way classification: Left vs Right vs Neutral
  - What model learns: Actual political ideology from content
  - Accuracy: 68.93%

Actual Political Diversity in Each Platform

BLUESKY (n=18661):
  Neutral: 44.6%
  Left: 44.5%
  Right: 10.9%

TRUTHSOCIAL (n=7693):
  Right: 61.1%
  Neutral: 31.6%
  Left: 7.3%


In [18]:
# 分析平台假设的准确性
print("="*60)
print("Platform Assumption Analysis")
print("="*60)

# 计算"平台假设"的准确率（如果用平台预测真实政治倾向）
df_test['platform_assumption'] = df_test['platform'].map({
    'bluesky': 'Left',
    'truthsocial': 'Right'
})

# 只看 Left 和 Right（排除 Neutral）
df_binary = df_test[df_test['llm_label'].isin(['Left', 'Right'])]

correct_assumption = (df_binary['platform_assumption'] == df_binary['llm_label']).sum()
total_binary = len(df_binary)

print(f"\nPlatform assumption accuracy (excluding Neutral):")
print(f"  Correct: {correct_assumption}/{total_binary} = {correct_assumption/total_binary:.1%}")

# 分别看每个平台
for platform in ['bluesky', 'truthsocial']:
    platform_binary = df_binary[df_binary['platform'] == platform]
    if platform == 'bluesky':
        accuracy = (platform_binary['llm_label'] == 'Left').mean()
        print(f"\nBluesky users actually Left-leaning: {accuracy:.1%}")
    else:
        accuracy = (platform_binary['llm_label'] == 'Right').mean()
        print(f"TruthSocial users actually Right-leaning: {accuracy:.1%}")

Platform Assumption Analysis

Platform assumption accuracy (excluding Neutral):
  Correct: 13009/15605 = 83.4%

Bluesky users actually Left-leaning: 80.3%
TruthSocial users actually Right-leaning: 89.4%


In [22]:
# 提取模型学到的特征
from sklearn.feature_extraction.text import TfidfVectorizer

# 使用训练好的 noisy label 模型
vectorizer = model_noisy.named_steps['tfidf']
classifier = model_noisy.named_steps['nb']

# 获取特征重要性
feature_names = vectorizer.get_feature_names_out()
feature_log_prob = classifier.feature_log_prob_

# 找出区分两个平台的关键词
print("="*60)
print("Platform-Specific Features Learned by Model")
print("="*60)

# Left (Bluesky) 的特征词
left_scores = feature_log_prob[0]  # Left class
top_left_indices = left_scores.argsort()[-20:][::-1]
print("\nTop features for 'Left' (actually Bluesky):")
for idx in top_left_indices[:10]:
    print(f"  {feature_names[idx]}")

# Right (TruthSocial) 的特征词
right_scores = feature_log_prob[1]  # Right class
top_right_indices = right_scores.argsort()[-20:][::-1]
print("\nTop features for 'Right' (actually TruthSocial):")
for idx in top_right_indices[:10]:
    print(f"  {feature_names[idx]}")

Platform-Specific Features Learned by Model

Top features for 'Left' (actually Bluesky):
  job creators
  fuck
  fucking
  from ecosearch
  asshole
  lgbtqnation com
  together scantopray
  lgbtqnation
  see medicare
  today www

Top features for 'Right' (actually TruthSocial):
  com users
  statuses
  rt https
  national intelligence
  presssec
  policy immigration
  pardons were
  pardons to
  administration mark
  report were


In [20]:
# 更有意义的分析
print("\n" + "="*60)
print("Meaningful Analysis: Platform-Politics Correlation")
print("="*60)

# 计算平台选择与政治倾向的相关性
from sklearn.metrics import matthews_corrcoef

# 只看二分类情况
df_binary = df_test[df_test['llm_label'].isin(['Left', 'Right'])]

# 编码
platform_encoded = df_binary['platform'].map({'bluesky': 0, 'truthsocial': 1})
politics_encoded = df_binary['llm_label'].map({'Left': 0, 'Right': 1})

correlation = matthews_corrcoef(platform_encoded, politics_encoded)
print(f"Matthews Correlation Coefficient: {correlation:.3f}")
print("(1.0 = perfect correlation, 0 = no correlation, -1.0 = perfect inverse correlation)")

# 条件概率
print("\nConditional Probabilities:")
for platform in ['bluesky', 'truthsocial']:
    for political in ['Left', 'Right']:
        count = len(df_test[(df_test['platform']==platform) & (df_test['llm_label']==political)])
        total = len(df_test[df_test['platform']==platform])
        prob = count / total if total > 0 else 0
        print(f"P({political} | {platform}): {prob:.3f}")


Meaningful Analysis: Platform-Politics Correlation
Matthews Correlation Coefficient: 0.665
(1.0 = perfect correlation, 0 = no correlation, -1.0 = perfect inverse correlation)

Conditional Probabilities:
P(Left | bluesky): 0.445
P(Right | bluesky): 0.109
P(Left | truthsocial): 0.073
P(Right | truthsocial): 0.611


In [19]:
# 生成一些分析建议
print("\n" + "="*60)
print("Research Insights")
print("="*60)

print("\n1. Platform Echo Chamber Effect:")
print("   - How much does platform choice correlate with political ideology?")
print("   - Your data can quantify this!")

print("\n2. Cross-Platform Political Minorities:")
df_minorities = pd.DataFrame({
    'Right-wing on Bluesky': [len(df_test[(df_test['platform']=='bluesky') & (df_test['llm_label']=='Right')])],
    'Left-wing on TruthSocial': [len(df_test[(df_test['platform']=='truthsocial') & (df_test['llm_label']=='Left')])]
})
print(df_minorities.T.to_string(header=False))

print("\n3. Model Robustness Question:")
print("   - Can a model trained on platform-balanced data generalize better?")
print("   - Your LLM-label model is actually learning cross-platform patterns!")

print("\n4. Feature Importance:")
print("   - What words distinguish platforms vs. political stances?")


Research Insights

1. Platform Echo Chamber Effect:
   - How much does platform choice correlate with political ideology?
   - Your data can quantify this!

2. Cross-Platform Political Minorities:
Right-wing on Bluesky     2036
Left-wing on TruthSocial   560

3. Model Robustness Question:
   - Can a model trained on platform-balanced data generalize better?
   - Your LLM-label model is actually learning cross-platform patterns!

4. Feature Importance:
   - What words distinguish platforms vs. political stances?


### BERT

In [1]:
import torch, numpy as np
from datasets import Dataset, DatasetDict
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                        DataCollatorWithPadding, TrainingArguments, Trainer)
from sklearn.metrics import accuracy_score, f1_score

device = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_NAME = "distilbert-base-uncased"
MAX_LEN = 256

# build HF datasets
def to_hf(df):
    # keep numeric label and rename it to 'labels'
    tmp = df[["text", "y", "topic", "platform"]].copy()
    tmp = tmp.rename(columns={"y": "labels"})
    return Dataset.from_pandas(tmp, preserve_index=False)

hf = DatasetDict({
    "train": to_hf(df_train),
    "validation": to_hf(df_val),
    "test": to_hf(df_test),
})


tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def tokenize(batch):
    return tok(batch["text"], truncation=True, max_length=MAX_LEN)

hf_tok = hf.map(
    tokenize,
    batched=True,
    remove_columns=["text", "topic", "platform"],  # 'labels' is kept
)
data_collator = DataCollatorWithPadding(tokenizer=tok)

def compute_metrics(pred):
    logits, labels = pred
    preds = np.argmax(logits, axis=-1)

    acc = accuracy_score(labels, preds)
    f1_macro = f1_score(labels, preds, average="macro")

    out = {"accuracy": acc, "f1": f1_macro}

    # per-class F1
    for i, name in ID2LABEL_NOISY.items():
        out[f"f1_{name}"] = f1_score(
            (labels == i).astype(int),
            (preds == i).astype(int),
            average="binary",
            zero_division=0,
        )
    return out

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(LABELS_NOISY),
    id2label=ID2LABEL_NOISY,
    label2id=LABEL2ID_NOISY
).to(device)

args = TrainingArguments(
    output_dir="models_distilbert",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=200,
    save_steps=2000,
    seed=RANDOM_SEED,
    dataloader_num_workers=4,
)

# handle imbalance with class weights
use_class_weights = True
class_counts = df_train["y"].value_counts().reindex(range(len(LABELS_NOISY)), fill_value=0).values
weights = torch.tensor(len(df_train)/np.maximum(class_counts,1), dtype=torch.float32, device=device)
weights = weights / weights.sum() * len(LABELS_NOISY)  # normalize around 1

class WeightedTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # labels
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        if self.class_weights is not None:
            loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights)
        else:
            loss_fct = torch.nn.CrossEntropyLoss()

        loss = loss_fct(
            logits.view(-1, model.config.num_labels),
            labels.view(-1)
        )
        return (loss, outputs) if return_outputs else loss

trainer = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=hf_tok["train"],
    eval_dataset=hf_tok["validation"],
    tokenizer=tok,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    class_weights=weights,
)

trainer.train()
print("Val metrics:", trainer.evaluate(hf_tok["validation"]))
print("Test metrics:", trainer.evaluate(hf_tok["test"]))

# saving model
trainer.save_model("models_distilbert/best")
tok.save_pretrained("models_distilbert/best")


KeyboardInterrupt: 