### Import and setup

In [2]:
import json, gzip, io, math
from pathlib import Path
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt

DATA_DIRS = [
    r".\\bluesky\\dataset\\labeled_all_posts.json",
    r".\\truthsocial\\dataset\\labeled_all_posts.json",
]

SAVE_DIR = Path("./label_eda")
SAVE_DIR.mkdir(parents=True, exist_ok=True)

search_terms = {
    "Abortion and Reproductive Policy": {
        "left": ["reproductive rights", "pro-choice", "abortion access", "women's rights"],
        "right": ["pro-life", "unborn", "sanctity of life", "heartbeat bill"],
        "neutral": ["abortion law", "abortion policy", "Roe v. Wade", "Planned Parenthood"],
    },
    "Gun Policy and Firearms Regulation": {
        "left": ["gun control", "assault weapons ban", "background checks"],
        "right": ["gun rights", "Second Amendment", "2A", "constitutional carry"],
        "neutral": ["firearms policy", "gun laws", "gun ownership"],
    },
    "Climate and Environmental Policy": {
        "left": ["climate crisis", "renewable energy", "green new deal"],
        "right": ["energy independence", "fossil fuels", "climate hoax"],
        "neutral": ["climate change", "carbon emissions", "environmental regulation", "Paris Agreement"],
    },
    "Healthcare and Insurance Reform": {
        "left": ["universal healthcare", "Medicare for All", "public option"],
        "right": ["health freedom", "government overreach", "private insurance"],
        "neutral": ["healthcare reform", "Affordable Care Act", "health insurance policy"],
    },
    "Immigration and Border Policy": {
        "left": ["immigrant rights", "DACA", "asylum seekers", "family separation"],
        "right": ["border crisis", "illegal immigration", "build the wall"],
        "neutral": ["immigration policy", "border security", "visa policy"],
    },
    "LGBTQ+ and Civil Rights": {
        "left": ["LGBTQ+ rights", "trans rights", "marriage equality"],
        "right": ["religious freedom", "parental rights", "traditional values"],
        "neutral": ["civil rights", "anti-discrimination", "gender identity policy"],
    },
    "Voting and Election Policy": {
        "left": ["voting rights", "voter suppression", "expand mail-in voting"],
        "right": ["election integrity", "voter fraud", "secure elections"],
        "neutral": ["election reform", "voter ID laws", "ballot access"],
    },
    "Economic Inequality and Taxation": {
        "left": ["wealth tax", "economic justice", "raise minimum wage"],
        "right": ["tax burden", "job creators", "free market"],
        "neutral": ["tax policy", "income inequality", "economic mobility"],
    },
    "Policing and Criminal Justice Reform": {
        "left": ["police reform", "defund the police", "mass incarceration"],
        "right": ["law and order", "back the blue", "crime wave"],
        "neutral": ["criminal justice reform", "public safety", "police accountability"],
    },
    "Free Speech and Content Regulation": {
        "left": ["hate speech", "content moderation", "misinformation"],
        "right": ["free speech", "censorship", "cancel culture"],
        "neutral": ["First Amendment", "online speech", "platform policy"],
    },
    "Affirmative Action and Education Policy": {
        "left": ["affirmative action", "diversity in education", "racial equity"],
        "right": ["merit-based admissions", "colorblind policy", "reverse discrimination"],
        "neutral": ["college admissions", "education policy", "Supreme Court decision"],
    },
    "Drug Policy and Substance Regulation": {
        "left": ["drug decriminalization", "harm reduction", "marijuana legalization"],
        "right": ["war on drugs", "fentanyl crisis", "tough on crime"],
        "neutral": ["drug policy", "opioid epidemic", "controlled substances"],
    },
    "Foreign Policy and National Defense": {
        "left": ["diplomacy", "humanitarian aid", "anti-war"],
        "right": ["military strength", "America First", "peace through strength"],
        "neutral": ["foreign policy", "NATO", "defense spending", "Ukraine war"],
    },
    "Technology and Internet Regulation": {
        "left": ["tech accountability", "data privacy", "AI regulation"],
        "right": ["free market innovation", "anti-censorship", "Big Tech bias"],
        "neutral": ["technology policy", "Section 230", "AI ethics"],
    },
    "Race and Education Curriculum": {
        "left": ["racial justice education", "anti-racism curriculum", "diversity training"],
        "right": ["critical race theory", "parental rights in education", "woke indoctrination"],
        "neutral": ["education curriculum", "teaching history", "school policy"],
    },
    "Sex Education and Family Policy": {
        "left": ["comprehensive sex education", "LGBTQ-inclusive curriculum"],
        "right": ["abstinence education", "parental consent", "family values"],
        "neutral": ["sex education policy", "school curriculum", "health education"],
    },
    "Basic Income and Welfare Programs": {
        "left": ["universal basic income", "social safety net", "poverty relief"],
        "right": ["welfare dependency", "personal responsibility", "work requirements"],
        "neutral": ["welfare policy", "income support", "economic assistance"],
    },
    "Trade and Economic Policy": {
        "left": ["fair trade", "labor rights", "climate-friendly trade"],
        "right": ["free trade", "tariffs", "America First trade"],
        "neutral": ["trade policy", "import/export", "trade agreements"],
    },
    "Social Security and Retirement Policy": {
        "left": ["protect social security", "expand benefits"],
        "right": ["entitlement reform", "reduce spending", "privatize social security"],
        "neutral": ["retirement policy", "social security funding", "aging population"],
    },
    "National Security and Civil Liberties": {
        "left": ["surveillance reform", "privacy rights", "anti-war movement"],
        "right": ["national security", "border protection", "patriotism"],
        "neutral": ["counterterrorism", "cybersecurity", "civil liberties"],
    },
}

### Build framing map

In [3]:
kw_to_topic = {}
kw_to_bucket = {}
for topic, buckets in search_terms.items():
    for bucket, kws in buckets.items():
        for kw in kws:
            k = kw.strip().lower()
            kw_to_topic[k] = topic
            kw_to_bucket[k] = bucket  # 'left' | 'right' | 'neutral'

### Helper functions

In [5]:
def iter_paths(root: Path):
    exts = {".json", ".jsonl", ".ndjson", ".json.gz", ".jsonl.gz", ".ndjson.gz"}
    for p in root.rglob("*"):
        if p.suffix.lower() in exts or "".join(p.suffixes).lower().endswith(".json.gz") or "".join(p.suffixes).lower().endswith(".jsonl.gz") or "".join(p.suffixes).lower().endswith(".ndjson.gz"):
            yield p

def open_text(path: Path):
    if str(path).endswith(".gz"):
        return io.TextIOWrapper(gzip.open(path, "rb"), encoding="utf-8")
    return open(path, "r", encoding="utf-8")

def iter_records_from_file(path: Path):
    with open_text(path) as f:
        head = f.read(2048)
        f.seek(0)
        # if first non-space char is '[', the file is a JSON array
        first = head.lstrip()[:1]
        if first == "[":
            try:
                data = json.load(f)
                if isinstance(data, list):
                    for obj in data:
                        if isinstance(obj, dict):
                            yield obj
                elif isinstance(data, dict):
                    yield data
            except Exception:
                # fallback to line-iter if parsing large array fails for some reason
                for line in f:
                    line = line.strip()
                    if not line:
                        continue
                    try:
                        obj = json.loads(line)
                        if isinstance(obj, dict):
                            yield obj
                    except Exception:
                        continue
        else:
            # treat as jsonl
            for line in f:
                line = line.strip()
                if not line:
                    continue
                try:
                    obj = json.loads(line)
                    if isinstance(obj, dict):
                        yield obj
                except Exception:
                    continue

def normalize_label(x):
    if not x:
        return None
    t = str(x).strip().lower()
    if t in ("left", "l"):
        return "Left"
    if t in ("right", "r"):
        return "Right"
    if t in ("neutral", "centre", "center", "centrist", "n"):
        return "Neutral"
    return str(x).strip()

def safe_meta(meta):
    # expected meta fields
    topic = meta.get("topic")
    matched_keyword = meta.get("matched_keyword")
    platform = meta.get("platform")
    label = normalize_label(meta.get("llm_label"))
    return topic, matched_keyword, platform, label

### Aggregation

In [6]:
# count by (platform, topic, matched_keyword, framing_bucket, llm_label)
C_kw = Counter()

# also aggregate by (platform, topic, llm_label)
C_topic = Counter()

n_seen = 0
n_used = 0

for d in DATA_DIRS:
    root = Path(d)
    if not root.exists():
        print(f"[WARN] Missing: {root}")
        continue

    if root.is_file():
        paths = [root]
    else:
        paths = iter_paths(root)
    
    for path in paths:
        for rec in iter_records_from_file(path):
            n_seen += 1
            meta = rec.get("__meta__", {}) or {}
            topic, matched_keyword, platform, label = safe_meta(meta)
            if not (topic and matched_keyword and platform and label):
                continue
            
            # normalize keyword for framing lookup
            kw_norm = matched_keyword.strip().lower()
            framing = kw_to_bucket.get(kw_norm, None)
        
            # if topic missing or disagreeing, accept the meta-provided topic
            C_kw[(platform, topic, matched_keyword, framing, label)] += 1
            C_topic[(platform, topic, label)] += 1
            n_used += 1

print(f"Processed records: seen={n_seen:,}, used={n_used:,}")

Processed records: seen=132,459, used=132,459


### Dataframe setup

In [7]:
df_kw = pd.DataFrame(
    [(plat, topic, kw, framing if framing else "unknown", label, cnt)
    for (plat, topic, kw, framing, label), cnt in C_kw.items()],
    columns=["platform", "topic", "matched_keyword", "framing_bucket", "llm_label", "count"]
)

df_topic = pd.DataFrame(
    [(plat, topic, label, cnt)
    for (plat, topic, label), cnt in C_topic.items()],
    columns=["platform", "topic", "llm_label", "count"]
)

def add_props(df, group_cols, count_col="count", prop_col="prop"):
    g = df.groupby(group_cols, dropna=False)[count_col].transform("sum")
    out = df.copy()
    out[prop_col] = out[count_col] / g.replace({0: pd.NA})
    return out

df_kw = add_props(df_kw, ["platform", "topic", "matched_keyword"])
df_topic = add_props(df_topic, ["platform", "topic"])


### Framing vs Label agreement
For keywords that belong to a framing bucket (left/right/neutral), what % of LLM labels match that bucket?


In [10]:
def matches_framing(row):
    fb = str(row["framing_bucket"]).lower()
    lab = str(row["llm_label"]).lower()
    return 1 if fb in ("left","right","neutral") and lab == fb else 0

df_kw["align"] = df_kw.apply(matches_framing, axis=1)
# alignment rate computed at keyword granularity
align_kw = (df_kw
            .groupby(["platform", "topic", "matched_keyword", "framing_bucket"], dropna=False)
            .apply(lambda g: (g["count"] * g["align"]).sum() / g["count"].sum(), include_groups=False)
            .reset_index(name="alignment_rate"))

# Framing-bucket-level label distributions
framing_dist = (df_kw
    .groupby(["platform", "topic", "framing_bucket", "llm_label"], dropna=False)["count"]
    .sum().reset_index())
framing_dist = add_props(framing_dist, ["platform", "topic", "framing_bucket"])

### Cross platform comparison and KL divergence

In [12]:
def label_vector(df, key_cols):
    # returns dict: key -> dict(label->prob) with Laplace smoothing
    out = {}
    base = df.groupby(key_cols + ["llm_label"])["count"].sum().reset_index()
    for key, g in base.groupby(key_cols):
        tot = g["count"].sum()
        # Laplace smoothing
        labels = ["Left","Right","Neutral"]
        vec = {}
        for lab in labels:
            v = int(g.loc[g["llm_label"]==lab, "count"].sum())
            vec[lab] = (v + 1) / (tot + len(labels))
        out[key] = vec
    return out

def kl(p, q):
    # KL(p||q) with base e
    eps = 1e-12
    return sum(pi * math.log((pi+eps)/(qi+eps)) for (pi, qi) in zip(p, q))

# topic-level distributions per platform
topic_vecs = label_vector(df_topic, ["platform","topic"])

rows_kl = []
for topic in df_topic["topic"].unique():
    p = topic_vecs.get(("truthsocial", topic))
    q = topic_vecs.get(("bluesky", topic))
    if p and q:
        labels = ["Left","Right","Neutral"]
        P = [p[l] for l in labels]
        Q = [q[l] for l in labels]
        rows_kl.append({"topic": topic,
                        "kl_truth_vs_blue": kl(P, Q),
                        "truth_LRN": P,
                        "blue_LRN": Q})

df_kl = pd.DataFrame(rows_kl)

### Saving csv outputs

In [None]:
df_kw.sort_values(["topic","matched_keyword","platform","llm_label"]).to_csv(SAVE_DIR/"keyword_label_distribution.csv", index=False)
df_topic.sort_values(["topic","platform","llm_label"]).to_csv(SAVE_DIR/"topic_label_distribution.csv", index=False)
align_kw.sort_values(["topic","matched_keyword","platform"]).to_csv(SAVE_DIR/"alignment_rate_by_keyword.csv", index=False)
framing_dist.sort_values(["topic","framing_bucket","platform","llm_label"]).to_csv(SAVE_DIR/"framing_bucket_distributions.csv", index=False)
if not df_kl.empty:
    df_kl.sort_values("kl_truth_vs_blue", ascending=False).to_csv(SAVE_DIR/"topic_kl_divergence_truth_vs_bluesky.csv", index=False)

In [None]:
N = 20
top_kw = (df_kw.groupby(["matched_keyword"])["count"].sum()
        .sort_values(ascending=False).head(N).index.tolist())
plot_kw = df_kw[df_kw["matched_keyword"].isin(top_kw)]

def stacked_bar(df, x_col, y_col="count", hue_col="llm_label", title="", fname="plot.png"):
    # build wide format for stacked bar
    wide = (df.pivot_table(index=x_col, columns=hue_col, values=y_col, aggfunc="sum", fill_value=0)
            .loc[:, ["Left","Right","Neutral"] if set(["Left","Right","Neutral"]).issubset(df[hue_col].unique()) else sorted(df[hue_col].unique())])
    ax = wide.plot(kind="bar", stacked=True, figsize=(12,6))
    ax.set_xlabel(x_col.replace("_"," "))
    ax.set_ylabel("Count")
    ax.set_title(title)
    plt.tight_layout()
    out = SAVE_DIR/fname
    plt.savefig(out, dpi=150)
    plt.close()

stacked_bar(plot_kw, "matched_keyword",
            title="Label distribution for top keywords (all platforms)",
            fname="top_keywords_stacked_labels.png")

# topic-level stacked bars by platform
for plat in df_topic["platform"].unique():
    dfp = df_topic[df_topic["platform"]==plat]
    stacked_bar(dfp, "topic", y_col="count",
                title=f"Topic label distribution — {plat}",
                fname=f"topic_labels_{plat}.png")

# framing-bucket label distributions
for plat in df_kw["platform"].unique():
    dfp = framing_dist[framing_dist["platform"]==plat]
    stacked_bar(dfp, "framing_bucket", y_col="count",
                title=f"Framing-bucket label distribution — {plat}",
                fname=f"framing_bucket_labels_{plat}.png")