In [3]:
!pip install scikit-learn




[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
# JUPYTER-FRIENDLY DATA PREP FOR STANCE CLASSIFICATION
# - Merge multiple JSON files (each a list of dicts with 'stance')
# - Add `case` from filename
# - Dedup by `id`, shuffle, save merged JSON/JSONL
# - Stratified split (train/val/test) by `stance`
# - Save label_map, class_counts, class_weights, per_case_counts, summary

from pathlib import Path
from typing import List, Dict, Any, Tuple
import json, random
from collections import Counter, defaultdict
from sklearn.model_selection import train_test_split

def _load_json_list(path: Path):
    with path.open("r", encoding="utf-8") as f:
        data = json.load(f)
    if not isinstance(data, list):
        raise ValueError(f"{path} is not a JSON list")
    return data

def _normalize_record(rec: Dict[str, Any], case_name: str) -> Dict[str, Any]:
    out = dict(rec)
    out["case"] = case_name
    if isinstance(out.get("stance"), str):
        out["stance"] = out["stance"].strip().lower()
    return out

def _write_json(path: Path, obj: Any):
    with path.open("w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)

def _write_jsonl(path: Path, rows: List[Dict[str, Any]]):
    with path.open("w", encoding="utf-8") as f:
        for r in rows:
            f.write(json.dumps(r, ensure_ascii=False) + "\n")

def _compute_class_weights(labels: List[str]) -> Dict[str, float]:
    counts = Counter(labels)
    total = sum(counts.values())
    k = len(counts)
    return {lbl: (total / (k * cnt)) for lbl, cnt in counts.items() if cnt > 0}

def prepare_merge_and_split(
    inputs: List[str],
    output_dir: str,
    seed: int = 42,
    val_size: float = 0.10,
    test_size: float = 0.10,
    shuffle: bool = True,
) -> Dict[str, Any]:
    """
    Args:
        inputs: list of JSON file paths (each a list of dicts with 'stance')
        output_dir: where to save outputs
        seed, val_size, test_size: split controls
        shuffle: shuffle before splitting

    Returns:
        dict with paths, counts, weights, and split sizes
    """
    out_dir = Path(output_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    random.seed(seed)

    # Load + tag case from filename
    merged = []
    for p in inputs:
        pth = Path(p)
        case_name = pth.stem.replace("tweets_", "").replace("_final_for_bert", "")
        data = _load_json_list(pth)
        merged.extend([_normalize_record(r, case_name) for r in data])

    # Dedup by 'id'
    seen = set()
    deduped = []
    for r in merged:
        tid = r.get("id")
        if tid in seen:
            continue
        seen.add(tid)
        deduped.append(r)

    # Shuffle
    if shuffle:
        random.shuffle(deduped)

    # Ensure keys
    required = ["id", "text", "clean_text", "stance", "case"]
    for r in deduped:
        for k in required:
            r.setdefault(k, None)

    # Save merged
    merged_json = out_dir / "merged_scapegoat_stance.json"
    merged_jsonl = out_dir / "merged_scapegoat_stance.jsonl"
    _write_json(merged_json, deduped)
    _write_jsonl(merged_jsonl, deduped)

    # Labels, maps, counts
    labels = [r["stance"] for r in deduped if r.get("stance") is not None]
    if not labels:
        raise ValueError("No 'stance' labels found.")

    label_order = sorted(set(labels))  # deterministic
    label_map = {lbl: i for i, lbl in enumerate(label_order)}
    _write_json(out_dir / "label_map.json", label_map)

    class_counts = Counter(labels)
    _write_json(out_dir / "class_counts.json", class_counts)

    class_weights = _compute_class_weights(labels)
    _write_json(out_dir / "class_weights.json", class_weights)

    per_case = defaultdict(lambda: Counter())
    for r in deduped:
        per_case[r["case"]][r.get("stance")] += 1
    per_case_out = {case: dict(cnt) for case, cnt in per_case.items()}
    _write_json(out_dir / "per_case_counts.json", per_case_out)

    # Stratified splits
    holdout = val_size + test_size
    if not (0 < holdout < 1):
        raise ValueError("val_size + test_size must be in (0,1).")

    rows = [r for r in deduped if r.get("stance") in label_map]
    y = [r["stance"] for r in rows]
    X = list(range(len(rows)))

    X_train, X_tmp, y_train, y_tmp = train_test_split(
        X, y, test_size=holdout, random_state=seed, stratify=y
    )
    test_ratio = 0 if test_size == 0 else test_size / (val_size + test_size)
    X_val, X_test, y_val, y_test = train_test_split(
        X_tmp, y_tmp, test_size=test_ratio, random_state=seed, stratify=y_tmp if test_ratio > 0 else None
    )

    train_rows = [rows[i] for i in X_train]
    val_rows   = [rows[i] for i in X_val]
    test_rows  = [rows[i] for i in X_test]

    _write_jsonl(out_dir / "merged_train.jsonl", train_rows)
    _write_jsonl(out_dir / "merged_val.jsonl",   val_rows)
    _write_jsonl(out_dir / "merged_test.jsonl",  test_rows)

    # Summary (also save)
    summary = {
        "total_after_dedup": len(deduped),
        "label_order": label_order,
        "class_counts": dict(class_counts),
        "class_weights": {k: round(v, 6) for k, v in class_weights.items()},
        "per_case_counts": per_case_out,
        "splits": {"train": len(train_rows), "val": len(val_rows), "test": len(test_rows)},
        "files": {
            "merged_json": str(merged_json),
            "merged_jsonl": str(merged_jsonl),
            "label_map": str(out_dir / "label_map.json"),
            "class_counts": str(out_dir / "class_counts.json"),
            "class_weights": str(out_dir / "class_weights.json"),
            "per_case_counts": str(out_dir / "per_case_counts.json"),
            "train": str(out_dir / "merged_train.jsonl"),
            "val": str(out_dir / "merged_val.jsonl"),
            "test": str(out_dir / "merged_test.jsonl"),
        },
    }
    with (out_dir / "summary.txt").open("w", encoding="utf-8") as f:
        f.write(json.dumps(summary, ensure_ascii=False, indent=2) + "\n")

    # Nice print
    print(f"Total after dedup: {summary['total_after_dedup']}")
    for lbl in summary["label_order"]:
        print(f"  {lbl}: {summary['class_counts'].get(lbl,0)}  (weight={summary['class_weights'][lbl]:.4f})")
    print(f"Splits -> Train: {summary['splits']['train']}  Val: {summary['splits']['val']}  Test: {summary['splits']['test']}")
    print(f"Wrote outputs to: {out_dir.resolve()}")

    return summary


In [5]:
summary = prepare_merge_and_split(
    inputs=[
        "../data/processed/tweets_monark_final_for_bert.json",
        "../data/processed/tweets_wagner_schwartz_final_for_bert.json",
    ],
    output_dir="../data/train",
    seed=42,
    val_size=0.10,
    test_size=0.10,
    shuffle=True,
)

summary  # shows paths, counts, weights, split sizes


FileNotFoundError: [Errno 2] No such file or directory: '..\\data\\processed\\tweets_monark_final_for_bert.json'

In [None]:
import os
print(os.path.exists("../data/processed/tweets_monark_final_for_bert.json"))
