binary system with string bins

In [14]:
# Mirror plots (A on top, B mirrored below) from a single CSV,
# split by regulation (upregulated, downregulated) for EACH experiment.
#
# - Uses column "replicate" directly as group (values "A" or "B")
# - For every (experiment, regulation) subset, creates a mirror plot
# - Within each bin, keeps only TOP_K most abundant proteoforms by max SNR
# - Proteoforms sorted by SNR (descending)
# - hit_number kept as strings (e.g., "20_10"), annotated above/below bars
#
# NEW:
# - Custom Y-axis ranges via Y_LIMITS (tuple or dict per regulation)
# - Larger, configurable fonts via FONT_SIZES
# - SHOW_TITLE / SHOW_LEGEND toggles

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import re

# --------------------
# CONFIG
# --------------------
IN_DIR   = Path(r"F:/binary/final")
OUT_DIR  = Path(r"F:/binary/final")
CSV_IN   = IN_DIR / "report.csv"

TOP_K = 5  # number of top proteoforms per bin to show

# Optional, to force display/order of categorical bins:
BIN_ORDER = ["soluble_fraction", "insoluble_fraction"]

# Fonts
FONT_SIZES = {
    "title": 20,
    "axis_label": 16,
    "tick": 16,
    "legend": 14,
    "bin_header": 20,
    "hit_anno": 14,
}

# Display toggles
SHOW_TITLE  = False
SHOW_LEGEND = False

# Custom Y-limits
# Option A: one global range for all plots (uncomment next line)
# Y_LIMITS = (-50, 200)

# Option B: separate per regulation (leave None for auto)
Y_LIMITS = {
    "upregulated":    (-1000, 1000),  # ymin, ymax (y=0 is the mirror line)
    "downregulated":  (-1000, 1000),
}
# If Y_LIMITS is a tuple, it's used for all plots.
# If Y_LIMITS is a dict, keys should be 'upregulated' and/or 'downregulated'.
# If None or a key is missing → auto scale.

# --------------------
# HELPERS
# --------------------
_hit_re = re.compile(r"^\s*(\d+)(?:_(\d+))?(?:_(\d+))?\s*$")

def parse_hit_token(token: str):
    if not isinstance(token, str):
        return None
    m = _hit_re.match(token)
    if not m:
        return None
    return tuple(int(x) for x in m.groups() if x is not None)

def pick_best_hit(series: pd.Series):
    best_tuple, best_raw = None, None
    for val in series.dropna():
        sval = str(val)
        tup = parse_hit_token(sval)
        if tup is not None:
            if (best_tuple is None) or (tup > best_tuple):
                best_tuple, best_raw = tup, sval
        elif best_raw is None:
            best_raw = sval
    return best_raw

def norm_regulation(val) -> str | None:
    """
    Normalize regulation labels to 'upregulated' or 'downregulated'.
    Returns None for anything else (those rows will be ignored).
    """
    if pd.isna(val):
        return None
    s = str(val).strip().lower()
    s = s.replace("-", "").replace("_", "").replace(" ", "")
    if s.startswith("up"):
        return "upregulated"
    if s.startswith("down"):
        return "downregulated"
    return None

def sanitize_for_path(s: str) -> str:
    return re.sub(r"[^A-Za-z0-9._-]+", "_", s)

def _bin_sort_key(x: str):
    """Sort bins by BIN_ORDER if present; otherwise alphabetical after the ordered ones."""
    try:
        return (0, BIN_ORDER.index(x))
    except ValueError:
        return (1, x)

def _pretty_bin_label(x: str) -> str:
    """Title-case with spaces for nicer plot headers."""
    return str(x).replace("_", " ").title()

def _resolve_y_limits(reg_name: str):
    """Return (ymin, ymax) or None to autoscale."""
    if Y_LIMITS is None:
        return None
    if isinstance(Y_LIMITS, tuple) and len(Y_LIMITS) == 2:
        return Y_LIMITS
    if isinstance(Y_LIMITS, dict):
        lim = Y_LIMITS.get(reg_name)
        if isinstance(lim, tuple) and len(lim) == 2:
            return lim
    return None  # fallback to auto

def make_mirror_plot(df_sub: pd.DataFrame, exp_name: str, reg_name: str, top_k: int, out_dir: Path):
    """
    Create one mirror plot (A vs B) for the given subset dataframe df_sub,
    which must already be filtered to a single (experiment, regulation).
    """
    if df_sub.empty:
        return None

    # Aggregate to max SNR per (bin, proteoform, group) and pick best hit_number token
    agg = (
        df_sub.groupby(["bin", "proteoform", "group"], as_index=False)
              .agg(snr=("snr", "max"), hit_number=("hit_number", pick_best_hit))
    )

    if agg.empty:
        return None

    # Build concatenated layout with TOP-K per bin (categorical)
    bins_sorted = sorted(agg["bin"].unique(), key=_bin_sort_key)
    x_positions, labels, bin_boundaries, bin_centers = [], [], [], []
    A_heights, B_heights, A_hits, B_hits = [], [], [], []

    cursor = 0
    gap = 1
    bar_width = 0.8

    for b in bins_sorted:
        sub = agg[agg["bin"] == b].copy()

        # Sort proteoforms in this bin by max SNR across groups (desc)
        order_all = (
            sub.groupby("proteoform")["snr"]
               .max()
               .sort_values(ascending=False)
               .index
               .tolist()
        )
        order = order_all[:top_k]

        subA = sub[sub["group"] == "A"].set_index("proteoform")
        subB = sub[sub["group"] == "B"].set_index("proteoform")

        n = len(order)
        xs = np.arange(cursor, cursor + n)

        for p in order:
            if p in subA.index:
                A_heights.append(float(subA.loc[p, "snr"]))
                A_hits.append(subA.loc[p, "hit_number"])
            else:
                A_heights.append(0.0)
                A_hits.append("")
            if p in subB.index:
                B_heights.append(-float(subB.loc[p, "snr"]))  # mirror negative
                B_hits.append(subB.loc[p, "hit_number"])
            else:
                B_heights.append(0.0)
                B_hits.append("")

        x_positions.extend(xs.tolist())
        labels.extend(order)
        bin_boundaries.append(cursor - 0.5)
        bin_centers.append(cursor + (n - 1) / 2 if n > 0 else cursor)
        cursor += n + gap

    if not x_positions:
        return None

    bin_boundaries.append(cursor - gap - 0.5 + gap)

    # PLOT
    fig_w = max(14, 0.35 * len(x_positions))
    fig, ax = plt.subplots(figsize=(fig_w, 10))

    ax.bar(x_positions, A_heights, width=bar_width, label="Group A")
    ax.bar(x_positions, B_heights, width=bar_width, label="Group B")

    # Annotations
    pos_offset = 0.02 * max(1.0, max(A_heights, default=1.0))
    neg_offset = 0.02 * max(1.0, abs(min(B_heights, default=-1.0)))

    for i, x in enumerate(x_positions):
        if A_heights[i]:
            ax.text(
                x, A_heights[i] + pos_offset,
                str(A_hits[i]) if A_hits[i] else "",
                ha="center", va="bottom", fontsize=FONT_SIZES["hit_anno"], rotation=90
            )
        if B_heights[i]:
            ax.text(
                x, B_heights[i] - neg_offset,
                str(B_hits[i]) if B_hits[i] else "",
                ha="center", va="top", fontsize=FONT_SIZES["hit_anno"], rotation=90
            )

    ax.set_xticks(x_positions)
    ax.set_xticklabels(labels, rotation=90, fontsize=FONT_SIZES["tick"])
    fig.subplots_adjust(bottom=0.30)

    for xb in bin_boundaries:
        ax.axvline(x=xb, linewidth=1)

    # Apply custom Y-limits (if given)
    custom_lim = _resolve_y_limits(reg_name)
    if custom_lim is not None:
        ax.set_ylim(custom_lim)

    ymin, ymax = ax.get_ylim()
    header_y = ymax - 0.05 * (ymax - ymin)
    for c, b in zip(bin_centers, bins_sorted):
        if np.isfinite(c):
            ax.text(
                c, header_y, _pretty_bin_label(b),
                ha="center", va="top", fontsize=FONT_SIZES["bin_header"]
            )

    ax.axhline(0, linewidth=1)
    ax.set_ylabel("Signal-to-noise ratio (SNR)", fontsize=FONT_SIZES["axis_label"])

    # Title & legend toggles
    if SHOW_TITLE:
        ax.set_title(
            f"{exp_name} — {reg_name.capitalize()} (Top {top_k} per bin, by SNR)",
            fontsize=FONT_SIZES["title"]
        )
    else:
        ax.set_title("")

    if SHOW_LEGEND:
        ax.legend(loc="upper right", fontsize=FONT_SIZES["legend"])
    else:
        leg = ax.get_legend()
        if leg:
            leg.remove()

    ax.tick_params(axis="y", labelsize=FONT_SIZES["tick"])
    fig.tight_layout()

    out_dir.mkdir(parents=True, exist_ok=True)
    out_name = f"mirror_{sanitize_for_path(exp_name)}_{reg_name}_top{top_k}.png"
    out_path = out_dir / out_name
    fig.savefig(out_path, dpi=150, bbox_inches="tight")
    plt.close(fig)
    return out_path

# --------------------
# LOAD & CLEAN
# --------------------
OUT_DIR.mkdir(parents=True, exist_ok=True)

df = pd.read_csv(CSV_IN)

# If both columns exist, the latter rename will win. Keep as-is if that's intended.
df = df.rename(columns={"Identified_proteoform": "proteoform", "PFR": "proteoform"})

# Treat 'bin' as categorical text (e.g., 'soluble_fraction', 'insoluble_fraction')
df["bin"] = df.get("bin").astype(str).str.strip().str.lower()

# Ensure SNR numeric & group mapping
df["snr"] = pd.to_numeric(df.get("snr"), errors="coerce")
df["group"] = df["replicate"].astype(str).str.upper().str.strip()

# Normalize proteoform labels (drop trailing ".0")
df["proteoform"] = df["proteoform"].astype(str).str.replace(r"\.0$", "", regex=True)

# Normalize regulation
df["regulation_norm"] = df["regulation"].apply(norm_regulation)

# Normalize experiment labels to strings (for grouping and filenames)
df["experiment"] = df["experiment"].astype(str).str.strip()

# Drop unusable rows
df = df.dropna(subset=["bin", "proteoform", "snr", "group", "experiment", "regulation_norm"])
df = df[df["group"].isin(["A", "B"])]

# (Optional) restrict to known bins if you want to ignore anything else:
# df = df[df["bin"].isin(BIN_ORDER)]

if df.empty:
    raise SystemExit("No data left after cleaning (check CSV columns and values).")

# --------------------
# LOOP OVER (experiment, regulation)
# --------------------
outputs = []
for exp_name in sorted(df["experiment"].unique()):
    for reg_name in ("upregulated", "downregulated"):
        sub = df[(df["experiment"] == exp_name) & (df["regulation_norm"] == reg_name)].copy()
        if sub.empty:
            continue
        out = make_mirror_plot(sub, exp_name=exp_name, reg_name=reg_name, top_k=TOP_K, out_dir=OUT_DIR)
        if out is not None:
            print(f"Saved → {out}")
            outputs.append(str(out))

if not outputs:
    print("No plots were generated (no matching data for the requested (experiment, regulation) subsets).")


Saved → F:\binary\final\mirror_1_upregulated_top5.png
Saved → F:\binary\final\mirror_1_downregulated_top5.png
