In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
from pathlib import Path
import polars as pl

In [None]:
# Assuming either all *_compressed.tar.zst files or all_groups_lean.tar.zst file is/are decompressed
CWD = Path().resolve()
BASE = CWD.parent

print(f"Folder where groups (data) folder should be: {BASE}")
print(f"Current working directory (where figure will be saved): {CWD}")

In [None]:
# also the order for Boxplots
groups = [
    "fungi_mit",
    "metazoans_mit",
    "plants_mit",
    "plants_plt",
    "protists_mit",
    "protists_plt",
    "green_algae_mit",
    "green_algae_plt",
]

violin_order = [
    # mitochondria
    "fungi_mit",
    "metazoans_mit",
    "plants_mit",
    "protists_mit",
    "green_algae_mit",
    # plastids
    "plants_plt",
    "protists_plt",
    "green_algae_plt",
]

polarity_map = {"++": "same", "--": "same", "+-": "opposite", "-+": "opposite"}

# Panel A, scatter controls
SCATTER_ALPHA = 0.35
SCATTER_SIZE = 10
JITTER_SD = 0.06

# Panel B, pairing display controls
MAX_PAIRED_LINES_PER_GROUP = 600  # 300–800 recommended; set 0 to disable paired lines
LINE_ALPHA = 0.12  # lower for huge N (e.g., 0.08–0.12 for metazoans_mit)
LINE_WIDTH = 0.7

# Global RNG (reproducible)
RNG = np.random.default_rng(42)

boxplot_titles = {
    "fungi_mit": "Fungi (mitochondria)",
    "metazoans_mit": "Metazoans (mitochondria)",
    "plants_mit": "Plants (mitochondria)",
    "protists_mit": "Protists (mitochondria)",
    "green_algae_mit": "Green algae (mitochondria)",
    "plants_plt": "Plants (plastid)",
    "protists_plt": "Protists (plastid)",
    "green_algae_plt": "Green algae (plastid)",
}

In [None]:
# Load and summarize all groups
group_data = {}
for g in groups:
    df = pl.read_csv(BASE / g / "summary_igs_intergenic.tsv", separator="\t").select(
        ["AN", "Polarity", "Length"]
    )

    df = df.with_columns(
        pl.col("Polarity")
        .replace_strict(polarity_map, default=None)
        .alias("polarity_bin")
    )

    # Per-genome medians by bin
    med = df.group_by(["AN", "polarity_bin"]).agg(
        pl.col("Length").log10().median().alias("median_log10_length"),
    )

    wide = med.pivot(index="AN", on="polarity_bin", values="median_log10_length")
    delta = wide["opposite"].to_numpy() - wide["same"].to_numpy()

    group_data[g] = {
        "group": g,
        "wide": wide,
        "wide_paired": wide,
        "delta_med_log10": delta,
        "n": len(delta),
    }

# For Panel A
delta_violin_data = [group_data[g]["delta_med_log10"] for g in violin_order]
n_by_group = [group_data[g]["n"] for g in violin_order]

In [None]:
# Composite figure layout
width = 13
violin_height = width / 2  # 6.5 for 2:1 ratio
boxplot_height = 17  # 4 rows × 4.5 inches each
dpi = 300

total_height = violin_height + boxplot_height

fig = plt.figure(figsize=(width, total_height), dpi=dpi)
gs = GridSpec(
    nrows=2,
    ncols=1,
    height_ratios=[violin_height, boxplot_height],
    hspace=0.25,
    figure=fig,
)

In [None]:
# ==========
# Panel A: Δ violin plot
# ==========
axA = fig.add_subplot(gs[0, 0])

positions = np.arange(1, len(violin_order) + 1)

vp = axA.violinplot(
    delta_violin_data,
    positions=positions,
    widths=0.85,
    showmeans=False,
    showmedians=True,
    showextrema=False,
)
for body in vp["bodies"]:
    body.set_alpha(0.5)

axA.boxplot(
    delta_violin_data,
    positions=positions,
    widths=0.25,
    showfliers=False,
    medianprops=dict(linewidth=2, color="black"),
)
axA.axhline(0, linewidth=1)

axA.set_xticks(positions)
axA.set_xticklabels(
    [
        g.replace("_mit", "").replace("_plt", "").replace("_", " ").capitalize()
        + f"\n(n={n})"
        for g, n in zip(violin_order, n_by_group)
    ],
    rotation=0,
)
axA.set_ylabel(
    r"$\Delta = \mathrm{med}(\log_{10}(\mathrm{opposite})) - \mathrm{med}(\log_{10}(\mathrm{same}))$"
)
axA.set_title("Genome-level Δ IGR length by taxonomic group and organelle type")
axA.grid(axis="y", alpha=0.2)

# Add secondary grouping labels
mit_positions = positions[:5]  # First 5 are mitochondria
plt_positions = positions[5:]  # Last 3 are plastid

axA.text(
    np.mean(mit_positions),
    -0.15,
    "Mitochondria",
    ha="center",
    va="top",
    fontweight="bold",
    fontsize=12,
    transform=axA.get_xaxis_transform(),
)
axA.text(
    np.mean(plt_positions),
    -0.15,
    "Plastid",
    ha="center",
    va="top",
    fontweight="bold",
    fontsize=12,
    transform=axA.get_xaxis_transform(),
)

# Add subtle separating line (between position 5 and 6)
axA.axvline(5.5, color="gray", linestyle="--", alpha=0.3, linewidth=1.5)

# Panel label A
fig.text(0.125, 0.895, "A", fontsize=16, fontweight="bold", va="top")

In [None]:
# ==========
# Panel B: 4x2 grid of boxplots
# ==========
subgs = gs[1, 0].subgridspec(4, 2, hspace=0.35, wspace=0.25)

for idx, g in enumerate(groups):
    r = idx // 2
    c = idx % 2
    ax = fig.add_subplot(subgs[r, c])

    wide = group_data[g]["wide"]

    same_vals = (
        wide["same"].drop_nulls().to_numpy() if "same" in wide.columns else np.array([])
    )
    opp_vals = (
        wide["opposite"].drop_nulls().to_numpy()
        if "opposite" in wide.columns
        else np.array([])
    )

    # Boxplots
    ax.boxplot(
        [same_vals, opp_vals],
        tick_labels=["same", "opposite"],
        showfliers=False,
        medianprops=dict(linewidth=2, color="black"),
    )

    # Jittered points
    for i, arr in enumerate([same_vals, opp_vals], start=1):
        if arr.size == 0:
            continue
        x = RNG.normal(i, JITTER_SD, size=len(arr))
        ax.scatter(x, arr, s=SCATTER_SIZE, alpha=SCATTER_ALPHA)

    # Downsampled paired lines (same genome has both bins)
    if MAX_PAIRED_LINES_PER_GROUP > 0 and wide.shape[0] > 0:
        same_p = wide["same"].to_numpy()
        opp_p = wide["opposite"].to_numpy()

        n_pairs = wide.shape[0]
        n_draw = min(MAX_PAIRED_LINES_PER_GROUP, n_pairs)
        sel = RNG.choice(n_pairs, size=n_draw, replace=False)

        for i in sel:
            ax.plot(
                [1, 2],
                [same_p[i], opp_p[i]],
                alpha=LINE_ALPHA,
                linewidth=LINE_WIDTH,
                zorder=0,
                color="gray",
            )

    # ax.set_yscale("log")
    # ax.set_ylim(y_min, y_max)

    ax.set_title(f"{boxplot_titles[g]}", fontsize=10)
    # ax.set_ylabel(r"$\mathrm{med}$ IGR length" + "\n" + "(log)")
    ax.set_ylabel(r"$\mathrm{median}(\log_{10}(\mathrm{IGR\ length}))$")

    # Reduce clutter: only bottom row shows x tick labels prominently
    if r < 3:
        ax.set_xlabel("")
    else:
        ax.set_xlabel("Polarity")

# Panel label B
fig.text(0.125, 0.62, "B", fontsize=16, fontweight="bold", va="top")

In [None]:
out_path = CWD / "figure1.tiff"
fig.savefig(out_path, dpi=300, bbox_inches="tight")
print(f"Saved composite figure to: {out_path}")