In [None]:
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, spearmanr
from matplotlib.ticker import MaxNLocator
from adjustText import adjust_text

In [30]:
# Concatenate normalized_bin_summary_summary for each of the genera, can also be just Actinokineospora 
# ('df_bins_normalized_plot_Actinokineospora.csv' file generated in notebook 4)
folder = "/results/normalized_bin_summary/"
out_path = os.path.join(folder, "normalized_bin_summary_merged.csv")

# Find and sort all CSVs in the folder
files = sorted(glob.glob(os.path.join(folder, "*.csv")))
if not files:
    raise FileNotFoundError(f"No CSV files found in {folder}")

def genus_from_filename(path: str) -> str:
    base = os.path.splitext(os.path.basename(path))[0]  # strip extension
    return base.split("_")[-1]                           # last token after "_"

dfs = []
for fp in files:
    genus = genus_from_filename(fp)
    df = pd.read_csv(fp)
    df["Genus"] = genus
    dfs.append(df)

merged_df = pd.concat(dfs, ignore_index=True)

In [31]:
output_dir = "/results/normalized_bin_summary/figures/"

In [37]:
# Inputs
# ============================
# DataFrame with columns:
# ['Codon','0','1','2','3','4','5','6','7','8','9','Normalized_Rarity','Genus']
df = merged_df   # <-- change if your variable name differs

output_dir = "/results/figures/"
os.makedirs(output_dir, exist_ok=True)

# Colors for each genus
color_codes = {
    "Streptomyces": "#1f77b4",
    "Nocardia": "#ff7f0e",
    "Rhodococcus": "#2ca02c",
    "Amycolatopsis": "#d62728",
    "Actinokineospora": "#9467bd",
    "Micromonospora": "#8c564b",
    "Kitasatospora": "#e377c2",
    "Pseudonocardia": "#7f7f7f",
    "Kribbella": "#bcbd22",
    "Saccharopolyspora": "#17becf",
    "Saccharothrix": "#9edae5",
}

# ============================
# Prep
# ============================
bin_cols = [c for c in df.columns if str(c).isdigit()]
bin_cols = sorted(map(str, bin_cols), key=lambda s: int(s))

df = df.copy()
df["Normalized_Rarity"] = pd.to_numeric(df["Normalized_Rarity"], errors="coerce")
for c in bin_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")
df = df.dropna(subset=["Normalized_Rarity"]).reset_index(drop=True)

MIN_N_FOR_STATS = 3
LABEL_FONTSIZE = 6
LABEL_ALPHA = 0.9

# Fixed x-axis for all panels/figures
X_MIN_FIXED = 0.0
X_MAX_FIXED = 1.1

def make_fixed_axes():
    # 3×3 + last row uses first 2 axes (total 11 panels)
    n_cols, n_rows = 3, 4
    fig, axes = plt.subplots(
        n_rows, n_cols,
        figsize=(4.2*n_cols, 3.6*n_rows),
        dpi=300, sharex=True, sharey=True,
        constrained_layout=True
    )
    axes2d = np.atleast_2d(axes)
    axes_to_use = list(axes2d[:3, :].ravel()) + [axes2d[3, 0], axes2d[3, 1]]
    axes2d[3, 2].set_visible(False)  # hide unused slot
    return fig, axes2d, axes_to_use

all_stats = []

for bin_col in bin_cols:
    # Order genera by number of rows
    genus_groups = [(g, gdf.dropna(subset=["Normalized_Rarity", bin_col]))
                    for g, gdf in df.groupby("Genus")]
    genus_groups.sort(key=lambda t: -len(t[1]))

    # Shared y-limits based on data for this bin
    y_min = pd.to_numeric(df[bin_col], errors="coerce").min()
    y_max = pd.to_numeric(df[bin_col], errors="coerce").max()

    fig, axes2d, axes_to_use = make_fixed_axes()
    per_bin_stats = []

    for ax, (genus, sub) in zip(axes_to_use, genus_groups):
        ax.set_xlim(X_MIN_FIXED, X_MAX_FIXED)
        ax.set_ylim(y_min, y_max)

        if sub is None or len(sub) == 0:
            ax.set_title(f"{genus}\n$r$=NA, $p$=NA | $\\rho$=NA, $p$=NA", fontsize=9)
            ax.grid(True, alpha=0.3)
            ax.xaxis.set_major_locator(MaxNLocator(nbins=5))
            ax.yaxis.set_major_locator(MaxNLocator(nbins=5))
            per_bin_stats.append({
                "Bin": bin_col, "Genus": genus,
                "pearson_r": np.nan, "pearson_p": np.nan,
                "spearman_r": np.nan, "spearman_p": np.nan,
                "slope": np.nan, "intercept": np.nan
            })
            continue

        x = sub["Normalized_Rarity"].to_numpy()
        y = sub[bin_col].to_numpy()
        codons = sub["Codon"].astype(str).to_numpy()
        color = color_codes.get(genus, None)

        # Scatter
        ax.scatter(x, y, s=12, alpha=0.6, c=color, edgecolors="none", zorder=2)

        # Stats and regression
        if len(x) >= MIN_N_FOR_STATS and np.std(x) > 0 and np.std(y) > 0:
            pr, pp = pearsonr(x, y)
            sr, sp = spearmanr(x, y)

            a, b = np.polyfit(x, y, 1)
            xx = np.linspace(X_MIN_FIXED, X_MAX_FIXED, 100)
            ax.plot(xx, a*xx + b, linestyle="--", linewidth=1, c=color, zorder=1)

            title = f"{genus}\n$r$={pr:.2f}, $p$={pp:.1e} | $\\rho$={sr:.2f}, $p$={sp:.1e}"
            per_bin_stats.append({
                "Bin": bin_col, "Genus": genus,
                "pearson_r": pr, "pearson_p": pp,
                "spearman_r": sr, "spearman_p": sp,
                "slope": a, "intercept": b
            })
        else:
            title = f"{genus}\n$r$=NA, $p$=NA | $\\rho$=NA, $p$=NA"
            per_bin_stats.append({
                "Bin": bin_col, "Genus": genus,
                "pearson_r": np.nan, "pearson_p": np.nan,
                "spearman_r": np.nan, "spearman_p": np.nan,
                "slope": np.nan, "intercept": np.nan
            })

        # ---- Label exactly 5 most common + 5 rarest codons (by Normalized_Rarity) ----
        # most common = smallest x; rarest = largest x
        order = np.argsort(x)
        idx_common = order[:min(5, len(order))]            # lowest x
        idx_rare = order[-min(5, len(order)):]             # highest x
        label_idx = np.unique(np.concatenate([idx_common, idx_rare]))

        texts = []
        for i in label_idx:
            texts.append(
                ax.text(x[i], y[i], codons[i],
                        fontsize=LABEL_FONTSIZE, alpha=LABEL_ALPHA,
                        ha="center", va="center", zorder=3)
            )
        if _HAS_ADJUSTTEXT and len(texts) > 0:
            try:
                adjust_text(
                    texts, ax=ax,
                    only_move={'points': 'y', 'text': 'xy'},
                    arrowprops=dict(arrowstyle='-', lw=0.3, color='gray', alpha=0.6),
                    expand_points=(1.05, 1.15),
                    expand_text=(1.05, 1.15),
                )
            except Exception:
                pass

        # Cosmetics
        ax.set_title(title, fontsize=9)
        ax.grid(True, alpha=0.3)
        ax.xaxis.set_major_locator(MaxNLocator(nbins=5))
        ax.yaxis.set_major_locator(MaxNLocator(nbins=5))

    # Hide any unused axes if fewer than 11 genera
    num_plotted = min(len(genus_groups), len(axes_to_use))
    for ax in axes_to_use[num_plotted:]:
        ax.set_visible(False)

    # Shared labels
    fig.supxlabel("Normalized rarity", fontsize=12)
    fig.supylabel(f"Proportion in bin {bin_col}", fontsize=12)

    out_png = os.path.join(output_dir, f"bin_{bin_col}_facets_by_genus_select.png")
    fig.savefig(out_png, dpi=350, bbox_inches="tight")
    plt.close(fig)

    all_stats.extend(per_bin_stats)

# Save per-bin/per-genus stats
stats_df = pd.DataFrame(all_stats)
stats_csv = os.path.join(output_dir, "per_bin_per_genus_correlations.csv")
stats_df.to_csv(stats_csv, index=False)

print(f"Saved {len(bin_cols)} figures to: {output_dir}")
print(f"Saved correlation table to: {stats_csv}")


Saved 10 figures to: /Users/annasve/Desktop/data/ncbi_genomes/analysis/additional_figures/bin_facets_by_genus_select10
Saved correlation table to: /Users/annasve/Desktop/data/ncbi_genomes/analysis/additional_figures/bin_facets_by_genus_select10/per_bin_per_genus_correlations.csv
