# Analysis

This notebook includes code for analyzing and plotting processed library activity measurements.

In [None]:
import os

os.environ["POLARS_MAX_THREADS"] = os.environ["SLURM_CPUS_PER_TASK"]
print(f"{os.environ['SLURM_CPUS_PER_TASK']} thread(s) available")

FIGURE_EXPORT_TYPE = "svg"

In [None]:
from IPython.display import display, Markdown, HTML

%matplotlib inline
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats("jpeg")

import matplotlib
import matplotlib.pyplot as pyplot
import seaborn
from matplotlib_venn import venn2
from statannotations.Annotator import Annotator

import numpy
import polars
import pandas
import scipy
import statsmodels.api as statsmods

import json

polars.toggle_string_cache(True)
_ = polars.Config.set_tbl_rows(10)

In [None]:
"""
Style Matplotlib plots
"""

plot_style = {
    "font.size": 12,
    "font.family": "sans-serif",
    "font.sans-serif": ["Inter"],
    "figure.figsize": [2, 2],
    "figure.dpi": 150,
    "savefig.dpi": 300,
    "figure.facecolor": (1, 1, 1, 0),
    "text.usetex": False,
    "lines.markersize": 3,
    "axes.titleweight": "bold",
    "axes.labelweight": 600,
    "axes.labelsize": 9,
    "axes.facecolor": "none",
    "axes.spines.top": False,
    "axes.spines.right": False,
    "axes.linewidth": 1.5,
    "grid.color": "#303030",
    "grid.alpha": 0.1,
    "xtick.labelsize": 9,
    "xtick.major.width": 1.5,
    "xtick.minor.width": 1,
    "xtick.minor.visible": False,
    "ytick.labelsize": 9,
    "ytick.major.width": 1.5,
    "ytick.minor.width": 1,
    "ytick.minor.visible": False,
    "figure.frameon": False,
    "legend.frameon": False,
    "legend.fancybox": False,
    "legend.fontsize": 9,
    "legend.scatterpoints": 1,
    "legend.markerscale": 1,
    "legend.handlelength": 1.0,
    "legend.handletextpad": 0.2,
    "axes.prop_cycle": matplotlib.cycler(color=["k", "b", "g", "r", "c", "y"]),
    "hatch.linewidth": 4,
}

pyplot.style.use(plot_style)
_ = pyplot.ioff()

cm_white_to_gray = matplotlib.colors.LinearSegmentedColormap.from_list("white_to_gray", ["#ffffff", "#404040"])
cm_white_to_red = matplotlib.colors.LinearSegmentedColormap.from_list("white_to_red", ["#ffffff", "#ff0000"])
cm_white_to_blue = matplotlib.colors.LinearSegmentedColormap.from_list("white_to_blue", ["#ffffff", "#0000ff"])
cm_white_to_green = matplotlib.colors.LinearSegmentedColormap.from_list("white_to_green", ["#ffffff", "#009000"])

cm_light_to_gray = matplotlib.colors.LinearSegmentedColormap.from_list("light_to_gray", ["#e0e0e0", "#404040"])
cm_light_to_red = matplotlib.colors.LinearSegmentedColormap.from_list("light_to_red", ["#ffe0e0", "#ff0000"])
cm_light_to_blue = matplotlib.colors.LinearSegmentedColormap.from_list("light_to_blue", ["#e0e0ff", "#0000ff"])
cm_light_to_green = matplotlib.colors.LinearSegmentedColormap.from_list("light_to_green", ["#e0ffe0", "#009000"])

boxplot_style_args = {
    "whiskerprops": {
        "color": "#000000"
    },
    "boxprops": {
        "edgecolor": "#000000",
    },
    "flierprops": {
        "marker": "o",
        "markerfacecolor": "#000000",
        "markersize": 1,
        "markeredgecolor": None
    },
    "medianprops": {
        "color": "#000000"
    },
    "capprops": {
        "color": "#000000"
    }
}

In [None]:
"""
Common plotting functions
"""

# Adapted from https://stackoverflow.com/a/53865762

def calc_weights_for_scatter(x, y, bins=20):
    if type(x) is not numpy.ndarray or type(y) is not numpy.ndarray:
        raise ValueError("x and y must be `numpy.ndarray`s")
    
    bin_weights, xbins, ybins = numpy.histogram2d(x, y, bins=bins, density=False)
    weights = scipy.interpolate.interpn(points=(0.5*(xbins[1:] + xbins[:-1]), 0.5*(ybins[1:] + ybins[:-1])), values=bin_weights, xi=numpy.vstack([x, y]).T, method="splinef2d", bounds_error=False)
    weights[numpy.where(numpy.isnan(weights))] = 0.0
    
    normalizer = matplotlib.colors.Normalize(vmin=numpy.min(weights), vmax=numpy.max(weights))
    
    return weights, normalizer

def reorder_points_and_weights(x, y, w, f = None):
    if type(x) is not numpy.ndarray or type(y) is not numpy.ndarray or type(w) is not numpy.ndarray:
        raise ValueError("x, y, and w must be `numpy.ndarray`s")
    
    x_, y_, w_ = x, y, w
    if f is not None:
        x_, y_, w_ = f(x), f(y), f(w)
    order = numpy.argsort(w_)
    return {
        "x": x_[order],
        "y": y_[order],
        "c": w_[order]
    }

NAME_TRANSLATION_TABLE = {
    "WT": "+/+",
    "Rhet": "R90W/+",
    "Rhom": "R90W/R90W",
    "Ehet": "E168d2/+",
    "Ehom": "E168d2/E168d2",
    "CrxKO": "-/-",
    "chip_peak_id": "CRE"
}

def translate_genotype_to_label(genotype):
    return NAME_TRANSLATION_TABLE[genotype]

In [None]:
"""
Load library metadata
"""

library_metadata = polars.read_csv("Library_Details/library_metadata.tsv", sep="\t", dtypes={
    "variant_type": polars.Categorical
})

chip_peak_metadata = polars.read_csv("Library_Details/chip_peak_metadata.tsv", sep="\t", dtypes={
    "atac_crx_dependent": polars.Categorical,
    "epigenome_group": polars.Categorical
})

display(Markdown("### Library Metadata"))
display(library_metadata)

display(Markdown("### ChIP Peak Metadata"))
display(chip_peak_metadata)

In [None]:
"""
Read in processed counts, and join with library metadata
"""

processed_samples_counts = polars.read_parquet("MPRA/processed_library_counts.parquet")

processed_samples_counts = processed_samples_counts.join(library_metadata, on="library_id", how="left").join(chip_peak_metadata, on="chip_peak_id", how="left")

display(processed_samples_counts)

---

# GLOBAL PATTERNS OF REPORTER EXPRESSION ACROSS GENOTYPES

---

In [None]:
for library_name, library_df in processed_samples_counts.sort(["library"]).groupby("library", maintain_order=True):
    display(Markdown(f"## [{library_name}] Scatterplot correlation of CRE activity"))
    
    for variant_type, cmap, base_color in [("wildtype", cm_light_to_blue, "#0000ff"), ("allCrxSites", cm_light_to_red, "#ff0000")]:
    
        data = library_df.filter(polars.col("variant_type") == variant_type).with_columns([
            polars.col("activity_mean") + 2e-5
        ]).sort(["genotype", "library_id"]).pivot(values="activity_mean", index="library_id", columns="genotype", maintain_order=True).to_pandas()

        scrambled_data = library_df.filter(polars.col("variant_type") == "scrambled").with_columns([
            polars.col("activity_mean") + 2e-5
        ]).sort(["genotype", "library_id"]).pivot(values="activity_mean", index="library_id", columns="genotype", maintain_order=True).to_pandas()
        
        #display(data)
        #display(scrambled_data)

        genotype_comparisons = [
            ("WT", "Rhet"),
            ("WT", "Ehet"),
            ("WT", "Rhom"),
            ("WT", "Ehom"),
            ("WT", "CrxKO"),
            ("Rhet", "Rhom"),
            ("Ehet", "Ehom"),
            ("Rhet", "Ehet"),
            ("Rhom", "Ehom")
        ]

        figure, axes = pyplot.subplots(nrows=2, ncols=5, sharex=False, sharey=False, figsize=(15, 6), layout="compressed")
        bounds = (1e-5, 1e3)
        
        for index, ((genotype1, genotype2), axis) in enumerate(zip(genotype_comparisons, axes.flat)):

            # Define grid of bins for evaulating heatmap and lines
            bins = numpy.geomspace(start=bounds[0], stop=bounds[1], num=50)

            x = data[genotype1].to_numpy()
            y = data[genotype2].to_numpy()
            
            # Compute robust regression of scrambled sequences
            scrambled_x = scrambled_data[genotype1].to_numpy()
            scrambled_y = scrambled_data[genotype2].to_numpy()
            log_scrambled_x_with_intercept = statsmods.add_constant(numpy.log10(scrambled_x))

            rlm_results = statsmods.RLM(numpy.log10(scrambled_y), log_scrambled_x_with_intercept, M=statsmods.robust.norms.TrimmedMean(), missing="raise").fit()
            intercept, slope = rlm_results.params
            axis.plot(bins, numpy.power(10, rlm_results.predict(statsmods.add_constant(numpy.log10(bins)), transform=False)), color="#00000070", linestyle=(0, (5, 10)), linewidth=0.5, marker=None, label=f"m={slope:.2f}")
            
            data_rlm_residuals = numpy.abs(numpy.log10(y) - rlm_results.predict(statsmods.add_constant(numpy.log10(x))))
            data_rlm_residuals_bound = numpy.percentile(data_rlm_residuals, 80)
            axis.plot(bins, numpy.power(10, rlm_results.predict(statsmods.add_constant(numpy.log10(bins)), transform=False) + data_rlm_residuals_bound), color=base_color + "70", linestyle=(0, (2, 2)), linewidth=0.5, marker=None, label="80th %ile residual" if index == 0 else "_nolegend_")
            axis.plot(bins, numpy.power(10, rlm_results.predict(statsmods.add_constant(numpy.log10(bins)), transform=False) - data_rlm_residuals_bound), color=base_color + "70", linestyle=(0, (2, 2)), linewidth=0.5, marker=None, label="_nolegend_")

            # Calculate R^2
            fit = scipy.stats.linregress(numpy.log10(x), numpy.log10(y))
            fit_func = lambda x: fit.slope * x + fit.intercept
            
            # Plot wildtype CREs as a scatterplot colored by density
            weights, color_normalizer = calc_weights_for_scatter(x, y, bins=bins)
            axis.scatter(**reorder_points_and_weights(x, y, weights), cmap=cmap, norm=color_normalizer, s=6, marker="o", rasterized=True, label=f"R²={fit.rvalue**2:.2f}")

            # Plot scrambled sequences
            axis.scatter(scrambled_data[genotype1], scrambled_data[genotype2], marker="+", color="#000000", s=6, alpha=1, linewidth=0.5, label="scrambled" if index == 0 else "_nolegend_", rasterized=True)
            
            # Plot basal
            g1_basal_mean = library_df.filter((polars.col("genotype") == genotype1) & (polars.col("variant_type") == "basal"))["activity_mean"][0]
            g2_basal_mean = library_df.filter((polars.col("genotype") == genotype2) & (polars.col("variant_type") == "basal"))["activity_mean"][0]
            axis.scatter(g1_basal_mean, g2_basal_mean, marker="+", color="#ff7000", s=16, linewidth=1, label="basal" if index == 0 else "_nolegend_")
            
            # Plot out-of-bounds
            axis.add_patch(matplotlib.patches.Polygon(
                xy=[(bounds[1], 2e-5), (2e-5, 2e-5), (2e-5, bounds[1]), (0, bounds[1]), (0, 0), (bounds[1], 0)],
                facecolor="#00000000",
                edgecolor="#00000008",
                hatch=r"//"
            ))
            
            # Format plot
            axis.set(xlim=bounds, ylim=bounds, xlabel=translate_genotype_to_label(genotype1), ylabel=translate_genotype_to_label(genotype2))
            axis.set_xscale("log", base=10)
            axis.set_yscale("log", base=10)
            axis.xaxis.set_major_locator(pyplot.LogLocator(numticks=5))
            axis.yaxis.set_major_locator(pyplot.LogLocator(numticks=5))
            axis.tick_params(which="major", bottom=True, top=False, left=True, right=False, labelbottom=True, labeltop=False, labelleft=True, labelright=False)
            axis.tick_params(which="minor", bottom=False, top=False, left=False, right=False)
            
            axis.set_aspect(aspect="equal", adjustable=None, anchor="C")

            handles, labels = axis.get_legend_handles_labels()
            legend = axis.legend(handles, labels, loc="upper left")

        for axis in axes.flat[len(genotype_comparisons):]:
            axis.axis("off")
            
        figure.savefig(f"Figures/expression_across_genotypes_{variant_type}_{library_name}.{FIGURE_EXPORT_TYPE}", bbox_inches="tight")
        display(Markdown(f"### {variant_type}"))
        display(figure)
        pyplot.close(figure)

In [None]:
for library_name, library_df in processed_samples_counts.sort(["library"]).groupby("library", maintain_order=True):
    display(Markdown(f"## [{library_name}] Heatmap correlation of CRE classification"))
    
    for variant_type, cmap in [("wildtype", cm_white_to_blue), ("allCrxSites", cm_white_to_red)]:
        
        genotype_comparisons = [
            ("WT", "Rhet"),
            ("WT", "Ehet"),
            ("WT", "Rhom"),
            ("WT", "Ehom"),
            ("WT", "CrxKO"),
            ("Rhet", "Rhom"),
            ("Ehet", "Ehom"),
            ("Rhet", "Ehet"),
            ("Rhom", "Ehom")
        ]

        figure, axes = pyplot.subplots(nrows=2, ncols=5, sharex=False, sharey=False, figsize=(20, 8), layout="compressed")

        activity_classes = ["strong_silencer", "weak_silencer", "inactive", "weak_enhancer", "strong_enhancer"]

        for (genotype1, genotype2), axis in zip(genotype_comparisons, axes.flat):
            raw_intersections = {}
            for genotype1_activity_class in activity_classes:
                raw_intersections[genotype1_activity_class] = {}
                for genotype2_activity_class in reversed(activity_classes):
                    genotype1_activity_class_ids = set(
                        library_df.filter(
                            (polars.col("genotype") == genotype1) &
                            (polars.col("variant_type") == variant_type) &
                            (polars.col("activity_class") == genotype1_activity_class)
                        )["chip_peak_id"]
                    )
                    genotype2_activity_class_ids = set(
                        library_df.filter(
                            (polars.col("genotype") == genotype2) &
                            (polars.col("variant_type") == variant_type) &
                            (polars.col("activity_class") == genotype2_activity_class)
                        )["chip_peak_id"]
                    )
                    intersection = genotype1_activity_class_ids & genotype2_activity_class_ids
                    raw_intersections[genotype1_activity_class][genotype2_activity_class] = len(intersection)
            intersections = pandas.DataFrame(raw_intersections)
            seaborn.heatmap(intersections, ax=axis, cmap=cmap, square=True, annot=True, fmt="d", cbar=False)
            axis.set(xlabel=translate_genotype_to_label(genotype1), ylabel=translate_genotype_to_label(genotype2))
            
            axis.set_aspect("equal", adjustable=None, anchor="C")

        for axis in axes.flat[len(genotype_comparisons):]:
            axis.axis("off")

        figure.savefig(f"Figures/activity_class_intersections_across_genotypes_{variant_type}_{library_name}.{FIGURE_EXPORT_TYPE}", bbox_inches="tight")
        display(Markdown(f"### {variant_type}"))
        display(figure)
        pyplot.close(figure)

In [None]:
for library_name, library_df in processed_samples_counts.sort(["library"]).groupby("library", maintain_order=True):
    display(Markdown(f"## [{library_name}] Scatterplot correlation of wt CRE vs mutant CRE activity"))

    genotypes = [
        "WT",
        "Rhet",
        "Ehet",
        "Rhom",
        "Ehom",
        "CrxKO"
    ]

    figure, axes = pyplot.subplots(nrows=1, ncols=6, sharex=False, sharey=True, figsize=(18, 4), layout="compressed")
    x_bounds = (1e-5, 1e3)
    y_bounds = (1e-7, 1e4)
    
    first_axis = axes.flat[0]

    for index, (genotype, axis) in enumerate(zip(genotypes, axes.flat)):
        
        data = library_df.filter((polars.col("genotype") == genotype) & ((polars.col("variant_type") == "wildtype") | (polars.col("variant_type") == "allCrxSites"))).with_columns([
            polars.col("activity_mean") + 2e-5
        ]).pivot(values="activity_mean", index="chip_peak_id", columns="variant_type").with_columns([
            (polars.col("allCrxSites")/polars.col("wildtype")).alias("foldChange")
        ]).to_pandas()
        
        #display(data)

        # Define grid of bins for evaulating heatmap and lines
        x_bins = numpy.geomspace(start=x_bounds[0], stop=y_bounds[1], num=50)
        y_bins = numpy.geomspace(start=y_bounds[0], stop=y_bounds[1], num=50)

        x = data["wildtype"].to_numpy()
        y = data["foldChange"].to_numpy()

        # Calculate R^2
        fit = scipy.stats.linregress(numpy.log10(x), numpy.log10(y))
        fit_func = lambda x: fit.slope * x + fit.intercept

        # Plot wildtype CREs as a scatterplot colored by density
        weights, color_normalizer = calc_weights_for_scatter(x, y, bins=(x_bins, y_bins))
        axis.scatter(**reorder_points_and_weights(x, y, weights), cmap=cm_light_to_gray, norm=color_normalizer, s=6, marker="o", rasterized=True, label=f"R²={fit.rvalue**2:.2f}")

        # Plot basal
        basal_mean = library_df.filter((polars.col("genotype") == genotype) & (polars.col("variant_type") == "basal"))["activity_mean"][0]
        axis.scatter(basal_mean, 1, marker="+", color="#ff7000", s=16, linewidth=1, label="basal" if index == 0 else "_nolegend_")
        axis.axvline(x=basal_mean, color="#ff700080", linewidth=1.5, linestyle=":")
        axis.axhline(y=1, color="#ff700080", linewidth=1.5, linestyle=":")
        
        # Plot out-of-bounds        
        axis.add_patch(matplotlib.patches.Polygon(
            xy=[(1e3, 2e-5/1e3), (2e-5, 1), (2e-5, 1e4), (0, 1e4), (0, 0)],
            facecolor="#00000000",
            edgecolor="#00000008",
            hatch=r"//"
        ))
        
        # Format plot
        axis.set(xlim=x_bounds, ylim=y_bounds, title=translate_genotype_to_label(genotype), xlabel="Intact motif activity", ylabel="Killed motif fold change")
        axis.xaxis.label.set_color("#0000ff")
        axis.yaxis.label.set_color("#ff0000")
        axis.set_xscale("log", base=10)
        axis.set_yscale("log", base=10)
        axis.xaxis.set_major_locator(pyplot.LogLocator(numticks=5))
        axis.yaxis.set_major_locator(pyplot.LogLocator(numticks=7))
        axis.tick_params(which="major", bottom=True, top=False, left=True, right=False, labelbottom=True, labeltop=False, labelleft=True, labelright=False)
        axis.tick_params(which="minor", bottom=False, top=False, left=False, right=False)
        axis.set_aspect(aspect="equal", adjustable=None, anchor="C")

        handles, labels = axis.get_legend_handles_labels()
        legend = axis.legend(handles, labels, loc="upper left")

    figure.savefig(f"Figures/expression_wt_vs_crxMut_motifs_{variant_type}_{library_name}.{FIGURE_EXPORT_TYPE}", bbox_inches="tight")
    display(Markdown(f"### {variant_type}"))
    display(figure)
    pyplot.close(figure)

In [None]:
for library_name, library_df in processed_samples_counts.sort(["library"]).groupby("library", maintain_order=True):
    display(Markdown(f"## [{library_name}] Heatmap correlation of wt CRE vs mutant CRE classification"))
    
    genotypes = [
        "WT",
        "Rhet",
        "Ehet",
        "Rhom",
        "Ehom",
        "CrxKO"
    ]
    
    figure, axes = pyplot.subplots(nrows=1, ncols=6, sharex=False, sharey=False, figsize=(24, 4), layout="compressed")
    
    for index, (genotype, axis) in enumerate(zip(genotypes, axes.flat)):
        activity_classes = ["strong_silencer", "weak_silencer", "inactive", "weak_enhancer", "strong_enhancer"]

        raw_intersections = {}
        for wtMotif_activity_class in activity_classes:
            raw_intersections[wtMotif_activity_class] = {}
            for mutMotif_activity_class in reversed(activity_classes):
                wtMotif_activity_class_ids = set(
                    library_df.filter(
                        (polars.col("genotype") == genotype) &
                        (polars.col("variant_type") == "wildtype") &
                        (polars.col("activity_class") == wtMotif_activity_class)
                    )["chip_peak_id"]
                )
                mutMotif_activity_class_ids = set(
                    library_df.filter(
                        (polars.col("genotype") == genotype) &
                        (polars.col("variant_type") == "allCrxSites") &
                        (polars.col("activity_class") == mutMotif_activity_class)
                    )["chip_peak_id"]
                )
                
                intersection = wtMotif_activity_class_ids & mutMotif_activity_class_ids
                raw_intersections[wtMotif_activity_class][mutMotif_activity_class] = len(intersection)
        intersections = pandas.DataFrame(raw_intersections)
        seaborn.heatmap(intersections, ax=axis, cmap=cm_white_to_gray, square=True, annot=True, fmt="d", cbar=False)
        axis.set(xlabel="Intact Motif CREs", ylabel="Killed Motif CREs", title=f"{genotype}")
        axis.xaxis.label.set_color("#0000ff")
        axis.yaxis.label.set_color("#ff0000")

        axis.set_aspect("equal", adjustable=None, anchor="C")

    for axis in axes.flat[len(genotypes):]:
        axis.axis("off")

    figure.savefig(f"Figures/activity_class_intersections_wt_vs_crxMut_motifs_{library_name}.{FIGURE_EXPORT_TYPE}", bbox_inches="tight")
    display(Markdown(f"### {variant_type}"))
    display(figure)
    pyplot.close(figure)

In [None]:
for library_name, library_df in processed_samples_counts.sort(["library"]).groupby("library", maintain_order=True):
    display(Markdown(f"## [{library_name}] Heatmap of CRE activity across all genotypes"))
    
    processed_data = library_df.filter(polars.col("variant_type") == "wildtype")

    heatmap_data = processed_data.with_columns([
        polars.col("activity_mean") + 2e-5
    ]).pivot(values="activity_mean", index="chip_peak_id", columns="genotype").select([
        "chip_peak_id",
        "WT",
        "Rhet",
        "Ehet",
        "Rhom",
        "Ehom",
        "CrxKO"
    ]).sort(polars.col("CrxKO")/polars.col("WT"), descending=True).rename(NAME_TRANSLATION_TABLE)
    
    activity_df = heatmap_data.to_pandas().set_index("CRE")
    display(activity_df)
    
    figure, axes = pyplot.subplots(nrows=1, ncols=1, figsize=(1.5, 6))
    
    axes = seaborn.heatmap(axes=axes, data=activity_df, cmap="PiYG", norm=matplotlib.colors.LogNorm(vmin=1e-4, vmax=1e3, clip=True), yticklabels=False, rasterized=True, cbar_kws={"fraction": 0.08})
    axes.collections[0].colorbar.ax.set_ylabel("Transcriptional Activity", rotation=270, labelpad=10, fontsize=10)
    figure.savefig(f"Figures/element_activity_heatmap_across_genotypes_{library_name}.{FIGURE_EXPORT_TYPE}", bbox_inches="tight")
    display(figure)
    pyplot.close(figure)

---

# DEREPRESSED SILENCERS

---

In [None]:
element_subgroups = {}

for library_name, library_df in processed_samples_counts.sort(["library"]).groupby("library", maintain_order=True):
    
    if library_name not in element_subgroups:
        element_subgroups[library_name] = {}
    
    element_subgroups[library_name]["wt_silencers"] = library_df.filter(
        (polars.col("variant_type") == "wildtype") &
        (polars.col("genotype") == "WT") &
        ((polars.col("activity_class") == "strong_silencer") | (polars.col("activity_class") == "weak_silencer"))
    )["chip_peak_id"]
    
    element_subgroups[library_name]["wt_strong_enhancers"] = library_df.filter(
        (polars.col("variant_type") == "wildtype") &
        (polars.col("genotype") == "WT") &
        (polars.col("activity_class") == "strong_enhancer")
    )["chip_peak_id"]
    
    wt_vs_e168d2 = library_df.filter(
        (polars.col("variant_type") == "wildtype") &
        (polars.col("genotype") == "WT")
    ).select([
        polars.col("chip_peak_id"),
        polars.col("activity_class").alias("activity_class_WT")
    ]).join(
        library_df.filter(
            (polars.col("variant_type") == "wildtype") &
            (polars.col("genotype") == "Ehom")
        ).select([
            polars.col("chip_peak_id"),
            polars.col("activity_class").alias("activity_class_Ehom")
        ]), how="left", on="chip_peak_id"
    )
    
    element_subgroups[library_name]["e168d2_derepressed_silencers"] = wt_vs_e168d2.filter(
        ((polars.col("activity_class_WT") == "strong_silencer") | (polars.col("activity_class_WT") == "weak_silencer")) &
        ((polars.col("activity_class_Ehom") == "weak_enhancer") | (polars.col("activity_class_Ehom") == "strong_enhancer"))
    )["chip_peak_id"]
    
    element_subgroups[library_name]["e168d2_always_silencers"] = wt_vs_e168d2.filter(
        ((polars.col("activity_class_WT") == "strong_silencer") | (polars.col("activity_class_WT") == "weak_silencer")) &
        ((polars.col("activity_class_Ehom") == "strong_silencer") | (polars.col("activity_class_Ehom") == "weak_silencer"))
    )["chip_peak_id"]
    
    element_subgroups[library_name]["e168d2_always_enhancers"] = wt_vs_e168d2.filter(
        ((polars.col("activity_class_WT") == "weak_enhancer") | (polars.col("activity_class_WT") == "strong_enhancer")) &
        ((polars.col("activity_class_Ehom") == "weak_enhancer") | (polars.col("activity_class_Ehom") == "strong_enhancer"))
    )["chip_peak_id"]
    
    element_subgroups[library_name]["e168d2_always_strong_enhancers"] = wt_vs_e168d2.filter(
        (polars.col("activity_class_WT") == "strong_enhancer") &
        (polars.col("activity_class_Ehom") == "strong_enhancer")
    )["chip_peak_id"]
    
    element_subgroups[library_name]["crxko_derepressed_silencers"] = library_df.filter(
        (polars.col("variant_type") == "wildtype") &
        (polars.col("genotype") == "WT")
    ).select([
        "chip_peak_id",
        polars.col("activity_class").alias("activity_class_WT")
    ]).join(
        library_df.filter(
            (polars.col("variant_type") == "wildtype") &
            (polars.col("genotype") == "CrxKO")
        ).select([
            "chip_peak_id",
            polars.col("activity_class").alias("activity_class_CrxKO")
        ]), how="left", on="chip_peak_id"
    ).filter(
        ((polars.col("activity_class_WT") == "strong_silencer") | (polars.col("activity_class_WT") == "weak_silencer")) &
        ((polars.col("activity_class_CrxKO") == "weak_enhancer") | (polars.col("activity_class_CrxKO") == "strong_enhancer"))
    )["chip_peak_id"]

for library_name in ["rho", "hsp68"]:
    
    display(Markdown(f"## [{library_name}] Subgroup intersections"))
    
    subgroup_comparisons = [
        ("e168d2_derepressed_silencers", "crxko_derepressed_silencers"),
        ("wt_strong_enhancers", "e168d2_always_enhancers"),
        ("wt_silencers", "e168d2_always_silencers"),
    ]
    
    for (subgroup1, subgroup2) in subgroup_comparisons:
        display(Markdown(f"### {subgroup1} and {subgroup2}"))

        figure, axes = pyplot.subplots(nrows=1, ncols=1, figsize=(4, 3))

        venn = venn2([set(element_subgroups[library_name][subgroup1]), set(element_subgroups[library_name][subgroup2])], ax=axes)
        venn.get_label_by_id("A").set_text(subgroup1)
        venn.get_label_by_id("B").set_text(subgroup2)

        venn.patches[0].set_facecolor("#29C732")
        venn.patches[0].set_alpha(1.0)

        venn.patches[1].set_facecolor("#FF9AB6")
        venn.patches[1].set_alpha(1.0)

        venn.patches[2].set_facecolor("#94B174")
        venn.patches[2].set_alpha(1.0)

        figure.savefig(f"Figures/overlap_{subgroup1}_vs_{subgroup2}_{library_name}.{FIGURE_EXPORT_TYPE}", bbox_inches="tight")
        display(figure)
        pyplot.close(figure)
 
for library in element_subgroups:
    display(Markdown(f"## [{library}] Subgroup sizes"))
    list_string = ""
    for subgroup in element_subgroups[library]:
        matching_CRE_sequences = library_metadata.filter(
            (polars.col("chip_peak_id").is_in(element_subgroups[library][subgroup])) &
            (polars.col("variant_type") == "wildtype")
        ).select(["chip_peak_id", "CRE_sequence"])
        output_file_name = f"Motif_Analysis/{subgroup}_{library}.fa"
        with open(output_file_name, "w") as out:
            for (chip_peak_id, CRE_sequence) in matching_CRE_sequences.iter_rows():
                out.write(f">{chip_peak_id}\n{CRE_sequence}\n")
        
        list_string += f" - {subgroup}: {len(element_subgroups[library][subgroup])} elements (wrote CRE sequences to `{output_file_name}`)\n"
        
    display(Markdown(list_string))

In [None]:
for library_name, library_df in processed_samples_counts.sort(["library"]).groupby("library", maintain_order=True):
    display(Markdown(f"## [{library_name}] Activity of subgroups"))
    
    basal_data = library_df.filter(
        (polars.col("library_id") == "basal")
    ).with_columns([
        polars.col("activity_mean") + 2e-5
    ]).with_columns([
        polars.col("genotype").apply(translate_genotype_to_label)
    ]).select([
        "genotype",
        "activity_mean",
    ]).to_pandas()
    
    for subgroup_name in ["e168d2_derepressed_silencers", "e168d2_always_silencers", "e168d2_always_enhancers"]:
    
        display(Markdown(f"### {subgroup_name}"))

        processed_data = library_df.filter(
            ((polars.col("variant_type") == "wildtype") | (polars.col("variant_type") == "allCrxSites")) &
            polars.col("chip_peak_id").is_in(element_subgroups[library_name][subgroup_name])
        ).with_columns([
            polars.col("activity_mean") + 2e-5
        ]).with_columns([
            polars.col("genotype").apply(translate_genotype_to_label)
        ]).to_pandas()

        figure, axes = pyplot.subplots(nrows=1, ncols=1, figsize=(2.5, 3))

        plot_data_params = dict(
            data=processed_data,
            x="genotype",
            y="activity_mean",
            hue="variant_type",
            order=["+/+", "R90W/+", "E168d2/+", "R90W/R90W", "E168d2/E168d2", "-/-"],
            hue_order=["wildtype", "allCrxSites"],
        )
        _ = seaborn.boxplot(ax=axes, **plot_data_params, notch=False, palette=["#6060ff", "#ff3030"], linewidth=1, **boxplot_style_args)

        _ = seaborn.swarmplot(ax=axes, data=basal_data, x="genotype", y="activity_mean", order=["+/+", "R90W/+", "E168d2/+", "R90W/R90W", "E168d2/E168d2", "-/-"], s=5, color="#ff7000")

        # Plot out-of-bounds
        axes.add_patch(matplotlib.patches.Polygon(
            xy=[(-0.5, 2e-5), (len(plot_data_params["order"]) - 0.5, 2e-5), (len(plot_data_params["order"]) - 0.5, 0), (0, 0)],
            facecolor="#00000000",
            edgecolor="#00000008",
            hatch=r"//"
        ))

        axes.set(ylim=(1e-5, 1e2), xlabel=None, ylabel="Transcriptional Activity")
        axes.set_yscale("log")
        axes.tick_params(axis="x", rotation=90)

        figure.savefig(f"Figures/subgroup_activity_{subgroup_name}_{library_name}.{FIGURE_EXPORT_TYPE}", bbox_inches="tight")
        display(figure)
        pyplot.close(figure)

---

# COOPERATIVITY OF CRX SITES

---

In [None]:
for library_name, library_df in processed_samples_counts.sort(["library"]).groupby("library", maintain_order=True):
    display(Markdown(f"## [{library_name}] Cooperativity of CREs"))
    
    data = library_df.with_columns([
        polars.col("activity_mean") + 2e-5
    ]).filter(polars.col("num_crx_sites") == 2).sort(["chip_peak_id", "genotype", "variant_type"]).with_columns([
        polars.when(polars.col("library_id").str.ends_with("_1_double")).then(1).when(polars.col("library_id").str.ends_with("2_double")).then(2).otherwise(None).alias("single_site_num"),
    ])
    
    processed_data = data.filter(polars.col("variant_type") == "wildtype").join(
        data.filter((polars.col("variant_type") == "singleCrxSite") & (polars.col("single_site_num") == 1)),
        how="inner", on=["genotype", "chip_peak_id"], suffix="_single1"
    ).join(
        data.filter((polars.col("variant_type") == "singleCrxSite") & (polars.col("single_site_num") == 2)),
        how="inner", on=["genotype", "chip_peak_id"], suffix="_single2"
    ).join(
        data.filter((polars.col("variant_type") == "allCrxSites")),
        how="inner", on=["genotype", "chip_peak_id"], suffix="_allSites"
    ).with_columns([
        #((polars.col("activity_mean_allSites") - polars.col("activity_mean")) - (polars.col("activity_mean_single1") - polars.col("activity_mean")) - (polars.col("activity_mean_single2") - polars.col("activity_mean"))).alias("double_single_activity_difference")
        #((polars.col("activity_mu_allSites") - polars.col("activity_mu")) - (polars.col("activity_mu_single1") - polars.col("activity_mu")) - (polars.col("activity_mu_single2") - polars.col("activity_mu"))).fill_nan(0).alias("double_single_activity_difference")
        (((polars.col("activity_mean") - polars.col("activity_mean_single1")) + (polars.col("activity_mean") - polars.col("activity_mean_single2")))/(polars.col("activity_mean") - polars.col("activity_mean_allSites"))).fill_nan(0).alias("double_single_activity_difference")
    ])
    
    heatmap_data = processed_data.pivot(values="double_single_activity_difference", index="chip_peak_id", columns="genotype").select([
        "chip_peak_id",
        "WT",
        "Rhet",
        "Ehet",
        "Rhom",
        "Ehom",
        "CrxKO"
    ]).sort(polars.col("CrxKO")/polars.col("WT"), descending=True).rename(NAME_TRANSLATION_TABLE).to_pandas().set_index("CRE")
    display(heatmap_data)
    
    figure, axes = pyplot.subplots(nrows=1, ncols=1, figsize=(1.5, 3))
    
    axes = seaborn.heatmap(axes=axes, data=heatmap_data, cmap="PiYG", norm=matplotlib.colors.LogNorm(vmin=1e-3, vmax=1e3), yticklabels=False, rasterized=True, cbar_kws={"fraction": 0.08, "label": "Transcriptional Activity"})
    axes.collections[0].colorbar.ax.set_ylabel("(Δa$_{site1}$ + Δa$_{site2}$) / (Δa$_{site1+2}$)", rotation=270, labelpad=10, fontsize=10)
    figure.savefig(f"Figures/cooperativity_across_genotypes_{library_name}.{FIGURE_EXPORT_TYPE}", bbox_inches="tight")
    display(figure)
    pyplot.close(figure)

---

# CHROMATIN STATE

---

In [None]:
for library_name, library_df in processed_samples_counts.filter((polars.col("variant_type") == "wildtype") & (polars.col("genotype") == "WT")).sort("library").groupby("library", maintain_order=True):
    display(Markdown(f"## [{library_name}] Fraction of Activity Classifications by chromatin annotation of CRE genomic origin in +/+"))
    
    figure, axes = pyplot.subplots(nrows=1, ncols=1, figsize=(5.5, 3.5))
    
    data = library_df.with_columns([
        polars.col("activity_mean") + 2e-5,
        polars.when(polars.col("activity_class") == "strong_enhancer").then(4)
        .when(polars.col("activity_class") == "weak_enhancer").then(3)
        .when(polars.col("activity_class") == "inactive").then(2)
        .when(polars.col("activity_class") == "weak_silencer").then(1)
        .when(polars.col("activity_class") == "strong_silencer").then(0).alias("activity_class_rank")
    ]).sort(["epigenome_group", "atac_crx_dependent", "activity_class_rank"]).groupby(["epigenome_group", "atac_crx_dependent", "activity_class"], maintain_order=True).agg([
        polars.count().alias("num_CREs")
    ]).with_columns([
        polars.sum("num_CREs").over(["epigenome_group", "atac_crx_dependent"]).alias("total_CREs")
    ]).with_columns([
        (polars.col("num_CREs")/polars.col("total_CREs")).alias("fraction_CREs")
    ])
    
    x_axis_labels = []
    
    prior_row = None
    for (activity_class_name, group_data), color in zip(data.groupby("activity_class", maintain_order=True), seaborn.diverging_palette(280, 120, l=80, n=5, center="light")):
        if prior_row is None:
            axes.barh(y=group_data["epigenome_group"]+"_"+group_data["atac_crx_dependent"], width=group_data["fraction_CREs"], label=f"{activity_class_name}", facecolor=color)
            prior_row = group_data["fraction_CREs"]
        else:
            axes.barh(y=group_data["epigenome_group"]+"_"+group_data["atac_crx_dependent"], width=group_data["fraction_CREs"], label=f"{activity_class_name}", left=prior_row, facecolor=color)
            prior_row += group_data["fraction_CREs"]
    
    axes.set(xlim=(0, 1), xlabel="Fraction", ylabel="Chromatin Group")
    axes.legend(loc="right", title=f"Activity Class", facecolor="#ffffff", bbox_to_anchor=(1.4,0.5))
    axes.invert_yaxis()
    
    figure.savefig(f"Figures/fraction_activity_classes_by_epigenome_group_{library_name}.{FIGURE_EXPORT_TYPE}", bbox_inches="tight")
    display(figure)
    pyplot.close(figure)

In [None]:
for library_name in ["rho", "hsp68"]:
    display(Markdown(f"## [{library_name}] Derepressed silencer epigenome group classifications"))
    
    figure, axes = pyplot.subplots(nrows=1, ncols=1, figsize=(5.5, 0.6))
    
    data = chip_peak_metadata.filter(
        polars.col("chip_peak_id").is_in(element_subgroups[library_name]["e168d2_derepressed_silencers"])
    ).sort(["epigenome_group"]).groupby(["epigenome_group"], maintain_order=True).agg([
        polars.count().alias("num_CREs")
    ]).with_columns([
        (polars.col("num_CREs")/polars.sum("num_CREs")).alias("fraction_CREs")
    ])
    
    x_axis_labels = []
    
    prior_row = None
    for (activity_class_name, group_data), color in zip(data.groupby("epigenome_group", maintain_order=True), seaborn.husl_palette(3)):
        if prior_row is None:
            axes.barh(y=["Derepressed Silencers"], width=group_data["fraction_CREs"], label=f"{activity_class_name}", facecolor=color)
            prior_row = group_data["fraction_CREs"]
        else:
            axes.barh(y=["Derepressed Silencers"], width=group_data["fraction_CREs"], label=f"{activity_class_name}", left=prior_row, facecolor=color)
            prior_row += group_data["fraction_CREs"]
    
    axes.set(xlim=(0, 1), ylabel=None, xlabel="Fraction")
    axes.legend(loc="right", title=f"Activity Class", facecolor="#ffffff", bbox_to_anchor=(1.6, 0.5))
    
    figure.savefig(f"Figures/fraction_activity_classes_derepressed_silencers_{library_name}.{FIGURE_EXPORT_TYPE}", bbox_inches="tight")
    display(figure)
    pyplot.close(figure)

In [None]:
for (library_name, epigenome_group_name), library_df in processed_samples_counts.filter((polars.col("variant_type") == "wildtype")).sort(["library", "epigenome_group"]).groupby(["library", "epigenome_group"], maintain_order=True):
    display(Markdown(f"## [{library_name}] CRX-dependent accessibility of CRE genomic origin (Group {epigenome_group_name})"))
    figure, axes = pyplot.subplots(nrows=1, ncols=1, figsize=(5.5, 4))
    
    data = library_df.with_columns([
        polars.col("activity_mean") + 2e-5,
        polars.col("genotype").apply(translate_genotype_to_label),
    ]).to_pandas()
    
    basal_data = processed_samples_counts.filter(
        (polars.col("library_id") == "basal")
        & (polars.col("library") == library_name)
    ).with_columns([
        polars.col("activity_mean") + 2e-5
    ]).with_columns([
        polars.col("genotype").apply(translate_genotype_to_label)
    ]).select([
        "genotype",
        "activity_mean",
        polars.lit("dependent").alias("atac_crx_dependent")
    ]).to_pandas()
    
    plot_data_params = dict(
        data=data,
        x="atac_crx_dependent",
        y="activity_mean",
        hue="genotype",
        order=["dependent", "independent"],
        hue_order=["+/+", "R90W/+", "E168d2/+", "R90W/R90W", "E168d2/E168d2", "-/-"]
    )
    _ = seaborn.boxplot(ax=axes, **plot_data_params, notch=False, palette=seaborn.husl_palette(6), linewidth=1, dodge=True,  **boxplot_style_args)
    
    _ = seaborn.stripplot(ax=axes, data=basal_data, x=plot_data_params["x"], y=plot_data_params["y"], hue=plot_data_params["hue"], order=plot_data_params["order"], hue_order=plot_data_params["hue_order"], palette=seaborn.husl_palette(6), s=5, color="#ff7000")
    
    # Plot out-of-bounds
    axes.add_patch(matplotlib.patches.Polygon(
        xy=[(-0.5, 2e-5), (len(plot_data_params["order"]) - 0.5, 2e-5), (len(plot_data_params["order"]) - 0.5, 0), (0, 0)],
        facecolor="#00000000",
        edgecolor="#00000008",
        hatch=r"//"
    ))
    
    axes.set(ylim=(1e-5, 3e2), xlabel="ATAC Accessibility", ylabel="Transcriptional Activity")
    axes.set_yscale("log")
    axes.legend(loc="right", title=f"Group {epigenome_group_name}", facecolor="#ffffff", bbox_to_anchor=(1.6,0.5))
    
    figure.savefig(f"Figures/activity_by_atac_crx_dependence_group{epigenome_group_name}_{library_name}.{FIGURE_EXPORT_TYPE}", bbox_inches="tight")
    display(figure)
    pyplot.close(figure)

---

# INFORMATION CONTENT ANALYSIS

---

In [None]:
information_content = polars.read_csv("Library_Details/library_information_content.tsv", sep="\t")
display(information_content)

In [None]:
for library_name in ["rho", "hsp68"]:
    display(Markdown(f"## [{library_name}] Subgroup information content and occupancy"))
    
    data = library_metadata.filter(polars.col("variant_type") == "wildtype").with_columns([
        polars.when(polars.col("chip_peak_id").is_in(element_subgroups[library_name]["e168d2_derepressed_silencers"])).then("E168d2 Derepressed Silencers")
        .when(polars.col("chip_peak_id").is_in(element_subgroups[library_name]["wt_strong_enhancers"])).then("+/+ Strong Enhancers")
        .when(polars.col("chip_peak_id").is_in(element_subgroups[library_name]["wt_silencers"]) & polars.col("chip_peak_id").is_in(element_subgroups[library_name]["e168d2_derepressed_silencers"]).is_not()).then("+/+ Silencers")
        .otherwise(None).alias("derepression_status")
    ]).drop_nulls("derepression_status").join(information_content, on="library_id", how="left").sort("derepression_status")
    
    data = data.to_pandas()
    data["variant_type"].cat = data["variant_type"].cat.remove_unused_categories()
    
    subplot_details = [
        ("info_content", "Information Content"),
        ("predicted_occupancy_CRX", "Predicted CRX Occupancy"),
        ("total_occupancy", "Total Occupancy")
    ]
    
    figure, axes = pyplot.subplots(nrows=1, ncols=len(subplot_details), figsize=(2.35*len(subplot_details), 3.5))
    
    for (yvar, ylabel), axis in zip(subplot_details, axes.flat):
        plot_data_params = dict(
            data=data,
            x="derepression_status",
            y=yvar,
            order=["+/+ Strong Enhancers", "E168d2 Derepressed Silencers", "+/+ Silencers"],
        )
        _ = seaborn.boxplot(ax=axis, **plot_data_params, notch=False, palette=["#6060ff"], linewidth=1, dodge=True, **boxplot_style_args)

        axis.set(xlabel=None, ylabel=ylabel)
        axis.tick_params(axis="x", rotation=20)
        for label in axis.get_xticklabels():
            label.set_ha("right")
        axis.legend([],[], frameon=False)
    
        display(Markdown(f"### {ylabel}"))
        annotator = Annotator(axis, **plot_data_params, pairs=(
            ("+/+ Strong Enhancers", "+/+ Silencers"),
            ("+/+ Strong Enhancers", "E168d2 Derepressed Silencers"),
            ("+/+ Silencers", "E168d2 Derepressed Silencers"),
        ))
        annotator.configure(test="Mann-Whitney", verbose=True)
        annotator.apply_test()
        annotator.annotate()

    figure.tight_layout()
    figure.savefig(f"Figures/information_content_and_occupancy_E168d2_derepressed_silencers_{library_name}.{FIGURE_EXPORT_TYPE}", bbox_inches="tight")
    display(figure)
    pyplot.close(figure)

In [None]:
import statsmodels

bcor_overlap = polars.read_csv("BCOR/CRX_BCOR_overlap.tsv", sep="\t").with_columns([
    ((polars.col("nearby_BCOR_in_P0") > 0)).alias("overlaps_BCOR")
])

display(Markdown("## CRX ChIP Peak Overlap with BCOR Cut&Run"))
display(bcor_overlap)

# column value is the number of replicates (0, 1, or 2) in which a BCOR peak overlapped the given CRX ChIP peak

comparisons = [
    ("e168d2_derepressed_silencers", "wt_silencers"),
]

for library_name in ["rho", "hsp68"]:
    display(Markdown(f"### {library_name}"))
    
    for subgroup1, subgroup2 in comparisons:
        display(Markdown(f"#### {subgroup1} vs. {subgroup2}"))
        
        num_elems_1 = len(element_subgroups[library_name][subgroup1])
        num_bcor_overlapping_1 = bcor_overlap.filter(polars.col("chip_peak_id").is_in(element_subgroups[library_name][subgroup1]) & polars.col("overlaps_BCOR")).height
            
        num_elems_2 = len(element_subgroups[library_name][subgroup2])
        num_bcor_overlapping_2 = bcor_overlap.filter(polars.col("chip_peak_id").is_in(element_subgroups[library_name][subgroup2]) & polars.col("overlaps_BCOR")).height

        print(f"{num_bcor_overlapping_1}/{num_elems_1} ({num_bcor_overlapping_1/num_elems_1:0.2%}) vs. {num_bcor_overlapping_2}/{num_elems_2} ({num_bcor_overlapping_2/num_elems_2:0.2%})")
        
        zstat, pval = statsmodels.stats.proportion.proportions_ztest([num_bcor_overlapping_1, num_bcor_overlapping_2], [num_elems_1, num_elems_2])
        
        print(f"z={zstat:.2e}, p={pval:.2e}")