In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import matplotlib.gridspec as gridspec
from matplotlib.ticker import MultipleLocator, FuncFormatter
import seaborn as sns
from scipy.ndimage import gaussian_filter1d
from scipy.optimize import linear_sum_assignment
from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances, silhouette_score
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler,normalize
from sklearn.impute import KNNImputer
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score, pairwise_distances
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.manifold import TSNE
import umap
import igraph as ig
import leidenalg as la
import warnings
warnings.filterwarnings('ignore')

from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D 

pd.set_option("display.max_columns", None)  # None = no limit for when i do df.head()

##### Preprocessing
Here are the following functions defined:
1. 'preprocess_dataframe'
2. 'filter_reads_per_gene_middle_bin_name'
3. 'bin_then_matrix'
4. 'process_into_matrix'
5. 'handling_NaN'
6. 'preprocess_long_for_plot'
7. 'plot_reads_long'
8. 'get_genes_list'

In [None]:
def plot_violin_scatter(
    results_df,
    metrics=('silhouette', 'calinski_harabasz', 'davies_bouldin',
             'leiden_quality', 'modularity', 'n_clusters')
):
    """
    Violin + scatter (strip) plots comparing pipelines for each metric across genes.

    Parameters
    ----------
    results_df : pandas.DataFrame
        Output from run_pipelines_on_genes().
    metrics : tuple of str
        Metrics to visualize.

    Returns
    -------
    figs : dict
        Dictionary of matplotlib Figures, keyed by metric name.
    """
    # Melt to long format
    df_long = results_df.melt(
        id_vars=['gene', 'pipeline'],
        value_vars=[m for m in metrics if m in results_df.columns],
        var_name='metric',
        value_name='value'
    )

    # Drop missing or failed results
    df_long = df_long.dropna(subset=['value'])

    # Plot one figure per metric
    figs = {}
    for metric_name, sub in df_long.groupby('metric'):
        plt.figure(figsize=(7, 5))
        ax = sns.violinplot(
            data=sub,
            x='pipeline',
            y='value',
            inner=None,
            cut=0
        )
        sns.stripplot(
            data=sub,
            x='pipeline',
            y='value',
            color='k',
            size=4,
            alpha=0.6,
            jitter=0.2
        )
        ax.set_title(f'Comparison by {metric_name}')
        ax.set_xlabel('')
        ax.set_ylabel(metric_name)
        ax.grid(axis='y', linestyle='--')
        plt.tight_layout()

        figs[metric_name] = ax.get_figure()
        

    return figs


In [None]:
def plot_compare_pipelines_grid(
    results_df: pd.DataFrame,
    pipelines_order=('pca_euclidean', 'cosine_no_pca'),
    metrics=('silhouette', 'calinski_harabasz', 'davies_bouldin',
             'leiden_quality', 'modularity', 'n_clusters'),
    kind='violin',                # 'violin' or 'box'
    show_points=True,
    connect_pairs=True,           # draw line per gene connecting the two pipelines
    figsize=None,
    point_kwargs=None,
    violin_kwargs=None,
    box_kwargs=None
):
    """
    Make ONE figure with one subplot per metric; within each subplot, 
    show pipelines side-by-side.

    Parameters
    ----------
    results_df : pd.DataFrame
        Must contain columns ['gene', 'pipeline', <metrics...>].
    pipelines_order : tuple
        Order of pipelines to display.
    metrics : tuple
        Which metrics to plot.
    kind : {'violin', 'box'}
        Plot type for distributions.
    show_points : bool
        Whether to overlay individual gene points.
    connect_pairs : bool
        Whether to connect paired genes across pipelines.
    figsize : tuple or None
        Figure size; computed automatically if None.
    *_kwargs : dict or None
        Style options passed to Seaborn plotting functions.

    Returns
    -------
    fig, axes : matplotlib Figure and Axes objects
    """

    # --- Filter metrics and pipelines ---
    metrics = [m for m in metrics if m in results_df.columns]
    if not metrics:
        raise ValueError("No requested metrics found in results_df.")

    df = results_df.copy()
    df = df[df['pipeline'].isin(pipelines_order)]
    if df.empty:
        raise ValueError("results_df has no rows for the requested pipelines_order.")

    # Ensure pipelines are ordered consistently
    df['pipeline'] = pd.Categorical(df['pipeline'],
                                    categories=list(pipelines_order),
                                    ordered=True)

    # --- Melt to long format ---
    df_long = (
        df.melt(
            id_vars=['gene', 'pipeline'],
            value_vars=metrics,
            var_name='metric',
            value_name='value'
        )
        .dropna(subset=['value'])
    )

    # --- Set up figure ---
    n_cols = len(metrics)
    if figsize is None:
        figsize = (3.2 * n_cols, 4.2)  # scale width by number of metrics

    fig, axes = plt.subplots(1, n_cols, figsize=figsize, sharey=False)
    if n_cols == 1:
        axes = [axes]

    # --- Default style dictionaries ---
    if point_kwargs is None:
        point_kwargs = dict(color='k', s=18, alpha=0.7)
    if violin_kwargs is None:
        violin_kwargs = dict(inner=None, cut=0, linewidth=0)
    if box_kwargs is None:
        box_kwargs = dict(fliersize=0, linewidth=1.2)

    # --- Color palette ---
    palette = sns.color_palette('Set2', n_colors=len(pipelines_order))
    pipeline_colors = dict(zip(pipelines_order, palette))

    # --- Plot each metric ---
    for ax, metric_name in zip(axes, metrics):
        sub = df_long[df_long['metric'] == metric_name]

        # Main distribution plot
        if kind == 'violin':
            sns.violinplot(
                data=sub,
                x='pipeline', y='value',
                order=list(pipelines_order),
                palette=pipeline_colors,
                ax=ax, **violin_kwargs
            )
        elif kind == 'box':
            sns.boxplot(
                data=sub,
                x='pipeline', y='value',
                order=list(pipelines_order),
                palette=pipeline_colors,
                ax=ax, **box_kwargs
            )
        else:
            raise ValueError("kind must be 'violin' or 'box'")

        # Overlay points
        if show_points:
            sns.stripplot(
                data=sub,
                x='pipeline', y='value',
                order=list(pipelines_order),
                dodge=False, jitter=0.15,
                color='k', size=4, alpha=0.6, ax=ax
            )

        # Connect paired points (same gene across pipelines)
        if connect_pairs and len(pipelines_order) == 2:
            p0, p1 = pipelines_order

            # Pivot to wide format (gene × pipeline)
            wide = sub.pivot_table(index='gene', columns='pipeline',
                                   values='value', aggfunc='first')
            wide = wide.dropna(subset=[p0, p1], how='any')

            # Draw connecting lines
            for _, row in wide.iterrows():
                ax.plot([0, 1], [row[p0], row[p1]],
                        color='gray', alpha=0.35, linewidth=1)

        # Axis formatting
        ax.set_title(metric_name.replace('_', ' ').title())
        ax.set_xlabel('')
        ax.grid(axis='y', linestyle='--', alpha=0.25)

    plt.tight_layout()
    return fig, axes


In [None]:
def silhouettes_multi(
    X_space,
    clusters,
    metric_main='cosine',
    extra_metrics=('correlation', 'euclidean')
):
    """
    Compute silhouette scores for multiple distance metrics.

    Args:
        X_space (array-like): Feature matrix (n_samples x n_features).
        clusters (array-like): Cluster labels.
        metric_main (str): Primary metric to evaluate (default: 'cosine').
        extra_metrics (tuple): Additional metrics to test (default: ('correlation', 'euclidean')).

    Returns:
        dict: Silhouette scores for each metric, using direct or precomputed distances.
    """
    results = {}

    # Main metric
    try:
        results[f'silhouette_{metric_main}'] = silhouette_score(
            X_space, clusters, metric=metric_main
        )
    except Exception:
        D = pairwise_distances(X_space, metric=metric_main)
        results[f'silhouette_{metric_main}'] = silhouette_score(
            D, clusters, metric='precomputed'
        )

    # Extra metrics
    for m in extra_metrics:
        try:
            results[f'silhouette_{m}'] = silhouette_score(
                X_space, clusters, metric=m
            )
        except Exception:
            D = pairwise_distances(X_space, metric=m)
            results[f'silhouette_{m}'] = silhouette_score(
                D, clusters, metric='precomputed'
            )

    return results


In [None]:
# -----------------------------
# 1) Multi-metric silhouette
# -----------------------------
def silhouettes_multi(
    X_space,
    clusters,
    metric_main: str = 'cosine',
    extra_metrics: tuple = ('correlation', 'euclidean')
) -> dict:
    """
    Compute silhouette scores for multiple distance metrics.
    Returns a dict with keys like 'silhouette_cosine', 'silhouette_correlation', etc.
    Falls back to precomputed pairwise distances when needed.
    """
    results = {}

    def _one(metric_name: str):
        try:
            return float(silhouette_score(X_space, clusters, metric=metric_name))
        except Exception:
            D = pairwise_distances(X_space, metric=metric_name)
            return float(silhouette_score(D, clusters, metric='precomputed'))

    # main
    results[f'silhouette_{metric_main}'] = _one(metric_main)

    # extras
    for m in extra_metrics:
        results[f'silhouette_{m}'] = _one(m)

    return results


# ----------------------------------------------
# 2) Integrate multi-silhouette into your result
# ----------------------------------------------
def add_silhouette_columns(
    results_df: pd.DataFrame,
    embed_dict: dict
) -> pd.DataFrame:
    """
    Add multiple silhouette columns to results_df.
    - results_df must contain 'gene' and 'pipeline' columns (rows are runs).
    - embed_dict maps (gene, pipeline) -> (X_space, clusters, metric_main) used for the run.
      Example:
          embed_dict[(gene, pipeline)] = {
              'X': X_pca,
              'labels': clusters,
              'metric': metric   # the distance metric used in the run (e.g., 'cosine' or 'euclidean')
          }
    Returns a new DataFrame with additional 'silhouette_*' columns.
    """
    out = results_df.copy()
    # initialize silhouette columns if they don't exist yet
    sil_cols_all = set([c for c in out.columns if c.startswith('silhouette_')])

    rows = []
    for idx, row in out.iterrows():
        gene = row['gene']
        pipeline = row['pipeline']
        key = (gene, pipeline)
        if key not in embed_dict:
            rows.append({})
            continue
        X_space = embed_dict[key]['X']
        labels = embed_dict[key]['labels']
        metric_main = embed_dict[key].get('metric', 'cosine')

        if len(np.unique(labels)) < 2 or X_space.shape[0] < 2:
            rows.append({})
            continue

        sil_dict = silhouettes_multi(
            X_space=X_space,
            clusters=labels,
            metric_main=metric_main,
            extra_metrics=('correlation', 'euclidean')  # extend if you want more
        )
        sil_cols_all.update(sil_dict.keys())
        rows.append(sil_dict)

    # assign
    sil_df = pd.DataFrame(rows, index=out.index)
    out = pd.concat([out, sil_df], axis=1)
    return out


# -------------------------------------------------------
# 3) Plotting: violin+scatter that auto-includes silhouettes
# -------------------------------------------------------
def plot_violin_scatter(
    results_df: pd.DataFrame,
    base_metrics: tuple = ('silhouette', 'calinski_harabasz', 'davies_bouldin',
                           'leiden_quality', 'modularity', 'n_clusters'),
):
    """
    Violin + scatter (strip) plots comparing pipelines for each metric across genes.
    Auto-includes any columns that start with 'silhouette_' (e.g., silhouette_cosine).
    """
    # collect requested metrics + any silhouette_* columns present
    sil_cols = [c for c in results_df.columns if c.startswith('silhouette_')]
    metrics = [m for m in base_metrics if m in results_df.columns] + sil_cols
    if not metrics:
        raise ValueError("No requested metrics found in results_df.")

    # Melt to long format
    df_long = results_df.melt(
        id_vars=['gene', 'pipeline'],
        value_vars=metrics,
        var_name='metric',
        value_name='value'
    ).dropna(subset=['value'])

    figs = {}
    for metric_name, sub in df_long.groupby('metric'):
        plt.figure(figsize=(7, 5))
        ax = sns.violinplot(
            data=sub,
            x='pipeline',
            y='value',
            inner=None,
            cut=0
        )
        sns.stripplot(
            data=sub,
            x='pipeline',
            y='value',
            color='k',
            size=4,
            alpha=0.6,
            jitter=0.2
        )
        ax.set_title(f'Comparison by {metric_name}')
        ax.set_xlabel('')
        ax.set_ylabel(metric_name)
        ax.grid(axis='y', linestyle='--')
        plt.tight_layout()
        figs[metric_name] = ax.get_figure()
    return figs


# ----------------------------------------------------------------------
# 4) One-figure grid: side-by-side pipelines and all silhouette columns
# ----------------------------------------------------------------------
def plot_compare_pipelines_grid(
    results_df: pd.DataFrame,
    pipelines_order=('pca_euclidean', 'cosine_no_pca'),
    base_metrics=('calinski_harabasz', 'davies_bouldin',
                  'leiden_quality', 'modularity', 'n_clusters'),
    include_silhouettes: bool = True,
    kind='violin',                # 'violin' or 'box'
    show_points=True,
    connect_pairs=True,           # draw line per gene connecting the two pipelines
    figsize=None,
    point_kwargs=None,
    violin_kwargs=None,
    box_kwargs=None
):
    """
    Make ONE figure with one subplot per metric (auto-includes silhouette_* if present).
    Within each subplot, show pipelines side-by-side.
    """
    # gather metrics to plot
    metrics = list(base_metrics)
    if include_silhouettes:
        sil_cols = [c for c in results_df.columns if c.startswith('silhouette_')]
        metrics = metrics + sil_cols
    metrics = [m for m in metrics if m in results_df.columns]
    if not metrics:
        raise ValueError("No requested metrics found in results_df.")

    df = results_df.copy()
    df = df[df['pipeline'].isin(pipelines_order)]
    if df.empty:
        raise ValueError("results_df has no rows for the requested pipelines_order.")

    df['pipeline'] = pd.Categorical(df['pipeline'],
                                    categories=list(pipelines_order),
                                    ordered=True)

    # Melt long
    df_long = (
        df.melt(
            id_vars=['gene', 'pipeline'],
            value_vars=metrics,
            var_name='metric',
            value_name='value'
        )
        .dropna(subset=['value'])
    )

    # Figure
    n_cols = len(metrics)
    if figsize is None:
        figsize = (3.2 * n_cols, 4.2)
    fig, axes = plt.subplots(1, n_cols, figsize=figsize, sharey=False)
    if n_cols == 1:
        axes = [axes]

    if point_kwargs is None:
        point_kwargs = dict(color='k', s=18, alpha=0.7)
    if violin_kwargs is None:
        violin_kwargs = dict(inner=None, cut=0, linewidth=0)
    if box_kwargs is None:
        box_kwargs = dict(fliersize=0, linewidth=1.2)

    palette = sns.color_palette('Set2', n_colors=len(pipelines_order))
    pipeline_colors = dict(zip(pipelines_order, palette))

    for ax, metric_name in zip(axes, metrics):
        sub = df_long[df_long['metric'] == metric_name]

        if kind == 'violin':
            sns.violinplot(
                data=sub, x='pipeline', y='value',
                order=list(pipelines_order),
                palette=pipeline_colors, ax=ax, **violin_kwargs
            )
        elif kind == 'box':
            sns.boxplot(
                data=sub, x='pipeline', y='value',
                order=list(pipelines_order),
                palette=pipeline_colors, ax=ax, **box_kwargs
            )
        else:
            raise ValueError("kind must be 'violin' or 'box'")

        if show_points:
            sns.stripplot(
                data=sub, x='pipeline', y='value',
                order=list(pipelines_order),
                dodge=False, jitter=0.15,
                color='k', size=4, alpha=0.6, ax=ax
            )

        if connect_pairs and len(pipelines_order) == 2:
            p0, p1 = pipelines_order
            wide = sub.pivot_table(index='gene', columns='pipeline', values='value', aggfunc='first')
            wide = wide.dropna(subset=[p0, p1], how='any')
            for _, row in wide.iterrows():
                ax.plot([0, 1], [row[p0], row[p1]], color='gray', alpha=0.35, linewidth=1)

        ax.set_title(metric_name.replace('_', ' ').title())
        ax.set_xlabel('')
        ax.grid(axis='y', linestyle='--', alpha=0.25)

    plt.tight_layout()
    return fig, axes

In [None]:
## (unused) 3.1 ALTERNATE PIPELINE FOR CLUSTERING

def filter_by_missingness(
    df: pd.DataFrame,
    min_bin_non_nan_frac: float = 0.2,  # keep bins (columns) seen in >= 20% of reads
    min_read_non_nan_frac: float = 0.5  # keep reads (rows) with >= 50% bins observed
) -> pd.DataFrame:
    """Filter reads (rows) and bins (columns) based on missingness thresholds."""
    # rows = reads, columns = bins
    if df.shape[0] < 2 or df.shape[1] < 2:
        return df.copy()

    keep_cols = df.notna().mean(axis=0) >= float(min_bin_non_nan_frac)
    keep_rows = df.notna().mean(axis=1) >= float(min_read_non_nan_frac)
    df_f = df.loc[keep_rows, keep_cols].copy()

    return df_f


def masked_pearson_correlation_matrix(
    df: pd.DataFrame,
    min_overlap: int = 30
) -> pd.DataFrame:
    """
    Pairwise-complete Pearson correlation between reads (rows), computed only on shared
    non-NaN bins, requiring at least min_overlap shared bins.

    Returns an (N x N) DataFrame; entries with overlap < min_overlap are NaN.
    """
    # df.T.corr computes correlation among rows of df
    R = df.T.corr(min_periods=int(min_overlap))
    return R


def overlap_matrix(df: pd.DataFrame) -> np.ndarray:
    """Compute number of shared non-NaN bins for each read pair (N x N)."""
    M = (~df.isna()).to_numpy(dtype=bool)  # shape (N, D)
    O = M @ M.T
    return O


def build_masked_corr_knn_graph(
    df: pd.DataFrame,
    k: int = 15,
    min_overlap: int = 30,
    positive_only: bool = True,
    shrink_c: float = 30.0,  # overlap-based shrink: w *= n / (n + shrink_c)
    mutual: bool = True
):
    """
    Build a weighted kNN graph from masked Pearson correlations.

    - df: rows = reads, columns = bins, values = methylation fractions (NaN allowed).

    Returns:
        igraph.Graph, weight list (in edge order), neighbor index list per node.
    """
    N = df.shape[0]
    if N < 2:
        raise ValueError("Need at least 2 reads to build a graph.")

    # 1) Pairwise-complete correlation and overlap counts
    R_df = masked_pearson_correlation_matrix(df, min_overlap=min_overlap)  # (N x N)
    O = overlap_matrix(df)  # (N x N), shared bin counts

    # 2) Convert to weights
    R = R_df.to_numpy()
    mask_sufficient = (O >= int(min_overlap))
    W = np.where(mask_sufficient, R, 0.0)

    # Keep only nonnegative similarities if desired
    if positive_only:
        W = np.maximum(W, 0.0)

    # Optional shrink by overlap size (softly downweight low-overlap edges)
    if shrink_c is not None and shrink_c > 0:
        shrink = O / (O + float(shrink_c))
        W = W * shrink

    # Zero diagonal
    np.fill_diagonal(W, 0.0)

    # 3) Build kNN (top-k by weight, skipping zeros)
    neighbors = []
    weights = []

    for i in range(N):
        row = W[i].copy()
        idx_sorted = np.argsort(row)[::-1]  # sort by weight descending
        idx_sorted = idx_sorted[row[idx_sorted] > 0]  # filter out zeros
        topk = idx_sorted[:k]
        neighbors.append(topk.tolist())
        weights.append(row[topk].tolist())

    # 4) Symmetrize (mutual kNN) and build edge list
    edges = {}

    if mutual:
        neighbor_sets = [set(ns) for ns in neighbors]
        for i in range(N):
            for j in neighbors[i]:
                if i in neighbor_sets[j]:
                    a, b = (i, j) if i < j else (j, i)
                    wij = W[i, j]
                    wji = W[j, i]
                    w = max(wij, wji) # here i could change and put mean also
                    if w > 0:
                        edges[(a, b)] = max(edges.get((a, b), 0.0), w)
    else:
        for i in range(N):
            for j, w in zip(neighbors[i], weights[i]):
                a, b = (i, j) if i < j else (j, i)
                edges[(a, b)] = max(edges.get((a, b), 0.0), w)

    e_list = list(edges.keys())
    w_list = [edges[e] for e in e_list]

    g = ig.Graph(n=N, edges=e_list, directed=False)
    g.es["weight"] = w_list

    return g, w_list, neighbors


def alternate_clustering(
    df: pd.DataFrame,
    apply_filter: bool = True,
    min_bin_non_nan_frac: float = 0.2,
    min_read_non_nan_frac: float = 0.5,
    transform: str | None = None,  # None | 'arcsine' | 'logit'
    min_overlap: int = 30,
    k: int = 15,
    positive_only: bool = True,
    shrink_c: float = 30.0,
    mutual: bool = True,
    leiden_resolution: float = 1.0,
    seed: int = 42
):
    """
    Pipeline: Masked Pearson correlation (no imputation)
    -> weighted mutual kNN graph -> Leiden clustering.

    Args:
        df: rows = reads, columns = bins, values in [0,1] with NaNs allowed.

    Returns:
        clusters (np.ndarray of shape (N,)),
        graph (igraph.Graph),
        df_used (pd.DataFrame after optional filtering/transform)
    """
    if apply_filter:
        df_used = filter_by_missingness(
            df,
            min_bin_non_nan_frac=min_bin_non_nan_frac,
            min_read_non_nan_frac=min_read_non_nan_frac
        )
    else:
        df_used = df.copy()

    if df_used.shape[0] < 2 or df_used.shape[1] < 2:
        raise ValueError("Not enough reads or bins after filtering.")

    # Optional transform for proportions
    if transform is not None:
        if transform == 'arcsine':
            arr = np.arcsin(np.sqrt(np.clip(df_used.to_numpy(float), 0.0, 1.0)))
            df_used = pd.DataFrame(arr, index=df_used.index, columns=df_used.columns)
        elif transform == 'logit':
            Xc = np.clip(df_used.to_numpy(float), 1e-3, 1 - 1e-3)
            arr = np.log(Xc / (1 - Xc))
            df_used = pd.DataFrame(arr, index=df_used.index, columns=df_used.columns)
        else:
            raise ValueError("transform must be None, 'arcsine', or 'logit'")

    # Build graph from masked correlation
    g, w_list, _ = build_masked_corr_knn_graph(
        df_used,
        k=k,
        min_overlap=min_overlap,
        positive_only=positive_only,
        shrink_c=shrink_c,
        mutual=mutual
    )

    # Leiden clustering
    part = la.find_partition(
        g,
        la.RBConfigurationVertexPartition,
        weights=g.es["weight"],
        resolution_parameter=float(leiden_resolution),
        seed=int(seed)
    )

    clusters = np.array(part.membership, dtype=int)
    return clusters, g, df_used


In [None]:
# weighting_mapping.py

# ------------------------------
# Column coercion / sorting util
# ------------------------------
def _coerce_sort(df: pd.DataFrame) -> pd.DataFrame:
    """
    Strip and coerce column labels to int, collapse duplicates by mean, and sort ascending.
    """
    out = df.copy()
    cols = pd.Series(out.columns, dtype=str).str.strip()
    nums = pd.to_numeric(cols, errors='coerce')
    if nums.isna().any():
        bad = cols[nums.isna()].tolist()
        raise ValueError(f"Non-numeric bin columns found: {bad}")
    out.columns = nums.astype(int)
    if out.columns.duplicated().any():
        out = out.groupby(level=0, axis=1).mean()
    return out.sort_index(axis=1)


# ------------------------------
# Build positional weights
# ------------------------------
# def build_positional_weights(
#     columns,                      # iterable of bin positions (e.g., DataFrame.columns)
#     window_bp: int = 500,         # half-window around center to emphasize (e.g., +/- 500 bp)
#     center: int = 0,              # Pol2 position (0 if already centered)
#     mode: str = "gaussian",       # 'gaussian' or 'box'
#     inside_weight: float = 1.0,   # weight inside window (for 'box')
#     outside_weight: float = 0.2,  # weight outside window (for 'box')
#     sigma_bp: float | None = None,# Gaussian sigma in bp (defaults to window_bp/2)
#     normalize: bool = True        # normalize to mean ~ 1.0
# ) -> pd.Series:
#     """
#     Create a positional weight vector indexed by columns (positions), emphasizing +/- window_bp around center.
#     Returns a pd.Series aligned to the provided columns.
#     """
#     pos = pd.to_numeric(pd.Series(list(columns), dtype=str).str.strip(), errors="coerce").astype(int)
#     if mode == "gaussian":
#         # Gaussian weighting: exp(- (pos - center)^2 / (2 * sigma^2))
#         sigma = (window_bp / 2.0) if sigma_bp is None else float(sigma_bp)
#         w = np.exp(-((pos - center) ** 2) / (2.0 * sigma ** 2))
#         w = outside_weight + (inside_weight - outside_weight) * w
#     elif mode == "box":
#         w = np.where(np.abs(pos - center) <= window_bp, inside_weight, outside_weight)
#     else:
#         raise ValueError("mode must be 'gaussian' or 'box'")

#     if normalize:
#         mean_val = np.mean(w) if np.mean(w) > 0 else 1.0
#         w = w / mean_val
#     return pd.Series(w, index=pos)


# ---------------------------------------------------------
# 1. Masked cosine similarity with NaN handling (intersection)
# ---------------------------------------------------------
def masked_cosine_cross(
    G_df: pd.DataFrame,
    P_df: pd.DataFrame,
    min_overlap: int = 10,
    center_rows: bool = True,
    weights=None  # pd.Series indexed by columns (positions), or array-like
) -> np.ndarray:
    """
    Compute m×p cosine similarities between rows of G and P, masking NaNs per pair,
    aligning columns by intersection, requiring at least min_overlap shared bins,
    and applying positional weights if provided (columns scaled by sqrt(weights)).
    """
    Gc = _coerce_sort(G_df)
    Pc = _coerce_sort(P_df)

    common = np.intersect1d(Gc.columns.values, Pc.columns.values)
    if common.size == 0:
        raise ValueError("No overlapping bins between G and P.")
    Gc = Gc.reindex(columns=common)
    Pc = Pc.reindex(columns=common)

    # Apply sqrt(weights) as column scaling
    # inside masked_cosine_cross, replace the weights handling with:
    if weights is not None:
        if isinstance(weights, pd.Series):
            sw_ser = weights.reindex(common).astype(float)
            sw_ser = sw_ser.fillna(1.0)              # ensure constant weights where missing
            sw = np.sqrt(np.maximum(sw_ser.to_numpy(), 0.0))
        else:
            w = np.asarray(weights, float)
            if w.shape[0] != common.size:
                w_series = pd.Series(w, index=P_df.columns)
                sw_ser = w_series.reindex(common).fillna(1.0)
                sw = np.sqrt(np.maximum(sw_ser.to_numpy(), 0.0))
            else:
                sw = np.sqrt(np.maximum(w, 0.0))
        Gc = Gc.mul(sw, axis=1)
        Pc = Pc.mul(sw, axis=1)

    # if weights is not None:
    #     if isinstance(weights, pd.Series):
    #         sw = np.sqrt(np.maximum(weights.reindex(common).to_numpy(float), 0.0))
    #     else:
    #         w = np.asarray(weights, float)
    #         if w.shape[0] != common.size:
    #             w_series = pd.Series(w, index=P_df.columns)
    #             sw = np.sqrt(np.maximum(w_series.reindex(common).to_numpy(float), 0.0))
    #         else:
    #             sw = np.sqrt(np.maximum(w, 0.0))
    #     Gc = Gc.mul(sw, axis=1)
    #     Pc = Pc.mul(sw, axis=1)

    G = Gc.to_numpy(float)
    P = Pc.to_numpy(float)
    m, d = G.shape
    p = P.shape[0]
    S = np.zeros((m, p), float)

    G_mask = np.isfinite(G)
    P_mask = np.isfinite(P)

    for i in range(m):
        gi, mi = G[i], G_mask[i]
        for j in range(p):
            pj, mj = P[j], P_mask[j]
            ov = mi & mj
            n = int(ov.sum())
            if n < int(min_overlap):
                S[i, j] = 0.0
                continue
            vi, vj = gi[ov], pj[ov]
            if center_rows:
                vi -= vi.mean()
                vj -= vj.mean()
            denom = np.linalg.norm(vi) * np.linalg.norm(vj)
            S[i, j] = float(np.dot(vi, vj) / denom) if denom > 0 else 0.0

    return S


# ------------------------------------------
# 2. Pearson similarity (weighted and naive)
# ------------------------------------------
def _zscore_rows(M: np.ndarray, eps: float = 1e-6) -> np.ndarray:
    M = np.asarray(M, float)
    mu = np.nanmean(M, axis=1, keepdims=True)
    sd = np.nanstd(M, axis=1, keepdims=True)
    sd = np.where(sd < eps, eps, sd)
    return (M - mu) / sd

def _pairwise_pearson(G: np.ndarray, P: np.ndarray, min_overlap: int = 2) -> np.ndarray:
    G = np.asarray(G, float)
    P = np.asarray(P, float)
    m, d = G.shape
    p = P.shape[0]
    S = np.zeros((m, p), float)
    for i in range(m):
        gi = G[i]
        for j in range(p):
            pj = P[j]
            mask = np.isfinite(gi) & np.isfinite(pj)
            if mask.sum() < int(min_overlap):
                S[i, j] = 0.0
                continue
            gi_m, pj_m = gi[mask], pj[mask]
            gi_sd, pj_sd = gi_m.std(), pj_m.std()
            gi_m = (gi_m - gi_m.mean()) / (gi_sd if gi_sd > 0 else 1.0)
            pj_m = (pj_m - pj_m.mean()) / (pj_sd if pj_sd > 0 else 1.0)
            S[i, j] = float(np.dot(gi_m, pj_m) / mask.sum())
    return S

def _pairwise_weighted_pearson_df(
    G_df: pd.DataFrame, P_df: pd.DataFrame, weights: pd.Series, min_overlap: int = 2
) -> np.ndarray:
    """
    Weighted Pearson correlation between rows of G_df and P_df using per-bin weights (Series).
    Columns are aligned to intersection; weights reindexed to the intersection.
    """
    Gc = _coerce_sort(G_df)
    Pc = _coerce_sort(P_df)
    common = np.intersect1d(Gc.columns.values, Pc.columns.values)
    if common.size == 0:
        raise ValueError("No overlapping bins between G and P.")
    Gc = Gc.reindex(columns=common)
    Pc = Pc.reindex(columns=common)
    if weights is None:
        w= np.ones(common.size, float)
    else:
        w = weights.reindex(common).to_numpy(float)
        w = np.where(w < 0, 0.0, w)

    G = Gc.to_numpy(float)
    P = Pc.to_numpy(float)
    m, d = G.shape
    p = P.shape[0]
    S = np.zeros((m, p), float)

    G_mask = np.isfinite(G)
    P_mask = np.isfinite(P)

    for i in range(m):
        gi, mi = G[i], G_mask[i]
        for j in range(p):
            pj, mj = P[j], P_mask[j]
            mask = (mi & mj)
            if mask.sum() < int(min_overlap):
                S[i, j] = 0.0
                continue
            wi = w[mask]
            xi = gi[mask]
            yj = pj[mask]
            W = wi.sum()
            if W <= 0:
                S[i, j] = 0.0
                continue
            mu_x = (wi * xi).sum() / W
            mu_y = (wi * yj).sum() / W
            var_x = (wi * (xi - mu_x) ** 2).sum() / W
            var_y = (wi * (yj - mu_y) ** 2).sum() / W
            denom = np.sqrt(var_x * var_y)
            if denom <= 0:
                S[i, j] = 0.0
                continue
            cov_xy = (wi * (xi - mu_x) * (yj - mu_y)).sum() / W
            S[i, j] = float(cov_xy / denom)

    return S


# ------------------------------------------
# 3. Unified similarity API (cosine/pearson)
# ------------------------------------------
def compute_similarity_matrix(
    G,
    P,
    method: str = 'pearson',
    use_abs: bool = False,
    center_rows: bool = True,
    weights=None,            # pd.Series (positional) or array-like; for cosine and weighted pearson
    min_overlap: int = 10
):
    """
    Compute similarity S and distance D between rows of G and rows of P.

    method:
      - 'pearson': G/P arrays or DataFrames; if weights is a Series and G/P are DataFrames,
                   uses weighted Pearson; otherwise unweighted Pearson.
      - 'cosine' : G/P must be DataFrames; columns aligned to intersection; masked cosine used.
    """
    if method == 'pearson':
        if isinstance(G, pd.DataFrame) and isinstance(P, pd.DataFrame) and isinstance(weights, pd.Series):
            S = _pairwise_weighted_pearson_df(G_df=G, P_df=P, weights=weights, min_overlap=min_overlap)
            S = np.nan_to_num(S, nan=0.0)
        else:
            G_arr = np.asarray(G, float) if not isinstance(G, pd.DataFrame) else _coerce_sort(G).to_numpy(float)
            P_arr = np.asarray(P, float) if not isinstance(P, pd.DataFrame) else _coerce_sort(P).to_numpy(float)
            # Align by intersection if DataFrames were passed
            if isinstance(G, pd.DataFrame) and isinstance(P, pd.DataFrame):
                common = np.intersect1d(_coerce_sort(G).columns.values, _coerce_sort(P).columns.values)
                G_arr = _coerce_sort(G).reindex(columns=common).to_numpy(float)
                P_arr = _coerce_sort(P).reindex(columns=common).to_numpy(float)
            Zg = _zscore_rows(G_arr)
            Zp = _zscore_rows(P_arr)
            S = _pairwise_pearson(Zg, Zp, min_overlap=min_overlap)
            S = np.nan_to_num(S, nan=0.0)
        if use_abs:
            S = np.abs(S)
        D = 1.0 - S
        D[D < 0] = 0.0
        prot_ids = list(P.index) if isinstance(P, pd.DataFrame) else list(range(np.asarray(P).shape[0]))
        return S, D, prot_ids

    elif method == 'cosine':
        if not isinstance(G, pd.DataFrame) or not isinstance(P, pd.DataFrame):
            raise ValueError("For masked cosine, pass G and P as DataFrames (columns = positions).")
        S = masked_cosine_cross(G_df=G, P_df=P, min_overlap=min_overlap, center_rows=center_rows, weights=weights)
        if use_abs:
            S = np.abs(S)
        D = 1.0 - S
        D[D < 0] = 0.0
        prot_ids = list(P.index)
        return S, D, prot_ids

    else:
        raise ValueError("method must be 'pearson' or 'cosine'.")


# -----------------------------------------------------
# 4. Hungarian assignment (one-to-one per gene)
# -----------------------------------------------------
def assign_gene_clusters_to_consensus(
    G,
    meta: pd.DataFrame,
    P,
    method: str = 'pearson',
    use_abs: bool = False,
    center_rows: bool = True,
    weights=None,
    min_overlap: int = 10,
    min_similarity: float | None = None,
    allow_unassigned: bool = True,
    capacity_per_type: int = 1
):
    """
    Map gene clusters (rows of G) to bulk prototypes (rows of P).
    meta must have ['gene_id' or 'gene_tss', 'cluster_id', 'size'] aligned to rows of G.
    """
    S, D_base, prot_ids = compute_similarity_matrix(
        G=G, P=P, method=method, use_abs=use_abs,
        center_rows=center_rows, weights=weights, min_overlap=min_overlap
    )

    meta_df = meta.copy()
    if 'size' not in meta_df.columns:
        meta_df['size'] = np.nan
    gene_col = 'gene_id' if 'gene_id' in meta_df.columns else ('gene_tss' if 'gene_tss' in meta_df.columns else None)
    if gene_col is None:
        raise ValueError("meta must contain 'gene_id' or 'gene_tss'.")

    assignments = []
    for gene_id, row_idx in meta_df.groupby(gene_col).groups.items():
        row_idx = list(row_idx)
        D = D_base[row_idx, :]

        if capacity_per_type > 1:
            P_rep = np.repeat(np.arange(D.shape[1]), repeats=int(capacity_per_type))
            D_aug = np.tile(D, reps=(1, int(capacity_per_type)))
        else:
            P_rep = np.arange(D.shape[1])
            D_aug = D

        if allow_unassigned:
            penalty = (1.0 - float(min_similarity)) if min_similarity is not None else float(np.nanmax(D_aug) + 0.05)
            dummy = np.full((D_aug.shape[0], D_aug.shape[0]), penalty, dtype=float)
            D_final = np.hstack([D_aug, dummy])
            col_map = np.concatenate([P_rep, -np.ones(dummy.shape[1], dtype=int)])
        else:
            D_final = D_aug
            col_map = P_rep

        r_idx, c_idx = linear_sum_assignment(D_final)

        for r, c in zip(r_idx, c_idx):
            cons_col = col_map[c]
            best_cost = D_final[r, c]
            sim = float(1.0 - best_cost)
            row_costs = D_final[r]
            mask = np.ones_like(row_costs, dtype=bool); mask[c] = False
            second_best_cost = np.min(row_costs[mask]) if mask.any() else np.nan
            second_best_sim = float(1.0 - second_best_cost) if np.isfinite(second_best_cost) else np.nan
            margin = float(second_best_cost - best_cost) if np.isfinite(second_best_cost) else np.nan
            cons_id = None if cons_col < 0 else prot_ids[int(cons_col)]
            assignments.append({
                gene_col: gene_id,
                'cluster_id': meta_df.iloc[r]['cluster_id'],
                'size': meta_df.iloc[r]['size'],
                'consensus_id': cons_id,
                'similarity': sim,
                'second_best_similarity': second_best_sim,
                'margin': margin
            })

    assign_df = pd.DataFrame(assignments)
    return assign_df, S, D_base



In [None]:
def plot_gene_mapping_heatmap(
    assign_df: pd.DataFrame,
    S: np.ndarray,                  # full similarity matrix returned by assign_gene_clusters_to_consensus
    meta: pd.DataFrame,             # meta passed to assign (aligned to rows of G)
    P,                              # bulk prototypes (DataFrame for cosine, array for pearson)
    gene_id: str,
    sort_rows: bool = True,         # sort by assigned prototype then by max similarity
    cmap: str = "viridis",
    vmin: float = 0.0,
    vmax: float = 1.0,
):
    """
    Plot a heatmap of similarities (rows = gene clusters, cols = bulk prototypes)
    for a single gene and overlay markers for the Hungarian assignment.

    Notes
    -----
    - S must be the full similarity matrix returned by `assign_gene_clusters_to_consensus`.
      Rows of S must align with rows of `meta` (i.e., meta.loc[i] corresponds to S[i]).
    - P can be a DataFrame (then prot IDs are P.index) or a numpy array (then prot IDs are 0..P.shape[0]-1).
    """
    # Prototype IDs (columns of S)
    if isinstance(P, pd.DataFrame):
        prot_ids = list(P.index)
    else:
        prot_ids = list(range(P.shape[0]))

    # Row indices for the selected gene (rows of S)
    idx_rows = meta.index[meta["gene_tss"] == gene_id].tolist()
    if not idx_rows:
        raise ValueError(f"No rows in meta for gene_id={gene_id}.")

    # Subset similarity and labels (rows in the same order as S)
    S_sub = S[idx_rows, :]
    row_labels = meta.loc[idx_rows, "cluster_id"].astype(int).tolist()
    col_labels = prot_ids

    # Row ordering / sorting
    if sort_rows:
        ass_sub = assign_df[assign_df["gene_tss"] == gene_id].copy()
        # map cluster_id -> assigned consensus_id
        ass_map = {int(r["cluster_id"]): r["consensus_id"] for _, r in ass_sub.iterrows()}
        sort_keys = []
        for i, cid in enumerate(row_labels):
            assigned = ass_map.get(int(cid), None)
            j_ass = col_labels.index(assigned) if (assigned is not None and assigned in col_labels) else -1
            # use -max_similarity so that higher similarity sorts earlier within same assigned group
            sort_keys.append((j_ass, -float(np.nanmax(S_sub[i])) if S_sub.size else 0.0))
        order = sorted(range(len(sort_keys)), key=lambda ii: sort_keys[ii])
        S_plot = S_sub[order, :]
        row_labels_plot = [row_labels[i] for i in order]
    else:
        S_plot = S_sub
        row_labels_plot = row_labels

    # Create heatmap
    fig, ax = plt.subplots(
        figsize=(1.0 + 0.4 * len(col_labels), 0.6 + 0.4 * len(row_labels_plot))
    )
    sns.heatmap(
        S_plot,
        xticklabels=col_labels,
        yticklabels=row_labels_plot,
        cmap=cmap,
        vmin=vmin,
        vmax=vmax,
        cbar_kws={"label": "similarity"},
        ax=ax,
    )
    ax.set_title(f"Mapping of {gene_id} clusters to bulk prototypes")
    ax.set_xlabel("bulk consensus ID")
    ax.set_ylabel("gene cluster ID")

    # Overlay assignment markers
    ass_sub = assign_df[assign_df["gene_tss"] == gene_id]
    # Build CID -> row index in the plotted order
    row_index_map = {cid: i for i, cid in enumerate(row_labels_plot)}
    for _, r in ass_sub.iterrows():
        cid = int(r["cluster_id"])
        cons_id = r["consensus_id"]
        if cons_id is None or cons_id not in col_labels:
            continue
        i = row_index_map.get(cid, None)
        j = col_labels.index(cons_id)
        if i is None:
            continue
        # place marker at the center of the heatmap cell
        ax.scatter(j + 0.5, i + 0.5, s=60, facecolors="none", edgecolors="white", linewidths=1.5)

    plt.tight_layout()
    return fig, ax


def plot_assignment_summary(assign_df: pd.DataFrame):
    """
    Bar chart: number of gene clusters assigned to each bulk prototype (across all genes).
    Unassigned clusters (consensus_id == NaN) are dropped.
    """
    df = assign_df.dropna(subset=["consensus_id"])
    counts = df.groupby("consensus_id").size().sort_values(ascending=False)
    fig, ax = plt.subplots(figsize=(8, 3))
    counts.plot(kind="bar", ax=ax, color="#4c72b0")
    ax.set_ylabel("# gene clusters assigned")
    ax.set_xlabel("bulk consensus ID")
    ax.set_title("Assignment summary across genes")
    plt.tight_layout()
    return fig, ax


def recolor_meta_by_bulk(assign_df, meta_df, bulk_color_map, gene_id):
    """
    Return a copy of meta_df for `gene_id` with 'cluster_color' replaced by the assigned
    bulk prototype color (falls back to original color if unassigned).
    """
    m = meta_df.copy()
    ass_sub = assign_df[assign_df["gene_tss"] == gene_id]
    ass_map = {int(r["cluster_id"]): r["consensus_id"] for _, r in ass_sub.iterrows()}

    colors = []
    for _, r in m.iterrows():
        cid = int(r["cluster_id"])
        cons = ass_map.get(cid, None)
        col = bulk_color_map.get(cons, r.get("cluster_color", None))
        colors.append(col)
    m = m.assign(cluster_color=colors)
    return m


def plot_assignment_confidence(assign_df: pd.DataFrame, gene_id: str | None = None):
    """
    Scatter plot of best similarity vs. margin (second_best_similarity - best_similarity).
    Small margin => ambiguous assignment.
    """
    df = assign_df.copy()
    if gene_id is not None:
        df = df[df["gene_tss"] == gene_id]
    fig, ax = plt.subplots(figsize=(6, 4))
    ax.scatter(df["similarity"], df["margin"], s=30, alpha=0.7)
    ax.set_xlabel("best similarity")
    ax.set_ylabel("margin (second-best − best)")
    ax.set_title("Assignment confidence" + (f" ({gene_id})" if gene_id else ""))
    ax.grid(alpha=0.3)
    plt.tight_layout()
    return fig, ax


In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
import sys, os
scripts_dir = os.path.abspath("../scripts")  # note the ../
if scripts_dir not in sys.path:
    sys.path.append(scripts_dir)


In [None]:
from functions_needed_for_pipeline import (calculate_pol2_position, 
    preprocess_dataframe, 
    filter_reads_per_gene_middle_bin_name, 
    bin_then_matrix, 
    process_into_matrix, 
    handling_NaN, 
    preprocess_long_for_plot, 
    plot_reads_long, 
    get_genes_list, 
    get_groups_df_list,
    clustering_final, 
    compute_cluster_metrics,
    plot_umap, 
    dict_id_cluster_color, 
    dict_to_df, 
    merge, 
    start_end_center, 
    summary, 
    compute_bulk_centroids, 
    compute_gene_centroids, 
    plot_centroids_with_shading,
    build_positional_weights
)

#### Opening the text document that contains the columns description

In [None]:
with open("../data/gc_peak2_all_colnames.txt", "r") as f:
    description_colnames = f.read()

print(description_colnames)  

#### Hard coding variables useful :
- treatment_group list
- hex_colors list

In [None]:
##LIST CONTAINITNG THE GROUPS NAMES 
treatment_groups = [
        'BT474_mV_72hNHWD',
        'BT474_mV_72hSTC15',
        'BT474_mV_high_Untreated',
        'BT474_mV_low_Untreated',
        'BT474_mV_Untreated_Unsorted']

## ARBITRARY CHOSEN COLORS FOR THE CLUSTERS (THEY NEED TO HAVE A HIGH VALUE AND CHROMA TO STAND OUT ON A GREY BACKGROUND)
hex_colors = [
    "#0e67a7", "#ff7f0e", "#a0e468", "#d62728", "#9467bd",
    "#672417", "#e377c2", "#f5f523", "#28e0f5", "#3214a8",
    "#ca9d16", "#04a887", "#8c564b", "#17becf", "#bcbd22",
    "#2ca02c", "#1f77b4", "#ff9896", "#c5b0d5", "#98df8a",
    "#ffbb78", "#aec7e8", "#7f7f7f", "#c49c94", "#dbdb8d",
    "#9edae5"
]

##### Opening the datafile:
Calling the dataframe df

In [None]:
df = pd.read_csv("../data/gc_peak2_all.txt", delim_whitespace=True)

print(f'The shape of the dataframe is of {df.shape}')
#print(df.dtypes)
#print(df.columns)

df.head()

Calling the bulk dataframe

In [None]:
df_bulk = pd.read_csv("../data/camille_average_profile_pol2_by_cluster.txt", delim_whitespace=True)

print(f'The shape of the dataframe is of {df.shape}')
#print(df.dtypes)
#print(df.columns)
# is_constant = df_bulk.groupby('cluster')['cov'].nunique().eq(1)

# print("cov constant per cluster:", is_constant.all())

df_bulk.head()

##### Pipeline


In [None]:
#### BINNING THE BULK DATA TO BA ABLE TO DO REASSIGNMENT 

d= bin_then_matrix(
    df_bulk,
    indexes=['cluster'],
    bin_size=50
)

 # ---- Mean methylation per (cluster, bin) ----
meth_agg = (
        d.groupby(['cluster', 'bin'], as_index=False)
        .agg(meth_mean=('meth', 'mean'))
    )

# Take unique cov per cluster (assumes cov constant per cluster)
cov_cluster = d.groupby('cluster')['cov'].first()
cov_rows = []
for cid, sub in meth_agg.groupby('cluster'):
    cov_val = float(cov_cluster.get(cid, np.nan))
    for b in sub['bin'].unique():
        cov_rows.append({'cluster': cid, 'bin': b, 'cov_bin': cov_val})
cov_agg = pd.DataFrame(cov_rows)

# ---- Merge meth and coverage ----
agg = pd.merge(meth_agg, cov_agg, on=['cluster', 'bin'], how='left')

# ---- Pivot to wide profiles ----
profiles_df_bulk_binned = agg.pivot(index='cluster', columns='bin', values='meth_mean')
coverage_df_bulk_binned = agg.pivot(index='cluster', columns='bin', values='cov_bin')

    # ---- Sort columns and set names ----
positions_bulk_binned = np.array(sorted(profiles_df_bulk_binned.columns.astype(int)), dtype=int)
profiles_df_bulk_binned = profiles_df_bulk_binned.reindex(columns=positions_bulk_binned)
coverage_df_bulk_binned = coverage_df_bulk_binned.reindex(columns=positions_bulk_binned)

profiles_df_bulk_binned.index.name = 'cluster'
profiles_df_bulk_binned.columns = pd.Index(positions_bulk_binned, name='C_pos')
coverage_df_bulk_binned.index.name = 'cluster'
coverage_df_bulk_binned.columns = pd.Index(positions_bulk_binned, name='C_pos')


n_metric = cov_cluster.astype(float)
total_cov= float(cov_cluster.sum()) if cov_cluster.sum() is not None else 0.0
proportions = (cov_cluster / (total_cov if total_cov > 0 else 1.0)) * 100.0

meta_rows = []
for cid in profiles_df_bulk_binned.index:
        meta_rows.append({
            'cluster_id': cid if isinstance(cid, (int, np.integer)) else str(cid),
            'cov_total': float(n_metric.get(cid, np.nan)),
            'proportion': float(proportions.get(cid, 0.0))
        })
meta_df_bulk_binned = pd.DataFrame(meta_rows).sort_values('cluster_id').reset_index(drop=True)

profiles_df_bulk_binned, 
meta_df_bulk_binned
coverage_df_bulk_binned, 
positions_bulk_binned


In [None]:
print(df['read_strand'].value_counts())
print(df['motif_stand'].value_counts())


In [None]:
##RUNNING THE CODE ON THE ACTUAL DATASET -- applying preprocess_dataframe
columns_to_drop=['read_strand', 'N', 'n_meth_motif',
       'perc_meth_in_motif','peak_cov', 'peak_meth', 'C_in_motif',
       'n', 'read_meth_mean', 'motif_id', 'gene2', 'bound', 'cov', 'peakid', 'meth_C_in_motif','read_meth_C_in_motif']

df_proc = preprocess_dataframe(df, columns_to_drop)

df_proc.head()

In [None]:
# WHOLE PREPROCESSED DF THAT GETS BINNED UNDER METHYLATION MATRIX (i need it to flter the genes that we want to work on)

df_whole_matrix= process_into_matrix(
    df_proc,
    gene=None, #the id taken from the column gene_tss, only if we want to process per gene
    bin_size= 50)

df_whole_matrix.head()

In [None]:
# # WHOLE PREPROCESSED DF THAT IS UNBINNED UNDER METHYLATION MATRIX (i need it plot the average methylation profile unbinned)

# df_whole_matrix_unbinned= process_into_matrix(df_proc,gene=None,bin_size= 1)
# df_whole_matrix_unbinned.head(50)

In [None]:
# FILTERING AND KEEPING ONLY THE GENES THAT ARE SUITABLE (HAVE ENOUGH READS OVERLAPPING AND LONG ENOUGH)

df_filtered, filtered_regions = filter_reads_per_gene_middle_bin_name(
    df_whole_matrix, 
    middle_bin_name=None, 
    min_reads =50, 
    min_bins =20,
    require_middle_bin=True)

print(f'THe shape of the filtered dataframe is of {df_filtered.shape}')
print(f'There is {df_filtered.index.get_level_values("gene_tss").nunique()} different genes in this dataframe')

df_filtered.head()

In [None]:
#GETTING THE FILTERED GENES NAMES AND SUB DF
gene_names, gene_df = get_genes_list(df_filtered)
print(gene_names)


# gene_names_unbinned, gene_df_unbinned = get_genes_list(df_whole_matrix_unbinned)
# print(gene_names_unbinned)

In [None]:
genes_to_keep = df_filtered.index.get_level_values('gene_tss').unique()

# df_whole_matrix_unbinned = df_whole_matrix_unbinned[df_whole_matrix_unbinned.index.get_level_values('gene_tss').isin(genes_to_keep)]

# df_whole_matrix_unbinned.head()

In [None]:
#SELECTNG A GENE OF THE LIST OF GENES (for i=19, SETD1A_30958295_30958295)
gene= gene_names[14]
df_gene= gene_df[14]
print(gene)


# df_gene_unbinned = gene_df_unbinned[80]
# print(df_gene_unbinned)

In [None]:
#PLotting for a gene - no color, whole gene, all reads

# df_to_plot= preprocess_long_for_plot(df_proc)
# plot_reads_long(df_to_plot,filters={'gene_tss':gene}, facet_by=None, color_by=None, hex_colors=None)

In [None]:
# plotting for a gene per group - no. color, all reads, faceting by group
# plot_reads_long(df_to_plot,filters={'gene_tss':gene}, facet_by= 'group', color_by=None, hex_colors=None)

##### Running the clustering on the actual data

In [None]:
# CLUSTERING WITH PCA + EUCLIDEAN + LAPLACIAN KERNEL

df_PCA_euclidean, X_PCA_euclidean, partition_PCA_euclidean, clusters_PCA_euclidean, metrics_PCA_euclidean = clustering_final(
        df_gene,
        n_neighbors=15,
        nan_threshold=0.7,
        nan_method='drop',
        scaling=False,
        pca_or_not=True,
        n_pcs=None,           # None = no PCA; int for #PCs; float in (0,1) for variance ratio
        metric='euclidean',          # 'euclidean' or 'cosine'
        transform='arcsine',            # 'none', 'logit', or 'arcsine'
        kernel_type='laplacian',
        leiden_resolution=0.8,
        seed=42
    )

# print(clusters_PCA_euclidean)

# # PLOTTING UMAP FOR PCA + EUCLIDEAN + LAPLACIAN KERNEL
embedding_PCA_euclidean, fig_PCA_euclidean = plot_umap(
        X_PCA_euclidean,
        clusters_PCA_euclidean,
        n_neighbors=25,
        min_dist=0.1,
        metric='euclidean',      # for Pipeline 2, use 'euclidean'
        transform=None,          # None | 'logit' | 'arcsine' (use only if X are raw proportions)
        n_pcs=None,              # set if X are raw features; None if X are already PCs
        standardize=False,       # True if using raw features without PCA
        seed=42,
        palette=hex_colors,            # optional list/array or matplotlib colormap name
        title="PCA + euclidean + laplacian"
    )



# CREATING A NEW DATAFRAME WITH THE CLUSTER ASSIGNMENTS AND COLORS
read_dict_PCA_euclidean = dict_id_cluster_color(df_PCA_euclidean, clusters_PCA_euclidean, hex_colors)
df_dict_PCA_euclidean = dict_to_df(read_dict_PCA_euclidean)
df_test_PCA_euclidean= merge(df_proc, df_dict_PCA_euclidean)

#PREPROCESSING THE DF FOR LONG READS PLOTTING 
df_test_plot_PCA_euclidean = preprocess_long_for_plot(df_test_PCA_euclidean,
                                                          include_locus_cluster=True,
                                                          filter_outliers= True,
                                                          max_span_bp=None,     #i'm letting the filtering be based on the span_quantile
                                                          span_quantile=0.99,
                                                          require_center_inside=True,
                                                          min_cpg=1,
                                                          cpg_window_bp=5000
                                                          )

# PLOTTING THE LONG READS FOR PCA + EUCLIDEAN + LAPLACIAN KERNEL
plot_reads_long(
        df_test_plot_PCA_euclidean,
        filters={
            # "gene_tss": gene, 
            'group':['BT474_mV_high_Untreated',
            'BT474_mV_low_Untreated',
            'BT474_mV_Untreated_Unsorted']},
        # facet_by='group',
        color_by="locus_cluster",
        gene=gene,
        hex_colors=hex_colors,
        figsize=(12,6)
    )

#ISOLATING ONLY THE INTERESTING GROUPS FOR SUMMARY STATS
# df_only_interesting_groups_PCA_euclidean = df_PCA_euclidean[df_PCA_euclidean.index.get_level_values('group').isin([
#         'BT474_mV_high_Untreated',
#         'BT474_mV_low_Untreated',
#         'BT474_mV_Untreated_Unsorted'
#     ])]

# df_only_interesting_groups_PCA_euclidean = merge(df_only_interesting_groups_PCA_euclidean, df_dict_PCA_euclidean)
# clusters_of_interest_PCA_euclidean = df_only_interesting_groups_PCA_euclidean['locus_cluster'].tolist() 
# summary(df_only_interesting_groups_PCA_euclidean, clusters_of_interest_PCA_euclidean, gene=gene)


# COMPUTING THE GENE CENTROIDS TO PLOT AVERAGE PROFILES PER CLUSTER
profiles_df_PCA_euclidean, coverage_df_PCA_euclidean, meta_df_PCA_euclidean, positions_PCA_euclidean= compute_gene_centroids(
                                                                    df_PCA_euclidean,
                                                                    df_dict_PCA_euclidean,
                                                                    gene_tss=gene,
                                                                   )

# PLOTTING THE AVERAGE PROFILES PER CLUSTER FOR PCA + EUCLIDEAN + LAPLACIAN KERNEL
plot_centroids_with_shading(
    profiles_df_PCA_euclidean,
    positions_PCA_euclidean,
    meta_df_PCA_euclidean,
    coverage_df_PCA_euclidean,
    hex_colors,
    proportional_height=True,
    smooth_sigma = 0,
    title=gene + 'Average DNA Methylation Profiles per Cluster',
    missingness_threshold=0.3,
)

In [None]:
# Build weights emphasizing ±500 bp around Pol2 (positions assumed to be df.columns)
weights_pos = build_positional_weights(
    columns=df_gene.columns,
    window_bp=300,
    center=0,
    mode='gaussian',
    inside_weight=1.0,
    outside_weight=0.2,
    sigma_bp=None,           # default window_bp/2
    normalize_mean=True      # average weight ~ 1.0
)

df_used, X_emb, part, labels, metrics = clustering_final(
    df=df_gene,
    n_neighbors=15,
    nan_threshold=0.7,
    nan_method='drop',
    scaling=False,
    pca_or_not=True,            # weighting applied pre-PCA by default
    n_pcs=None,
    metric='euclidean',         # or 'cosine'
    transform='arcsine',
    kernel_type='laplacian',
    leiden_resolution=0.8,
    seed=42,
    pos_weights=weights_pos,    # <-- emphasize central bins
    weight_stage='pre_pca'
)
print(part) 

clusters = np.asarray(part.membership, dtype=int)

print (clusters)
# # PLOTTING UMAP FOR PCA + EUCLIDEAN + LAPLACIAN KERNEL
embedding_PCA_euclidean, fig_PCA_euclidean = plot_umap(
        X_emb,
        clusters,
        n_neighbors=25,
        min_dist=0.1,
        metric='euclidean',      # for Pipeline 2, use 'euclidean'
        transform=None,          # None | 'logit' | 'arcsine' (use only if X are raw proportions)
        n_pcs=None,              # set if X are raw features; None if X are already PCs
        standardize=False,       # True if using raw features without PCA
        seed=42,
        palette=hex_colors,            # optional list/array or matplotlib colormap name
        title="PCA + euclidean + laplacian + weighted bins",
        gene = 'Gene'
    )



# CREATING A NEW DATAFRAME WITH THE CLUSTER ASSIGNMENTS AND COLORS
read_dict = dict_id_cluster_color(df_used, clusters, hex_colors)
df_dict = dict_to_df(read_dict)
df_test= merge(df_proc, df_dict)

#PREPROCESSING THE DF FOR LONG READS PLOTTING 
df_test_plot = preprocess_long_for_plot(df_test,
                                                          include_locus_cluster=True,
                                                          filter_outliers= True,
                                                          max_span_bp=None,     #i'm letting the filtering be based on the span_quantile
                                                          span_quantile=0.99,
                                                          require_center_inside=True,
                                                          min_cpg=1,
                                                          cpg_window_bp=5000
                                                          )

# PLOTTING THE LONG READS FOR PCA + EUCLIDEAN + LAPLACIAN KERNEL
plot_reads_long(
        df_test_plot,
        filters={
            # "gene_tss": gene, 
            'group':['BT474_mV_high_Untreated',
            'BT474_mV_low_Untreated',
            'BT474_mV_Untreated_Unsorted']},
        # facet_by='group',
        color_by="locus_cluster",
        gene=gene,
        hex_colors=hex_colors,
        figsize=(12,6)
    )

#ISOLATING ONLY THE INTERESTING GROUPS FOR SUMMARY STATS
# df_only_interesting_groups_PCA_euclidean = df_PCA_euclidean[df_PCA_euclidean.index.get_level_values('group').isin([
#         'BT474_mV_high_Untreated',
#         'BT474_mV_low_Untreated',
#         'BT474_mV_Untreated_Unsorted'
#     ])]

# df_only_interesting_groups_PCA_euclidean = merge(df_only_interesting_groups_PCA_euclidean, df_dict_PCA_euclidean)
# clusters_of_interest_PCA_euclidean = df_only_interesting_groups_PCA_euclidean['locus_cluster'].tolist() 
# summary(df_only_interesting_groups_PCA_euclidean, clusters_of_interest_PCA_euclidean, gene=gene)


# COMPUTING THE GENE CENTROIDS TO PLOT AVERAGE PROFILES PER CLUSTER
profiles_df, coverage_df, meta_df, positions= compute_gene_centroids(
                                                                    df_used,
                                                                    df_dict,
                                                                    gene_tss=gene,
                                                                   )

# PLOTTING THE AVERAGE PROFILES PER CLUSTER FOR PCA + EUCLIDEAN + LAPLACIAN KERNEL
plot_centroids_with_shading(
    profiles_df,
    positions,
    meta_df_PCA_euclidean,
    coverage_df,
    hex_colors,
    proportional_height=True,
    smooth_sigma = 0,
    title=gene + 'Average DNA Methylation Profiles per Cluster',
    missingness_threshold=0.3,
)

In [None]:
read_dict_PCA_euclidean
# GETTING THE DICTIONARY MAPPING CLUSTER TO COLOR
cluster_to_color = {int(v["cluster"]): v["color"] for v in read_dict_PCA_euclidean.values()}
cluster_to_color 


In [None]:
#adding the order column for plotting purposes later
df_test_plot_PCA_euclidean['order']=df_test_plot_PCA_euclidean['cluster']
df_test_plot_PCA_euclidean.head()

In [None]:
# def plot_single_molecule_occupancy(
#     df,
#     cluster_colors,                 # dict keyed by int cluster: {0:"#...",1:"#...",...}
#     xlim=(-1000, 1000),
#     point_size=3,
#     figsize_cm=(12, 10),
#     dpi=300
# ):
#     required = {"readid", "C_start_shifted", "meth", "locus_cluster", "order"}
#     missing = required - set(df.columns)
#     if missing:
#         raise ValueError(f"Missing columns: {', '.join(sorted(missing))}")

#     d = df.copy()

#     # Unique (readid, order) → y positions
#     per_read = (
#         d[["readid", "order"]]
#         .dropna()
#         .astype({"order": int})
#         .sort_values("order")
#         .drop_duplicates(subset="readid", keep="first")
#     )
#     if per_read.empty:
#         raise ValueError("No (readid, order) pairs available to assign y positions.")

#     y_map = dict(zip(per_read["readid"], range(1, len(per_read) + 1)))
#     d = d.loc[d["readid"].isin(y_map)].copy()
#     d["ypoint"] = d["readid"].map(y_map).astype(int) + 1
#     d["C_start_shifted"] = d["C_start_shifted"].astype(float) + 1

#     # Point colors: white for meth==1; else cluster color
#     d["color"] = np.where(
#         d["meth"] == 1,
#         "white",
#         [cluster_colors.get(int(c), "black") for c in d["locus_cluster"]]
#     )

#     # Stripe color per read from its cluster
#     stripes = (
#         d[["readid", "ypoint", "locus_cluster"]]
#         .drop_duplicates(subset="readid")
#         .assign(stripe_color=lambda x: [cluster_colors.get(int(c), "black") for c in x["locus_cluster"]])
#     )

#     # Plot
#     w_in, h_in = (figsize_cm[0] / 2.54, figsize_cm[1] / 2.54)
#     fig, ax = plt.subplots(figsize=(w_in, h_in), dpi=dpi)

#     for _, row in stripes.iterrows():
#         ax.hlines(y=row["ypoint"], xmin=xlim[0], xmax=xlim[1],
#                   colors=row["stripe_color"], linewidth=2, alpha=1.0, zorder=0)

#     ax.scatter(d["C_start_shifted"], d["ypoint"],
#                s=point_size, c=d["color"], marker="o", linewidths=0, zorder=1)

#     ax.set_xlim(*xlim)
#     ax.set_xlabel("distance from motif")
#     ax.set_ylabel("")
#     ax.set_yticks([])
#     for spine in ("top", "right", "left"):
#         ax.spines[spine].set_visible(False)

#     return fig, ax

In [None]:
# FUNCTION TO PLOT SINGLE MOLECULE OCCUPANCY WITH ORDER-BASED STRIPE COLORS --TV PLOT 
#can work on loci specific (per genre) or on the whole dataframe (all genes)

def plot_single_molecule_occupancy_rlike(
    df,
    cluster_colors,                 # dict keyed by int cluster: {1:"#...", 2:"#...", ..., 14:"#..."}
    x_col="C_start_shifted",
    xlim=(-1000, 1000),
    point_size=0.6,                 # ~ R cex=0.1; adjust for density
    stripe_lw=2.0,
    figsize_cm=(12, 10),
    dpi=300,
    n_clusters=14                   # number of orders/clusters
):
    required = {"readid", x_col, "meth", "locus_cluster"}
    missing = required - set(df.columns)
    if missing:
        raise ValueError(f"Missing columns: {', '.join(sorted(missing))}")

    d = df.copy()

    # y positions: concatenate read IDs per order bucket
    orders_present = sorted(d["locus_cluster"].dropna().astype(int).unique())
    readids_ordered = []
    for o in orders_present:
        ids_o = d.loc[d["locus_cluster"] == o, "readid"].dropna().drop_duplicates().tolist()
        readids_ordered.extend(ids_o)
    if not readids_ordered:
        raise ValueError("No read IDs available to assign y positions.")

    y_map = dict(zip(readids_ordered, range(1, len(readids_ordered) + 1)))
    d = d.loc[d["readid"].isin(y_map)].copy()
    d["ypoint"] = d["readid"].map(y_map).astype(float) + 1.0  # R offset +1
    d[x_col] = d[x_col].astype(float) + 1.0                   # R offset +1

    # point colors: by cluster ( white for meth==1)
    cluster_ids = d["locus_cluster"].astype(int).to_numpy()
    base_colors = np.array([cluster_colors.get(int(c), "black") for c in cluster_ids], dtype=object)
    d["color"] = np.where(d["meth"] == 1, "white", base_colors)
    

    # stripe colors: order-based reversed cluster palette (if complete), else per-read cluster color
    palette_list = [cluster_colors.get(i) for i in range(1, n_clusters + 1)]
    if all(c is not None for c in palette_list):
        reverse_colors = palette_list[::-1]  # order 1 gets last color, 14 gets first
        stripes = (
            d[["readid", "ypoint", 'cluster']]
            .drop_duplicates(subset="readid")
            .sort_values("locus_cluster")
        )
        stripes["stripe_color"] = [reverse_colors[int(o) - 1] for o in stripes["locus_cluster"]]
    else:
        stripes = (
            d[["readid", "ypoint", "locus_cluster"]]
            .drop_duplicates(subset="readid")
        )
        stripes["stripe_color"] = [cluster_colors.get(int(c), "black") for c in stripes["locus_cluster"]]

    # plot
    w_in, h_in = (figsize_cm[0] / 2.54, figsize_cm[1] / 2.54)
    fig, ax = plt.subplots(figsize=(w_in, h_in), dpi=dpi)

    for _, row in stripes.iterrows():
        ax.hlines(y=row["ypoint"], xmin=xlim[0], xmax=xlim[1],
                  colors=row["stripe_color"], linewidth=stripe_lw, alpha=1.0, zorder=0)

    ax.scatter(d[x_col], d["ypoint"],
               s=point_size, c=d["color"], marker="o", linewidths=0, zorder=1)

    ax.set_xlim(*xlim)
    ax.set_xlabel("distance from motif")
    ax.set_ylabel("")
    ax.set_yticks([])
    for spine in ("top", "right", "left"):
        ax.spines[spine].set_visible(False)

    return fig, ax

In [None]:
plot_single_molecule_occupancy_rlike(
    df=df_test_plot_PCA_euclidean,
    cluster_colors=cluster_to_color,
    xlim=(-1100, 1100),
    stripe_lw=1.5,
    point_size=0.6,
    figsize_cm=(12, 5),
    dpi=300
)

In [None]:
meta_df_PCA_euclidean

In [None]:
profiles_df_PCA_euclidean

In [None]:
df_dict_PCA_euclidean

In [None]:
df_PCA_euclidean

In [None]:
# # NOW REPEATING THE WHOLE PROCESS FOR THE UNBINNED DATAFRAME
# df_PCA_euclidean, X_PCA_euclidean_unbinned, partition_PCA_euclidean_unbinned, clusters_PCA_euclidean_unbinned, metrics_PCA_euclidean_unbinned = clustering_final(
#         df_gene,
#         n_neighbors=9,
#         nan_threshold=0.1,
#         nan_method='drop',
#         scaling=False,
#         pca_or_not=True,
#         n_pcs=None,           # None = no PCA; int for #PCs; float in (0,1) for variance ratio
#         metric='euclidean',          # 'euclidean' or 'cosine'
#         transform='arcsine',            # 'none', 'logit', or 'arcsine'
#         kernel_type='laplacian',
#         leiden_resolution=0.8,
#         seed=42
#     )

# read_dict_PCA_euclidean_unbinned = dict_id_cluster_color(df_gene_unbinned, clusters_PCA_euclidean_unbinned, hex_colors)

# df_dict_PCA_euclidean_unbinned = dict_to_df(read_dict_PCA_euclidean_unbinned)
# df_test_PCA_euclidean_unbinned= merge(df_proc, df_dict_PCA_euclidean_unbinned)

# df_test_plot_PCA_euclidean_unbinned = preprocess_long_for_plot(df_test_PCA_euclidean_unbinned,
#                                                           include_locus_cluster=True,
#                                                           filter_outliers= True,
#                                                           max_span_bp=None,     #i'm letting the filtering be based on the span_quantile
#                                                           span_quantile=0.99,
#                                                           require_center_inside=True,
#                                                           min_cpg=1,
#                                                           cpg_window_bp=5000
#                                                           )


# plot_reads_long(
#         df_test_plot_PCA_euclidean_unbinned,
#         filters={
#             # "gene_tss": gene, 
#             'group':['BT474_mV_high_Untreated',
#             'BT474_mV_low_Untreated',
#             'BT474_mV_Untreated_Unsorted']},
#         # facet_by='group',
#         color_by="locus_cluster",
#         gene=gene,
#         hex_colors=hex_colors,
#         figsize =(12,5)
#     )

# plot_single_molecule_occupancy_rlike(
#     df=df_test_plot_PCA_euclidean_unbinned,
#     cluster_colors=cluster_to_color,
#     xlim=(-1100, 1100),
#     stripe_lw=1.5,
#     point_size=0.6,
#     figsize_cm=(12, 5),
#     dpi=300
# )

# profiles_df_unbinned, coverage_df_unbinned_euclidean, meta_df_unbinned_euclidean, positions_unbinned_euclidean= compute_gene_centroids(
#                                                                     df_gene_unbinned,
#                                                                     df_dict_PCA_euclidean_unbinned,
#                                                                     gene_tss='SETD1A_30958295_30958295',
#                                                                    )

# plot_centroids_with_shading(
#     profiles_df_unbinned,
#     positions_unbinned_euclidean,
#     meta_df_unbinned_euclidean,
#     coverage_df_unbinned_euclidean,
#     hex_colors,
#     smooth_sigma = 0,
#     title=gene +'Average DNA Methylation Profiles per Cluster',
#     missingness_threshold=0.5,
# )


In [None]:
# # LOOPING OVER ALL THE GENES AND PERFORMING THE WHOLE ANALYSIS PIPELINE -- PCA + EUCLIDEAN + LAPLACIAN KERNEL
for i in range(len(gene_df)):
    gene = gene_names[i]
    df_gene = gene_df[i]

    print(gene)

    df_PCA_euclidean, X_PCA_euclidean, partition_PCA_euclidean, clusters_PCA_euclidean, metrics_PCA_euclidean = clustering_final(
        df_gene,
        n_neighbors=15,
        nan_threshold=0.7,
        nan_method='drop',
        scaling=False,
        pca_or_not=True,
        n_pcs=None,           # None = no PCA; int for #PCs; float in (0,1) for variance ratio
        metric='euclidean',          # 'euclidean' or 'cosine' 
        transform='arcsine',            # 'none', 'logit', or 'arcsine'
        kernel_type='laplacian',
        leiden_resolution=0.8,
        seed=42
    )

    # embedding_PCA_euclidean, fig_PCA_euclidean = plot_umap(
    #     X_PCA_euclidean,
    #     clusters_PCA_euclidean,
    #     n_neighbors=25,
    #     min_dist=0.1,
    #     metric='euclidean',      
    #     transform=None,          # None | 'logit' | 'arcsine' (use only if X are raw proportions)
    #     n_pcs=None,              # set if X are raw features; None if X are already PCs
    #     standardize=False,       # True if using raw features without PCA
    #     seed=42,
    #     palette=hex_colors,            # optional list/array or matplotlib colormap name
    #     title="PCA + Euclidean + logit + Laplacian",
    #     gene=gene
    # )

    # print(metrics_PCA_euclidean)

    # multi_sil_PCA_euclidean= silhouettes_multi(X_PCA_euclidean, clusters_PCA_euclidean, metric_main='euclidean', extra_metrics=['cosine', 'correlation', 'jaccard'])

    # print(f"Silhouette (PCA + Euclidean): {metrics_PCA_euclidean['silhouette']:.3f}")
    # print(f"Silhouette (PCA + Euclidean) - Cosine: {multi_sil_PCA_euclidean['silhouette_cosine']:.3f}")
    # print(f"Silhouette (PCA + Euclidean) - Correlation: {multi_sil_PCA_euclidean['silhouette_correlation']:.3f}")
    # print(f"Silhouette (PCA + Euclidean) - Jaccard: {multi_sil_PCA_euclidean['silhouette_jaccard']:.3f}")


    read_dict_PCA_euclidean = dict_id_cluster_color(df_PCA_euclidean, clusters_PCA_euclidean, hex_colors)

    df_dict_PCA_euclidean = dict_to_df(read_dict_PCA_euclidean)
    df_test_PCA_euclidean= merge(df_proc, df_dict_PCA_euclidean)

    # Long format for plotting
    df_test_plot_PCA_euclidean = preprocess_long_for_plot(df_test_PCA_euclidean,
                                                          include_locus_cluster=True,
                                                          filter_outliers= True,
                                                          max_span_bp=None,     #i'm letting the filtering be based on the span_quantile
                                                          span_quantile=0.99,
                                                          require_center_inside=True,
                                                          min_cpg=1,
                                                          cpg_window_bp=5000
                                                          )

    # Plot
    plot_reads_long(
        df_test_plot_PCA_euclidean,
        filters={
            # "gene_tss": gene, 
            'group':['BT474_mV_high_Untreated',
            'BT474_mV_low_Untreated',
            'BT474_mV_Untreated_Unsorted']},
        # facet_by='group',
        color_by="locus_cluster",
        gene=gene,
        hex_colors=hex_colors
    )

    df_only_interesting_groups_PCA_euclidean = df_PCA_euclidean[df_PCA_euclidean.index.get_level_values('group').isin([
        'BT474_mV_high_Untreated',
        'BT474_mV_low_Untreated',
        'BT474_mV_Untreated_Unsorted'
    ])]

    df_only_interesting_groups_PCA_euclidean = merge(df_only_interesting_groups_PCA_euclidean, df_dict_PCA_euclidean)
    clusters_of_interest_PCA_euclidean = df_only_interesting_groups_PCA_euclidean['locus_cluster'].tolist() 
    # summary(df_only_interesting_groups_PCA_euclidean, clusters_of_interest_PCA_euclidean, gene=gene)

    profiles_df_PCA_euclidean, coverage_df_PCA_euclidean, meta_df_PCA_euclidean, positions_PCA_euclidean= compute_gene_centroids(
                                                                        df_PCA_euclidean,
                                                                        df_dict_PCA_euclidean,
                                                                        gene_tss=gene,
                                                                    )

    plot_centroids_with_shading(
        profiles_df_PCA_euclidean,
        positions_PCA_euclidean,
        meta_df_PCA_euclidean,
        coverage_df_PCA_euclidean,
        hex_colors,
        smooth_sigma = 0,
        title= gene +':Average DNA Methylation Profiles per Cluster - EUCLIDEAN',
        missingness_threshold=0.5,
    )

In [None]:
# NOW LOOPING OVER ALL THE GENES AND PERFORMING THE WHOLE ANALYSIS PIPELINE -- NO PCA + COSINE + LAPLACIAN KERNEL
for i in range(len(gene_df)):
    gene = gene_names[i]
    df_gene = gene_df[i]

    print(gene)
    df_cosine, X_cosine, partition_cosine, clusters_cosine, metrics_cosine = clustering_final(
        df_gene,
        n_neighbors=15,
        nan_threshold=0.7,
        nan_method='drop',
        scaling=False,
        pca_or_not=False,
        n_pcs=None,           # None = no PCA; int for #PCs; float in (0,1) for variance ratio
        metric='cosine',          # 'euclidean' or 'cosine'
        transform='arcsine',            # 'none', 'logit', or 'arcsine'
        kernel_type='laplacian',
        leiden_resolution=0.8,
        seed=42
    )

    embedding_cosine, fig_cosine = plot_umap(
        X_cosine,
        clusters_cosine,
        n_neighbors=25,
        min_dist=0.1,
        metric='cosine',      # for Pipeline 2, use 'euclidean'
        transform=None,          # None | 'logit' | 'arcsine' (use only if X are raw proportions)
        n_pcs=None,              # set if X are raw features; None if X are already PCs
        standardize=False,       # True if using raw features without PCA
        seed=42,
        palette=hex_colors,            # optional list/array or matplotlib colormap name
        title="NO PCA + cosine + laplacian",
        gene=gene
    )

    # print(metrics_cosine)

    # multi_sil_cosine= silhouettes_multi(X_cosine, clusters_cosine, metric_main='cosine', extra_metrics=['euclidean', 'correlation', 'jaccard'])

    # print(f"Silhouette (Cosine): {metrics_cosine['silhouette']:.3f}")
    # print(f"Silhouette (Cosine) - Euclidean: {multi_sil_cosine['silhouette_euclidean']:.3f}")
    # print(f"Silhouette (Cosine) - Correlation: {multi_sil_cosine['silhouette_correlation']:.3f}")
    # print(f"Silhouette (Cosine) - Jaccard: {multi_sil_cosine['silhouette_jaccard']:.3f}")


    read_dict_cosine = dict_id_cluster_color(df_cosine, clusters_cosine, hex_colors)

    df_dict_cosine = dict_to_df(read_dict_cosine)
    df_test_cosine= merge(df_proc, df_dict_cosine)

    # Long format for plotting
    df_test_plot_cosine = preprocess_long_for_plot(df_test_cosine,
                                                    include_locus_cluster=True,
                                                    filter_outliers= True,
                                                    max_span_bp=None,     #i'm letting the filtering be based on the span_quantile
                                                    span_quantile=0.99,
                                                    require_center_inside=True,
                                                    min_cpg=1,
                                                    cpg_window_bp=5000
                                                    )

    # Plot
    plot_reads_long(
        df_test_plot_cosine,
        filters={
            # "gene_tss": gene, 
            'group':['BT474_mV_high_Untreated',
            'BT474_mV_low_Untreated',
            'BT474_mV_Untreated_Unsorted']},
        # facet_by='group',
        color_by="locus_cluster",
        gene= gene,
        hex_colors=hex_colors
    )

    df_only_interesting_groups_cosine = df_cosine[df_cosine.index.get_level_values('group').isin([
        'BT474_mV_high_Untreated',
        'BT474_mV_low_Untreated',
        'BT474_mV_Untreated_Unsorted'
    ])]

    df_only_interesting_groups_cosine = merge(df_only_interesting_groups_cosine, df_dict_cosine)
    clusters_of_interest_cosine = df_only_interesting_groups_cosine['locus_cluster'].tolist() 
    
    summary(df_only_interesting_groups_cosine, clusters_of_interest_cosine)

    profiles_df_cosine, coverage_df_cosine, meta_df_cosine, positions_cosine = compute_gene_centroids(
                                                                        df_cosine,
                                                                        df_dict_cosine,
                                                                        gene_tss=gene,
                                                                    )

    plot_centroids_with_shading(
        profiles_df_cosine,
        positions_cosine,
        meta_df_cosine,
        coverage_df_cosine,
        hex_colors,
        smooth_sigma = 0,
        title= gene + ' : Average DNA Methylation Profiles per Cluster - COSINE ',
        missingness_threshold=0.5,
    )

In [None]:
# NOW LOOPING OVER ALL THE GENES AND PERFORMING THE WHOLE ANALYSIS PIPELINE -- NO PCA + COSINE + LAPLACIAN KERNEL
for i in range(len(gene_df)):
    gene = gene_names[i]
    df_gene = gene_df[i]

    print(gene)

    weights_pos = build_positional_weights(
        columns=df_gene.columns,
        window_bp=400,
        center=0,
        mode='gaussian',
        inside_weight=1.0,
        outside_weight=0.2,
        sigma_bp=None,           # default window_bp/2
        normalize_mean=True      # average weight ~ 1.0
    )


    df_cosine, X_cosine, part, clusters_cosine, metrics_cosine = clustering_final(
        df_gene,
        n_neighbors=15,
        nan_threshold=0.7,
        nan_method='drop',
        scaling=False,
        pca_or_not=False,
        n_pcs=None,           # None = no PCA; int for #PCs; float in (0,1) for variance ratio
        metric='cosine',          # 'euclidean' or 'cosine'
        transform='arcsine',            # 'none', 'logit', or 'arcsine'
        kernel_type='laplacian',
        leiden_resolution=0.8,
        seed=42,
        pos_weights=weights_pos,    # <-- emphasize central bins
        weight_stage='pre_pca'

    )

    partition_cosine = np.asarray(part.membership, dtype=int)

    embedding_cosine, fig_cosine = plot_umap(
        X_cosine,
        clusters_cosine,
        n_neighbors=25,
        min_dist=0.1,
        metric='cosine',      # for Pipeline 2, use 'euclidean'
        transform=None,          # None | 'logit' | 'arcsine' (use only if X are raw proportions)
        n_pcs=None,              # set if X are raw features; None if X are already PCs
        standardize=False,       # True if using raw features without PCA
        seed=42,
        palette=hex_colors,            # optional list/array or matplotlib colormap name
        title="NO PCA + cosine + laplacian",
        gene=gene
    )

    # print(metrics_cosine)

    # multi_sil_cosine= silhouettes_multi(X_cosine, clusters_cosine, metric_main='cosine', extra_metrics=['euclidean', 'correlation', 'jaccard'])

    # print(f"Silhouette (Cosine): {metrics_cosine['silhouette']:.3f}")
    # print(f"Silhouette (Cosine) - Euclidean: {multi_sil_cosine['silhouette_euclidean']:.3f}")
    # print(f"Silhouette (Cosine) - Correlation: {multi_sil_cosine['silhouette_correlation']:.3f}")
    # print(f"Silhouette (Cosine) - Jaccard: {multi_sil_cosine['silhouette_jaccard']:.3f}")


    read_dict_cosine = dict_id_cluster_color(df_cosine, clusters_cosine, hex_colors)

    df_dict_cosine = dict_to_df(read_dict_cosine)
    df_test_cosine= merge(df_proc, df_dict_cosine)

    # Long format for plotting
    df_test_plot_cosine = preprocess_long_for_plot(df_test_cosine,
                                                    include_locus_cluster=True,
                                                    filter_outliers= True,
                                                    max_span_bp=None,     #i'm letting the filtering be based on the span_quantile
                                                    span_quantile=0.99,
                                                    require_center_inside=True,
                                                    min_cpg=1,
                                                    cpg_window_bp=5000
                                                    )

    # Plot
    plot_reads_long(
        df_test_plot_cosine,
        filters={
            # "gene_tss": gene, 
            'group':['BT474_mV_high_Untreated',
            'BT474_mV_low_Untreated',
            'BT474_mV_Untreated_Unsorted']},
        # facet_by='group',
        color_by="locus_cluster",
        gene= gene,
        hex_colors=hex_colors
    )

    df_only_interesting_groups_cosine = df_cosine[df_cosine.index.get_level_values('group').isin([
        'BT474_mV_high_Untreated',
        'BT474_mV_low_Untreated',
        'BT474_mV_Untreated_Unsorted'
    ])]

    df_only_interesting_groups_cosine = merge(df_only_interesting_groups_cosine, df_dict_cosine)
    clusters_of_interest_cosine = df_only_interesting_groups_cosine['locus_cluster'].tolist() 
    
    summary(df_only_interesting_groups_cosine, clusters_of_interest_cosine)

    profiles_df_cosine, coverage_df_cosine, meta_df_cosine, positions_cosine = compute_gene_centroids(
                                                                        df_cosine,
                                                                        df_dict_cosine,
                                                                        gene_tss=gene,
                                                                    )

    plot_centroids_with_shading(
        profiles_df_cosine,
        positions_cosine,
        meta_df_cosine,
        coverage_df_cosine,
        hex_colors,
        smooth_sigma = 0,
        title= gene + ' : Average DNA Methylation Profiles per Cluster - COSINE ',
        missingness_threshold=0.5,
    )

In [None]:
df_cosine, X_cosine, partition_cosine, clusters_cosine, metrics_cosine = clustering_final(
        gene_df[14],
        n_neighbors=15,
        nan_threshold=0.7,
        nan_method='impute',
        scaling=False,
        pca_or_not=False,
        n_pcs=None,           # None = no PCA; int for #PCs; float in (0,1) for variance ratio
        metric='cosine',          # 'euclidean' or 'cosine'
        transform='arcsine',            # 'none', 'logit', or 'arcsine'
        kernel_type='laplacian',
        leiden_resolution=0.8,
        seed=42
    )

embedding_cosine, fig_cosine = plot_umap(
        X_cosine,
        clusters_cosine,
        n_neighbors=25,
        min_dist=0.1,
        metric='cosine',      # for Pipeline 2, use 'euclidean'
        transform=None,          # None | 'logit' | 'arcsine' (use only if X are raw proportions)
        n_pcs=None,              # set if X are raw features; None if X are already PCs
        standardize=False,       # True if using raw features without PCA
        seed=42,
        palette=hex_colors,            # optional list/array or matplotlib colormap name
        title="NO PCA + cosine + laplacian",
        gene= gene_names[14]
    )

print(metrics_cosine)

# multi_sil_cosine= silhouettes_multi(X_cosine, clusters_cosine, metric_main='cosine', extra_metrics=['euclidean', 'correlation', 'jaccard'])

# print(f"Silhouette (Cosine): {metrics_cosine['silhouette']:.3f}")
# print(f"Silhouette (Cosine) - Euclidean: {multi_sil_cosine['silhouette_euclidean']:.3f}")
# print(f"Silhouette (Cosine) - Correlation: {multi_sil_cosine['silhouette_correlation']:.3f}")
# print(f"Silhouette (Cosine) - Jaccard: {multi_sil_cosine['silhouette_jaccard']:.3f}")


read_dict_cosine = dict_id_cluster_color(df_cosine, clusters_cosine, hex_colors)

df_dict_cosine = dict_to_df(read_dict_cosine)
df_test_cosine= merge(df_proc, df_dict_cosine)

profiles_df_cos, coverage_df, meta_df, positions = compute_gene_centroids(
    df_reads=df_cosine,        # reads × bins methylation matrix
    df_map=df_dict_cosine,        # mapping of readid → cluster + color
    gene_tss= gene_names[14],
    extra_meta_cols=['gene_tss', 'group', 'cluster']  # add other non-bin columns here if present
)

profiles_df_gene, coverage_df_gene, meta_df_gene, positions_gene = compute_gene_centroids(
    df_reads= gene_df[14],        # reads × bins methylation matrix
    df_map=df_dict_cosine,        # mapping of readid → cluster + color
    gene_tss= gene_names[14],
    extra_meta_cols=['gene_tss', 'group', 'cluster']  # add other non-bin columns here if present
)

plot_centroids_with_shading(
    profiles_df_cos,
    positions,
    meta_df,
    coverage_df,
    hex_colors,
    smooth_sigma = 0,
    title='Average DNA Methylation Profiles per Cluster',
    missingness_threshold=0.5,
)

plot_centroids_with_shading(
    profiles_df_gene,
    positions_gene,
    meta_df_gene,
    coverage_df_gene,
    hex_colors,
    smooth_sigma = 0,
    title='Average DNA Methylation Profiles per Cluster',
    missingness_threshold=0.5,
)

gene_df[14].head(50)

In [None]:
df_cosine.head(50)

In [None]:
df_dict_cosine.head()

In [None]:
# df_bulk has columns: ['cluster','C_pos','meth','cov'] for the whole dataset

P_df, coverage_bulk_df, meta_bulk_df, positions_bulk = compute_bulk_centroids(df_bulk, use_cov_for_proportion=True)

fig, axes = plot_centroids_with_shading(
    P_df=P_df,
    positions=positions_bulk,
    meta_df=meta_bulk_df,
    hex_colors=hex_colors,
    title="Average DNA Methylation Profiles per Cluster (Bulk)"
)

P_df.head(15)

In [None]:
meta_bulk_df

In [None]:
# df_bulk has columns: ['cluster','C_pos','meth','cov'] for the whole dataset

fig, axes = plot_centroids_with_shading(
    P_df=profiles_df_bulk_binned,
    positions=positions_bulk_binned,
    meta_df=meta_df_bulk_binned, #attention ici, j'utilise pas le meta donné par la fonction compute_gene_centroid
    hex_colors=hex_colors,
    title="Average DNA Methylation Profiles per Cluster (Bulk) (Binned)",
    proportional_height=False
)

profiles_df_bulk_binned.head(15)

In [None]:
read_dict_bulk = dict_id_cluster_color(df_cosine, clusters_cosine, hex_colors)

df_dict_cosine = dict_to_df(read_dict_cosine)
df_test_cosine= merge(df_proc, df_dict_cosine)

In [None]:
P_array = profiles_df_bulk_binned.to_numpy(float)

# Optional per-bin weights for cosine
# weights = coverage_df.sum(axis=0).to_numpy(float) / (coverage_df.sum(axis=0).mean() + 1e-9)


In [None]:
assign_df, S, D = assign_gene_clusters_to_consensus(
    G=profiles_df,               # DataFrame (rows = gene clusters, cols = common positions)
    meta=meta_df,         # aligned to G rows
    P=profiles_df_bulk_binned,               # DataFrame (rows = bulk clusters, cols = common positions)
    method='cosine',
    center_rows=True,
    # weights=weights,  # optional pd.Series indexed by common positions
    min_overlap=10,
    min_similarity=0,
    allow_unassigned=True,
    capacity_per_type=1,
)

assign_df

In [None]:
meta_for_assign = meta_df[['gene_tss', 'cluster_id', 'n_reads']].rename(columns={'n_reads': 'size'})

# 2) Heatmap for one gene
fig, ax = plot_gene_mapping_heatmap(
    assign_df=assign_df,
    S=S,
    meta=meta_for_assign,
    P=profiles_df_bulk_binned,
    gene_id= gene_names[14],
    sort_rows=False
)

# 3) Assignment summary across genes
# fig2, ax2 = plot_assignment_summary(assign_df)

# 4) Recolor meta for plotting centroids
bulk_color_map = {int(row['cluster_id']): row['cluster_color']
                     for _, row in meta_bulk_df.iterrows()}  # build from bulk meta
meta_colored = recolor_meta_by_bulk(assign_df, meta_df, bulk_color_map, gene_id=gene_names[14])

# 5) Assignment confidence
# fig3, ax3 = plot_assignment_confidence(assign_df, gene_id=gene)

plot_centroids_with_shading(
    profiles_df_gene,
    positions_gene,
    meta_colored,
    coverage_df_gene,
    hex_colors,
    smooth_sigma = 0,
    title='Average DNA Methylation Profiles per Cluster',
    missingness_threshold=0.5,
    
)

In [None]:
meta_colored

In [None]:
# Build positional weights emphasizing +/- 500 bp around Pol2 (center=0)
weights_pos = build_positional_weights(columns=P_df.columns, window_bp=200, center=0, mode='gaussian',
                                       inside_weight=1.0, outside_weight=0.5, normalize_mean=True)

# Option A: Cosine with positional weights (no PCA, masked cosine):
assign_weighted_df, S, D = assign_gene_clusters_to_consensus(
    G=profiles_df,
    meta=meta_df,
    P=profiles_df_bulk_binned,
    method='cosine',
    center_rows=True,            # keep raw levels; set True for pattern-only
    weights=weights_pos,          # emphasize +/- 500 bp region
    min_overlap=10,               # ensure enough shared bins
    min_similarity=0,           # allow 'unassigned' if below threshold
    allow_unassigned=True,
    capacity_per_type=1
)

assign_weighted_df

In [None]:
meta_for_assign_weighted = meta_df[['gene_tss', 'cluster_id', 'n_reads']].rename(columns={'n_reads': 'size'})
# 2) Heatmap for one gene
fig, ax = plot_gene_mapping_heatmap(
    assign_df=assign_weighted_df,
    S=S,
    meta=meta_for_assign_weighted,
    P=profiles_df_bulk_binned,
    gene_id=gene_names[14],
    sort_rows=False
)

# 3) Assignment summary across genes
# fig2, ax2 = plot_assignment_summary(assign_weighted_df)

# 4) Recolor meta for plotting centroids
bulk_color_map = {cons_id: '#hexcolor' for cons_id in P_df.index}  # build from bulk meta
meta_colored = recolor_meta_by_bulk(assign_weighted_df, meta_df, bulk_color_map, gene_id=gene)

# 5) Assignment confidence
# fig3, ax3 = plot_assignment_confidence(assign_df, gene_id=gene)


In [None]:
# colours = [
#     "#FF9999",  # warm soft red
#     "#FFB380",  # coral orange
#     "#FFE680",  # golden pastel yellow
#     "#A8E6CF",  # mint green
#     "#81C7F5",  # light sky blue
#     "#9FA8DA",  # soft indigo blue
#     "#CBA3E3",  # lavender purple
#     "#FF9EC4",  # rose pink
#     "#FFBC8A",  # peach apricot
#     "#7FD1AE",  # seafoam green
# ]


In [None]:
colours = [
    "#969494",
    "#969494",
    "#969494",
    "#969494",
    "#969494",
    "#969494",
    "#969494",
    "#969494",
    "#969494",
    "#969494",
    "#969494"
    
]


In [None]:
## UNWEIGTHED CLUSTERING + UNWEIGHTED ASSIGNMENT LOOP OVER ALL GENES
P_array = profiles_df_bulk_binned.to_numpy(float)
print(len(gene_df))
sim=[]
for i in range(len(gene_df)):
    gene = gene_names[i]
    df_gene = gene_df[i]

    df_cosine, X_cosine, partition_cosine, clusters_cosine, metrics_cosine = clustering_final(
        df_gene,
        n_neighbors=15,
        nan_threshold=0.7,
        nan_method='drop',
        scaling=False,
        pca_or_not=False,
        n_pcs=None,           # None = no PCA; int for #PCs; float in (0,1) for variance ratio
        metric='cosine',          # 'euclidean' or 'cosine'
        transform='arcsine',            # 'none', 'logit', or 'arcsine'
        kernel_type='laplacian',
        leiden_resolution=0.8,
        seed=42
    )
    
    read_dict_cosine = dict_id_cluster_color(df_cosine, clusters_cosine, colours)
    df_dict_cosine = dict_to_df(read_dict_cosine)
    df_test_cosine= merge(df_proc, df_dict_cosine)

    profiles_df, coverage_df, meta_df, positions = compute_gene_centroids(
    df_reads=df_gene,        # reads × bins methylation matrix
    df_map=df_dict_cosine,        # mapping of readid → cluster + color
    gene_tss= gene,
    extra_meta_cols=['gene_tss', 'group', 'cluster']  # add other non-bin columns here if present
)

    Gc = _coerce_sort(profiles_df)
    Pc = _coerce_sort(profiles_df_bulk_binned)
    common = np.intersect1d(Gc.columns.values, Pc.columns.values)

    assign_df, S, D = assign_gene_clusters_to_consensus(
        G=Gc,               # DataFrame (rows = gene clusters, cols = common positions)
        meta=meta_df,         # aligned to G rows
        P=Pc,               # DataFrame (rows = bulk clusters, cols = common positions)
        method='cosine',
        center_rows=True,
        weights=None,  # optional pd.Series indexed by common positions
        min_overlap=20,
        min_similarity=0.5,
        allow_unassigned=True,
        capacity_per_type=1,
    )

    meta_for_assign = meta_df[['gene_tss', 'cluster_id', 'n_reads']].rename(columns={'n_reads': 'size'})

    fig, ax = plot_gene_mapping_heatmap(
        assign_df=assign_df,
        S=S,
        meta=meta_for_assign,
        P=Pc,
        gene_id=gene,
        sort_rows=False
    )

    bulk_color_map = {int(row['cluster_id']): row['cluster_color']
                     for _, row in meta_bulk_df.iterrows()}  # build from bulk meta
    
    meta_colored = recolor_meta_by_bulk(assign_df, meta_df, bulk_color_map, gene_id= gene)

    # print(meta_colored)

    plot_centroids_with_shading(
        profiles_df,
        positions,
        meta_colored,
        coverage_df,
        hex_colors,
        proportional_height=False,
        smooth_sigma = 0,
        title= gene + ' Average DNA Methylation Profiles per Cluster - reassigned colors - unweighted clustering - unweighted assignment',
        missingness_threshold=0.5,
    )

    print( assign_df['similarity'].mean())
    sim.append(assign_df['similarity'].mean())

print(np.mean(sim))


In [None]:
##UNWEITHED CLUSTERING + WEIGHTED BOX ASSIGNMENT LOOP OVER ALL GENES
P_array = profiles_df_bulk_binned.to_numpy(float)

sim=[]
for i in range(len(gene_df)):
    gene = gene_names[i]
    df_gene = gene_df[i]


    df_cosine, X_cosine, part, clusters_cosine, metrics_cosine = clustering_final(
        df_gene,
        n_neighbors=15,
        nan_threshold=0.7,
        nan_method='drop',
        scaling=False,
        pca_or_not=False,
        n_pcs=None,           # None = no PCA; int for #PCs; float in (0,1) for variance ratio
        metric='cosine',          # 'euclidean' or 'cosine'
        transform='arcsine',            # 'none', 'logit', or 'arcsine'
        kernel_type='laplacian',
        leiden_resolution=0.8,
        seed=42,
        pos_weights=None,    # <-- emphasize central bins
        weight_stage=None

    )
    
    read_dict_cosine = dict_id_cluster_color(df_cosine, clusters_cosine, colours)
    df_dict_cosine = dict_to_df(read_dict_cosine)
    df_test_cosine= merge(df_proc, df_dict_cosine)

    profiles_df, coverage_df, meta_df, positions = compute_gene_centroids(
    df_reads=df_gene,        # reads × bins methylation matrix
    df_map=df_dict_cosine,        # mapping of readid → cluster + color
    gene_tss= gene,
    extra_meta_cols=['gene_tss', 'group', 'cluster']  # add other non-bin columns here if present
)
    
    # Build weights AFTER columns are coerced/sorted and intersection computed
    Gc = _coerce_sort(profiles_df)
    Pc = _coerce_sort(profiles_df_bulk_binned)
    common = np.intersect1d(Gc.columns.values, Pc.columns.values)

    weights_pos = build_positional_weights(
    columns=common,      # use the intersection, not P_df.columns
    window_bp=400,
    center=0,
    mode='box',
    inside_weight=1.0,
    outside_weight=0.5,
    normalize_mean=True
)
    assign_weighted_df, S, D = assign_gene_clusters_to_consensus(
    G=Gc,
    meta=meta_df,
    P=Pc,
    method='cosine',
    center_rows=True,            # keep raw levels; set True for pattern-only
    weights=weights_pos,          # emphasize +/- 500 bp region
    min_overlap=20,               # ensure enough shared bins
    min_similarity=0.5,           # allow 'unassigned' if below threshold
    allow_unassigned=True,
    capacity_per_type=1
)

    meta_for_assign = meta_df[['gene_tss', 'cluster_id', 'n_reads']].rename(columns={'n_reads': 'size'})

    fig, ax = plot_gene_mapping_heatmap(
        assign_df=assign_weighted_df,
        S=S,
        meta=meta_for_assign,
        P=Pc,
        gene_id=gene,
        sort_rows=False
    )

    bulk_color_map = {int(row['cluster_id']): row['cluster_color']
                     for _, row in meta_bulk_df.iterrows()}  # build from bulk meta
    
    meta_colored = recolor_meta_by_bulk(assign_weighted_df, meta_df, bulk_color_map, gene_id= gene)

    # print(meta_colored)

    plot_centroids_with_shading(
        profiles_df,
        positions,
        meta_colored,
        coverage_df,
        hex_colors,
        proportional_height=False,
        smooth_sigma = 0,
        title= gene + ' Average DNA Methylation Profiles per Cluster - reassigned colors - unweighted clustering - weighted box assignment',
        missingness_threshold=0.5,
    )

    print( assign_weighted_df['similarity'].mean())
    sim.append(assign_weighted_df['similarity'].mean())

print(np.mean(sim))



In [None]:
## UNWEIGHTED CLUSTERING + WEIGHTED GAUSSIAN ASSIGNMENT LOOP OVER ALL GENES
P_array = profiles_df_bulk_binned.to_numpy(float)

sim=[]

for i in range(len(gene_df)):
    gene = gene_names[i]
    df_gene = gene_df[i]


    df_cosine, X_cosine, part, clusters_cosine, metrics_cosine = clustering_final(
        df_gene,
        n_neighbors=15,
        nan_threshold=0.7,
        nan_method='drop',
        scaling=False,
        pca_or_not=False,
        n_pcs=None,           # None = no PCA; int for #PCs; float in (0,1) for variance ratio
        metric='cosine',          # 'euclidean' or 'cosine'
        transform='arcsine',            # 'none', 'logit', or 'arcsine'
        kernel_type='laplacian',
        leiden_resolution=0.8,
        seed=42,
        pos_weights=None,    # <-- emphasize central bins
        weight_stage=None

    )
    
    read_dict_cosine = dict_id_cluster_color(df_cosine, clusters_cosine, colours)
    df_dict_cosine = dict_to_df(read_dict_cosine)
    df_test_cosine= merge(df_proc, df_dict_cosine)

    profiles_df, coverage_df, meta_df, positions = compute_gene_centroids(
    df_reads=df_gene,        # reads × bins methylation matrix
    df_map=df_dict_cosine,        # mapping of readid → cluster + color
    gene_tss= gene,
    extra_meta_cols=['gene_tss', 'group', 'cluster']  # add other non-bin columns here if present
)
    
    # Build weights AFTER columns are coerced/sorted and intersection computed
    Gc = _coerce_sort(profiles_df)
    Pc = _coerce_sort(profiles_df_bulk_binned)
    common = np.intersect1d(Gc.columns.values, Pc.columns.values)

    weights_pos = build_positional_weights(
    columns=common,      # use the intersection, not P_df.columns
    window_bp=400,
    center=0,
    mode='gaussian',
    inside_weight=1.0,
    outside_weight=0.2,
    normalize_mean=True
)
    assign_weighted_df, S, D = assign_gene_clusters_to_consensus(
    G=Gc,
    meta=meta_df,
    P=Pc,
    method='cosine',
    center_rows=True,            # keep raw levels; set True for pattern-only
    weights=weights_pos,          # emphasize +/- 500 bp region
    min_overlap=20,               # ensure enough shared bins
    min_similarity=0.5,           # allow 'unassigned' if below threshold
    allow_unassigned=True,
    capacity_per_type=1
)

    meta_for_assign = meta_df[['gene_tss', 'cluster_id', 'n_reads']].rename(columns={'n_reads': 'size'})

    fig, ax = plot_gene_mapping_heatmap(
        assign_df=assign_weighted_df,
        S=S,
        meta=meta_for_assign,
        P=Pc,
        gene_id=gene,
        sort_rows=False
    )

    bulk_color_map = {int(row['cluster_id']): row['cluster_color']
                     for _, row in meta_bulk_df.iterrows()}  # build from bulk meta
    
    meta_colored = recolor_meta_by_bulk(assign_weighted_df, meta_df, bulk_color_map, gene_id= gene)

    # print(meta_colored)

    plot_centroids_with_shading(
        profiles_df,
        positions,
        meta_colored,
        coverage_df,
        hex_colors,
        proportional_height=False,
        smooth_sigma = 0,
        title= gene + ' Average DNA Methylation Profiles per Cluster - reassigned colors - unweighted clustering - weighted gaussian assignment',
        missingness_threshold=0.5,
    )
    print( assign_weighted_df['similarity'].mean())
    sim.append(assign_weighted_df['similarity'].mean())

print(np.mean(sim))



In [None]:
## WEIGHTED CLUSTERING + UNWEIGHTED ASSIGNMENT LOOP OVER ALL GENES
P_array = profiles_df_bulk_binned.to_numpy(float)
sim=[]
for i in range(len(gene_df)):
    gene = gene_names[i]
    df_gene = gene_df[i]

    weights_pos = build_positional_weights(
        columns=df_gene.columns,
        window_bp=400,
        center=0,
        mode='gaussian',
        inside_weight=1.0,
        outside_weight=0.2,
        sigma_bp=None,           # default window_bp/2
        normalize_mean=True      # average weight ~ 1.0
    )


    df_cosine, X_cosine, part, clusters_cosine, metrics_cosine = clustering_final(
        df_gene,
        n_neighbors=15,
        nan_threshold=0.7,
        nan_method='drop',
        scaling=False,
        pca_or_not=False,
        n_pcs=None,           # None = no PCA; int for #PCs; float in (0,1) for variance ratio
        metric='cosine',          # 'euclidean' or 'cosine'
        transform='arcsine',            # 'none', 'logit', or 'arcsine'
        kernel_type='laplacian',
        leiden_resolution=0.8,
        seed=42,
        pos_weights=weights_pos,    # <-- emphasize central bins
        weight_stage='post_pca'

    )
    
    read_dict_cosine = dict_id_cluster_color(df_cosine, clusters_cosine, colours)
    df_dict_cosine = dict_to_df(read_dict_cosine)
    df_test_cosine= merge(df_proc, df_dict_cosine)

    profiles_df, coverage_df, meta_df, positions = compute_gene_centroids(
    df_reads=df_gene,        # reads × bins methylation matrix
    df_map=df_dict_cosine,        # mapping of readid → cluster + color
    gene_tss= gene,
    extra_meta_cols=['gene_tss', 'group', 'cluster']  # add other non-bin columns here if present
)
    
    # Build weights AFTER columns are coerced/sorted and intersection computed
    Gc = _coerce_sort(profiles_df)
    Pc = _coerce_sort(profiles_df_bulk_binned)
    common = np.intersect1d(Gc.columns.values, Pc.columns.values)

#     weights_pos = build_positional_weights(
#     columns=common,      # use the intersection, not P_df.columns
#     window_bp=400,
#     center=0,
#     mode='box',
#     inside_weight=1.0,
#     outside_weight=0.5,
#     normalize_mean=True
# )
    assign_weighted_df, S, D = assign_gene_clusters_to_consensus(
    G=Gc,
    meta=meta_df,
    P=Pc,
    method='cosine',
    center_rows=True,            # keep raw levels; set True for pattern-only
    weights=None,          # emphasize +/- 500 bp region
    min_overlap=20,               # ensure enough shared bins
    min_similarity=0.5,           # allow 'unassigned' if below threshold
    allow_unassigned=True,
    capacity_per_type=1
)

    meta_for_assign = meta_df[['gene_tss', 'cluster_id', 'n_reads']].rename(columns={'n_reads': 'size'})

    fig, ax = plot_gene_mapping_heatmap(
        assign_df=assign_weighted_df,
        S=S,
        meta=meta_for_assign,
        P=Pc,
        gene_id=gene,
        sort_rows=False
    )

    bulk_color_map = {int(row['cluster_id']): row['cluster_color']
                     for _, row in meta_bulk_df.iterrows()}  # build from bulk meta
    
    meta_colored = recolor_meta_by_bulk(assign_weighted_df, meta_df, bulk_color_map, gene_id= gene)

    # print(meta_colored)

    plot_centroids_with_shading(
        profiles_df,
        positions,
        meta_colored,
        coverage_df,
        hex_colors,
        proportional_height=False,
        smooth_sigma = 0,
        title= gene + ' Average DNA Methylation Profiles per Cluster - reassigned colors - weighted clustering - unweighted assignment',
        missingness_threshold=0.5,
    )

    print( assign_weighted_df['similarity'].mean())
    sim.append(assign_weighted_df['similarity'].mean())

print(np.mean(sim))


In [None]:
## WEIGHTED CLUSTERING + WEIGHTED GAUSSIAN ASSIGNMENT LOOP OVER ALL GENES
P_array = profiles_df_bulk_binned.to_numpy(float)
sim=[]
for i in range(len(gene_df)):
    gene = gene_names[i]
    df_gene = gene_df[i]

    weights_pos = build_positional_weights(
        columns=df_gene.columns,
        window_bp=400,
        center=0,
        mode='gaussian',
        inside_weight=1.0,
        outside_weight=0.2,
        sigma_bp=None,           # default window_bp/2
        normalize_mean=True      # average weight ~ 1.0
    )


    df_cosine, X_cosine, part, clusters_cosine, metrics_cosine = clustering_final(
        df_gene,
        n_neighbors=15,
        nan_threshold=0.7,
        nan_method='drop',
        scaling=False,
        pca_or_not=False,
        n_pcs=None,           # None = no PCA; int for #PCs; float in (0,1) for variance ratio
        metric='cosine',          # 'euclidean' or 'cosine'
        transform='arcsine',            # 'none', 'logit', or 'arcsine'
        kernel_type='laplacian',
        leiden_resolution=0.8,
        seed=42,
        pos_weights=weights_pos,    # <-- emphasize central bins
        weight_stage='pre_pca'

    )

    partition_cosine = np.asarray(part.membership, dtype=int)

    
    read_dict_cosine = dict_id_cluster_color(df_cosine, clusters_cosine, colours)
    df_dict_cosine = dict_to_df(read_dict_cosine)
    df_test_cosine= merge(df_proc, df_dict_cosine)

    profiles_df, coverage_df, meta_df, positions = compute_gene_centroids(
    df_reads=df_gene,        # reads × bins methylation matrix
    df_map=df_dict_cosine,        # mapping of readid → cluster + color
    gene_tss= gene,
    extra_meta_cols=['gene_tss', 'group', 'cluster']  # add other non-bin columns here if present
)
    
    # Build weights AFTER columns are coerced/sorted and intersection computed
    Gc = _coerce_sort(profiles_df)
    Pc = _coerce_sort(profiles_df_bulk_binned)
    common = np.intersect1d(Gc.columns.values, Pc.columns.values)

    weights_pos = build_positional_weights(
    columns=common,      # use the intersection, not P_df.columns
    window_bp=400,
    center=0,
    mode='gaussian',
    inside_weight=1.0,
    outside_weight=0.2,
    normalize_mean=True
)
    assign_weighted_df, S, D = assign_gene_clusters_to_consensus(
    G=Gc,
    meta=meta_df,
    P=Pc,
    method='cosine',
    center_rows=True,            # keep raw levels; set True for pattern-only
    weights=weights_pos,          # emphasize +/- 500 bp region
    min_overlap=20,               # ensure enough shared bins
    min_similarity=0.5,           # allow 'unassigned' if below threshold
    allow_unassigned=True,
    capacity_per_type=1
)

    meta_for_assign_weighted = meta_df[['gene_tss', 'cluster_id', 'n_reads']].rename(columns={'n_reads': 'size'})
    
    fig, ax = plot_gene_mapping_heatmap(
        assign_df=assign_weighted_df,
        S=S,
        meta=meta_for_assign_weighted,
        P=Pc,
        gene_id= gene,
        sort_rows=False
    )

    bulk_color_map = {int(row['cluster_id']): row['cluster_color']
                     for _, row in meta_bulk_df.iterrows()}  # build from bulk meta
    
    meta_colored = recolor_meta_by_bulk(assign_weighted_df, meta_df, bulk_color_map, gene_id= gene)

    # print(meta_colored)

    plot_centroids_with_shading(
        profiles_df,
        positions,
        meta_colored,
        coverage_df,
        hex_colors,
        smooth_sigma = 0,
        title= gene + ' Average DNA Methylation Profiles per Cluster - reassigned colors - weighted clustering - weigthed gausian assignment',
        missingness_threshold=0.5,
        proportional_height=False,
    )

    print(assign_weighted_df['similarity'].mean())
    sim.append(assign_weighted_df['similarity'].mean())

print(np.mean(sim))


In [None]:
## WEIGHTED CLUSTERING + WEIGHTED BOX ASSIGNMENT LOOP OVER ALL GENES
P_array = profiles_df_bulk_binned.to_numpy(float)
sim=[]
for i in range(len(gene_df)):
    gene = gene_names[i]
    df_gene = gene_df[i]

    weights_pos = build_positional_weights(
        columns=df_gene.columns,
        window_bp=400,
        center=0,
        mode='gaussian',
        inside_weight=1.0,
        outside_weight=0.2,
        sigma_bp=None,           # default window_bp/2
        normalize_mean=True      # average weight ~ 1.0
    )


    df_cosine, X_cosine, part, clusters_cosine, metrics_cosine = clustering_final(
        df_gene,
        n_neighbors=15,
        nan_threshold=0.7,
        nan_method='drop',
        scaling=False,
        pca_or_not=False,
        n_pcs=None,           # None = no PCA; int for #PCs; float in (0,1) for variance ratio
        metric='cosine',          # 'euclidean' or 'cosine'
        transform='arcsine',            # 'none', 'logit', or 'arcsine'
        kernel_type='laplacian',
        leiden_resolution=0.8,
        seed=42,
        pos_weights=weights_pos,    # <-- emphasize central bins
        weight_stage='pre_pca'

    )

    partition_cosine = np.asarray(part.membership, dtype=int)

    
    read_dict_cosine = dict_id_cluster_color(df_cosine, clusters_cosine, colours)
    df_dict_cosine = dict_to_df(read_dict_cosine)
    df_test_cosine= merge(df_proc, df_dict_cosine)

    profiles_df, coverage_df, meta_df, positions = compute_gene_centroids(
    df_reads=df_gene,        # reads × bins methylation matrix
    df_map=df_dict_cosine,        # mapping of readid → cluster + color
    gene_tss= gene,
    extra_meta_cols=['gene_tss', 'group', 'cluster']  # add other non-bin columns here if present
)
    
    # Build weights AFTER columns are coerced/sorted and intersection computed
    Gc = _coerce_sort(profiles_df)
    Pc = _coerce_sort(profiles_df_bulk_binned)
    common = np.intersect1d(Gc.columns.values, Pc.columns.values)

    weights_pos = build_positional_weights(
    columns=common,      # use the intersection, not P_df.columns
    window_bp=400,
    center=0,
    mode='box',
    inside_weight=1.0,
    outside_weight=0.5,
    normalize_mean=True
)
    assign_weighted_df, S, D = assign_gene_clusters_to_consensus(
    G=Gc,
    meta=meta_df,
    P=Pc,
    method='cosine',
    center_rows=True,            # keep raw levels; set True for pattern-only
    weights=weights_pos,          # emphasize +/- 500 bp region
    min_overlap=20,               # ensure enough shared bins
    min_similarity=0.5,           # allow 'unassigned' if below threshold
    allow_unassigned=True,
    capacity_per_type=1
)

    meta_for_assign_weighted = meta_df[['gene_tss', 'cluster_id', 'n_reads']].rename(columns={'n_reads': 'size'})
    
    fig, ax = plot_gene_mapping_heatmap(
        assign_df=assign_weighted_df,
        S=S,
        meta=meta_for_assign_weighted,
        P=Pc,
        gene_id= gene,
        sort_rows=False
    )

    bulk_color_map = {int(row['cluster_id']): row['cluster_color']
                     for _, row in meta_bulk_df.iterrows()}  # build from bulk meta
    
    meta_colored = recolor_meta_by_bulk(assign_weighted_df, meta_df, bulk_color_map, gene_id= gene)

    # print(meta_colored)

    plot_centroids_with_shading(
        profiles_df,
        positions,
        meta_colored,
        coverage_df,
        hex_colors,
        smooth_sigma = 0,
        title= gene + ' Average DNA Methylation Profiles per Cluster - reassigned colors - weighted clustering - weigthed box assignment',
        missingness_threshold=0.5,
        proportional_height=False,
    )

    print( assign_weighted_df['similarity'].mean())
    sim.append(assign_weighted_df['similarity'].mean())

print(np.mean(sim))


  

In [None]:
# Define pipeline configurations
configs = [
    # --- PCA + Euclidean (no scaling) ---
    dict(
        name='euclidean',
        n_neighbors=15,
        nan_threshold=0.7,
        nan_method='drop',
        scaling=False,
        pca_or_not=True,
        n_pcs=None,  # keep 95% variance
        metric='euclidean',
        transform='arcsine',  # can also use 'arcsine' or 'none'
        kernel_type='laplacian',
        leiden_resolution=1.0,
        seed=42
    ),

    # --- No PCA + Cosine (shape-based) ---
    dict(
        name='cosine',
        n_neighbors=15,
        nan_threshold=0.7,
        nan_method='drop',
        scaling=False,  # keep raw 0..1; cosine similarity handles scale
        pca_or_not=False,
        n_pcs=None,
        metric='cosine',
        transform='arcsine',  # same transform for fairness
        kernel_type='laplacian',
        leiden_resolution=1.0,
        seed=42
    ),

    # --- No PCA + Cosine + Positional Weights ---
    dict(
        name='cosine_weighted',
        n_neighbors=15,
        nan_threshold=0.7,
        nan_method='drop',
        scaling=False,
        pca_or_not=False,
        n_pcs=None,           # None = no PCA; int for #PCs; float in (0,1) for variance ratio
        metric='cosine',          # 'euclidean' or 'cosine'
        transform='arcsine',            # 'none', 'logit', or 'arcsine'
        kernel_type='laplacian',
        leiden_resolution=0.8,
        seed=42,
        pos_weights= build_positional_weights(
            columns=df_gene.columns,
            window_bp=400,
            center=0,
            mode='gaussian',
            inside_weight=1.0,
            outside_weight=0.2,
            sigma_bp=None,           # default window_bp/2
            normalize_mean=True      # average weight ~ 1.0
        ),
        weight_stage='post_pca'
    )

]

# Example input dictionary of gene-level dataframes
# e.g. genes_dict = {'GENE1': df_gene1, 'GENE2': df_gene2, ...}

# Run the clustering pipelines
results_df, outputs = run_pipelines_on_genes(gene_names, gene_df, configs)

print(results_df['pipeline'].value_counts())

for m in ['silhouette','calinski_harabasz','davies_bouldin','leiden_quality','modularity','n_clusters']: 
    if m in results_df.columns:
        print(m, results_df.pivot_table(index='gene', columns='pipeline', values=m, aggfunc='first').isna().all())
# Plot violin + scatter comparison of clustering metrics
# figs = plot_violin_scatter(results_df)

results_df.head(20)

fig, axes = plot_compare_pipelines_grid(results_df,
                                        pipelines_order=('euclidean', 'cosine','cosine_weighted'),
                                        metrics=('silhouette','calinski_harabasz','davies_bouldin','leiden_quality','modularity','n_clusters'),
                                        kind='violin', # or 'box' 
                                        show_points=True,
                                        connect_pairs=True )

In [None]:
# === Compute and plot multi-metric silhouette scores for your existing results ===
# Assumes you already ran:
#   results_df, outputs = run_pipelines_on_genes(gene_names, gene_df, configs)
# and that outputs[gene][pipeline] contains keys 'X_pca' and 'clusters'

# 1) Build the embedding dict (gene, pipeline) -> {X, labels, metric}
metric_map = {
    'euclidean': 'euclidean',
    'cosine': 'cosine',
    'correlation': 'correlation'
}
embed_dict = {}
for gene in gene_names:
    if gene not in outputs:
        continue
    for cfg in configs:
        pipeline = cfg['name']  # 'euclidean' or 'cosine'
        if pipeline not in outputs[gene]:
            continue
        run = outputs[gene][pipeline]
        if 'X_pca' not in run or 'clusters' not in run:
            continue
        embed_dict[(gene, pipeline)] = {
            'X': run['X_pca'],
            'labels': run['clusters'],
            'metric': metric_map.get(pipeline, cfg.get('metric', 'euclidean'))
        }

# 2) Add extra silhouette columns (silhouette_cosine, silhouette_correlation, silhouette_euclidean)
results_with_sil = add_silhouette_columns(results_df, embed_dict)

# 3A) One plot per metric (auto-includes all columns named 'silhouette_*')
figs_dict = plot_violin_scatter(results_with_sil)
# Save (optional)
# for metric_name, fig in figs_dict.items():
#     fig.savefig(f"violin_{metric_name}.png", dpi=300, bbox_inches='tight')

# 3B) One grid figure with all metrics (base + all silhouette_* columns)
fig, axes = plot_compare_pipelines_grid(
    results_df=results_with_sil,
    pipelines_order=('euclidean', 'cosine', 'cosine_weighted'),
    metrics=('calinski_harabasz', 'davies_bouldin', 'leiden_quality', 'modularity', 'n_clusters'),
    kind='violin',
    show_points=True,
    connect_pairs=True)


# -------------------------------------------------------------------------
# If you are using your older plot_compare_pipelines_grid signature (without
# include_silhouettes), explicitly pass the silhouette_* columns you want:
# -------------------------------------------------------------------------
# sil_cols = [c for c in results_with_sil.columns if c.startswith('silhouette_')]
# fig, axes = plot_compare_pipelines_grid(
#     results_df=results_with_sil,
#     pipelines_order=('euclidean', 'cosine'),
#     metrics=tuple(['calinski_harabasz','davies_bouldin','leiden_quality','modularity','n_clusters'] + sil_cols),
#     kind='violin',
#     show_points=True,
#     connect_pairs=True
# )

In [None]:
df_group_first = (
    df_filtered
    .swaplevel('gene_tss', 'group')      # put 'group' as the outer level
    .sort_index()                    # optional: sort by the new index
    #.reset_index(level='gene_tss')       # move 'gene' from index to a column (metadata)
)

df_group_first.head()

In [None]:
group_names, group_df = get_groups_df_list(df_group_first)
group_df[0]
group_names[0]

In [None]:
gene_names_group, gene_df_group = get_genes_list(group_df[0])
gene_df_group[0]

In [None]:
for i in range(len(group_df)):
    gene_names_group, gene_df_group = get_genes_list(group_df[i])
    for k in range(len(gene_df_group)):
        gene= gene_names_group[k]
        df_group = gene_df_group[k]
        df_group, X_group, partition_group, clusters_group, metrics_group = clustering_final(
                df_group,
                n_neighbors=10,
                nan_threshold=0.7,
                nan_method='drop',
                scaling=False,
                pca_or_not=True,
                n_pcs=None,           # None = no PCA; int for #PCs; float in (0,1) for variance ratio
                metric='euclidean',          # 'euclidean' or 'cosine' 
                transform='arcsine',            # 'none', 'logit', or 'arcsine'
                kernel_type='laplacian',
                leiden_resolution=1.1,
                seed=42
            )

        # embedding_group, fig_group = plot_umap(
        #         X_group,
        #         clusters_group,
        #         n_neighbors=25,
        #         min_dist=0.1,
        #         metric='euclidean',      
        #         transform=None,          # None | 'logit' | 'arcsine' (use only if X are raw proportions)
        #         n_pcs=None,              # set if X are raw features; None if X are already PCs
        #         standardize=False,       # True if using raw features without PCA
        #         seed=42,
        #         palette=hex_colors,            # optional list/array or matplotlib colormap name
        #         title="ALL FILTERED GENES PCA + Euclidean + logit + Laplacian",
        #         gene='ALL GENES'
        #     )

        read_dict_group = dict_id_cluster_color(df_group, clusters_group, hex_colors)

        df_dict_group = dict_to_df(read_dict_group)
        df_test_group= merge(df_proc, df_dict_group)

        profiles_df_group, coverage_df_group, meta_df_group, positions_group= compute_gene_centroids(
                                                                            df_group,
                                                                            df_dict_group,
                                                                            group= group_names[i],
                                                                        )


        plot_centroids_with_shading(
            profiles_df_group,
            positions_group,
            meta_df_group,
            coverage_df_group,
            hex_colors,
            smooth_sigma = 0,
            title='Average DNA Methylation Profiles per Cluster - GROUP: ' + group_names[i] + gene,
            missingness_threshold=0.5,
        )

    # Long format for plotting
        df_test_plot_group = preprocess_long_for_plot(df_test_group,
                                                                include_locus_cluster=True,
                                                                filter_outliers= True,
                                                                max_span_bp=2000,     #i'm letting the filtering be based on the span_quantile
                                                                #   span_quantile=0.999,
                                                                require_center_inside=True,
                                                                min_cpg=1,
                                                                cpg_window_bp=5000
                                                                )
         # Plot
        plot_reads_long(
                df_test_plot_group,
                # filters={
                #     # "gene_tss": gene, 
                #     'group':['BT474_mV_high_Untreated',
                #     'BT474_mV_low_Untreated',
                #     'BT474_mV_Untreated_Unsorted']},
                # facet_by='group',
                color_by="locus_cluster",
                gene='all genes',
                hex_colors=hex_colors
            )

           

In [None]:
df_group

In [None]:
profiles_df_group.head(50)

In [None]:
meta_df_group.head(50)

In [None]:
positions_group

In [None]:
plot_centroids_with_shading(
    profiles_df_group,
    positions_group,
    meta_df_group,
    coverage_df_group,
    hex_colors,
    smooth_sigma = 0,
    title='Average DNA Methylation Profiles per Cluster',
    missingness_threshold=0.5,
)

In [None]:
df_filtered_c, X_filtered_c, partition_filtered_c, clusters_filtered_c, metrics_filtered_c = clustering_final(
        df_filtered,
        n_neighbors=15,
        nan_threshold=0.7,
        nan_method='drop',
        scaling=False,
        pca_or_not=True,
        n_pcs=None,           # None = no PCA; int for #PCs; float in (0,1) for variance ratio
        metric='cosine',          # 'euclidean' or 'cosine' 
        transform='none',            # 'none', 'logit', or 'arcsine'
        kernel_type='laplacian',
        leiden_resolution=1.1,
        seed=42
    )

# embedding_filtered_c, fig_filtered_c = plot_umap(
#         X_filtered_c,
#         clusters_filtered_c,
#         n_neighbors=25,
#         min_dist=0.1,
#         metric='cosine',      
#         transform=None,          # None | 'logit' | 'arcsine' (use only if X are raw proportions)
#         n_pcs=None,              # set if X are raw features; None if X are already PCs
#         standardize=False,       # True if using raw features without PCA
#         seed=42,
#         palette=hex_colors,            # optional list/array or matplotlib colormap name
#         title="ALL FILTERED GENES Cosine",
#         gene='ALL GENES'
#     )

# print(metrics_filtered_c)

# multi_sil_filtered_c= silhouettes_multi(X_filtered_c, clusters_filtered_c, metric_main='cosine', extra_metrics=['euclidean', 'correlation', 'jaccard'])

# print(f"Silhouette (Cosine): {metrics_filtered_c['silhouette']:.3f}")
# print(f"Silhouette (Cosine) - Euclidean: {multi_sil_filtered_c['silhouette_euclidean']:.3f}")
# print(f"Silhouette (Cosine) - Correlation: {multi_sil_filtered_c['silhouette_correlation']:.3f}")
# print(f"Silhouette (Cosine) - Jaccard: {multi_sil_filtered_c['silhouette_jaccard']:.3f}")


read_dict_filtered_c = dict_id_cluster_color(df_filtered_c, clusters_filtered_c, hex_colors)

df_dict_filtered_c = dict_to_df(read_dict_filtered_c)
df_test_filtered_c= merge(df_proc, df_dict_filtered_c)

# Long format for plotting
df_test_plot_filtered_c = preprocess_long_for_plot(df_test_filtered_c,
                                                          include_locus_cluster=True,
                                                          filter_outliers= True,
                                                          max_span_bp=None,     #i'm letting the filtering be based on the span_quantile
                                                          span_quantile=0.99,
                                                          require_center_inside=True,
                                                          min_cpg=1,
                                                          cpg_window_bp=5000
                                                          )

    # Plot

plot_reads_long(
        df_test_plot_filtered_c,
        filters={
            # "gene_tss": gene, 
            'group':['BT474_mV_high_Untreated',
            'BT474_mV_low_Untreated',
            'BT474_mV_Untreated_Unsorted']},
        # facet_by='group',
        color_by="locus_cluster",
        gene='all genes - cosine',
        hex_colors=hex_colors,
        figsize= (12,10)
    )

# plot_single_molecule_occupancy_rlike(
#     df=df_test_plot_filtered_c,
#     # cluster_colors=cluster_to_color,
#     xlim=(-1100, 1100),
#     stripe_lw=1.5,
#     point_size=0.6,
#     figsize_cm=(12, 10),
#     dpi=300,
    
# )

df_only_interesting_groups_filtered_c = df_filtered_c[df_filtered_c.index.get_level_values('group').isin([
        'BT474_mV_high_Untreated',
        'BT474_mV_low_Untreated',
        'BT474_mV_Untreated_Unsorted'
    ])]

# df_only_interesting_groups_filtered_c = merge(df_only_interesting_groups_filtered_c, df_dict_filtered_c)
# clusters_of_interest_filtered_c = df_only_interesting_groups_filtered_c['locus_cluster'].tolist() 
# summary(df_only_interesting_groups_filtered_c, clusters_of_interest_filtered_c, gene='all genes - cosine')

profiles_df_filtered_c, coverage_df_filtered_c, meta_df_filtered_c, positions_filtered_c = compute_gene_centroids(
                                                                        df_filtered_c,
                                                                        df_dict_filtered_c,
                                                                        gene_tss=gene,
                                                                    )

plot_centroids_with_shading(
        profiles_df_filtered_c,
        positions_filtered_c,
        meta_df_filtered_c,
        coverage_df_filtered_c,
        hex_colors,
        smooth_sigma = 0,
        title= gene + ' : Average DNA Methylation Profiles per Cluster - COSINE ',
        missingness_threshold=0.5,
    )

In [None]:
cluster_to_color = {int(v["cluster"]): v["color"] for v in read_dict_filtered_c.values()}

df_test_plot_filtered_c['order']= df_test_plot_filtered_c['cluster']
# df_test_plot_filtered_c.head()
cluster_to_color 

In [None]:
# plot_single_molecule_occupancy(
#     df=df_test_plot_filtered_c,
#     cluster_colors=cluster_to_color,
#     xlim=(-1000, 1000),
#     point_size=2,
#     figsize_cm=(15, 9),
#     dpi=280
# )
plot_single_molecule_occupancy_rlike(
    df=df_test_plot_filtered_c,
    cluster_colors=cluster_to_color,
    xlim=(-1100, 1100),
    stripe_lw=1.5,
    point_size=0.6,
    figsize_cm=(12, 10),
    dpi=300,
    n_clusters=len(cluster_to_color)
)

In [None]:
df_cosine, X_cosine, partition_cosine, clusters_cosine, metrics_cosine = clustering_final(
        gene_df[78],
        n_neighbors=15,
        nan_threshold=0.7,
        nan_method='drop',
        scaling=False,
        pca_or_not=False,
        n_pcs=None,           # None = no PCA; int for #PCs; float in (0,1) for variance ratio
        metric='cosine',          # 'euclidean' or 'cosine'
        transform='arcsine',            # 'none', 'logit', or 'arcsine'
        kernel_type='laplacian',
        leiden_resolution=0.8,
        seed=42
    )

embedding_cosine, fig_cosine = plot_umap(
        X_cosine,
        clusters_cosine,
        n_neighbors=25,
        min_dist=0.1,
        metric='cosine',      # for Pipeline 2, use 'euclidean'
        transform=None,          # None | 'logit' | 'arcsine' (use only if X are raw proportions)
        n_pcs=None,              # set if X are raw features; None if X are already PCs
        standardize=False,       # True if using raw features without PCA
        seed=42,
        palette=hex_colors,            # optional list/array or matplotlib colormap name
        title="NO PCA + cosine + laplacian",
        gene= gene_names[78]
    )

# print(metrics_cosine)

# multi_sil_cosine= silhouettes_multi(X_cosine, clusters_cosine, metric_main='cosine', extra_metrics=['euclidean', 'correlation', 'jaccard'])

# print(f"Silhouette (Cosine): {metrics_cosine['silhouette']:.3f}")
# print(f"Silhouette (Cosine) - Euclidean: {multi_sil_cosine['silhouette_euclidean']:.3f}")
# print(f"Silhouette (Cosine) - Correlation: {multi_sil_cosine['silhouette_correlation']:.3f}")
# print(f"Silhouette (Cosine) - Jaccard: {multi_sil_cosine['silhouette_jaccard']:.3f}")


read_dict_cosine = dict_id_cluster_color(df_cosine, clusters_cosine, hex_colors)

df_dict_cosine = dict_to_df(read_dict_cosine)
df_test_cosine= merge(df_proc, df_dict_cosine)

profiles_df, coverage_df, meta_df, positions = compute_gene_centroids(
    df_reads=gene_df[78],        # reads × bins methylation matrix
    df_map=df_dict_cosine,        # mapping of readid → cluster + color
    gene_tss= gene_names[78],
    extra_meta_cols=['gene_tss', 'group', 'cluster']  # add other non-bin columns here if present
)

plot_centroids_with_shading(
    profiles_df,
    positions,
    meta_df,
    coverage_df,
    hex_colors,
    smooth_sigma = 0,
    title='Average DNA Methylation Profiles per Cluster',
    missingness_threshold=0.5,
)

In [None]:
assign_df_bulk_to_filtered, S, D = assign_gene_clusters_to_consensus(
    G=profiles_df_filtered_c,               # DataFrame (rows = gene clusters, cols = common positions)
    meta=meta_df_filtered_c,         # aligned to G rows
    P=profiles_df_bulk_binned,               # DataFrame (rows = bulk clusters, cols = common positions)
    method='cosine',
    center_rows=True,
    # weights=weights,  # optional pd.Series indexed by common positions
    min_overlap=40,
    min_similarity=0,
    allow_unassigned=True,
    capacity_per_type=1,
)

meta_for_assign_bulk_to_filtered = meta_df_filtered_c[['gene_tss', 'cluster_id', 'n_reads']].rename(columns={'n_reads': 'size'})

assign_df_bulk_to_filtered

In [None]:
fig, ax = plot_gene_mapping_heatmap(
    assign_df=assign_df_bulk_to_filtered,
    S=S,
    meta=meta_for_assign_bulk_to_filtered,
    P=profiles_df_bulk_binned,
    gene_id='SETD1A_30958295_30958295',
    sort_rows=False
)

# 3) Assignment summary across genes
fig2, ax2 = plot_assignment_summary(assign_df_bulk_to_filtered)

# 4) Recolor meta for plotting centroids
bulk_color_map = {cons_id: '#hexcolor' for cons_id in P_df.index}  # build from bulk meta
meta_colored = recolor_meta_by_bulk(assign_df_bulk_to_filtered, meta_df_filtered_c, bulk_color_map, gene_id=gene)


In [None]:
assign_df_bulk_to_filtered, S, D = assign_gene_clusters_to_consensus(
    G=profiles_df_filtered_c,               # DataFrame (rows = gene clusters, cols = common positions)
    meta=meta_df_filtered_c,         # aligned to G rows
    P=profiles_df_bulk_binned,               # DataFrame (rows = bulk clusters, cols = common positions)
    method='cosine',
    center_rows=False,
    # weights=weights,  # optional pd.Series indexed by common positions
    min_overlap=40,
    min_similarity=0,
    allow_unassigned=True,
    capacity_per_type=1,
)

meta_for_assign_bulk_to_filtered = meta_df_filtered_c[['gene_tss', 'cluster_id', 'n_reads']].rename(columns={'n_reads': 'size'})

assign_df_bulk_to_filtered

In [None]:
fig, ax = plot_gene_mapping_heatmap(
    assign_df=assign_df_bulk_to_filtered,
    S=S,
    meta=meta_for_assign_bulk_to_filtered,
    P=profiles_df_bulk_binned,
    gene_id='SETD1A_30958295_30958295',
    sort_rows=False
)

# 3) Assignment summary across genes
fig2, ax2 = plot_assignment_summary(assign_df_bulk_to_filtered)

# 4) Recolor meta for plotting centroids
bulk_color_map = {cons_id: '#hexcolor' for cons_id in P_df.index}  # build from bulk meta
meta_colored = recolor_meta_by_bulk(assign_df_bulk_to_filtered, meta_df_filtered_c, bulk_color_map, gene_id=gene)
