## Visualizing probes of interest

### imports

In [3]:
import pandas as pd
import numpy as np
import plotly.graph_objs as go
import plotly.express as px
from plotly.subplots import make_subplots

### setup env for multithreading

In [4]:
os.environ["OMP_NUM_THREADS"] = "16"
os.environ["OPENBLAS_NUM_THREADS"] = "16"
os.environ["MKL_NUM_THREADS"] = "16"
os.environ["VECLIB_MAXIMUM_THREADS"] = "16"
os.environ["NUMEXPR_NUM_THREADS"] = "16"

### Data Loading and Preparation

In [2]:
pcawg_prim_window_meth_df_merged = pd.read_csv("../_OUTPUTS_/merged_sith_meth_pcawg_prim_window.tsv", sep = "\t")
pcawg_sith_corr_meth_df_merged = pd.read_csv("../_OUTPUTS_/merged_sith_meth_pcawg_sith_corr.tsv", sep = "\t")
pcawg_iqr_corr_meth_df_merged = pd.read_csv("../_OUTPUTS_/merged_sith_meth_pcawg_iqr_corr.tsv", sep = "\t")

## Visualizations

In [12]:
####################
# 1) The same bins/labels for sith_level
####################
bins = [-float('inf'), 0.7, 0.8, float('inf')]
labels = ['<0.7', '0.7-0.8', '>0.8']

####################
# 2) Load CSV into a DataFrame
####################
analysis_results_df = pd.read_csv("../_OUTPUTS_/pcawg_probes_of_interest.csv")

####################
# 3) Helper function from earlier snippet
####################
def get_probes_for_method(dataset_name, method_name, top_n=10):
    subset = analysis_results_df[
        (analysis_results_df['dataset'] == dataset_name) &
        (analysis_results_df['method'] == method_name)
    ].copy()
    
    if method_name == "cluster_7":
        subset = subset.sort_values('cluster_label').dropna(subset=['cluster_label', 'probe']).head(top_n)
        return dict(zip(subset['cluster_label'], subset['probe']))
    else:
        subset = subset.dropna(subset=['probe']).head(top_n)
        return subset['probe'].tolist()

####################
# 4) Define a small mapping from method ID -> descriptive label (optional)
####################
method_mapping = {
    "var_1": "1: variance",
    "corr_2": "2: correlation with SITH",
    "anova_3": "3: ANOVA p-value",
    "rf_impute_4a": "4a: Random Forest (imputation)",
    "rf_drop_4b": "4b: Random Forest (dropped rows)",
    "common_5": "5: common interest",
    "combined_6": "6: combined variance & ANOVA",
    "cluster_7": "7: clustering",
    "range_8": "8: range",
    "threshold_9": "9: high/low proportions",
    "diffmeth_10": "10: differential methylation",
    "aggregate_11": "11: aggregate score"
}

####################
# 5) Prepare the same datasets list
####################
datasets = [
    {
        'name': 'pcawg_prim_window',
        'df': pcawg_prim_window_meth_df_merged,
        'sort_col': 'SITH'
    },
    {
        'name': 'pcawg_sith_corr',
        'df': pcawg_sith_corr_meth_df_merged,
        'sort_col': 'SITH'
    },
    {
        'name': 'pcawg_iqr_corr',
        'df': pcawg_iqr_corr_meth_df_merged,
        'sort_col': 'INT_IQR'
    }
]

####################
# 6) The create_probe_heatmap(...) function remains the same
####################

def create_probe_heatmap(df, probes, title, score_col="SITH", y_tick_font_size=None):
    """
    Creates and displays a heatmap for the given set of probes with a sidebar for a score.
    
    Parameters:
      - df: The filtered and sorted DataFrame.
      - probes: Either a list of probe IDs or a dict (in which case its keys are used).
      - title: Title for the heatmap.
      - score_col: The column name for the score sidebar ("SITH" or "INT_IQR").
      - y_tick_font_size: Optional font size for the y-axis tick labels.
    """
    # Use keys if probes is a dict; otherwise assume probes is a list.
    probe_list = list(probes.keys()) if isinstance(probes, dict) else probes

    if not probe_list:
        print(f"Probe set '{title}' is empty. Skipping visualization.")
        return

    # Ensure the chosen probes exist in the DataFrame.
    available_probes = [p for p in probe_list if p in df.columns]
    missing_probes = set(probe_list) - set(available_probes)
    if missing_probes:
        print(f"Warning: The following probes for '{title}' are missing and will be skipped: {missing_probes}")
    if not available_probes:
        print(f"No available probes to display for '{title}'. Skipping visualization.")
        return

    # Prepare the data matrix for the main heatmap.
    data_matrix = df[available_probes]

    # Create main heatmap of methylation values.
    heatmap = go.Heatmap(
        z=data_matrix.values,
        x=available_probes,
        y=df['icgc_sample_id'],
        colorscale='RdBu',
        reversescale=True,
        colorbar=dict(title='Methylation Level'),
    )

    # Create a sidebar for the score.
    score_bar = go.Heatmap(
        z=df[score_col].values[:, np.newaxis],
        x=[score_col],
        y=df['icgc_sample_id'],
        colorscale='Viridis',
        colorbar=dict(title=score_col),
        showscale=False,
    )

    # If score_col is "SITH", add the sith_level sidebar.
    add_sith_level = (score_col == "SITH")
    if add_sith_level:
        sith_levels_unique = df['sith_level'].unique()
        sith_level_map = {level: idx for idx, level in enumerate(sith_levels_unique)}
        sith_levels_numeric = df['sith_level'].map(sith_level_map).values

        colors = px.colors.qualitative.Set1[:len(sith_levels_unique)]
        colorscale_custom = []
        for i, color in enumerate(colors):
            scale_value = i / (len(colors) - 1) if len(colors) > 1 else 0
            colorscale_custom.append([scale_value, color])

        sith_level_bar = go.Heatmap(
            z=sith_levels_numeric[:, np.newaxis],
            x=['Sith Level'],
            y=df['icgc_sample_id'],
            colorscale=colorscale_custom,
            colorbar=dict(title='Sith Level'),
            showscale=False,
            zmin=0,
            zmax=len(colors)-1,
        )

    # Determine number of subplot columns and subplot titles.
    n_cols = 3 if add_sith_level else 2
    subplot_titles = (f'Heatmap: {title}', score_col) + (("Sith Level",) if add_sith_level else ())

    # Create subplots.
    fig = make_subplots(
        rows=1, cols=n_cols,
        column_widths=[0.8] + ([0.1, 0.1] if add_sith_level else [0.2]),
        subplot_titles=subplot_titles,
        horizontal_spacing=0.02,
        shared_yaxes=True,
    )

    fig.add_trace(heatmap, row=1, col=1)
    fig.add_trace(score_bar, row=1, col=2)
    if add_sith_level:
        fig.add_trace(sith_level_bar, row=1, col=3)

    # Update layout and axes.
    fig.update_layout(
        height=800,
        width=1200,
        showlegend=False,
        title_text=title,
    )

    tick_step = max(1, len(available_probes) // 50)
    fig.update_xaxes(
        row=1, col=1,
        tickmode='array',
        tickvals=np.arange(0, len(available_probes), tick_step),
        ticktext=[available_probes[i] for i in range(0, len(available_probes), tick_step)],
        tickangle=90,
        title_text='Probes'
    )
    # Use the provided y_tick_font_size if given, else default to 10.
    y_tick_font = dict(size=y_tick_font_size) if y_tick_font_size is not None else dict(size=10)
    fig.update_yaxes(
        automargin=True,
        tickfont=y_tick_font,
        title_text='Samples'
    )

    if add_sith_level:
        legend_items = []
        for level, idx in sith_level_map.items():
            color = colors[idx]
            legend_items.append(
                go.Scatter(
                    x=[None],
                    y=[None],
                    mode='markers',
                    marker=dict(size=10, color=color),
                    legendgroup=str(level),
                    showlegend=True,
                    name=str(level),
                )
            )
        for item in legend_items:
            fig.add_trace(item)
        fig.update_layout(
            legend_title_text='Sith Level',
            legend=dict(itemsizing='constant'),
        )

    fig.show()
    html_filename = f"../_OUTPUTS_/heatmaps/{title.replace(' ', '_')}.html"
    fig.write_html(html_filename)
    print(f"Saved heatmap to {html_filename}")


####################
# 7) Generate Heatmaps, but pull each “method’s” probes from CSV
####################
all_methods = [
    "var_1",
    "corr_2",
    "anova_3",
    "rf_impute_4a",
    "rf_drop_4b",
    "common_5",
    "combined_6",
    "cluster_7",
    "range_8",
    "threshold_9",
    "diffmeth_10",
    "aggregate_11"
]

for dset in datasets:
    name = dset['name']
    df = dset['df'].copy()  # work on a copy
    sort_col = dset['sort_col']
    
    print("\n========================")
    print(f"Processing dataset: {name}")
    print("========================\n")
    
    # Add the sith_level column
    df['sith_level'] = pd.cut(df['SITH'], bins=bins, labels=labels)
    
    # pcawg_sith_corr -> two subsets
    if name == "pcawg_sith_corr":
        filters = [
            ("sample_code==1",  df[df['sample_code'] == 1]),
            ("sample_code>=2", df[df['sample_code'] >= 2])
        ]
        for f_desc, df_filtered in filters:
            sorted_df = df_filtered.sort_values(sort_col, ascending=False).reset_index(drop=True)
            score_col = "SITH"
            y_tick_font_size = 2 if f_desc == "sample_code==1" else None
            
            # For each method, fetch top probes from CSV
            for method_id in all_methods:
                # If you want a nicer method title, combine:
                method_title = method_mapping.get(method_id, method_id)
                full_title = f"{name} - {f_desc} - {method_title}"
                probes = get_probes_for_method(name, method_id, top_n=10)
                
                create_probe_heatmap(
                    sorted_df,
                    probes,
                    full_title,
                    score_col=score_col,
                    y_tick_font_size=y_tick_font_size
                )
    else:
        # pcawg_prim_window -> sample_code==1
        # pcawg_iqr_corr -> sample_code>1
        # others: no special filter
        if name == "pcawg_prim_window":
            df = df[df['sample_code'] == 1]
            y_tick_font_size = 2
        elif name == "pcawg_iqr_corr":
            df = df[df['sample_code'] > 1]
            y_tick_font_size = None
        else:
            y_tick_font_size = None
        
        sorted_df = df.sort_values(sort_col, ascending=False).reset_index(drop=True)
        
        # For pcawg_iqr_corr use "INT_IQR", else "SITH"
        score_col = "INT_IQR" if name == "pcawg_iqr_corr" else "SITH"
        
        for method_id in all_methods:
            method_title = method_mapping.get(method_id, method_id)
            full_title = f"{name} - {method_title}"
            probes = get_probes_for_method(name, method_id, top_n=10)
            
            create_probe_heatmap(
                sorted_df,
                probes,
                full_title,
                score_col=score_col,
                y_tick_font_size=y_tick_font_size
            )



Processing dataset: pcawg_prim_window



Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_prim_window_-_1:_variance.html


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_prim_window_-_2:_correlation_with_SITH.html


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_prim_window_-_3:_ANOVA_p-value.html


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_prim_window_-_4a:_Random_Forest_(imputation).html


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_prim_window_-_4b:_Random_Forest_(dropped_rows).html


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_prim_window_-_5:_common_interest.html


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_prim_window_-_6:_combined_variance_&_ANOVA.html
No available probes to display for 'pcawg_prim_window - 7: clustering'. Skipping visualization.


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_prim_window_-_8:_range.html
Probe set 'pcawg_prim_window - 9: high/low proportions' is empty. Skipping visualization.


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_prim_window_-_10:_differential_methylation.html


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_prim_window_-_11:_aggregate_score.html

Processing dataset: pcawg_sith_corr



Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_sith_corr_-_sample_code==1_-_1:_variance.html


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_sith_corr_-_sample_code==1_-_2:_correlation_with_SITH.html


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_sith_corr_-_sample_code==1_-_3:_ANOVA_p-value.html


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_sith_corr_-_sample_code==1_-_4a:_Random_Forest_(imputation).html


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_sith_corr_-_sample_code==1_-_4b:_Random_Forest_(dropped_rows).html


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_sith_corr_-_sample_code==1_-_5:_common_interest.html


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_sith_corr_-_sample_code==1_-_6:_combined_variance_&_ANOVA.html
No available probes to display for 'pcawg_sith_corr - sample_code==1 - 7: clustering'. Skipping visualization.


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_sith_corr_-_sample_code==1_-_8:_range.html
Probe set 'pcawg_sith_corr - sample_code==1 - 9: high/low proportions' is empty. Skipping visualization.


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_sith_corr_-_sample_code==1_-_10:_differential_methylation.html


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_sith_corr_-_sample_code==1_-_11:_aggregate_score.html


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_sith_corr_-_sample_code>=2_-_1:_variance.html


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_sith_corr_-_sample_code>=2_-_2:_correlation_with_SITH.html


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_sith_corr_-_sample_code>=2_-_3:_ANOVA_p-value.html


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_sith_corr_-_sample_code>=2_-_4a:_Random_Forest_(imputation).html


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_sith_corr_-_sample_code>=2_-_4b:_Random_Forest_(dropped_rows).html


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_sith_corr_-_sample_code>=2_-_5:_common_interest.html


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_sith_corr_-_sample_code>=2_-_6:_combined_variance_&_ANOVA.html
No available probes to display for 'pcawg_sith_corr - sample_code>=2 - 7: clustering'. Skipping visualization.


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_sith_corr_-_sample_code>=2_-_8:_range.html
Probe set 'pcawg_sith_corr - sample_code>=2 - 9: high/low proportions' is empty. Skipping visualization.


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_sith_corr_-_sample_code>=2_-_10:_differential_methylation.html


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_sith_corr_-_sample_code>=2_-_11:_aggregate_score.html

Processing dataset: pcawg_iqr_corr



Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_iqr_corr_-_1:_variance.html


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_iqr_corr_-_2:_correlation_with_SITH.html


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_iqr_corr_-_3:_ANOVA_p-value.html


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_iqr_corr_-_4a:_Random_Forest_(imputation).html


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_iqr_corr_-_4b:_Random_Forest_(dropped_rows).html


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_iqr_corr_-_5:_common_interest.html


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_iqr_corr_-_6:_combined_variance_&_ANOVA.html
No available probes to display for 'pcawg_iqr_corr - 7: clustering'. Skipping visualization.


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_iqr_corr_-_8:_range.html
Probe set 'pcawg_iqr_corr - 9: high/low proportions' is empty. Skipping visualization.


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_iqr_corr_-_10:_differential_methylation.html


Saved heatmap to ../_OUTPUTS_/heatmaps/pcawg_iqr_corr_-_11:_aggregate_score.html
