# Telomere Analysis Pipeline

**Workflow:**
1. **Generate** - Build telomere_analysis CSV from FASTQ/FASTA files
2. **Plot** - Run histograms, mutational signatures, Spearman correlations, pairwise heatmap, trendlines, and curve fitting

Edit the **Configuration** cell below to set paths, then run cells in order.

In [4]:
# Imports
import gzip
import json
from collections import defaultdict
import csv
import os
import numpy as np
import glob
import HTSeq
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy.optimize import curve_fit
from scipy import stats
from scipy.stats import spearmanr, linregress
import numbers
from datetime import datetime

## Configuration

Set these paths to match your setup. Run from the **analysis** directory or adjust paths accordingly.

In [6]:
# Configuration - edit these paths.
# _ANALYSIS_DIR: folder containing this notebook (analysis/). Works when run from repo root or from analysis/.
if '__file__' in dir():
    _ANALYSIS_DIR = os.path.dirname(os.path.abspath(__file__))
else:
    _cwd = os.getcwd()
    _ANALYSIS_DIR = os.path.join(_cwd, 'analysis') if os.path.isdir(os.path.join(_cwd, 'analysis')) and os.path.basename(_cwd) != 'analysis' else _cwd

PATTERNS_PATH = os.path.join(_ANALYSIS_DIR, "telomere_patterns_3x.json")
# FASTQ/FASTA input: greider_data_download is a sibling of analysis/ (same level as analysis in the repo)
FASTQ_DIR = os.path.abspath(os.path.join(_ANALYSIS_DIR, "..", "greider_data_download"))
METADATA_PATH = None  # None = auto-detect greider_methods_table_s2_outliers_removed.csv
CSV_OUT = None  # None = auto-derived from patterns version (e.g. telomere_analysis_2x_repeat.csv)

# Output directories (same as main.py)
_TRENDLINES_DIR = os.path.join(_ANALYSIS_DIR, 'trendlines')
_SPEARMAN_CORRELATIONS_DIR = os.path.join(_ANALYSIS_DIR, 'spearman_correlations')
_HISTOGRAM_DIR = os.path.join(_ANALYSIS_DIR, 'histograms')
print("Configuration complete")

Configuration complete


## Generate CSV (from generate_csv.py)

Load patterns, count mutations in FASTQ/FASTA files, and write the combined CSV.

In [7]:
def load_patterns(patterns_file_path):
    """Load patterns and general_mutation_map from the given patterns JSON path."""
    with open(patterns_file_path, 'r') as f:
        data = json.load(f)
    version = data.get('version', 'unknown')
    return data['patterns'], data['general_mutation_map'], version


def read_sequence_file(file_path: str):
    """Read FASTQ or FASTA file and yield sequences."""
    open_func = gzip.open if file_path.endswith('.gz') else open
    mode = 'rt' if file_path.endswith('.gz') else 'r'
    is_fasta = any(ext in file_path.lower() for ext in ['.fasta', '.fa', '.fas'])
    with open_func(file_path, mode) as f:
        if is_fasta:
            current_sequence = ""
            for line in f:
                line = line.strip()
                if line.startswith('>'):
                    if current_sequence:
                        yield current_sequence
                    current_sequence = ""
                else:
                    current_sequence += line
            if current_sequence:
                yield current_sequence
        else:
            while True:
                header = f.readline().strip()
                if not header:
                    break
                sequence = f.readline().strip()
                _ = f.readline()
                _ = f.readline()
                yield sequence


def count_patterns(sequence: str, pattern: str) -> int:
    return sequence.count(pattern)


def count_total_reads(file_path: str) -> int:
    """Count the total number of reads in a FASTQ or FASTA file."""
    count = 0
    for _ in HTSeq.FastqReader(file_path):
        count += 1
    return count


def load_age_data(metadata_file_path=None):
    """Load age data from greider_methods_table_s2_outliers_removed.csv."""
    age_data = {}
    if metadata_file_path is None:
        if os.path.exists('greider_methods_table_s2_outliers_removed.csv'):
            metadata_file_path = 'greider_methods_table_s2_outliers_removed.csv'
        elif os.path.exists('../analysis/greider_methods_table_s2_outliers_removed.csv'):
            metadata_file_path = '../analysis/greider_methods_table_s2_outliers_removed.csv'
        elif os.path.exists('analysis/greider_methods_table_s2_outliers_removed.csv'):
            metadata_file_path = 'analysis/greider_methods_table_s2_outliers_removed.csv'
        else:
            raise FileNotFoundError("greider_methods_table_s2_outliers_removed.csv not found in current directory, analysis directory, or parent directory")
    with open(metadata_file_path, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            fastq_name = row['fastq file name'].replace('_', '.')
            age_data[fastq_name] = row['Age (Years)']
    return age_data


def load_length_data(metadata_file_path=None):
    """Load length data from greider_methods_table_s2_outliers_removed.csv."""
    length_data = {}
    if metadata_file_path is None:
        if os.path.exists('greider_methods_table_s2_outliers_removed.csv'):
            metadata_file_path = 'greider_methods_table_s2_outliers_removed.csv'
        elif os.path.exists('../analysis/greider_methods_table_s2_outliers_removed.csv'):
            metadata_file_path = '../analysis/greider_methods_table_s2_outliers_removed.csv'
        elif os.path.exists('analysis/greider_methods_table_s2_outliers_removed.csv'):
            metadata_file_path = 'analysis/greider_methods_table_s2_outliers_removed.csv'
        else:
            raise FileNotFoundError("greider_methods_table_s2_outliers_removed.csv not found in current directory, analysis directory, or parent directory")
    with open(metadata_file_path, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            fastq_name = row['fastq file name'].replace('_', '.')
            length_data[fastq_name] = row['Mean Telomere Length (bps)']
    return length_data


def get_sequence_files(directory: str):
    """Get all FASTQ and FASTA files in the given directory."""
    sequence_files = []
    sequence_files.extend(glob.glob(os.path.join(directory, "*.fastq")))
    sequence_files.extend(glob.glob(os.path.join(directory, "*.fastq.gz")))
    sequence_files.extend(glob.glob(os.path.join(directory, "*.fasta")))
    sequence_files.extend(glob.glob(os.path.join(directory, "*.fasta.gz")))
    sequence_files.extend(glob.glob(os.path.join(directory, "*.fa")))
    sequence_files.extend(glob.glob(os.path.join(directory, "*.fa.gz")))
    sequence_files.extend(glob.glob(os.path.join(directory, "*.fas")))
    sequence_files.extend(glob.glob(os.path.join(directory, "*.fas.gz")))
    return sorted(sequence_files)

In [8]:
def generate_csv(
    data_dir: str,
    output_callback=None,
    metadata_file_path=None,
    patterns_file_path=None,
    output_csv_path=None,
):
    """Generate a single CSV file from sequence data containing both raw counts and normalized (per-1k) metrics."""
    if patterns_file_path is None:
        raise ValueError("patterns_file_path is required.")
    patterns, general_mutation_map, patterns_version = load_patterns(patterns_file_path)
    if output_csv_path is None:
        safe_version = patterns_version.replace(' ', '_')
        csv_path = f"telomere_analysis_{safe_version}.csv"
    else:
        csv_path = output_csv_path
        parent = os.path.dirname(csv_path)
        if parent:
            os.makedirs(parent, exist_ok=True)
    sequence_files = get_sequence_files(data_dir)
    if not sequence_files:
        message = f"No FASTQ or FASTA files found in {data_dir} directory"
        print(message)
        if output_callback:
            output_callback(message)
        return
    try:
        age_data = load_age_data(metadata_file_path)
        length_data = load_length_data(metadata_file_path)
    except FileNotFoundError:
        age_data = {}
        length_data = {}
    fieldnames = ['FileName', 'Age', 'Telomere_Length', 'Total_Reads', 'c_strand', 'g_strand']
    mutation_keys = []
    for group in ['g_strand_mutations', 'c_strand_mutations']:
        for subkey in patterns[group].keys():
            mutation_keys.append(f"{group}_{subkey}")
    fieldnames.extend(mutation_keys)
    fieldnames.extend([f"{k}_per_1k" for k in mutation_keys])
    general_mutation_headers = []
    for strand, mutmap in general_mutation_map.items():
        for mut in mutmap:
            general_mutation_headers.append(f"{strand}_{mut}_per_1k")
    fieldnames.extend(general_mutation_headers)
    fieldnames.extend([
        'composite_transition_per_1k', 'composite_transversion_per_1k',
        'g_strand_mutations_sum_per_1k', 'c_strand_mutations_sum_per_1k',
        'log_telomere_length', 'telomere_length_bin', 'mutation_rate_normalized_by_length',
    ])
    summed_per_1k_headers = []
    for strand, mutmap in general_mutation_map.items():
        for mut, subtypes in mutmap.items():
            summed_per_1k_headers.append(f"{strand}_{mut}_sum_per_1k")
    fieldnames.extend(summed_per_1k_headers)
    fieldnames.append('total_mutations_per_1k_strand_specific')
    fieldnames.extend([
        'total_mutations_over_total_g_strand_per_1k',
        'total_mutations_over_total_c_strand_per_1k',
    ])
    with open(csv_path, 'w', newline='') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for file_path in sequence_files:
            counts = defaultdict(int)
            total_reads = count_total_reads(file_path)
            for sequence in read_sequence_file(file_path):
                counts['c_strand'] += count_patterns(sequence, patterns['c_strand'])
                counts['g_strand'] += count_patterns(sequence, patterns['g_strand'])
                for group in ['g_strand_mutations', 'c_strand_mutations']:
                    for subkey, subpattern in patterns[group].items():
                        counts[f"{group}_{subkey}"] += count_patterns(sequence, subpattern)
            filename = os.path.basename(file_path)
            filename_base = filename
            for ext in ['.fastq.gz', '.fastq', '.fasta.gz', '.fasta', '.fa.gz', '.fa', '.fas.gz', '.fas']:
                if filename_base.endswith(ext):
                    filename_base = filename_base[:-len(ext)]
                    break
            age = age_data.get(filename_base, '')
            length = length_data.get(filename_base, '')
            g_strand_total = counts['g_strand']
            c_strand_total = counts['c_strand']
            g_strand_mutations_total = sum(counts[k] for k in counts if k.startswith('g_strand_mutations_'))
            c_strand_mutations_total = sum(counts[k] for k in counts if k.startswith('c_strand_mutations_'))
            g_strand_normalizer = g_strand_total + g_strand_mutations_total
            c_strand_normalizer = c_strand_total + c_strand_mutations_total
            def per_1k_strand_specific(val, strand_normalizer):
                return (val / strand_normalizer) * 1000 if strand_normalizer > 0 else 0
            def per_1k(val, total):
                return (val / total) * 1000 if total > 0 else 0
            row = {
                'FileName': filename, 'Age': age, 'Telomere_Length': length,
                'Total_Reads': total_reads, 'c_strand': c_strand_total, 'g_strand': g_strand_total,
            }
            for k in mutation_keys:
                row[k] = counts.get(k, 0)
                if k.startswith('g_strand_mutations_'):
                    row[f"{k}_per_1k"] = per_1k_strand_specific(counts.get(k, 0), g_strand_normalizer)
                elif k.startswith('c_strand_mutations_'):
                    row[f"{k}_per_1k"] = per_1k_strand_specific(counts.get(k, 0), c_strand_normalizer)
                else:
                    row[f"{k}_per_1k"] = per_1k_strand_specific(counts.get(k, 0), g_strand_normalizer)
            for strand, mutmap in general_mutation_map.items():
                for mut, subtypes in mutmap.items():
                    total = sum(counts.get(f"{strand}_mutations_{subtype}", 0) for subtype in subtypes)
                    if strand == 'g_strand':
                        row[f"{strand}_{mut}_per_1k"] = per_1k_strand_specific(total, g_strand_normalizer)
                    else:
                        row[f"{strand}_{mut}_per_1k"] = per_1k_strand_specific(total, c_strand_normalizer)
            for strand, mutmap in general_mutation_map.items():
                for mut, subtypes in mutmap.items():
                    per_1k_sum = sum(row.get(f"{strand}_mutations_{subtype}_per_1k", 0) for subtype in subtypes)
                    row[f"{strand}_{mut}_sum_per_1k"] = per_1k_sum
            total_mutations = sum(counts[k] for k in mutation_keys)
            if g_strand_normalizer > 0 or c_strand_normalizer > 0:
                g_weight = g_strand_mutations_total / total_mutations if total_mutations > 0 else 0
                c_weight = c_strand_mutations_total / total_mutations if total_mutations > 0 else 0
                weighted_normalizer = (g_weight * g_strand_normalizer) + (c_weight * c_strand_normalizer)
                row['total_mutations_per_1k_strand_specific'] = per_1k_strand_specific(total_mutations, weighted_normalizer)
            else:
                row['total_mutations_per_1k_strand_specific'] = 0
            row['total_mutations_over_total_g_strand_per_1k'] = per_1k_strand_specific(total_mutations, g_strand_normalizer)
            row['total_mutations_over_total_c_strand_per_1k'] = per_1k_strand_specific(total_mutations, c_strand_normalizer)
            row['g_strand_mutations_sum_per_1k'] = sum(row.get(k, 0) for k in row if k.startswith('g_strand_mutations') and k.endswith('_per_1k'))
            row['c_strand_mutations_sum_per_1k'] = sum(row.get(k, 0) for k in row if k.startswith('c_strand_mutations') and k.endswith('_per_1k'))
            try:
                row['log_telomere_length'] = np.log1p(float(row['Telomere_Length'])) if row['Telomere_Length'] else 0
            except Exception:
                row['log_telomere_length'] = 0
            try:
                length_val = float(row['Telomere_Length'])
                if length_val < 5000:
                    row['telomere_length_bin'] = 'short'
                elif length_val < 8000:
                    row['telomere_length_bin'] = 'medium'
                else:
                    row['telomere_length_bin'] = 'long'
            except Exception:
                row['telomere_length_bin'] = 'unknown'
            try:
                telomere_length = float(row['Telomere_Length'])
                if telomere_length > 0:
                    row['mutation_rate_normalized_by_length'] = row['g_strand_mutations_sum_per_1k'] / telomere_length
                else:
                    row['mutation_rate_normalized_by_length'] = 0
            except Exception:
                row['mutation_rate_normalized_by_length'] = 0
            writer.writerow(row)
            messages = [
                f"\nProcessing {filename}:", f"Age: {age}", f"Telomere Length: {length}",
                f"Total Reads: {total_reads}", f"{patterns_version} c-strand total: {c_strand_total}",
                f"{patterns_version} g-strand total: {g_strand_total}",
                f"G-strand mutations total: {g_strand_mutations_total}",
                f"C-strand mutations total: {c_strand_mutations_total}",
                f"G-strand normalizer ({patterns_version} + mutations): {g_strand_normalizer}",
                f"C-strand normalizer ({patterns_version} + mutations): {c_strand_normalizer}",
                f"Total mutations found: {total_mutations}",
            ]
            if counts['g_strand'] == 0 and counts['c_strand'] == 0:
                messages.append(f"Warning: No telomere sequences found in {filename}")
            for message in messages:
                print(message)
                if output_callback:
                    output_callback(message)
    return csv_path

In [None]:
# Run generate CSV (equivalent to: main.py generate --patterns ... --fastq-dir ... --metadata ... --csv-out ...)
def _default_csv_path_from_patterns(patterns_file_path):
    """Construct the default telomere_analysis CSV path based on patterns version."""
    with open(patterns_file_path) as f:
        version = json.load(f).get('version', 'unknown')
    safe_version = str(version).replace(' ', '_')
    return os.path.join(_ANALYSIS_DIR, f'telomere_analysis_{safe_version}.csv')

print("[1/2] Generating telomere_analysis CSV ...")
output_path = CSV_OUT if CSV_OUT else _default_csv_path_from_patterns(PATTERNS_PATH)
csv_path_result = generate_csv(
    data_dir=FASTQ_DIR,
    metadata_file_path=METADATA_PATH,
    patterns_file_path=PATTERNS_PATH,
    output_csv_path=output_path,
)
csv_path = csv_path_result if csv_path_result else output_path
print(f"CSV written to: {csv_path}")

[1/2] Generating telomere_analysis CSV ...

Processing JH100.F49.NB67.fastq.gz:
Age: 22
Telomere Length: 7188
Total Reads: 4473
3x_repeat c-strand total: 60582
3x_repeat g-strand total: 626822
G-strand mutations total: 39385
C-strand mutations total: 3083
G-strand normalizer (3x_repeat + mutations): 666207
C-strand normalizer (3x_repeat + mutations): 63665
Total mutations found: 42468

Processing JH101.F47.NB69.fastq.gz:
Age: 36
Telomere Length: 6863
Total Reads: 4091
3x_repeat c-strand total: 52252
3x_repeat g-strand total: 540563
G-strand mutations total: 31345
C-strand mutations total: 3136
G-strand normalizer (3x_repeat + mutations): 571908
C-strand normalizer (3x_repeat + mutations): 55388
Total mutations found: 34481

Processing JH102.F47.NB01.fastq.gz:
Age: 27
Telomere Length: 6677
Total Reads: 7271
3x_repeat c-strand total: 91923
3x_repeat g-strand total: 991850
G-strand mutations total: 59486
C-strand mutations total: 6185
G-strand normalizer (3x_repeat + mutations): 1051336
C

## Plotting (from plotting.py)

Run all plotting pipelines. Uses `csv_path` from the generate step above. If you already have a CSV, set `csv_path` manually and run the plot cells.

In [None]:
def _get_patterns_version(patterns_file_path):
    """Read version string from patterns JSON."""
    try:
        with open(patterns_file_path) as f:
            return json.load(f).get('version', 'unknown')
    except Exception:
        return 'unknown'

def _default_csv_path_from_patterns(patterns_file_path):
    """Construct the default telomere_analysis CSV path based on patterns version."""
    with open(patterns_file_path) as f:
        version = json.load(f).get('version', 'unknown')
    safe_version = str(version).replace(' ', '_')
    return os.path.join(_ANALYSIS_DIR, f'telomere_analysis_{safe_version}.csv')

# Ensure csv_path is available (use from generate step, or resolve from patterns)
try:
    csv_path
except NameError:
    csv_path = _default_csv_path_from_patterns(PATTERNS_PATH)

In [None]:
def _plot_histograms_by_age_group(data, output_path):
    """Plot boxplots of mutation rate variables in 10-year age bins (2x2 grid)."""
    sns.set_style("whitegrid")
    sns.set_palette("husl")
    fig, axes = plt.subplots(2, 2, figsize=(20, 12))
    fig.suptitle('Mutation Rates by Age Groups (10-year bins)', fontsize=16, fontweight='bold')
    variables = ['total_mutations_over_total_g_strand_per_1k', 'g_strand_A>G_sum_per_1k', 'g_strand_T>G_sum_per_1k', 'g_strand_T>C_sum_per_1k']
    titles = ['Total Mutations Normalized', 'G > A Mutation Rate Normalized', 'T > G Mutation Rate Normalized', 'T > C Mutation Rate Normalized']
    for i, (var, title) in enumerate(zip(variables, titles)):
        row, col = i // 2, i % 2
        ax = axes[row, col]
        plot_data = data.dropna(subset=['Age', var])
        if len(plot_data) > 0:
            age_bins = np.arange(0, plot_data['Age'].max() + 10, 10)
            age_labels = [f'{int(b)}-{int(b+9)}' for b in age_bins[:-1]]
            plot_data = plot_data.copy()
            plot_data['Age_Group'] = pd.cut(plot_data['Age'], bins=age_bins, labels=age_labels, include_lowest=True)
            sns.boxplot(data=plot_data, x='Age_Group', y=var, ax=ax)
            ax.set_xlabel('Age Group (years)')
            ax.set_ylabel('Mutations per 1000bp')
            plt.setp(ax.get_xticklabels(), rotation=45, ha='right')
        else:
            ax.text(0.5, 0.5, 'No data available', ha='center', va='center', transform=ax.transAxes, fontsize=12)
        ax.set_title(title, fontweight='bold')
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.close()

def _plot_mutations_per_file(data, output_path):
    """Bar plot of total mutations per file."""
    sns.set_style("whitegrid")
    mutation_columns = [col for col in data.columns if 'mutations' in col and 'per_1k' not in col]
    data_with_totals = data.copy()
    data_with_totals['Total_Mutations'] = data[mutation_columns].sum(axis=1)
    plot_data = data_with_totals.dropna(subset=['FileName', 'Total_Mutations'])
    plt.figure(figsize=(16, 10))
    if len(plot_data) > 0:
        plot_data = plot_data.sort_values('Total_Mutations', ascending=True)
        bars = plt.bar(range(len(plot_data)), plot_data['Total_Mutations'], color='steelblue', alpha=0.7, edgecolor='black', linewidth=0.5)
        plt.title('Total Number of Mutations per File', fontsize=16, fontweight='bold', pad=20)
        plt.xlabel('Files'); plt.ylabel('Total Number of Mutations')
        plt.xticks(range(len(plot_data)), plot_data['FileName'], rotation=45, ha='right')
        max_val = plot_data['Total_Mutations'].max()
        for bar in bars:
            plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max_val*0.01, f'{int(bar.get_height())}', ha='center', va='bottom', fontsize=8)
        stats_text = f"Mean: {plot_data['Total_Mutations'].mean():.0f}\nMedian: {plot_data['Total_Mutations'].median():.0f}\nMin: {plot_data['Total_Mutations'].min():.0f}\nMax: {max_val:.0f}"
        plt.text(0.02, 0.98, stats_text, transform=plt.gca().transAxes, fontsize=10, verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))
        plt.grid(axis='y', alpha=0.3)
    else:
        plt.text(0.5, 0.5, 'No data available', ha='center', va='center', transform=plt.gca().transAxes, fontsize=12)
        plt.title('Total Number of Mutations per File', fontweight='bold')
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.close()
    return len(plot_data)

def plot_histograms_from_csv(csv_path, output_dir=None):
    """Reproduce histogram-style plots given a telomere analysis CSV path."""
    if output_dir is None:
        output_dir = os.path.dirname(csv_path) or '.'
    os.makedirs(output_dir, exist_ok=True)
    data = pd.read_csv(csv_path)
    hist_path = os.path.join(output_dir, "histogram.png")
    per_file_path = os.path.join(output_dir, "mutations_per_file_histogram.png")
    _plot_histograms_by_age_group(data, hist_path)
    num_files = _plot_mutations_per_file(data, per_file_path)
    print(f"Histogram plot saved as '{hist_path}'")
    print(f"Mutations per file histogram saved as '{per_file_path}'")
    print(f"Processed {num_files} files")

In [None]:
def plot_trendlines(data, output_path, variables, titles, version):
    """Plot linear trendlines of mutation rate variables vs Age in 2x2 grid."""
    sns.set_style("whitegrid")
    sns.set_palette("husl")
    fig, axes = plt.subplots(2, 2, figsize=(20, 12))
    fig.suptitle(f'Mutation Rates vs Age [{version}]', fontsize=16, fontweight='bold')
    for i, (var, title) in enumerate(zip(variables, titles)):
        row, col = i // 2, i % 2
        ax = axes[row, col]
        plot_data = data.dropna(subset=['Age', var])
        if len(plot_data) > 0:
            sns.scatterplot(data=plot_data, x='Age', y=var, ax=ax, alpha=0.6)
            if len(plot_data) > 1:
                sns.regplot(data=plot_data, x='Age', y=var, ax=ax, scatter=False, line_kws={'color': 'blue', 'linestyle': '--'})
                slope, intercept, r_value, p_value, std_err = linregress(plot_data['Age'], plot_data[var])
                ax.set_title(f"{title}\nR² = {(r_value**2):.3f}", fontweight='bold')
            else:
                ax.set_title(title, fontweight='bold')
            ax.set_xlabel('Age'); ax.set_ylabel('Mutations per 1000bp')
        else:
            ax.text(0.5, 0.5, 'No data available', ha='center', va='center', transform=ax.transAxes, fontsize=12)
            ax.set_title(title, fontweight='bold')
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.close()

def plot_spearman_trendlines(data, output_path, variables, titles, version):
    """Plot Spearman correlation scatterplots vs Age in 2x2 grid."""
    sns.set_style("whitegrid")
    sns.set_palette("husl")
    fig, axes = plt.subplots(2, 2, figsize=(20, 12))
    fig.suptitle(f"Spearman's Rank Correlation: Mutation Rates vs Age [{version}]", fontsize=16, fontweight='bold')
    for i, (var, title) in enumerate(zip(variables, titles)):
        row, col = i // 2, i % 2
        ax = axes[row, col]
        plot_data = data.dropna(subset=['Age', var])
        if len(plot_data) > 1:
            corr, pval = spearmanr(plot_data['Age'], plot_data[var])
            sns.scatterplot(data=plot_data, x='Age', y=var, ax=ax, alpha=0.6)
            ax.set_title(f"{title}\nSpearman r = {corr:.2f}, p = {pval:.2g}", fontweight='bold')
        else:
            ax.text(0.5, 0.5, 'No data available', ha='center', va='center', transform=ax.transAxes, fontsize=12)
            ax.set_title(title, fontweight='bold')
        ax.set_xlabel('Age'); ax.set_ylabel('Mutations per 1000bp')
    plt.tight_layout()
    plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.close()

def plot_trendlines_main(csv_path, trendline_output_path, spearman_output_path, variables, titles, patterns_file_path):
    """Main entry for 2x2 trendline and Spearman plots."""
    version = _get_patterns_version(patterns_file_path)
    data = pd.read_csv(csv_path)
    plot_trendlines(data, trendline_output_path, variables, titles, version)
    plot_spearman_trendlines(data, spearman_output_path, variables, titles, version)
    print(f"Trendline plot saved as '{trendline_output_path}'")
    print(f"Spearman correlation plot saved as '{spearman_output_path}'")

In [None]:
def plot_mutational_signature_row(row, mutation_types, mutation_columns, output_path, version):
    sns.set_style("whitegrid")
    sns.set_palette("husl")
    bar_heights, bar_colors, bar_labels = [], [], []
    all_columns = []
    for mut_type, contexts in mutation_columns.items():
        for context, cols in contexts.items():
            existing_cols = [col for col in cols if col in row.index]
            all_columns.extend(existing_cols)
    if not all_columns:
        print(f"Warning: No valid mutation columns found for {row.get('FileName', 'unknown')}")
        return
    total_mutations = row[all_columns].sum()
    for mut_label, color in mutation_types:
        contexts = mutation_columns[mut_label]
        for context_name, cols in contexts.items():
            for i, col in enumerate(cols):
                if col in row.index:
                    value = row[col]
                    percentage = (value / total_mutations) * 100 if total_mutations > 0 else 0
                    bar_heights.append(percentage)
                    bar_colors.append(color)
                    bar_labels.append(f"{mut_label} {context_name} pos{i+1}")
    if not bar_heights:
        print(f"Warning: No valid data found for {row.get('FileName', 'unknown')}")
        return
    x = np.arange(len(bar_heights))
    fig, ax = plt.subplots(figsize=(16, 10))
    sns.barplot(x=x, y=bar_heights, palette=bar_colors, ax=ax, edgecolor='black', linewidth=0.5)
    for i, label in enumerate(bar_labels):
        ax.text(i, -max(bar_heights)*0.02, label, ha='center', va='center', color='black', fontsize=9, fontweight='normal', rotation=45)
    ax.set_xticks([])
    ax.set_yticks(np.linspace(0, max(bar_heights), 6))
    ax.set_xlim(-0.5, len(bar_heights) - 0.5)
    ax.set_ylim(-max(bar_heights)*0.1, max(bar_heights) + max(bar_heights)*0.2)
    ax.spines['top'].set_visible(False); ax.spines['right'].set_visible(False)
    ax.spines['bottom'].set_visible(False); ax.spines['left'].set_visible(False)
    ax.yaxis.grid(True, linestyle='--', alpha=0.3)
    ax.set_axisbelow(True)
    ax.set_ylabel('Percentage of Single Base Modifications', fontsize=14, fontweight='bold')
    age = row['Age'] if 'Age' in row else 'N/A'
    filename = row['FileName'] if 'FileName' in row else 'sample'
    ax.set_title(f"Mutational Signatures by Position and Strand Context [{version}]\nFile: {filename} | Age: {age} years", fontsize=18, fontweight='bold', pad=30)
    plt.tight_layout(rect=[0, 0.15, 1, 0.95])
    plt.savefig(output_path, dpi=300, bbox_inches='tight', facecolor='white')
    plt.close()

def plot_mutational_signatures(csv_path, patterns_file_path):
    sns.set_theme(style="whitegrid", font_scale=1.1)
    version = _get_patterns_version(patterns_file_path)
    df = pd.read_csv(csv_path)
    mutation_types = [('C>A','blue'),('C>G','black'),('C>T','red'),('G>A','gray'),('G>C','green'),('G>T','pink')]
    mutation_columns = {
        'C>A': {'C-strand': ['c_strand_mutations_C>A_c1','c_strand_mutations_C>A_c2','c_strand_mutations_C>A_c3'], 'G-strand': ['g_strand_mutations_G>T_g1','g_strand_mutations_G>T_g2','g_strand_mutations_G>T_g3']},
        'C>G': {'C-strand': ['c_strand_mutations_C>G_c1','c_strand_mutations_C>G_c2','c_strand_mutations_C>G_c3'], 'G-strand': ['g_strand_mutations_G>C_g1','g_strand_mutations_G>C_g2','g_strand_mutations_G>C_g3']},
        'C>T': {'C-strand': ['c_strand_mutations_C>T_c1','c_strand_mutations_C>T_c2','c_strand_mutations_C>T_c3'], 'G-strand': ['g_strand_mutations_G>A_g1','g_strand_mutations_G>A_g2','g_strand_mutations_G>A_g3']},
        'G>A': {'G-strand': ['g_strand_mutations_G>A_g1','g_strand_mutations_G>A_g2','g_strand_mutations_G>A_g3'], 'C-strand': ['c_strand_mutations_C>G_c1','c_strand_mutations_C>G_c2','c_strand_mutations_C>G_c3']},
        'G>C': {'G-strand': ['g_strand_mutations_G>C_g1','g_strand_mutations_G>C_g2','g_strand_mutations_G>C_g3'], 'C-strand': ['c_strand_mutations_C>T_c1','c_strand_mutations_C>T_c2','c_strand_mutations_C>T_c3']},
        'G>T': {'G-strand': ['g_strand_mutations_G>T_g1','g_strand_mutations_G>T_g2','g_strand_mutations_G>T_g3'], 'C-strand': ['c_strand_mutations_C>A_c1','c_strand_mutations_C>A_c2','c_strand_mutations_C>A_c3']},
    }
    os.makedirs('plots', exist_ok=True)
    for idx, row in df.iterrows():
        filename = str(row['FileName']) if 'FileName' in row else f'sample_{idx}'
        filename_base = os.path.splitext(filename)[0]
        output_path = os.path.join('plots', f'{filename_base}.png')
        plot_mutational_signature_row(row, mutation_types, mutation_columns, output_path, version)

def plot_mutational_signatures_main(patterns_file_path):
    csv_path = _default_csv_path_from_patterns(patterns_file_path)
    plot_mutational_signatures(csv_path, patterns_file_path)
    print("Mutational signature plots saved in 'plots/' directory")

In [None]:
def plot_spearman_with_age(csv_path, patterns_file_path):
    """Plot scatter of each numeric column vs Age with Spearman correlation."""
    sns.set_theme(style="whitegrid", font_scale=1.1)
    df = pd.read_csv(csv_path)
    output_dir = "spearman's plots"
    os.makedirs(output_dir, exist_ok=True)
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if 'Age' not in numeric_cols:
        print("No 'Age' column found.")
        return
    numeric_cols = [c for c in numeric_cols if c != 'Age']
    df = df.dropna(subset=['Age'])
    spearman_results = []
    version = _get_patterns_version(patterns_file_path)
    for col in numeric_cols:
        sub_df = df.dropna(subset=[col])
        if sub_df.shape[0] < 2:
            continue
        x, y = sub_df['Age'], sub_df[col]
        corr, pval = stats.spearmanr(x, y)
        spearman_results.append({'Column': col, 'Spearman_r': corr, 'p_value': pval})
        plt.figure(figsize=(8, 6))
        ax = sns.scatterplot(x=x, y=y)
        if len(x) > 1:
            sns.regplot(x=x, y=y, scatter=False, ci=None, line_kws={'color': 'red', 'linestyle': '--'}, ax=ax)
        ax.set_xlabel('Age (years)'); ax.set_ylabel(col)
        ax.set_title(f"Spearman's ρ = {corr:.2f} (p={pval:.2g})\n{col} vs Age [{version}]", fontsize=14)
        plt.tight_layout()
        safe_col = col.replace('/', '_').replace(' ', '_').replace('>', 'to').replace('<', 'lt').replace(':', '_')
        plt.savefig(os.path.join(output_dir, f"{safe_col}_vs_Age.png"), dpi=200)
        plt.close()
    pd.DataFrame(spearman_results).to_csv(os.path.join(output_dir, "spearman_results.csv"), index=False)

def plot_spearman_with_age_main(patterns_file_path):
    csv_path = _default_csv_path_from_patterns(patterns_file_path)
    plot_spearman_with_age(csv_path, patterns_file_path)
    print("Spearman plots saved in \"spearman's plots/\" directory")

def plot_mutation_r_heatmap(csv_path, target_col='Age', patterns_file_path=None):
    """Heatmap of Spearman r values between per_1k mutation columns and target."""
    df = pd.read_csv(csv_path)
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if target_col not in numeric_cols:
        print(f"No '{target_col}' column found.")
        return
    mutation_cols = [c for c in numeric_cols if 'per_1k' in c or 'per1k' in c]
    if 'total_mutations_per_1k_reads' in df.columns and 'total_mutations_per_1k_reads' not in mutation_cols:
        mutation_cols.append('total_mutations_per_1k_reads')
    total_mut_col = next((c for c in numeric_cols if 'total_mutation' in c and c not in mutation_cols), None)
    if total_mut_col:
        mutation_cols.append(total_mut_col)
    frameshift_cols = [c for c in numeric_cols if 'frameshift' in c.lower() and 'per_1k' in c]
    mutation_cols.extend([c for c in frameshift_cols if c not in mutation_cols])
    if not mutation_cols:
        print("No per_1k mutation columns found.")
        return
    df = df.dropna(subset=[target_col])
    r_values = []
    for col in mutation_cols:
        sub_df = df.dropna(subset=[col])
        r_values.append(stats.spearmanr(sub_df[col], sub_df[target_col])[0] if sub_df.shape[0] >= 2 else float('nan'))
    r_df = pd.DataFrame({'Mutation': mutation_cols, 'Spearman_r': r_values}).set_index('Mutation')
    plt.figure(figsize=(max(8, len(mutation_cols)*0.4), 2.5))
    sns.heatmap(r_df.T, annot=True, cmap='coolwarm', center=0, cbar_kws={'label': "Spearman's r"})
    version = _get_patterns_version(patterns_file_path) if patterns_file_path else 'unknown'
    plt.title(f"Spearman r values: Normalized Mutations vs {target_col} [{version}]")
    plt.yticks(rotation=0)
    plt.tight_layout()
    output_dir = "spearman's plots"
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, f"mutation_r_heatmap_vs_{target_col}.png")
    plt.savefig(output_path, dpi=200)
    plt.close()
    print(f"Mutation r heatmap saved as {output_path}")

def plot_pairwise_r_heatmap(csv_path, patterns_file_path=None):
    """Pairwise Spearman r heatmap between per_1k, Age, telomere columns."""
    df = pd.read_csv(csv_path)
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    mutation_cols = [c for c in numeric_cols if 'per_1k' in c or 'per1k' in c]
    if 'total_mutations_per_1k_reads' in df.columns and 'total_mutations_per_1k_reads' not in mutation_cols:
        mutation_cols.append('total_mutations_per_1k_reads')
    total_mut_col = next((c for c in numeric_cols if 'total_mutation' in c and c not in mutation_cols), None)
    if total_mut_col:
        mutation_cols.append(total_mut_col)
    frameshift_cols = [c for c in numeric_cols if 'frameshift' in c.lower() and 'per_1k' in c]
    mutation_cols.extend([c for c in frameshift_cols if c not in mutation_cols])
    if 'Total_Reads' in df.columns and 'Total_Reads' in numeric_cols and 'Total_Reads' not in mutation_cols:
        mutation_cols.append('Total_Reads')
    if 'Age' in df.columns and 'Age' in numeric_cols and 'Age' not in mutation_cols:
        mutation_cols.append('Age')
    telomere_cols = [c for c in df.columns if 'telomere' in c.lower() and c in numeric_cols and c not in mutation_cols]
    mutation_cols.extend(telomere_cols)
    if not mutation_cols:
        print("No relevant columns for pairwise heatmap.")
        return
    n = len(mutation_cols)
    r_matrix = np.zeros((n, n))
    for i, col1 in enumerate(mutation_cols):
        for j, col2 in enumerate(mutation_cols):
            sub_df = df[[col1, col2]].dropna()
            r_val = stats.spearmanr(sub_df[col1], sub_df[col2])[0] if sub_df.shape[0] >= 2 else np.nan
            r_matrix[i, j] = float(r_val) if isinstance(r_val, numbers.Number) and not isinstance(r_val, (list, tuple, np.ndarray)) else np.nan
    mask = np.eye(n, dtype=bool)
    fig_width, fig_height = max(10, n*0.7), max(8, n*0.7)
    plt.figure(figsize=(fig_width, fig_height))
    ax = sns.heatmap(r_matrix, annot=True, fmt=".2f", cmap="coolwarm", center=0, mask=mask,
                     xticklabels=mutation_cols, yticklabels=mutation_cols, cbar_kws={'label': "Spearman's r"}, annot_kws={"size": 8})
    for i in range(n):
        ax.add_patch(plt.Rectangle((i, i), 1, 1, fill=False, edgecolor='black', lw=2, hatch='xx'))
    version = _get_patterns_version(patterns_file_path) if patterns_file_path else 'unknown'
    plt.title(f"Pairwise Spearman r Heatmap (Normalized Mutations, Age, Telomere) [{version}]", fontsize=14)
    plt.xticks(rotation=45, ha='right', fontsize=9)
    plt.yticks(fontsize=9)
    plt.tight_layout()
    output_dir = "spearman's plots"
    os.makedirs(output_dir, exist_ok=True)
    output_path = os.path.join(output_dir, "pairwise_mutation_r_heatmap.png")
    plt.savefig(output_path, dpi=200)
    plt.close()
    print(f"Pairwise mutation r heatmap saved as {output_path}")

def plot_pairwise_r_heatmap_main(patterns_file_path):
    csv_path = _default_csv_path_from_patterns(patterns_file_path)
    plot_pairwise_r_heatmap(csv_path, patterns_file_path=patterns_file_path)

In [None]:
def curve_fitting_analysis(csv_path, output_dir="curve_fitting_plots", patterns_file_path=None):
    """Curve fitting for telomere length vs age and mutation rate vs age."""
    def linear_func(x, a, b): return a * x + b
    def exponential_func(x, a, b, c): return a * np.exp(b * x) + c
    def logarithmic_func(x, a, b): return a * np.log(x + 1) + b
    def polynomial_func(x, a, b, c, d): return a * x**3 + b * x**2 + c * x + d
    def power_func(x, a, b, c): return a * (x + 1)**b + c
    df = pd.read_csv(csv_path).dropna(subset=['Age'])
    os.makedirs(output_dir, exist_ok=True)
    sns.set_theme(style="whitegrid", font_scale=1.1)
    results = []
    curve_types = [('Linear', linear_func, [1, 1]), ('Exponential', exponential_func, [1, 0.01, 1]),
                   ('Logarithmic', logarithmic_func, [1, 1]), ('Polynomial', polynomial_func, [0.01, 0.1, 1, 1]),
                   ('Power', power_func, [1, 0.5, 1])]
    version = _get_patterns_version(patterns_file_path) if patterns_file_path else 'unknown'
    if 'Telomere_Length' in df.columns:
        telomere_df = df.dropna(subset=['Telomere_Length'])
        if len(telomere_df) >= 4:
            x_data, y_data = telomere_df['Age'].values, telomere_df['Telomere_Length'].values
            best_fit, best_r_squared = None, -np.inf
            plt.figure(figsize=(14, 10))
            colors = ['red', 'green', 'blue', 'orange', 'purple']
            for i, (name, func, p0) in enumerate(curve_types):
                try:
                    popt, pcov = curve_fit(func, x_data, y_data, p0=p0, maxfev=5000)
                    y_pred = func(x_data, *popt)
                    r_squared = 1 - np.sum((y_data - y_pred)**2) / np.sum((y_data - np.mean(y_data))**2)
                    results.append({'Variable': 'Telomere_Length', 'Curve_Type': name, 'R_squared': r_squared, 'Parameters': popt.tolist(), 'Parameter_Errors': np.sqrt(np.diag(pcov)).tolist()})
                    x_smooth = np.linspace(x_data.min(), x_data.max(), 100)
                    plt.plot(x_smooth, func(x_smooth, *popt), color=colors[i], linestyle='--', label=f'{name} (R² = {r_squared:.3f})', linewidth=2)
                    if r_squared > best_r_squared:
                        best_r_squared, best_fit = r_squared, (name, func, popt)
                except Exception as e:
                    print(f"Could not fit {name} to Telomere_Length: {e}")
            plt.scatter(x_data, y_data, alpha=0.7, s=50, color='black', label='Data points')
            if best_fit:
                name, func, popt = best_fit
                x_smooth = np.linspace(x_data.min(), x_data.max(), 100)
                plt.plot(x_smooth, func(x_smooth, *popt), color='red', linewidth=3, label=f'Best fit: {name} (R² = {best_r_squared:.3f})')
                y_pred = func(x_data, *popt)
                residuals = y_data - y_pred
                outlier_threshold = 2 * np.std(residuals)
                outlier_indices = np.where(np.abs(residuals) > outlier_threshold)[0]
                if len(outlier_indices) > 0:
                    plt.scatter(x_data[outlier_indices], y_data[outlier_indices], alpha=0.9, s=80, color='orange', edgecolor='red', linewidth=2, zorder=5)
                    print(f"\n--- Telomere Length Outliers ---")
                    print(f"Outlier threshold: ±{outlier_threshold:.1f} bp")
                    for idx in outlier_indices:
                        sample_name = telomere_df.iloc[idx]['FileName'] if 'FileName' in telomere_df.columns else f'Sample_{idx}'
                        print(f"  {sample_name}: Age={x_data[idx]:.1f}, TL={y_data[idx]:.1f}bp, Residual={residuals[idx]:+.1f}bp")
                        plt.annotate(str(sample_name)[:12] + ('...' if len(str(sample_name)) > 15 else ''), (x_data[idx], y_data[idx]),
                                   xytext=(10, 10), textcoords='offset points', fontsize=8, ha='left', va='bottom', bbox=dict(boxstyle='round,pad=0.3', facecolor='yellow', alpha=0.7), arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0.1'))
            plt.xlabel('Age (years)'); plt.ylabel('Telomere Length (bp)')
            plt.title(f'Curve Fitting: Telomere Length vs Age [{version}]', fontsize=14, fontweight='bold')
            plt.legend(); plt.grid(True, alpha=0.3); plt.tight_layout()
            output_path = os.path.join(output_dir, 'telomere_length_vs_age_curve_fitting.png')
            plt.savefig(output_path, dpi=300, bbox_inches='tight')
            plt.close()
            print(f"Telomere length curve fitting plot saved: {output_path}")
    mutation_rate_cols = [c for c in df.columns if 'per_1k' in c and c != 'Telomere_Length']
    target_cols = [c for c in ['total_mutations_per_1k_reads', 'mutation_rate_normalized_by_length', 'composite_score',
                               'g_strand_mutations_sum_per_1k', 'c_strand_mutations_sum_per_1k'] if c in df.columns]
    if not target_cols:
        target_cols = mutation_rate_cols[:3]
    for col in target_cols:
        mutation_df = df.dropna(subset=[col])
        if len(mutation_df) < 4:
            continue
        x_data, y_data = mutation_df['Age'].values, mutation_df[col].values
        best_fit, best_r_squared = None, -np.inf
        plt.figure(figsize=(14, 10))
        colors = ['red', 'green', 'blue', 'orange', 'purple']
        for i, (name, func, p0) in enumerate(curve_types):
            try:
                popt, pcov = curve_fit(func, x_data, y_data, p0=p0, maxfev=5000)
                y_pred = func(x_data, *popt)
                r_squared = 1 - np.sum((y_data - y_pred)**2) / np.sum((y_data - np.mean(y_data))**2)
                results.append({'Variable': col, 'Curve_Type': name, 'R_squared': r_squared, 'Parameters': popt.tolist(), 'Parameter_Errors': np.sqrt(np.diag(pcov)).tolist()})
                x_smooth = np.linspace(x_data.min(), x_data.max(), 100)
                plt.plot(x_smooth, func(x_smooth, *popt), color=colors[i], linestyle='--', label=f'{name} (R² = {r_squared:.3f})', linewidth=2)
                if r_squared > best_r_squared:
                    best_r_squared, best_fit = r_squared, (name, func, popt)
            except Exception as e:
                print(f"Could not fit {name} to {col}: {e}")
        plt.scatter(x_data, y_data, alpha=0.7, s=50, color='black', label='Data points')
        if best_fit:
            name, func, popt = best_fit
            x_smooth = np.linspace(x_data.min(), x_data.max(), 100)
            plt.plot(x_smooth, func(x_smooth, *popt), color='red', linewidth=3, label=f'Best fit: {name} (R² = {best_r_squared:.3f})')
            y_pred = func(x_data, *popt)
            residuals = y_data - y_pred
            outlier_threshold = 2 * np.std(residuals)
            outlier_indices = np.where(np.abs(residuals) > outlier_threshold)[0]
            if len(outlier_indices) > 0:
                plt.scatter(x_data[outlier_indices], y_data[outlier_indices], alpha=0.9, s=80, color='orange', edgecolor='red', linewidth=2, zorder=5)
                print(f"\n--- {col.replace('_', ' ').title()} Outliers ---")
                print(f"Outlier threshold: ±{outlier_threshold:.3f}")
                for idx in outlier_indices:
                    sample_name = mutation_df.iloc[idx]['FileName'] if 'FileName' in mutation_df.columns else f'Sample_{idx}'
                    print(f"  {sample_name}: Age={x_data[idx]:.1f}, Value={y_data[idx]:.3f}, Residual={residuals[idx]:+.3f}")
                    plt.annotate(str(sample_name)[:12] + ('...' if len(str(sample_name)) > 15 else ''), (x_data[idx], y_data[idx]),
                               xytext=(10, 10), textcoords='offset points', fontsize=8, ha='left', va='bottom', bbox=dict(boxstyle='round,pad=0.3', facecolor='yellow', alpha=0.7), arrowprops=dict(arrowstyle='->', connectionstyle='arc3,rad=0.1'))
        plt.xlabel('Age (years)'); plt.ylabel(col.replace('_', ' ').title())
        plt.title(f'Curve Fitting: {col.replace("_", " ").title()} vs Age [{version}]', fontsize=14, fontweight='bold')
        plt.legend(); plt.grid(True, alpha=0.3); plt.tight_layout()
        safe_col = col.replace('/', '_').replace(' ', '_').replace('>', 'to').replace('<', 'lt').replace(':', '_')
        output_path = os.path.join(output_dir, f'{safe_col}_vs_age_curve_fitting.png')
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        plt.close()
        print(f"Mutation rate curve fitting plot saved: {output_path}")
    if results:
        results_df = pd.DataFrame(results)
        results_csv_path = os.path.join(output_dir, "curve_fitting_results.csv")
        results_df.to_csv(results_csv_path, index=False)
        print(f"Curve fitting results saved: {results_csv_path}")
        print("\n=== CURVE FITTING SUMMARY ===")
        for variable in results_df['Variable'].unique():
            var_results = results_df[results_df['Variable'] == variable]
            best_result = var_results.loc[var_results['R_squared'].idxmax()]
            print(f"{variable}:")
            print(f"  Best fit: {best_result['Curve_Type']} (R² = {best_result['R_squared']:.4f})")
            print(f"  Parameters: {best_result['Parameters']}")
            print()

def curve_fitting_analysis_main(patterns_file_path):
    csv_path = _default_csv_path_from_patterns(patterns_file_path)
    curve_fitting_analysis(csv_path, patterns_file_path=patterns_file_path)

## Run All Plots (equivalent to: main.py run / main.py plot)

Run the cell below to execute the full plotting pipeline: histograms, mutational signatures, Spearman correlations, pairwise heatmap, trendlines, and curve fitting.

In [None]:
# Run full plotting pipeline (equivalent to main.py run after generate, or main.py plot)
print("[2/2] Running plots ...")
os.makedirs(_HISTOGRAM_DIR, exist_ok=True)
plot_histograms_from_csv(csv_path, output_dir=_HISTOGRAM_DIR)
plot_mutational_signatures_main(PATTERNS_PATH)
plot_spearman_with_age_main(PATTERNS_PATH)
plot_pairwise_r_heatmap_main(PATTERNS_PATH)
# Trendlines
def _unique_output_path(directory, base_name, ext):
    os.makedirs(directory, exist_ok=True)
    timestamp = datetime.now().strftime('%Y-%m-%d_%H%M%S')
    return os.path.join(directory, f"{base_name}_{timestamp}.{ext}")
_TRENDLINE_VARIABLES = ["total_mutations_over_total_g_strand_per_1k", "g_strand_T>C_sum_per_1k", "g_strand_G>T_sum_per_1k", "g_strand_T>G_sum_per_1k"]
_TRENDLINE_TITLES = ["Total Mutations Normalized", "T > C Mutation Rate Normalized", "G > T Mutation Rate Normalized", "T > G Mutation Rate Normalized"]
trendline_output = _unique_output_path(_TRENDLINES_DIR, "trendline", "png")
spearman_output = _unique_output_path(_SPEARMAN_CORRELATIONS_DIR, "spearman_correlation", "png")
plot_trendlines_main(csv_path, trendline_output, spearman_output, _TRENDLINE_VARIABLES, _TRENDLINE_TITLES, PATTERNS_PATH)
# Curve fitting
curve_fitting_analysis_main(PATTERNS_PATH)
print("\nAll plotting complete.")