Statistical Analysis of SV Features

This notebook performs statistical analysis on the extracted genomic features
to assess their discriminative power for TP vs FP classification.

Input:
- CSV files with computed features from feature extraction notebook

Output:
- Statistical test results
- Feature distribution plots  
- Feature ranking table for paper

Analysis:
- Mann-Whitney U tests for TP vs FP differences
- Effect size calculations (Cohen's d)
- Feature ranking by discriminative power

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import mannwhitneyu, ks_2samp
import warnings
warnings.filterwarnings('ignore')

# Set up plotting
plt.style.use('default')
plt.rcParams['figure.dpi'] = 100
plt.rcParams['savefig.dpi'] = 300

In [None]:
# Load the feature data
df = pd.read_csv('../data/processed/SV_Features_Dataset.csv')

print(f"Loaded {len(df):,} variants from {df['dataset'].nunique()} datasets")
print(f"TP: {len(df[df['label'] == 'TP']):,}, FP: {len(df[df['label'] == 'FP']):,}")

FEATURES = [
    'log_svlen', 'depth_ratio', 'depth_mad', 'ab', 'cn_slop',
    'mq_drop', 'clip_frac', 'split_reads', 'read_len_med', 'strand_bias',
    'gc_frac', 'homopolymer_max', 'lcr_mask',
    'support_read', 'svtype_DEL'
]

FEATURE_CATEGORIES = {
    'Size & Copy Number': ['log_svlen', 'depth_ratio', 'depth_mad', 'ab', 'cn_slop'],
    'Read Quality & Mapping': ['mq_drop', 'clip_frac', 'split_reads', 'read_len_med', 'strand_bias'],
    'Sequence Context': ['gc_frac', 'homopolymer_max', 'lcr_mask'],
    'Caller-Specific': ['support_read', 'svtype_DEL']
}

In [None]:
print("Feature availability:")
for feature in FEATURES:
    if feature in df.columns:
        available = df[feature].notna().sum()
        total = len(df)
        pct = available / total * 100
        print(f"{feature:<18} {available:>8,}/{total:<8,} ({pct:>5.1f}%)")
    else:
        print(f"{feature:<18} {'MISSING':<20}")

In [None]:
def calculate_effect_size(tp_data, fp_data):
    """Calculate Cohen's d effect size"""
    if len(tp_data) < 2 or len(fp_data) < 2:
        return np.nan

    tp_mean, tp_std = tp_data.mean(), tp_data.std()
    fp_mean, fp_std = fp_data.mean(), fp_data.std()

    pooled_std = np.sqrt(((len(tp_data) - 1) * tp_std**2 + (len(fp_data) - 1) * fp_std**2) /
                        (len(tp_data) + len(fp_data) - 2))

    if pooled_std == 0:
        return np.nan

    return abs(tp_mean - fp_mean) / pooled_std

def analyze_feature_by_dataset(df, feature, dataset):
    """Analyze a single feature for a dataset"""
    dataset_data = df[df['dataset'] == dataset]

    tp_data = dataset_data[dataset_data['label'] == 'TP'][feature].dropna()
    fp_data = dataset_data[dataset_data['label'] == 'FP'][feature].dropna()

    if len(tp_data) < 50 or len(fp_data) < 50:
        return None

    try:
        # Statistical tests
        mw_stat, mw_p = mannwhitneyu(tp_data, fp_data, alternative='two-sided')
        ks_stat, ks_p = ks_2samp(tp_data, fp_data)
        effect_size = calculate_effect_size(tp_data, fp_data)

        return {
            'dataset': dataset,
            'feature': feature,
            'tp_count': len(tp_data),
            'fp_count': len(fp_data),
            'tp_mean': tp_data.mean(),
            'fp_mean': fp_data.mean(),
            'effect_size': effect_size,
            'mann_whitney_p': mw_p,
            'ks_p_value': ks_p
        }
    except:
        return None

In [None]:
# Analyze all features across all datasets
results = []
for dataset in df['dataset'].unique():
    print(f"\nAnalyzing {dataset}:")
    for feature in FEATURES:
        if feature in df.columns:
            result = analyze_feature_by_dataset(df, feature, dataset)
            if result:
                results.append(result)
                if result['mann_whitney_p'] < 0.001:
                    sig = "***"
                elif result['mann_whitney_p'] < 0.01:
                    sig = "**"
                elif result['mann_whitney_p'] < 0.05:
                    sig = "*"
                else:
                    sig = ""
                print(f"  {feature:<18} p={result['mann_whitney_p']:.2e}, d={result['effect_size']:.3f} {sig}")

stats_df = pd.DataFrame(results)
print(f"\nTotal statistical tests: {len(stats_df)}")

In [None]:
if len(stats_df) > 0:
    # Summarize by feature across all datasets
    feature_summary = stats_df.groupby('feature').agg({
        'effect_size': 'mean',
        'mann_whitney_p': 'min',
        'dataset': 'count'
    }).rename(columns={'dataset': 'n_datasets'})

    feature_summary['highly_significant'] = (stats_df.groupby('feature')['mann_whitney_p'] < 0.001).sum()
    feature_summary = feature_summary.sort_values('effect_size', ascending=False)

    print("\nTop discriminative features:")
    print(f"{'Rank':<4} {'Feature':<18} {'Avg Effect':<12} {'Min P-value':<12} {'Datasets':<8}")
    print("-" * 60)

    for i, (feature, row) in enumerate(feature_summary.head(10).iterrows(), 1):
        print(f"{i:<4} {feature:<18} {row['effect_size']:<12.3f} {row['mann_whitney_p']:<12.2e} {row['n_datasets']:<8}")

    # Save results
    stats_df.to_csv('../data/processed/feature_statistics.csv', index=False)
    feature_summary.to_csv('../data/processed/feature_ranking.csv')

else:
    print("No statistical results generated")

In [None]:
# Create distribution plots for all features
n_features = len(FEATURES)
n_cols = 3
n_rows = int(np.ceil(n_features / n_cols))

fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, 5 * n_rows))
axes = axes.flatten()

for i, feature in enumerate(FEATURES):
    ax = axes[i]

    if feature not in df.columns:
        ax.text(0.5, 0.5, f'{feature}\nMISSING', ha='center', va='center', transform=ax.transAxes)
        continue

    tp_data = df[df['label'] == 'TP'][feature].dropna()
    fp_data = df[df['label'] == 'FP'][feature].dropna()

    if len(tp_data) == 0 or len(fp_data) == 0:
        ax.text(0.5, 0.5, f'{feature}\nNO DATA', ha='center', va='center', transform=ax.transAxes)
        continue

    # Check if binary feature
    if feature in ['lcr_mask', 'svtype_DEL'] or tp_data.nunique() <= 5:
        # Bar plot for binary features
        tp_counts = tp_data.value_counts(normalize=True)
        fp_counts = fp_data.value_counts(normalize=True)

        all_values = sorted(set(tp_counts.index) | set(fp_counts.index))
        tp_props = [tp_counts.get(v, 0) for v in all_values]
        fp_props = [fp_counts.get(v, 0) for v in all_values]

        x = np.arange(len(all_values))
        width = 0.35

        ax.bar(x - width/2, tp_props, width, label='TP', alpha=0.7)
        ax.bar(x + width/2, fp_props, width, label='FP', alpha=0.7)
        ax.set_xticks(x)
        ax.set_xticklabels(all_values)
        ax.set_ylabel('Proportion')
    else:
        # Histogram for continuous features
        bins = min(50, max(10, int(np.sqrt(min(len(tp_data), len(fp_data))))))

        ax.hist(tp_data, bins=bins, alpha=0.6, label=f'TP (n={len(tp_data):,})', density=True)
        ax.hist(fp_data, bins=bins, alpha=0.6, label=f'FP (n={len(fp_data):,})', density=True)

        # Add mean lines
        ax.axvline(tp_data.mean(), color='blue', linestyle='--', alpha=0.8)
        ax.axvline(fp_data.mean(), color='orange', linestyle='--', alpha=0.8)

        ax.set_ylabel('Density')

    # Add statistical info
    if len(tp_data) > 10 and len(fp_data) > 10:
        try:
            _, p_val = mannwhitneyu(tp_data, fp_data, alternative='two-sided')
            effect_size = calculate_effect_size(tp_data, fp_data)

            sig_stars = "***" if p_val < 0.001 else "**" if p_val < 0.01 else "*" if p_val < 0.05 else ""
            title = f'{feature}\np={p_val:.2e} {sig_stars}, d={effect_size:.3f}'
        except:
            title = feature
    else:
        title = feature

    ax.set_title(title, fontsize=10)
    ax.legend(fontsize=8)
    ax.grid(True, alpha=0.3)

# Remove empty subplots
for i in range(n_features, len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.savefig('../figures/feature_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
if len(stats_df) > 0:
    # Get top 8 features by effect size
    top_features = feature_summary.head(8).index.tolist()

    fig, axes = plt.subplots(2, 4, figsize=(20, 10))
    axes = axes.flatten()

    for i, feature in enumerate(top_features):
        ax = axes[i]

        if feature not in df.columns:
            continue

        tp_data = df[df['label'] == 'TP'][feature].dropna()
        fp_data = df[df['label'] == 'FP'][feature].dropna()

        if len(tp_data) == 0 or len(fp_data) == 0:
            continue

        # Skip binary features for violin plots
        if tp_data.nunique() <= 5:
            continue

        # Create combined data for seaborn
        combined_data = pd.DataFrame({
            'value': pd.concat([tp_data, fp_data]),
            'label': ['TP'] * len(tp_data) + ['FP'] * len(fp_data)
        })

        sns.violinplot(data=combined_data, x='label', y='value', ax=ax)

        # Add median points
        tp_median = tp_data.median()
        fp_median = fp_data.median()
        ax.plot(0, tp_median, 'wo', markersize=8, markeredgecolor='black')
        ax.plot(1, fp_median, 'wo', markersize=8, markeredgecolor='black')

        ax.set_title(feature, fontsize=12)
        ax.grid(True, alpha=0.3)

    plt.suptitle('Top Discriminative Features: Violin Plots', fontsize=16)
    plt.tight_layout()
    plt.savefig('../figures/top_features_violin.png', dpi=300, bbox_inches='tight')
    plt.show()

In [None]:
if len(stats_df) > 0:
    # Create summary table
    print("Statistical significance of differences in linear features per TP/FP by dataset")
    print("\nFeature significance across datasets:")

    for feature in FEATURES:
        if feature in stats_df['feature'].values:
            feature_data = stats_df[stats_df['feature'] == feature]

            sig_count = sum(feature_data['mann_whitney_p'] < 0.01)
            total_datasets = len(feature_data)
            avg_effect = feature_data['effect_size'].mean()

            print(f"{feature:<18} {sig_count}/{total_datasets} datasets significant, avg effect: {avg_effect:.3f}")

    # Save table
    paper_table = stats_df.pivot_table(
        index='feature',
        columns='dataset',
        values=['mann_whitney_p', 'effect_size'],
        aggfunc='first'
    )
    paper_table.to_csv('../data/processed/paper_summary_table.csv')

    print(f"\nSummary: {len(stats_df[stats_df['mann_whitney_p'] < 0.01])} statistically significant feature-dataset combinations")
    print(f"Out of {len(stats_df)} total tests ({len(stats_df[stats_df['mann_whitney_p'] < 0.01])/len(stats_df)*100:.1f}% significant)")

print("\nAnalysis complete.")
print("Results saved to:")
print("  - ../data/processed/feature_statistics.csv")
print("  - ../data/processed/feature_ranking.csv")
print("  - ../figures/feature_distributions.png")
print("  - ../figures/top_features_violin.png")