# Evaluation and Analysis

This notebook evaluates the cancer driver gene prioritization results.

## Goals:
- Compare with known cancer drivers (COSMIC, IntOGen)
- Calculate precision, recall, and F1-score
- Visualize results
- Generate comprehensive evaluation report

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Add your evaluation code here

In [None]:
# Create summary report
summary = []

for name, eval_df in evaluation_results.items():
    # Get metrics at top-50
    top_50_metrics = eval_df[eval_df['top_k'] == 50]
    if not top_50_metrics.empty:
        summary.append({
            'method': name,
            'precision@50': top_50_metrics['precision'].values[0],
            'recall@50': top_50_metrics['recall'].values[0],
            'f1@50': top_50_metrics['f1_score'].values[0],
            'true_positives@50': top_50_metrics['true_positives'].values[0]
        })

if summary:
    summary_df = pd.DataFrame(summary)
    summary_df = summary_df.sort_values('f1@50', ascending=False)
    
    print("\\n" + "="*60)
    print("EVALUATION SUMMARY (Top-50 Predictions)")
    print("="*60)
    print(summary_df.to_string(index=False))
    
    # Save summary
    summary_file = results_dir / 'evaluation_summary.csv'
    summary_df.to_csv(summary_file, index=False)
    print(f"\\n‚úÖ Saved evaluation summary to: {summary_file.name}")

print("\\n" + "="*60)
print("‚úÖ Evaluation completed!")
print("="*60)

## Summary and Export

Generate comprehensive evaluation report

In [None]:
# Analyze top predictions
for name, df in results_dict.items():
    gene_col = [c for c in df.columns if 'gene' in c.lower() or 'symbol' in c.lower()]
    if not gene_col:
        continue
    gene_col = gene_col[0]
    
    print(f"\\n{'='*60}")
    print(f"Top 20 Predictions: {name}")
    print(f"{'='*60}")
    
    top_20 = df.head(20).copy()
    top_20['is_known_driver'] = top_20[gene_col].isin(known_drivers)
    top_20['status'] = top_20['is_known_driver'].map({True: '‚úÖ Known', False: 'üÜï Novel'})
    
    print(top_20[[gene_col, 'status']].to_string(index=False))
    
    known_count = top_20['is_known_driver'].sum()
    print(f"\\nKnown drivers in top 20: {known_count}/20 ({known_count/20*100:.1f}%)")

## Detailed Analysis: Top Predicted Drivers

In [None]:
if len(evaluation_results) > 0:
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    for name, eval_df in evaluation_results.items():
        axes[0].plot(eval_df['top_k'], eval_df['precision'], marker='o', label=name)
        axes[1].plot(eval_df['top_k'], eval_df['recall'], marker='s', label=name)
        axes[2].plot(eval_df['top_k'], eval_df['f1_score'], marker='^', label=name)
    
    axes[0].set_xlabel('Top K Predictions')
    axes[0].set_ylabel('Precision')
    axes[0].set_title('Precision at Different K')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    axes[1].set_xlabel('Top K Predictions')
    axes[1].set_ylabel('Recall')
    axes[1].set_title('Recall at Different K')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)
    
    axes[2].set_xlabel('Top K Predictions')
    axes[2].set_ylabel('F1 Score')
    axes[2].set_title('F1 Score at Different K')
    axes[2].legend()
    axes[2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(results_dir / 'evaluation_metrics.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"\\n‚úÖ Saved evaluation plot to: {results_dir / 'evaluation_metrics.png'}")
else:
    print("No results to visualize yet.")

## Visualize Results

In [None]:
def evaluate_prioritization(predicted_genes, known_drivers, top_k_list=[10, 20, 50, 100, 200]):
    """
    Evaluate gene prioritization against known drivers
    
    Args:
        predicted_genes: List of predicted driver genes (ranked)
        known_drivers: Set of known driver genes
        top_k_list: List of top-k values to evaluate
    
    Returns:
        DataFrame with metrics
    """
    results = []
    
    for top_k in top_k_list:
        if top_k > len(predicted_genes):
            continue
            
        top_predicted = set(predicted_genes[:top_k])
        
        # Calculate metrics
        true_positives = len(top_predicted & known_drivers)
        false_positives = len(top_predicted - known_drivers)
        false_negatives = len(known_drivers - top_predicted)
        
        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / len(known_drivers) if len(known_drivers) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        
        results.append({
            'top_k': top_k,
            'true_positives': true_positives,
            'precision': precision,
            'recall': recall,
            'f1_score': f1
        })
    
    return pd.DataFrame(results)

# Evaluate each result file
evaluation_results = {}

for name, df in results_dict.items():
    # Get gene column (flexible naming)
    gene_col = None
    for col in ['gene', 'Gene', 'Hugo_Symbol', 'symbol']:
        if col in df.columns:
            gene_col = col
            break
    
    if gene_col is None:
        print(f"‚ö†Ô∏è  Skipping {name}: No gene column found")
        continue
    
    predicted_genes = df[gene_col].tolist()
    eval_df = evaluate_prioritization(predicted_genes, known_drivers)
    evaluation_results[name] = eval_df
    
    print(f"\\n{'='*60}")
    print(f"Evaluation: {name}")
    print(f"{'='*60}")
    print(eval_df.to_string(index=False))

## Calculate Evaluation Metrics

Calculate precision, recall, and F1-score at different thresholds

In [None]:
# Find all result files
result_files = list(results_dir.glob("*.tsv")) + list(results_dir.glob("*.txt"))
result_files = [f for f in result_files if f.parent == results_dir]  # Exclude subdirectories

if len(result_files) == 0:
    print(f"‚ö†Ô∏è  No result files found in {results_dir}")
    print("\\nPlease run analysis first (notebooks 04 or 05)")
    results_dict = {}
else:
    print(f"Found {len(result_files)} result file(s):")
    results_dict = {}
    
    for result_file in result_files:
        try:
            df = pd.read_csv(result_file, sep="\\t")
            results_dict[result_file.stem] = df
            print(f"  ‚úÖ {result_file.name}: {len(df)} genes")
        except Exception as e:
            print(f"  ‚ùå {result_file.name}: Error - {e}")

## Load Prioritization Results

Load results from Endeavour and/or nCop analysis

In [None]:
# Load known drivers
known_drivers = set(pd.read_csv(reference_file, header=None)[0].values)
print(f"\\nLoaded {len(known_drivers)} known cancer driver genes")

In [None]:
# Example: Create a sample known drivers list (replace with actual COSMIC/IntOGen data)
known_drivers_example = [
    'TP53', 'KRAS', 'EGFR', 'PIK3CA', 'BRAF', 'PTEN', 'APC', 'RB1',
    'CDKN2A', 'SMAD4', 'FBXW7', 'NRAS', 'ARID1A', 'CTNNB1', 'FAT1',
    'GATA3', 'MAP3K1', 'MYC', 'NOTCH1', 'PPP2R1A', 'ATM', 'BRCA1',
    'BRCA2', 'CDH1', 'ERBB2', 'FGFR3', 'IDH1', 'NF1', 'SETD2', 'VHL'
]

# Save example reference file
reference_file = reference_dir / 'known_cancer_drivers.txt'
pd.Series(known_drivers_example).to_csv(reference_file, index=False, header=False)

print(f"‚úÖ Saved {len(known_drivers_example)} known driver genes to: {reference_file.name}")
print(f"\\n‚ö†Ô∏è  Note: Replace this with actual COSMIC/IntOGen data for real evaluation!")
print(f"\\nDownload from:")
print(f"  - COSMIC: https://cancer.sanger.ac.uk/census")
print(f"  - IntOGen: https://www.intogen.org/")

## Load Known Cancer Driver Genes

For evaluation, we need reference sets of known cancer drivers from:
- **COSMIC Cancer Gene Census**
- **IntOGen**
- **Bailey et al. 2018** (comprehensive driver study)

In [None]:
# Define paths
results_dir = Path("../data/evaluation")
reference_dir = Path("../data/evaluation/reference")

# Create directories
reference_dir.mkdir(parents=True, exist_ok=True)

print(f"Results directory: {results_dir}")
print(f"Reference data directory: {reference_dir}")

## Setup Paths and Load Data