# Endeavour to nCop Weights Conversion

This notebook converts Endeavour prioritization results to nCop-compatible weight format.

## Goal:
Transform Endeavour gene rankings into weighted scores for network analysis

In [None]:
import pandas as pd
import numpy as np

# Add your Endeavour to NCOP weights conversion code here

In [None]:
# Visualize weight distributions
weight_files = list(weights_dir.glob("*.txt")) + list(weights_dir.glob("*.tsv"))

if len(weight_files) > 0:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    for weight_file in weight_files[:3]:  # Plot first 3 files
        df = pd.read_csv(weight_file, sep="\t")
        
        # Plot weight distribution
        axes[0].hist(df['weight'], bins=50, alpha=0.6, label=weight_file.stem)
        axes[1].plot(df['rank'], df['weight'], alpha=0.6, label=weight_file.stem)
    
    axes[0].set_xlabel('Weight')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Weight Distribution')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    axes[1].set_xlabel('Rank')
    axes[1].set_ylabel('Weight')
    axes[1].set_title('Weight vs Rank')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
else:
    print("No weight files available for visualization yet.")

## Visualize Weight Distribution

In [None]:
# Convert each Endeavour file to weights
for endeavour_file in endeavour_files:
    print(f"\n{'='*60}")
    print(f"Processing: {endeavour_file.name}")
    print(f"{'='*60}")
    
    try:
        # Read Endeavour results
        df = pd.read_csv(endeavour_file, sep="\t")
        print(f"  Total genes: {len(df)}")
        
        # Check for required columns (flexible column names)
        if 'gene' not in df.columns.str.lower():
            # Try to find gene column
            gene_col = [c for c in df.columns if 'gene' in c.lower() or 'symbol' in c.lower()]
            if gene_col:
                df.rename(columns={gene_col[0]: 'gene'}, inplace=True)
            else:
                print(f"  ⚠️  Warning: No 'gene' column found. Using first column.")
                df.rename(columns={df.columns[0]: 'gene'}, inplace=True)
        
        if 'rank' not in df.columns.str.lower():
            rank_col = [c for c in df.columns if 'rank' in c.lower() or 'score' in c.lower()]
            if rank_col:
                df.rename(columns={rank_col[0]: 'rank'}, inplace=True)
            else:
                print(f"  ⚠️  Warning: No 'rank' column found. Assigning sequential ranks.")
                df['rank'] = range(len(df))
        
        # Standardize column names
        df.columns = df.columns.str.lower()
        
        # Calculate weight: 1 / (rank + 1)
        df['weight'] = 1 / (df['rank'] + 1)
        
        # Normalize weights to [0, 1]
        df['weight_normalized'] = (df['weight'] - df['weight'].min()) / (df['weight'].max() - df['weight'].min())
        
        # Select output columns
        df_out = df[['gene', 'rank', 'weight', 'weight_normalized']].copy()
        
        # Save weighted results
        output_file = weights_dir / endeavour_file.name
        df_out.to_csv(output_file, sep="\t", index=False)
        print(f"  ✅ Saved to: {output_file.name}")
        
        # Display statistics
        print(f"\n  Weight statistics:")
        print(f"    Min weight: {df['weight'].min():.6f}")
        print(f"    Max weight: {df['weight'].max():.6f}")
        print(f"    Mean weight: {df['weight'].mean():.6f}")
        
        # Display top 10 genes
        print(f"\n  Top 10 prioritized genes:")
        top_genes = df_out.head(10)
        for idx, row in top_genes.iterrows():
            print(f"    {int(row['rank'])+1}. {row['gene']} (weight: {row['weight']:.4f})")
    
    except Exception as e:
        print(f"  ❌ Error processing {endeavour_file.name}: {e}")

print(f"\n{'='*60}")
print("✅ Conversion to nCop weights completed!")
print(f"{'='*60}")

In [None]:
# Find all Endeavour result files
endeavour_files = list(endeavour_dir.glob("*.txt")) + list(endeavour_dir.glob("*.tsv"))

if len(endeavour_files) == 0:
    print(f"⚠️  No Endeavour result files found in {endeavour_dir}")
    print("\n" + "="*60)
    print("Instructions:")
    print("="*60)
    print("1. Upload gene lists (from notebook 03) to Endeavour")
    print("2. Download ranked results")
    print(f"3. Place results in: {endeavour_dir.absolute()}")
    print("\nExpected format: Tab-separated with 'gene' and 'rank' columns")
else:
    print(f"Found {len(endeavour_files)} Endeavour result file(s):")
    for f in endeavour_files:
        print(f"  - {f.name}")

## Convert Endeavour Rankings to Weights

Endeavour provides gene rankings. We convert these to weights using:
- Weight = 1 / (rank + 1)
- Higher-ranked genes get higher weights

In [None]:
# Define directory paths
endeavour_dir = Path("../data/endeavour/results")
weights_dir = Path("../data/endeavour/ncop_weights")

# Create directories
endeavour_dir.mkdir(parents=True, exist_ok=True)
weights_dir.mkdir(parents=True, exist_ok=True)

print(f"Input directory (Endeavour results): {endeavour_dir}")
print(f"Output directory (nCop weights): {weights_dir}")

## Setup Paths