# Gene List Generation

This notebook generates gene lists for Endeavour analysis from nCop-formatted MAF data.

## Goal:
Extract unique gene lists for use as training data in Endeavour

In [None]:
import pandas as pd
import numpy as np

# Add your gene list generation code here

In [None]:
# List all generated gene lists
gene_lists = list(out_dir.glob("*.txt"))
print(f"Generated gene lists ({len(gene_lists)}):")
for f in gene_lists:
    # Count genes in file
    with open(f, 'r') as file:
        gene_count = len(file.readlines())
    print(f"  - {f.name}: {gene_count} genes")

print("\n" + "="*60)
print("Next Steps:")
print("="*60)
print("1. Download functional interaction networks (HPRD/STRING/BioGRID)")
print("2. Upload gene lists to Endeavour for prioritization")
print("3. Use MAF gene lists as training genes")
print("4. Use network gene lists as candidate genes")
print("5. Download ranked results for further analysis")

## Summary and Next Steps

In [None]:
# Generate gene lists
for maf_file in maf_files:
    print(f"\n{'='*60}")
    print(f"Processing: {maf_file.name}")
    print(f"{'='*60}")
    
    # Read nCop MAF
    df = pd.read_csv(maf_file, sep="\t")
    print(f"  Total patient-gene pairs: {len(df)}")
    
    # Get unique genes and sort
    genes = sorted(df["Hugo_Symbol"].unique())
    print(f"  Unique genes: {len(genes)}")
    
    # Save as text file (one gene per line)
    output_file = out_dir / f"{maf_file.stem}_genes.txt"
    pd.Series(genes).to_csv(output_file, index=False, header=False)
    print(f"  ✅ Saved to: {output_file.name}")
    
    # Display preview
    print(f"\n  First 10 genes:")
    for i, gene in enumerate(genes[:10], 1):
        print(f"    {i}. {gene}")
    
    if len(genes) > 10:
        print(f"    ... and {len(genes) - 10} more")

print(f"\n{'='*60}")
print("✅ Gene list generation completed!")
print(f"{'='*60}")

In [None]:
# Find all nCop MAF files
maf_files = list(ncop_dir.glob("*.maf"))

if len(maf_files) == 0:
    print(f"⚠️  No nCop MAF files found in {ncop_dir}")
    print("\nPlease run notebook 02_maf_to_ncop.ipynb first!")
else:
    print(f"Found {len(maf_files)} nCop MAF file(s):")
    for f in maf_files:
        print(f"  - {f.name}")

## Generate Gene Lists

Extract unique genes from each MAF file and save as text files (one gene per line)

In [None]:
# Define directory paths
ncop_dir = Path("../data/mafs/ncop")
out_dir = Path("../data/endeavour/MAF_lists")

# Create output directory
out_dir.mkdir(parents=True, exist_ok=True)

print(f"Input directory (nCop MAFs): {ncop_dir}")
print(f"Output directory (gene lists): {out_dir}")

## Setup Paths