# MAF to nCop Format Conversion

This notebook converts cleaned MAF files to nCop format.

## Goal:
Convert MAF files to gene × patient format expected by nCop

In [None]:
import pandas as pd
import numpy as np

# Add your MAF to NCOP conversion code here

In [None]:
# List all nCop files
ncop_files = list(ncop_dir.glob("*.maf"))
print(f"nCop format files ({len(ncop_files)}):")
for f in ncop_files:
    size_kb = f.stat().st_size / 1024
    print(f"  - {f.name} ({size_kb:.1f} KB)")

## Verify nCop Files

In [None]:
# Convert each MAF to nCop format
for maf_file in maf_files:
    print(f"\n{'='*60}")
    print(f"Processing: {maf_file.name}")
    print(f"{'='*60}")
    
    # Read cleaned MAF
    df = pd.read_csv(maf_file, sep="\t")
    print(f"  Total mutations: {len(df)}")
    
    # Select columns for nCop (patient and gene)
    ncop_df = df[["Tumor_Sample_Barcode", "Hugo_Symbol"]].copy()
    
    # Remove duplicates (same gene mutated multiple times in same patient)
    ncop_df_unique = ncop_df.drop_duplicates()
    print(f"  Unique patient-gene pairs: {len(ncop_df_unique)}")
    
    # Save in nCop format
    output_file = ncop_dir / maf_file.name
    ncop_df_unique.to_csv(output_file, sep="\t", index=False)
    print(f"  ✅ Saved to: {output_file.name}")
    
    # Display preview
    print(f"\n  Preview of nCop format:")
    print(ncop_df_unique.head(10).to_string(index=False))
    
    # Summary statistics
    print(f"\n  Summary:")
    print(f"    Unique patients: {ncop_df_unique['Tumor_Sample_Barcode'].nunique()}")
    print(f"    Unique genes: {ncop_df_unique['Hugo_Symbol'].nunique()}")

print(f"\n{'='*60}")
print("✅ Conversion to nCop format completed!")
print(f"{'='*60}")

In [None]:
# Find all cleaned MAF files
maf_files = list(clean_dir.glob("*.maf"))

if len(maf_files) == 0:
    print(f"⚠️  No cleaned MAF files found in {clean_dir}")
    print("\nPlease run notebook 01_maf_cleaning.ipynb first!")
else:
    print(f"Found {len(maf_files)} cleaned MAF file(s):")
    for f in maf_files:
        print(f"  - {f.name}")

## Convert MAF to nCop Format

nCop expects gene × patient format with columns:
- Tumor_Sample_Barcode (patient ID)
- Hugo_Symbol (gene name)

In [None]:
# Define directory paths
clean_dir = Path("../data/mafs/clean")
ncop_dir = Path("../data/mafs/ncop")

# Create ncop directory if it doesn't exist
ncop_dir.mkdir(parents=True, exist_ok=True)

print(f"Input directory (cleaned MAFs): {clean_dir}")
print(f"Output directory (nCop format): {ncop_dir}")

## Setup Paths