# MAF Cleaning

This notebook handles the cleaning and preprocessing of MAF (Mutation Annotation Format) files.

## Goals:
- Remove silent mutations
- Keep only protein-altering variants
- Standardize gene names
- Output cleaned MAF files

In [None]:
import pandas as pd
import numpy as np

# Add your MAF cleaning code here

In [None]:
# List all cleaned files
cleaned_files = list(clean_dir.glob("*.maf"))
print(f"Cleaned MAF files ({len(cleaned_files)}):")
for f in cleaned_files:
    size_kb = f.stat().st_size / 1024
    print(f"  - {f.name} ({size_kb:.1f} KB)")

## Verify Cleaned Files

In [None]:
# Process each MAF file
for maf_file in maf_files:
    print(f"\n{'='*60}")
    print(f"Processing: {maf_file.name}")
    print(f"{'='*60}")
    
    # Read MAF file (skip comment lines starting with #)
    df = pd.read_csv(maf_file, sep="\t", comment="#", low_memory=False)
    print(f"  Original mutations: {len(df)}")
    
    # Filter for protein-altering variants only
    protein_altering = [
        "Missense_Mutation",
        "Nonsense_Mutation",
        "Frame_Shift_Del",
        "Frame_Shift_Ins",
        "Splice_Site"
    ]
    
    df_filtered = df[df["Variant_Classification"].isin(protein_altering)]
    print(f"  Protein-altering mutations: {len(df_filtered)}")
    
    # Select relevant columns
    df_clean = df_filtered[["Hugo_Symbol", "Variant_Classification", "Tumor_Sample_Barcode"]].copy()
    
    # Remove any rows with missing values
    df_clean = df_clean.dropna()
    print(f"  After removing NaNs: {len(df_clean)}")
    
    # Save cleaned MAF
    output_file = clean_dir / maf_file.name
    df_clean.to_csv(output_file, sep="\t", index=False)
    print(f"  ✅ Saved to: {output_file.name}")
    
    # Display summary statistics
    print(f"\n  Summary:")
    print(f"    Unique genes: {df_clean['Hugo_Symbol'].nunique()}")
    print(f"    Unique samples: {df_clean['Tumor_Sample_Barcode'].nunique()}")
    print(f"\n  Variant distribution:")
    print(df_clean['Variant_Classification'].value_counts().to_string())

print(f"\n{'='*60}")
print("✅ All MAFs cleaned successfully!")
print(f"{'='*60}")

In [None]:
# Find all MAF files
maf_files = list(raw_dir.glob("*.maf"))

if len(maf_files) == 0:
    print(f"⚠️  No MAF files found in {raw_dir}")
    print("\nPlease download MAF files from cBioPortal and place them in:")
    print(f"  {raw_dir.absolute()}")
else:
    print(f"Found {len(maf_files)} MAF file(s):")
    for f in maf_files:
        print(f"  - {f.name}")

## Clean MAF Files

Filter for protein-altering mutations:
- Missense_Mutation
- Nonsense_Mutation
- Frame_Shift_Del
- Frame_Shift_Ins
- Splice_Site

In [None]:
# Define directory paths
raw_dir = Path("../data/mafs/raw")
clean_dir = Path("../data/mafs/clean")

# Create clean directory if it doesn't exist
clean_dir.mkdir(parents=True, exist_ok=True)

print(f"Raw MAF directory: {raw_dir}")
print(f"Clean MAF directory: {clean_dir}")
print(f"\nLooking for MAF files in: {raw_dir.absolute()}")

## Setup Paths