In [1]:
#!/usr/bin/env python3
import pandas as pd
import subprocess
import tempfile
import os
import sys

# File paths
CPG_PATH = "/nfs/nas12.ethz.ch/fs1201/green_groups_let_public/Euler/Vakil/mouse_genome_annotation/Genes_Promoters_CpG_islands_for_Tae/allCpG_islands_GRCm39.bed"
PROMOTERS_PATH = "/nfs/nas12.ethz.ch/fs1201/green_groups_let_public/Euler/Vakil/mouse_genome_annotation/Genes_Promoters_CpG_islands_for_Tae/knownGenes_canonTr_2000bp_upstream_TSS_feature_GRCm39_GENCODE.VM36.bed"
OXID_CPG_PATH = "../data_normalized/cpg_Normalized.csv"
OUTPUT_PATH = "../data_normalized/cpg_Normalized_with_genes.csv"

def run_bedtools_intersect(cpg_path, promoters_path):
    """
    Run bedtools intersect to find overlaps between CpG islands and promoters.
    Returns the intersection results as a pandas DataFrame.
    """
    print("Running bedtools intersect...")
    
    temp_path = "bedtools_intersect_output.bed"
    
    try:
        # Run bedtools intersect
        cmd = [
            '/cluster/software/stacks/2024-06/spack/opt/spack/linux-ubuntu22.04-x86_64_v3/gcc-12.2.0/bedtools2-2.31.0-a4obbslkxntgdx2criopqpwx662gcftq/bin/bedtools', 'intersect',
            '-a', cpg_path,
            '-b', promoters_path,
            '-wa', '-wb' 
        ]
        
        with open(temp_path, 'w') as outfile:
            result = subprocess.run(cmd, stdout=outfile, stderr=subprocess.PIPE, text=True)
        
        if result.returncode != 0:
            print(f"Error running bedtools: {result.stderr}")
            return None
        
        # Read the intersection results
        df = pd.read_csv(temp_path, sep='\t', header=None)
        
        # Column names: CpG columns (Chr, Start, End, Seq) + Promoter columns (Chr, Start, End, Gene, ...)
        if df.shape[1] >= 7:
            df.columns = ['cpg_chr', 'cpg_start', 'cpg_end', 'prom_chr', 'prom_start', 'prom_end', 'gene', 'idk', 'strand'] 
        else:
            print(f"Unexpected number of columns in intersection result: {df.shape[1]}")
            return None
        
        print(f"Found {len(df)} CpG-promoter intersections")
        return df
        
    finally: 
        print('h')       

def create_cpg_gene_mapping(intersection_df):
    """
    Create a mapping from CpG island Chr+Start to genes.
    Now returns a dictionary where each CpG can map to multiple genes.
    """
    if intersection_df is None or len(intersection_df) == 0:
        return {}
    
    cpg_gene_map = {}
    
    # Group by CpG island coordinates (Chr, Start, End)
    grouped = intersection_df.groupby(['cpg_chr', 'cpg_start'])
    
    for (chr_name, start), group in grouped:
        # Create the key as Chr + Start (matching your CSV id format)
        cpg_key = f"{chr_name}_{start}"
        
        # Get all unique genes for this CpG island
        unique_genes = group['gene'].unique().tolist()
        cpg_gene_map[cpg_key] = unique_genes
    
    # Print summary statistics
    single_gene_count = sum(1 for genes in cpg_gene_map.values() if len(genes) == 1)
    multiple_gene_count = sum(1 for genes in cpg_gene_map.values() if len(genes) > 1)
    
    print(f"CpG islands with single gene: {single_gene_count}")
    print(f"CpG islands with multiple genes: {multiple_gene_count}")
    
    if multiple_gene_count > 0:
        print(f"Examples of CpG islands with multiple genes:")
        multi_examples = [(k, v) for k, v in cpg_gene_map.items() if len(v) > 1][:5]
        for cpg, genes in multi_examples:
            print(f"  {cpg}: {', '.join(genes)}")

    return cpg_gene_map

def add_gene_column_to_csv(oxid_csv_path, cpg_gene_map, output_path):
    """
    Add gene column to the oxidation CpG CSV file.
    Creates duplicate rows when a CpG island maps to multiple genes.
    """
    print(f"Reading oxidation data from {oxid_csv_path}...")
    df = pd.read_csv(oxid_csv_path)
    
    print(f"Original data shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    
    # Show first few IDs to confirm the format
    print(f"Sample IDs: {df['id'].head().tolist()}")
    
    # Create a list to store all rows (including duplicates)
    new_rows = []
    genes_added = 0
    no_gene_count = 0
    duplicated_count = 0
    
    print("Processing CpG islands and creating rows with gene assignments...")
    
    for idx, row in df.iterrows():
        cpg_id = str(row['id'])
        
        if cpg_id in cpg_gene_map:
            genes = cpg_gene_map[cpg_id]
            
            # Create one row for each gene
            for gene in genes:
                new_row = row.copy()
                new_row['gene'] = gene
                new_rows.append(new_row)
                genes_added += 1
            
            # Track duplications (when more than one gene per CpG)
            if len(genes) > 1:
                duplicated_count += len(genes) - 1
        else:
            # No gene found for this CpG island
            new_row = row.copy()
            new_row['gene'] = 'NO_GENE_FOUND'
            new_rows.append(new_row)
            no_gene_count += 1
    
    # Create the final DataFrame
    final_df = pd.DataFrame(new_rows)
    
    print(f"Original CpG islands: {len(df)}")
    print(f"Final rows (with duplicates): {len(final_df)}")
    print(f"CpG islands with genes assigned: {len(df) - no_gene_count}")
    print(f"CpG islands with no gene found: {no_gene_count}")
    print(f"Additional rows created due to multiple genes: {duplicated_count}")
    
    # Save the result
    final_df.to_csv(output_path, index=False)
    print(f"Results saved to: {output_path}")
    
    # Print summary of gene assignments
    gene_counts = final_df['gene'].value_counts()
    print(f"\nTop 10 most frequent gene assignments:")
    print(gene_counts.head(10))
    
    # Show some examples of successful matches
    if genes_added > 0:
        matched_examples = final_df[final_df['gene'] != 'NO_GENE_FOUND'].head(10)
        print(f"\nFirst few successful matches:")
        for _, row in matched_examples.iterrows():
            print(f"  ID: {row['id']} -> Gene: {row['gene']}")
    
    # Show examples of duplicated rows (same CpG, different genes)
    duplicated_cpgs = final_df[final_df.duplicated(subset=['id'], keep=False)]
    if len(duplicated_cpgs) > 0:
        print(f"\nExamples of CpG islands with multiple genes (showing duplicated rows):")
        sample_id = duplicated_cpgs['id'].iloc[0]
        sample_rows = final_df[final_df['id'] == sample_id]
        for _, row in sample_rows.iterrows():
            print(f"  ID: {row['id']} -> Gene: {row['gene']}")
    
    return True

def main():
    """Main function to run the analysis."""
    print("Starting CpG island gene annotation...")
    
    # Check if input files exist
    for path, name in [(CPG_PATH, "CpG islands"), (PROMOTERS_PATH, "Promoters"), (OXID_CPG_PATH, "Oxidation data")]:
        if not os.path.exists(path):
            print(f"Error: {name} file not found: {path}")
            return False
    
    # Step 1: Run bedtools intersect
    intersection_df = run_bedtools_intersect(CPG_PATH, PROMOTERS_PATH)
    if intersection_df is None:
        print("Failed to run bedtools intersect")
        return False
    
    # Debug: Show some intersection examples
    print(f"\nFirst few intersection results:")
    print(intersection_df[['cpg_chr', 'cpg_start', 'cpg_end', 'gene']].head())
    
    # Step 2: Create CpG to gene mapping
    cpg_gene_map = create_cpg_gene_mapping(intersection_df)
    print(f"Created mapping for {len(cpg_gene_map)} CpG islands")
    
    # Step 3: Add gene column to oxidation CSV
    success = add_gene_column_to_csv(OXID_CPG_PATH, cpg_gene_map, OUTPUT_PATH)
    
    if success:
        print("Analysis completed successfully!")
        print("Note: Rows have been duplicated when CpG islands map to multiple genes.")
    else:
        print("Analysis completed with warnings.")
    
    return success

if __name__ == "__main__":
    main()

Starting CpG island gene annotation...
Running bedtools intersect...
Found 16367 CpG-promoter intersections
h

First few intersection results:
       cpg_chr  cpg_start   cpg_end                gene
0  NC_000067.7   91225617  91226516  ENSMUSG00000026307
1  NC_000067.7   38665874  38667165  ENSMUSG00000037138
2  NC_000067.7   85721048  85722212  ENSMUSG00000036707
3  NC_000067.7   88997698  88998967  ENSMUSG00000120921
4  NC_000067.7   88997698  88998967  ENSMUSG00000036206
CpG islands with single gene: 6919
CpG islands with multiple genes: 4384
Examples of CpG islands with multiple genes:
  NC_000067.7_4567170: ENSMUSG00000025902, ENSMUSG00000104238
  NC_000067.7_4855599: ENSMUSG00000033845, ENSMUSG00000120403
  NC_000067.7_4877782: ENSMUSG00000136002, ENSMUSG00000025903, ENSMUSG00000104217
  NC_000067.7_4927688: ENSMUSG00000137875, ENSMUSG00000033813
  NC_000067.7_6284654: ENSMUSG00000090031, ENSMUSG00000025907
Created mapping for 11303 CpG islands
Reading oxidation data from ../data