In [3]:
import os
import pandas as pd
import numpy as np
import glob
import subprocess
from collections import defaultdict

In [16]:
# Path with files outputted from bedtools intersect
PATH = "/cluster/scratch/taekim/data_oxidation/promoters_intersect" 
OUTPUT_FILE = "../data_normalized/promoters_Normalized.csv"
GENOME_PATH = "/nfs/nas12.ethz.ch/fs1201/green_groups_let_public/Euler/Navnit/genomes/mouse/GRCm39_NCBI_Bowtie2.fasta"
# Original path with all CpG coordinates
ORG_PATH = "/nfs/nas12.ethz.ch/fs1201/green_groups_let_public/Euler/Vakil/mouse_genome_annotation/Genes_Promoters_CpG_islands_for_Tae/knownGenes_canonTr_2000bp_upstream_TSS_feature_GRCm39_GENCODE.VM36.bed" 

# Define column names for the input files
INPUT_COLUMNS = ["Chr1", "Start1", "End1", "Value", "MAPQ", "Chr2", "Start2", "End2", "Gene", "Transcript", "ignore"]

# Define columns for the output file
OUTPUT_COLUMNS = ["id", "sample", "gene", "strand", "GC_count", "damage", 
                  "GC_normalized_damage", "median", "median_normalized_damage"]

# Define the median values for each sample
# Median value for each sample from 100kb bins
median_values = {
    "Sample_14_CRS_evening_S14_": 2.546314,
    "Sample_15_Ctrl_evening_S15_": 3.570246,
    "Sample_05_Ctrl_morning_S5_": 6.184096,
    "Sample_01_Ctrl_morning_S1_": 6.921409,
    "Sample_16_CRS_evening_S16_": 2.879618,
    "Sample_11_Ctrl_evening_S11_": 2.485161,
    "Sample_13_Ctrl_evening_S13_": 2.403964,
    "Sample_08_CRS_morning_S8_": 5.246539,
    "Sample_20_CRS_evening_S20_": 10.633666,
    "Sample_18_CRS_evening_S18_": 4.602043,
    "Sample_17_Ctrl_evening_S17_": 2.929046,
    "Sample_04_CRS_morning_S4_": 6.227455,
    "Sample_19_Ctrl_evening_S19_": 2.545900,
    "Sample_03_Ctrl_morning_S3_": 9.529751,
    "Sample_02_CRS_morning_S2_": 7.151634,
    "Sample_09_Ctrl_morning_S9_": 7.923174,
    "Sample_10_CRS_morning_S10_": 8.815860,
    "Sample_06_CRS_morning_S6_": 7.930738,
    "Sample_12_CRS_evening_S12_": 2.849694,
    "Sample_07_Ctrl_morning_S7_": 4.924262
}

In [17]:
def run_bedtools_getfasta(bed_file, output_file, genome_path):
    """Run bedtools getfasta command to get sequence data for a BED file"""
    command = f"/cluster/software/stacks/2024-06/spack/opt/spack/linux-ubuntu22.04-x86_64_v3/gcc-12.2.0/bedtools2-2.31.0-a4obbslkxntgdx2criopqpwx662gcftq/bin/bedtools getfasta -fi {genome_path} -bed {bed_file} -bedOut > {output_file}"
    print(f"Running command: {command}")
    
    try:
        result = subprocess.run(command, shell=True, check=True, 
                               stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                               text=True)
        print("Command completed successfully")
        return True
    except subprocess.CalledProcessError as e:
        print(f"Error running bedtools: {e}")
        print(f"STDERR: {e.stderr}")
        return False

SEQ_COLUMNS = ["Chr", "Start", "End", "Gene", "idk", "ignore", "Seq"]
seq_output_file = os.path.join(PATH, "promoters.SEQ.bed")
# Run getfasta with original CpG coordinate file
run_bedtools_getfasta(ORG_PATH, seq_output_file, GENOME_PATH)
seq_df = pd.read_csv(seq_output_file, sep="\t", header=None, names=SEQ_COLUMNS)

seq_counts_plus = seq_df.copy()
seq_counts_plus['id'] = seq_counts_plus['Gene'] + '+'  # Use Gene for consistency with previous code
seq_counts_plus['GC_count'] = seq_counts_plus['Seq'].str.upper().str.count('G')
seq_counts_plus = seq_counts_plus.drop_duplicates('id')

seq_counts_minus = seq_df.copy()
seq_counts_minus['id'] = seq_counts_minus['Gene'] + '-'  # Use Gene for consistency with previous code
seq_counts_minus['GC_count'] = seq_counts_minus['Seq'].str.upper().str.count('C')
seq_counts_minus = seq_counts_minus.drop_duplicates('id')

# Combine both strand data
seq_counts = pd.concat([seq_counts_plus, seq_counts_minus])

Running command: /cluster/software/stacks/2024-06/spack/opt/spack/linux-ubuntu22.04-x86_64_v3/gcc-12.2.0/bedtools2-2.31.0-a4obbslkxntgdx2criopqpwx662gcftq/bin/bedtools getfasta -fi /nfs/nas12.ethz.ch/fs1201/green_groups_let_public/Euler/Navnit/genomes/mouse/GRCm39_NCBI_Bowtie2.fasta -bed /nfs/nas12.ethz.ch/fs1201/green_groups_let_public/Euler/Vakil/mouse_genome_annotation/Genes_Promoters_CpG_islands_for_Tae/knownGenes_canonTr_2000bp_upstream_TSS_feature_GRCm39_GENCODE.VM36.bed -bedOut > /cluster/scratch/taekim/data_oxidation/promoters_intersect/promoters.SEQ.bed
Command completed successfully


In [18]:
# Collect all files for each sample
bed_files = glob.glob(os.path.join(PATH, "*strand.bed"))

all_unique_ids = set()  # To store all unique IDs across all samples
# First pass: Collect all unique IDs across all samples
print("First pass: Collecting all unique IDs...")
for file_path in sorted(bed_files):
    file_name = os.path.basename(file_path)
    try:
        # Read the file
        df = pd.read_csv(file_path, sep="\t", header=None, names=INPUT_COLUMNS,  on_bad_lines='skip')


        if "plus_strand" in file_name:
            strand = "+"
        elif "minus_strand" in file_name:
            strand = "-"        
        # Create the identifier
        ids = df['Gene'] + strand
        
        # Add to the set of all unique IDs
        all_unique_ids.update(ids.unique())
        
    except Exception as e:
        print(f"  Error collecting IDs from {file_name}: {str(e)}")

print(f"Collected {len(all_unique_ids)} unique IDs across all samples")


# Create a dictionary to store chromosome info for each ID
id_to_chrom = {}
for file_path in sorted(bed_files):
    file_name = os.path.basename(file_path)
    try:
        df = pd.read_csv(file_path, sep="\t", header=None, names=INPUT_COLUMNS)
        # Map each ID to its chromosome
        id_df = df.copy()
        
        if "plus_strand" in file_name:
            strand = "+"
        elif "minus_strand" in file_name:
            strand = "-"   
            
        id_df['id'] = id_df['Gene'] + strand
        
        for _, row in id_df.drop_duplicates('id').iterrows():
            id_to_chrom[row['id']] = row['Chr1']
    except Exception as e:
        print(f"  Error mapping chromosomes: {str(e)}")


First pass: Collecting all unique IDs...
Collected 156398 unique IDs across all samples


In [19]:
# Create an empty DataFrame for the final output
all_data = pd.DataFrame(columns=OUTPUT_COLUMNS)

print("Starting processing of samples")

# Process each file
sample_data_dict = {}  # Store data for each sample

print("starting")
for file_path in sorted(bed_files):
    file_name = os.path.basename(file_path)
    print(f"Processing file: {file_name}")
    
    # Extract sample name and determine strand
    sample_name = file_name.split('.')[0]
    
    if "plus_strand" in file_name:
        strand = "+"
        nucleotide = 'G'  # Count G on plus strand
    elif "minus_strand" in file_name:
        strand = "-"
        nucleotide = 'C'  # Count C on minus strand
    else:
        print(f"  Warning: Could not determine strand for {file_name}")
        continue
    
    try:
        # Read and process the file
        # dataframe of intersection from oxidation sites per sample and cpg coordinates
        df = pd.read_csv(file_path, sep="\t", header=None, names=INPUT_COLUMNS)
        print(f"  Records: {len(df)}")
        
        # Create IDs and calculate damage per genomic position
        df['id'] = df['Gene'] + strand 
        
        # Create damage_df with both summed damage and first gene in one operation
        damage_df = df.groupby('id').agg({
            'Value': 'sum',
            'Gene': 'first',
            'Chr1': 'first'
        }).reset_index()
        
        # Rename Value column to damage
        damage_df = damage_df.rename(columns={'Value': 'damage'})
        damage_df = damage_df.rename(columns={'Gene': 'gene'})
        damage_df = damage_df.rename(columns={'Chr1': 'chromosome'})
        
        # Create base result dataframe
        file_data = damage_df.copy()
        file_data['sample'] = sample_name
        file_data['strand'] = strand
        
        # 4. Merge the counts with our result dataframe
        file_data = file_data.merge(
            seq_counts[['id', 'GC_count']], 
            on='id', 
            how='left'
        )
        
        # Calculate normalized damage
        file_data['GC_normalized_damage'] = file_data['damage'] / file_data['GC_count'].replace(0, np.nan)
        
        # Add median normalization if available
        if sample_name in median_values:
            file_data['median'] = median_values[sample_name]
            file_data['median_normalized_damage'] = 1000 * file_data['GC_normalized_damage'] / file_data['median']
        else:
            print(f"  Warning: No median value found for sample {sample_name}")
            file_data['median'] = np.nan
            file_data['median_normalized_damage'] = np.nan
        
        # Store processed data
        # file_data saved per sample
        sample_data_dict[f"{sample_name}_{strand}"] = file_data
        print(f"File data: {len(file_data)}")
        print (file_data.head(1))
        
    except Exception as e:
        print(f"  Error processing {file_name}: {str(e)}")

Starting processing of samples
starting
Processing file: Sample_01_Ctrl_morning_S1_.GRCh38.p13_G_minus_strand.bed
  Records: 1777689
File data: 65298
                    id  damage                gene   chromosome  \
0  ENSMUSG00000000001-       1  ENSMUSG00000000001  NC_000069.7   

                       sample strand  GC_count  GC_normalized_damage  \
0  Sample_01_Ctrl_morning_S1_      -       484              0.002066   

     median  median_normalized_damage  
0  6.921409                  0.298511  
Processing file: Sample_01_Ctrl_morning_S1_.GRCh38.p13_G_plus_strand.bed
  Records: 1048587
File data: 65396
                    id  damage                gene   chromosome  \
0  ENSMUSG00000000001+       4  ENSMUSG00000000001  NC_000069.7   

                       sample strand  GC_count  GC_normalized_damage  \
0  Sample_01_Ctrl_morning_S1_      +       415              0.009639   

     median  median_normalized_damage  
0  6.921409                  1.392571  
Processing file: Samp

In [20]:
# Fill in missing IDs for each sample
print("Adding missing IDs to each sample...")
for sample_key, file_data in sample_data_dict.items():
    sample_name = sample_key.rsplit('_', 1)[0]

    print (sample_key)
    
    # Identify missing IDs for this sample
    existing_ids = set(file_data['id'])
    print (f"There are {len(existing_ids)} unique ids")
    missing_ids = all_unique_ids - existing_ids
    
    if missing_ids:
        print(f"  Adding {len(missing_ids)} missing IDs to sample {sample_name} (strand {strand})")
        
        # Create rows for missing IDs
        missing_rows = []
        for missing_id in missing_ids:
            # Get the chromosome for this ID
            
            # Create a row with zeros for all numerical values
            missing_row = {
                'id': missing_id,
                'sample': sample_name,
                'gene': missing_id[:-1],
                'strand': missing_id[-1],
                'GC_count': 0,
                'damage': 0,
                'GC_normalized_damage': 0,
                'chromosome': id_to_chrom[missing_id]
            }

            
            # Add median value if available
            if sample_name in median_values:
                missing_row['median'] = median_values[sample_name]
                missing_row['median_normalized_damage'] = 0
            else:
                missing_row['median'] = np.nan
                missing_row['median_normalized_damage'] = np.nan
                
            missing_rows.append(missing_row)
        
        # Add the missing rows to the sample's data
        if missing_rows:
            missing_df = pd.DataFrame(missing_rows)
            file_data = pd.concat([file_data, missing_df], ignore_index=True)
            sample_data_dict[sample_key] = file_data
       
        print (file_data.head(1))

Adding missing IDs to each sample...
Sample_01_Ctrl_morning_S1__-
There are 65298 unique ids
  Adding 91100 missing IDs to sample Sample_01_Ctrl_morning_S1_ (strand +)
                    id  damage                gene   chromosome  \
0  ENSMUSG00000000001-       1  ENSMUSG00000000001  NC_000069.7   

                       sample strand  GC_count  GC_normalized_damage  \
0  Sample_01_Ctrl_morning_S1_      -       484              0.002066   

     median  median_normalized_damage  
0  6.921409                  0.298511  
Sample_01_Ctrl_morning_S1__+
There are 65396 unique ids
  Adding 91002 missing IDs to sample Sample_01_Ctrl_morning_S1_ (strand +)
                    id  damage                gene   chromosome  \
0  ENSMUSG00000000001+       4  ENSMUSG00000000001  NC_000069.7   

                       sample strand  GC_count  GC_normalized_damage  \
0  Sample_01_Ctrl_morning_S1_      +       415              0.009639   

     median  median_normalized_damage  
0  6.921409          

In [21]:
# Combine all samples into the final DataFrame
for sample_key, file_data in sample_data_dict.items():
    all_data = pd.concat([all_data, file_data], ignore_index=True)

# Save the combined data to a CSV file
all_data.to_csv(OUTPUT_FILE, index=False)
print(f"Processing complete! Combined data saved to {OUTPUT_FILE}")
print(f"Total records: {len(all_data)}")

  all_data = pd.concat([all_data, file_data], ignore_index=True)


Processing complete! Combined data saved to ../data_normalized/promoters_Normalized.csv
Total records: 6255920
