In [1]:
!module load stack/2024-06 gcc/12.2.0 bedtools2/2.31.0

Many modules are hidden in this stack. Use "module --show_hidden spider SOFTWARE" if you are not able to find the required software

Inactive Modules:
  1) python/3.11.6_cuda

Due to MODULEPATH changes, the following have been reloaded:
  1) hdf5/1.14.3     2) r/4.3.2

The following have been reloaded with a version change:
  1) cuda/12.2.1 => cuda/12.8.0     3) stack/2024-05 => stack/2024-06
  2) gcc/13.2.0 => gcc/12.2.0



In [2]:
!pwd

/cluster/home/taekim/stressed_mice/jupyter_notebooks


                        Sample     Median
0   Sample_14_CRS_evening_S14_   2.546314
0  Sample_15_Ctrl_evening_S15_   3.570246
0   Sample_05_Ctrl_morning_S5_   6.184096
0   Sample_01_Ctrl_morning_S1_   6.921409
0   Sample_16_CRS_evening_S16_   2.879618
0  Sample_11_Ctrl_evening_S11_   2.485161
0  Sample_13_Ctrl_evening_S13_   2.403964
0    Sample_08_CRS_morning_S8_   5.246539
0   Sample_20_CRS_evening_S20_  10.633666
0   Sample_18_CRS_evening_S18_   4.602043
0  Sample_17_Ctrl_evening_S17_   2.929046
0    Sample_04_CRS_morning_S4_   6.227455
0  Sample_19_Ctrl_evening_S19_   2.545900
0   Sample_03_Ctrl_morning_S3_   9.529751
0    Sample_02_CRS_morning_S2_   7.151634
0   Sample_09_Ctrl_morning_S9_   7.923174
0   Sample_10_CRS_morning_S10_   8.815860
0    Sample_06_CRS_morning_S6_   7.930738
0   Sample_12_CRS_evening_S12_   2.849694
0   Sample_07_Ctrl_morning_S7_   4.924262

In [8]:
import os
import pandas as pd
import numpy as np
import glob
import subprocess
from collections import defaultdict

In [9]:
# Path with files outputted from bedtools intersect
PATH = "/cluster/scratch/taekim/data_oxidation/cpg_intersect" 
OUTPUT_FILE = "../data_normalized/cpg_Normalized.csv"
GENOME_PATH = "/nfs/nas12.ethz.ch/fs1201/green_groups_let_public/Euler/Navnit/genomes/mouse/GRCm39_NCBI_Bowtie2.fasta"
# Original path with all CpG coordinates
ORG_PATH = "/nfs/nas12.ethz.ch/fs1201/green_groups_let_public/Euler/Vakil/mouse_genome_annotation/Genes_Promoters_CpG_islands_for_Tae/allCpG_islands_GRCm39.bed" 

# Define column names for the input files
INPUT_COLUMNS = ["Chr1", "Start1", "End1", "Value", "MAPQ", "Chr2", "Start2", "End2"]

# Define columns for the output file
OUTPUT_COLUMNS = ["id", "sample", "chromosome", "strand", "GC_count", "damage", 
                  "GC_normalized_damage", "median", "median_normalized_damage"]

# Define the median values for each sample
# Median value for each sample from 100kb bins
median_values = {
    "Sample_14_CRS_evening_S14_": 2.546314,
    "Sample_15_Ctrl_evening_S15_": 3.570246,
    "Sample_05_Ctrl_morning_S5_": 6.184096,
    "Sample_01_Ctrl_morning_S1_": 6.921409,
    "Sample_16_CRS_evening_S16_": 2.879618,
    "Sample_11_Ctrl_evening_S11_": 2.485161,
    "Sample_13_Ctrl_evening_S13_": 2.403964,
    "Sample_08_CRS_morning_S8_": 5.246539,
    "Sample_20_CRS_evening_S20_": 10.633666,
    "Sample_18_CRS_evening_S18_": 4.602043,
    "Sample_17_Ctrl_evening_S17_": 2.929046,
    "Sample_04_CRS_morning_S4_": 6.227455,
    "Sample_19_Ctrl_evening_S19_": 2.545900,
    "Sample_03_Ctrl_morning_S3_": 9.529751,
    "Sample_02_CRS_morning_S2_": 7.151634,
    "Sample_09_Ctrl_morning_S9_": 7.923174,
    "Sample_10_CRS_morning_S10_": 8.815860,
    "Sample_06_CRS_morning_S6_": 7.930738,
    "Sample_12_CRS_evening_S12_": 2.849694,
    "Sample_07_Ctrl_morning_S7_": 4.924262
}

In [10]:
def run_bedtools_getfasta(bed_file, output_file, genome_path):
    """Run bedtools getfasta command to get sequence data for a BED file"""
    command = f"/cluster/software/stacks/2024-06/spack/opt/spack/linux-ubuntu22.04-x86_64_v3/gcc-12.2.0/bedtools2-2.31.0-a4obbslkxntgdx2criopqpwx662gcftq/bin/bedtools getfasta -fi {genome_path} -bed {bed_file} -bedOut > {output_file}"
    print(f"Running command: {command}")
    
    try:
        result = subprocess.run(command, shell=True, check=True, 
                               stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                               text=True)
        print("Command completed successfully")
        return True
    except subprocess.CalledProcessError as e:
        print(f"Error running bedtools: {e}")
        print(f"STDERR: {e.stderr}")
        return False

SEQ_COLUMNS = ["Chr", "Start", "End", "Seq"]
seq_output_file = os.path.join(PATH, "allCpG_islands.SEQ.bed")

# Run getfasta with original CpG coordinate file
run_bedtools_getfasta(ORG_PATH, seq_output_file, GENOME_PATH)

seq_df = pd.read_csv(seq_output_file, sep="\t", header=None, names=SEQ_COLUMNS)

seq_counts_plus = seq_df.copy()
seq_counts_plus['id'] = seq_counts_plus['Chr'] + '+' + seq_counts_plus['Start'].astype(str)
seq_counts_plus['GC_count'] = seq_counts_plus['Seq'].str.upper().str.count('G')
seq_counts_plus = seq_counts_plus.drop_duplicates('id')

seq_counts_minus = seq_df.copy()
seq_counts_minus['id'] = seq_counts_minus['Chr'] + '-' + seq_counts_minus['Start'].astype(str)
seq_counts_minus['GC_count'] = seq_counts_minus['Seq'].str.upper().str.count('C')
seq_counts_minus = seq_counts_minus.drop_duplicates('id')

# Combine both strand data
seq_counts = pd.concat([seq_counts_plus, seq_counts_minus])

Running command: /cluster/software/stacks/2024-06/spack/opt/spack/linux-ubuntu22.04-x86_64_v3/gcc-12.2.0/bedtools2-2.31.0-a4obbslkxntgdx2criopqpwx662gcftq/bin/bedtools getfasta -fi /nfs/nas12.ethz.ch/fs1201/green_groups_let_public/Euler/Navnit/genomes/mouse/GRCm39_NCBI_Bowtie2.fasta -bed /nfs/nas12.ethz.ch/fs1201/green_groups_let_public/Euler/Vakil/mouse_genome_annotation/Genes_Promoters_CpG_islands_for_Tae/allCpG_islands_GRCm39.bed -bedOut > /cluster/scratch/taekim/data_oxidation/cpg_intersect/allCpG_islands.SEQ.bed
Command completed successfully


In [11]:
# Collect all files for each sample
bed_files = glob.glob(os.path.join(PATH, "*strand.bed"))

all_unique_ids = set()  # To store all unique IDs across all samples

# First pass: Collect all unique IDs across all samples
print("First pass: Collecting all unique IDs...")
for file_path in sorted(bed_files):
    file_name = os.path.basename(file_path)
    try:
        # Read the file
        df = pd.read_csv(file_path, sep="\t", header=None, names=INPUT_COLUMNS)

        if "plus_strand" in file_name:
            strand = "+"
        elif "minus_strand" in file_name:
            strand = "-"        
            
        # Create the identifier
        ids = df['Chr1'] + strand + df['Start2'].astype(str)
        
        # Add to the set of all unique IDs
        all_unique_ids.update(ids.unique())
        
    except Exception as e:
        print(f"  Error collecting IDs from {file_name}: {str(e)}")

print(f"Collected {len(all_unique_ids)} unique IDs across all samples")

# Create a dictionary to store chromosome info for each ID
id_to_chrom = {}
for file_path in sorted(bed_files):
    file_name = os.path.basename(file_path)
    try:
        df = pd.read_csv(file_path, sep="\t", header=None, names=INPUT_COLUMNS)
        
        # Map each ID to its chromosome
        id_df = df.copy()
        
        if "plus_strand" in file_name:
            strand = "+"
        elif "minus_strand" in file_name:
            strand = "-"   
            
        id_df['id'] = id_df['Chr1'] + strand + id_df['Start2'].astype(str)
        
        for _, row in id_df.drop_duplicates('id').iterrows():
            id_to_chrom[row['id']] = row['Chr1']
            
    except Exception as e:
        print(f"  Error mapping chromosomes: {str(e)}")

First pass: Collecting all unique IDs...
Collected 31500 unique IDs across all samples


In [15]:
# Create an empty DataFrame for the final output
all_data = pd.DataFrame(columns=OUTPUT_COLUMNS)

print("Starting processing of samples")

# Process each file
sample_data_dict = {}  # Store data for each sample


for file_path in sorted(bed_files):
    file_name = os.path.basename(file_path)
    print(f"Processing file: {file_name}")
    
    # Extract sample name and determine strand
    sample_name = file_name.split('.')[0]
    
    if "plus_strand" in file_name:
        strand = "+"
        nucleotide = 'G'  # Count G on plus strand
    elif "minus_strand" in file_name:
        strand = "-"
        nucleotide = 'C'  # Count C on minus strand
    else:
        print(f"  Warning: Could not determine strand for {file_name}")
        continue
    
    try:
        # Read and process the file
        # dataframe of intersection from oxidation sites per sample and cpg coordinates
        df = pd.read_csv(file_path, sep="\t", header=None, names=INPUT_COLUMNS)
        print(f"  Records: {len(df)}")
        
        # Create IDs and calculate damage per genomic position
        df['id'] = df['Chr1'] + strand + df['Start2'].astype(str)
        damage_df = df.groupby('id').agg({
            'Value': 'sum',
            'Chr1': 'first'
        }).reset_index()
        damage_df = damage_df.rename(columns={'Value': 'damage'})
        damage_df = damage_df.rename(columns={'Chr1': 'chromosome'})
        
        # Create base result dataframe
        # We need one row per ID + strand + sample combination
        file_data = damage_df.copy()
        file_data['sample'] = sample_name
        file_data['strand'] = strand

        # 4. Merge the counts with our result dataframe
        file_data = file_data.merge(
            seq_counts[['id', 'GC_count']], 
            on='id', 
            how='left'
        )
        
        # Calculate normalized damage
        file_data['GC_normalized_damage'] = file_data['damage'] / file_data['GC_count'].replace(0, np.nan)
        
        # Add median normalization if available
        if sample_name in median_values:
            file_data['median'] = median_values[sample_name]
            file_data['median_normalized_damage'] = 1000 * file_data['GC_normalized_damage'] / file_data['median']
        else:
            print(f"  Warning: No median value found for sample {sample_name}")
            file_data['median'] = np.nan
            file_data['median_normalized_damage'] = np.nan
        
        # Store processed data
        # file_data saved per sample
        sample_data_dict[f"{sample_name}_{strand}"] = file_data
        print(f"File data: {len(file_data)}")
        
        print (file_data.head(1))
        
    except Exception as e:
        print(f"  Error processing {file_name}: {str(e)}")




Starting processing of samples
Processing file: Sample_01_Ctrl_morning_S1_.GRCh38.p13_G_minus_strand.bed
  Records: 12285
File data: 5618
                     id  damage   chromosome                      sample  \
0  NC_000067.7-10037343       4  NC_000067.7  Sample_01_Ctrl_morning_S1_   

  strand  GC_count  GC_normalized_damage    median  median_normalized_damage  
0      -       441               0.00907  6.921409                  1.310469  
Processing file: Sample_01_Ctrl_morning_S1_.GRCh38.p13_G_plus_strand.bed
  Records: 12493
File data: 5618
                     id  damage   chromosome                      sample  \
0  NC_000067.7+10037343       4  NC_000067.7  Sample_01_Ctrl_morning_S1_   

  strand  GC_count  GC_normalized_damage    median  median_normalized_damage  
0      +       429              0.009324  6.921409                  1.347126  
Processing file: Sample_02_CRS_morning_S2_.GRCh38.p13_G_minus_strand.bed
  Records: 14585
File data: 6146
                     id  dam

In [16]:
# Fill in missing IDs for each sample
print("Adding missing IDs to each sample...")
for sample_key, file_data in sample_data_dict.items():
    sample_name = sample_key.rsplit('_', 1)[0]
    strand = sample_key.split('_')[-1]
    
    # Identify missing IDs for this sample
    existing_ids = set(file_data['id'])
    print (f"There are {len(existing_ids)} unique ids")
    missing_ids = all_unique_ids - existing_ids
    
    if missing_ids:
        print(f"  Adding {len(missing_ids)} missing IDs to sample {sample_name} (strand {strand})")
        
        # Create rows for missing IDs
        missing_rows = []
        for missing_id in missing_ids:
            # Get the chromosome for this ID
            chromosome = id_to_chrom.get(missing_id, "unknown")
            
            # Create a row with zeros for all numerical values
            missing_row = {
                'id': missing_id,
                'sample': sample_name,
                'chromosome': chromosome,
                'strand': strand,
                'GC_count': 0,
                'damage': 0,
                'GC_normalized_damage': 0
            }
            
            # Add median value if available
            if sample_name in median_values:
                missing_row['median'] = median_values[sample_name]
                missing_row['median_normalized_damage'] = 0
            else:
                missing_row['median'] = np.nan
                missing_row['median_normalized_damage'] = np.nan
                
            missing_rows.append(missing_row)
        
        # Add the missing rows to the sample's data
        if missing_rows:
            missing_df = pd.DataFrame(missing_rows)
            file_data = pd.concat([file_data, missing_df], ignore_index=True)
            sample_data_dict[sample_key] = file_data

        print(file_data.head(1))


Adding missing IDs to each sample...
There are 5618 unique ids
  Adding 25882 missing IDs to sample Sample_01_Ctrl_morning_S1_ (strand -)
                     id  damage   chromosome                      sample  \
0  NC_000067.7-10037343       4  NC_000067.7  Sample_01_Ctrl_morning_S1_   

  strand  GC_count  GC_normalized_damage    median  median_normalized_damage  
0      -       441               0.00907  6.921409                  1.310469  
There are 5618 unique ids
  Adding 25882 missing IDs to sample Sample_01_Ctrl_morning_S1_ (strand +)
                     id  damage   chromosome                      sample  \
0  NC_000067.7+10037343       4  NC_000067.7  Sample_01_Ctrl_morning_S1_   

  strand  GC_count  GC_normalized_damage    median  median_normalized_damage  
0      +       429              0.009324  6.921409                  1.347126  
There are 6146 unique ids
  Adding 25354 missing IDs to sample Sample_02_CRS_morning_S2_ (strand -)
                     id  damage   chrom

In [17]:
# Combine all samples into the final DataFrame
for sample_key, file_data in sample_data_dict.items():
    all_data = pd.concat([all_data, file_data], ignore_index=True)

# Save the combined data to a CSV file
all_data.to_csv(OUTPUT_FILE, index=False)
print(f"Processing complete! Combined data saved to {OUTPUT_FILE}")
print(f"Total records: {len(all_data)}")

  all_data = pd.concat([all_data, file_data], ignore_index=True)


Processing complete! Combined data saved to ../data_normalized/cpg_Normalized.csv
Total records: 1260000
