In [8]:
!module load stack/2024-06 gcc/12.2.0 bedtools2/2.31.0

Many modules are hidden in this stack. Use "module --show_hidden spider SOFTWARE" if you are not able to find the required software

Inactive Modules:
  1) python/3.11.6_cuda

Due to MODULEPATH changes, the following have been reloaded:
  1) hdf5/1.14.3     2) r/4.3.2

The following have been reloaded with a version change:
  1) cuda/12.2.1 => cuda/12.8.0     3) stack/2024-05 => stack/2024-06
  2) gcc/13.2.0 => gcc/12.2.0



In [10]:
!pwd

/cluster/home/taekim/stressed_mice/jupyter_notebooks


                        Sample     Median
0   Sample_14_CRS_evening_S14_   2.546314
0  Sample_15_Ctrl_evening_S15_   3.570246
0   Sample_05_Ctrl_morning_S5_   6.184096
0   Sample_01_Ctrl_morning_S1_   6.921409
0   Sample_16_CRS_evening_S16_   2.879618
0  Sample_11_Ctrl_evening_S11_   2.485161
0  Sample_13_Ctrl_evening_S13_   2.403964
0    Sample_08_CRS_morning_S8_   5.246539
0   Sample_20_CRS_evening_S20_  10.633666
0   Sample_18_CRS_evening_S18_   4.602043
0  Sample_17_Ctrl_evening_S17_   2.929046
0    Sample_04_CRS_morning_S4_   6.227455
0  Sample_19_Ctrl_evening_S19_   2.545900
0   Sample_03_Ctrl_morning_S3_   9.529751
0    Sample_02_CRS_morning_S2_   7.151634
0   Sample_09_Ctrl_morning_S9_   7.923174
0   Sample_10_CRS_morning_S10_   8.815860
0    Sample_06_CRS_morning_S6_   7.930738
0   Sample_12_CRS_evening_S12_   2.849694
0   Sample_07_Ctrl_morning_S7_   4.924262

In [18]:
import os
import pandas as pd
import numpy as np
import glob
import subprocess
from collections import defaultdict

# Path with files outputted from bedtools intersect
PATH = "/cluster/scratch/taekim/data_oxidation/cpg_intersect" 
OUTPUT_FILE = "all_samples_combined_data.csv"
GENOME_PATH = "/nfs/nas12.ethz.ch/fs1201/green_groups_let_public/Euler/Navnit/genomes/mouse/GRCm39_NCBI_Bowtie2.fasta"
# Original path with all CpG coordinates
ORG_PATH = "/nfs/nas12.ethz.ch/fs1201/green_groups_let_public/Euler/Vakil/mouse_genome_annotation/Genes_Promoters_CpG_islands_for_Tae/allCpG_islands_GRCm39.bed" 

# Define column names for the input files
INPUT_COLUMNS = ["Chr1", "Start1", "End1", "Value", "MAPQ", "Chr2", "Start2", "End2"]

# Define columns for the output file
OUTPUT_COLUMNS = ["id", "sample", "chromosome", "strand", "GC_count", "damage", 
                  "GC_normalized_damage", "median", "median_normalized_damage"]

# Define the median values for each sample
# Median value for each sample from 100kb bins
median_values = {
    "Sample_14_CRS_evening_S14_": 2.546314,
    "Sample_15_Ctrl_evening_S15_": 3.570246,
    "Sample_05_Ctrl_morning_S5_": 6.184096,
    "Sample_01_Ctrl_morning_S1_": 6.921409,
    "Sample_16_CRS_evening_S16_": 2.879618,
    "Sample_11_Ctrl_evening_S11_": 2.485161,
    "Sample_13_Ctrl_evening_S13_": 2.403964,
    "Sample_08_CRS_morning_S8_": 5.246539,
    "Sample_20_CRS_evening_S20_": 10.633666,
    "Sample_18_CRS_evening_S18_": 4.602043,
    "Sample_17_Ctrl_evening_S17_": 2.929046,
    "Sample_04_CRS_morning_S4_": 6.227455,
    "Sample_19_Ctrl_evening_S19_": 2.545900,
    "Sample_03_Ctrl_morning_S3_": 9.529751,
    "Sample_02_CRS_morning_S2_": 7.151634,
    "Sample_09_Ctrl_morning_S9_": 7.923174,
    "Sample_10_CRS_morning_S10_": 8.815860,
    "Sample_06_CRS_morning_S6_": 7.930738,
    "Sample_12_CRS_evening_S12_": 2.849694,
    "Sample_07_Ctrl_morning_S7_": 4.924262
}

def run_bedtools_getfasta(bed_file, output_file, genome_path):
    """Run bedtools getfasta command to get sequence data for a BED file"""
    command = f"/cluster/software/stacks/2024-06/spack/opt/spack/linux-ubuntu22.04-x86_64_v3/gcc-12.2.0/bedtools2-2.31.0-a4obbslkxntgdx2criopqpwx662gcftq/bin/bedtools getfasta -fi {genome_path} -bed {bed_file} -bedOut > {output_file}"
    print(f"Running command: {command}")
    
    try:
        result = subprocess.run(command, shell=True, check=True, 
                               stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                               text=True)
        print("Command completed successfully")
        return True
    except subprocess.CalledProcessError as e:
        print(f"Error running bedtools: {e}")
        print(f"STDERR: {e.stderr}")
        return False

SEQ_COLUMNS = ["Chr", "Start", "End", "Seq"]
seq_output_file = os.path.join(PATH, "allCpG_islands.SEQ.bed")
# Run getfasta with original CpG coordinate file
run_bedtools_getfasta(ORG_PATH, seq_output_file, GENOME_PATH)
seq_df = pd.read_csv(seq_output_file, sep="\t", header=None, names=SEQ_COLUMNS)

# Create an empty DataFrame for the final output
all_data = pd.DataFrame(columns=OUTPUT_COLUMNS)

print("Starting processing of samples")

# Collect all files for each sample
bed_files = glob.glob(os.path.join(PATH, "*.bed"))

# Process each file
for file_path in sorted(bed_files):
    file_name = os.path.basename(file_path)
    print(f"Processing file: {file_name}")
    
    # Extract sample name and strand information from the filename
    # Example: Sample_01_Ctrl_morning_S1_.GRCh38.p13_G_plus_strand.bed
    parts = file_name.split('.')
    sample_name = parts[0]  # e.g., Sample_01_Ctrl_morning_S1_
    
    if "plus_strand" in file_name:
        strand = "+"
    elif "minus_strand" in file_name:
        strand = "-"
    else:
        print(f"  Warning: Could not determine strand for {file_name}")
        continue
    
    try:
        # Define the sequence output file path
        
        # Read the file
        df = pd.read_csv(file_path, sep="\t", header=None, names=INPUT_COLUMNS)
        print(f"  Records: {len(df)}")
        
        # SANITY CHECK 1: Check if Chr1 equals Chr2 for all rows
        chr_mismatch = df[df['Chr1'] != df['Chr2']]
        if not chr_mismatch.empty:
            print(f"  WARNING: Found {len(chr_mismatch)} rows where Chr1 does not match Chr2")
            print(f"  First few mismatches: {chr_mismatch[['Chr1', 'Chr2']].head().to_string(index=False)}")
    
        # SANITY CHECK 2: Check if there are multiple End2 values for the same Chr1_Start2 combination
        # Create the identifier
        df['identifier'] = df['Chr1'] + '_' + df['Start2'].astype(str)
    
        # Group by the identifier and count unique End2 values
        end2_counts = df.groupby('identifier')['End2'].nunique()
    
        # Find identifiers with multiple End2 values
        multiple_ends = end2_counts[end2_counts > 1].index.tolist()
    
        if multiple_ends:
            print(f"  WARNING: Found {len(multiple_ends)} Chr1_Start2 combinations with multiple End2 values")
            print(f"  First few examples:")
            for idx in multiple_ends[:3]:  # Show first 3 examples
                example = df[df['identifier'] == idx][['Chr1', 'Start2', 'End2']]
                print(f"    {idx}: End2 values: {sorted(example['End2'].unique())}")
    
        # Create a DataFrame for this file's processed data
        file_data = pd.DataFrame()
  
        # Fill in the known columns
        file_data['id'] = df['Chr1'] + '_' + df['Start2'].astype(str)
        file_data['sample'] = sample_name
        file_data['chromosome'] = df['Chr1']
        file_data['strand'] = strand

        # Create a unique identifier using Chr_Start format for sequence data
        seq_df['region_id'] = seq_df['Chr'] + '_' + seq_df['Start'].astype(str)
        # Create a lookup dictionary from seq_df
        seq_dict = dict(zip(seq_df['region_id'], seq_df['Seq']))
        # Determine which nucleotide to count based on strand
        nucleotide = 'G' if strand == '+' else 'C'
        # Count the appropriate nucleotides for matching regions
        file_data['GC_count'] = file_data['id'].map(lambda region: str(seq_dict.get(region, '')).upper().count(nucleotide) if region in seq_dict else np.nan)

        # Calculate the damage by summing up all Value values for each id
        damage_dict = df.groupby(df['Chr1'] + '_' + df['Start2'].astype(str))['Value'].sum().to_dict()
        file_data['damage'] = file_data['id'].map(damage_dict)
    
        # Calculate GC normalized damage if GC_count is not zero or NaN
        file_data['GC_normalized_damage'] = file_data.apply(lambda row: row['damage'] / row['GC_count'] if row['GC_count'] > 0 else np.nan, axis=1)

        if sample_name in median_values:
            file_data['median'] = median_values[sample_name]
        
            # Calculate the median normalized damage
            file_data['median_normalized_damage'] = file_data['GC_normalized_damage'] / file_data['median']
        else:
            print(f"  Warning: No median value found for sample {sample_name}")
            file_data['median'] = np.nan
            file_data['median_normalized_damage'] = np.nan
    
        # Append this file's data to the all_data DataFrame
        all_data = pd.concat([all_data, file_data], ignore_index=True)
        
    except Exception as e:
        print(f"  Error processing {file_name}: {str(e)}")
        print(f"  Exception details: {type(e).__name__}: {str(e)}")

# Save the combined data to a CSV file
all_data.to_csv(OUTPUT_FILE, index=False)
print(f"Processing complete! Combined data saved to {OUTPUT_FILE}")
print(f"Total records: {len(all_data)}")


Running command: /cluster/software/stacks/2024-06/spack/opt/spack/linux-ubuntu22.04-x86_64_v3/gcc-12.2.0/bedtools2-2.31.0-a4obbslkxntgdx2criopqpwx662gcftq/bin/bedtools getfasta -fi /nfs/nas12.ethz.ch/fs1201/green_groups_let_public/Euler/Navnit/genomes/mouse/GRCm39_NCBI_Bowtie2.fasta -bed /nfs/nas12.ethz.ch/fs1201/green_groups_let_public/Euler/Vakil/mouse_genome_annotation/Genes_Promoters_CpG_islands_for_Tae/allCpG_islands_GRCm39.bed -bedOut > /cluster/scratch/taekim/data_oxidation/cpg_intersect/allCpG_islands.SEQ.bed
Command completed successfully
Starting processing of samples
Processing file: Sample_01_Ctrl_morning_S1_.GRCh38.p13_G_minus_strand.bed
  Records: 12285


  all_data = pd.concat([all_data, file_data], ignore_index=True)


Processing file: Sample_01_Ctrl_morning_S1_.GRCh38.p13_G_plus_strand.bed
  Records: 12493
Processing file: Sample_02_CRS_morning_S2_.GRCh38.p13_G_minus_strand.bed
  Records: 14585
Processing file: Sample_02_CRS_morning_S2_.GRCh38.p13_G_plus_strand.bed
  Records: 14884
Processing file: Sample_03_Ctrl_morning_S3_.GRCh38.p13_G_minus_strand.bed
  Records: 15610
Processing file: Sample_03_Ctrl_morning_S3_.GRCh38.p13_G_plus_strand.bed
  Records: 15956
Processing file: Sample_04_CRS_morning_S4_.GRCh38.p13_G_minus_strand.bed
  Records: 11335
Processing file: Sample_04_CRS_morning_S4_.GRCh38.p13_G_plus_strand.bed
  Records: 10942
Processing file: Sample_05_Ctrl_morning_S5_.GRCh38.p13_G_minus_strand.bed
  Records: 9503
Processing file: Sample_05_Ctrl_morning_S5_.GRCh38.p13_G_plus_strand.bed
  Records: 9478
Processing file: Sample_06_CRS_morning_S6_.GRCh38.p13_G_minus_strand.bed
  Records: 14900
Processing file: Sample_06_CRS_morning_S6_.GRCh38.p13_G_plus_strand.bed
  Records: 15030
Processing fi