## Binning genome-wide data

<div style="text-align: right">
    02.04.2025
    <br>
    Tae Kim, MSc
</div>

In [30]:
#importing necessary modules
import os
os.environ['LC_ALL'] = 'en_US.UTF-8'
os.environ['LANG'] = 'en_US.UTF-8'
os.environ["MPLBACKEND"] = "TkAgg"

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import sys
print(sys.version)

3.11.6 (main, Jun  7 2024, 07:09:59) [GCC 13.2.0]


In [32]:
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

matplotlib.rcParams['font.sans-serif'] = "Arial"
matplotlib.rcParams['font.family'] = "sans-serif"
matplotlib.rcParams['mathtext.default'] = "regular"

matplotlib.rcParams['font.size'] = 16

In [33]:
print("numpy", np.__version__)
print("pandas", pd.__version__)
print("matplotlib", matplotlib.__version__)
print("seaborn", sns.__version__)
print("scipy", scipy.__version__)

numpy 1.26.4
pandas 2.2.2
matplotlib 3.9.0
seaborn 0.13.2
scipy 1.13.1


In [34]:
# Create a list of chromosome names in standard format
# Generate chromosomes 1-19 plus sex chromosomes X and Y

In [35]:
chromosomes = ['chr' + str(i) for i in np.arange(1, 20, 1)] + ["chrX", "chrY"]
chromosomes

['chr1',
 'chr2',
 'chr3',
 'chr4',
 'chr5',
 'chr6',
 'chr7',
 'chr8',
 'chr9',
 'chr10',
 'chr11',
 'chr12',
 'chr13',
 'chr14',
 'chr15',
 'chr16',
 'chr17',
 'chr18',
 'chr19',
 'chrX',
 'chrY']

In [36]:
chromosomes_name_table = "/nfs/nas12.ethz.ch/fs1201/green_groups_let_public/Euler/Navnit/genomes/mouse/Chromosome_naming_table_17.09.2024_Vakil.tsv"
chromosomes_name_table = pd.read_csv(chromosomes_name_table, sep = "\t", header = 0)
chromosomes_name_table = chromosomes_name_table[chromosomes_name_table["UCSC style name"].isin(chromosomes)]
chromosomes_name_table = chromosomes_name_table.loc[:, ["RefSeq seq accession", "UCSC style name"]]
chromosomes_name_table

Unnamed: 0,RefSeq seq accession,UCSC style name
0,NC_000067.7,chr1
1,NC_000068.8,chr2
2,NC_000069.7,chr3
3,NC_000070.7,chr4
4,NC_000071.7,chr5
5,NC_000072.7,chr6
6,NC_000073.7,chr7
7,NC_000074.7,chr8
8,NC_000075.7,chr9
9,NC_000076.7,chr10


In [37]:
chr_sizesGRCh38 = "/nfs/nas12.ethz.ch/fs1201/green_groups_let_public/Euler/Navnit/genomes/mouse/GRCm39_NCBI_Bowtie2.fasta.fai"
DF_chrsizes = pd.read_csv(chr_sizesGRCh38, sep = "\t", header = None)
print("Number of chromsomes/contigs in the genome assembly:", DF_chrsizes[0].nunique())
DF_chrsizes = DF_chrsizes[DF_chrsizes[0].isin(chromosomes_name_table["RefSeq seq accession"].values)]
print("Number of chromsomes/contigs in the genome assembly:", DF_chrsizes[0].nunique())


Number of chromsomes/contigs in the genome assembly: 61
Number of chromsomes/contigs in the genome assembly: 21


In [38]:
DF_chrsizes

Unnamed: 0,0,1,2,3,4
0,NC_000067.7,195154279,63,60,61
7,NC_000068.8,181755017,199241831,60,61
8,NC_000069.7,159745316,384026162,60,61
9,NC_000070.7,156860686,546433963,60,61
11,NC_000071.7,151758149,705911172,60,61
17,NC_000072.7,149588044,861966554,60,61
18,NC_000073.7,144995196,1014047796,60,61
20,NC_000074.7,130127694,1161638654,60,61
21,NC_000075.7,124359700,1293935206,60,61
22,NC_000076.7,130530862,1420367632,60,61


In [39]:
DF_chrsizes.shape

(21, 5)

In [40]:
!ls -lht /nfs/nas12.ethz.ch/fs1201/green_groups_let_public/Euler/Vakil/Mouse_brain_Sept2024/mouse_brain_Sep2024_processed/Sample_01_Ctrl_morning_S1_/bed_and_bedgraph/

total 379M
-rwxrwx--- 1 vtakhaveev vtakhaveev-group 198M Sep 16  2024 Sample_01_Ctrl_morning_S1_.GRCh38.p13_G_minus_strand.bedgraph
-rwxrwx--- 1 vtakhaveev vtakhaveev-group 179M Sep 16  2024 Sample_01_Ctrl_morning_S1_.GRCh38.p13_G_plus_strand.bedgraph
-rwxrwx--- 1 vtakhaveev vtakhaveev-group 2.0M Sep 16  2024 Sample_01_Ctrl_morning_S1_.GRCh38.p13.dedupl.filtered.forw.bam.bai
-rwxrwx--- 1 vtakhaveev vtakhaveev-group  72K Sep 16  2024 Sample_01_Ctrl_morning_S1_.GRCh38.p13.dedupl.filtered.forw.STATS.txt
-rwxrwx--- 1 vtakhaveev vtakhaveev-group  95K Sep 16  2024 Sample_01_Ctrl_morning_S1_.GRCh38.p13.dedupl.filtered.STATS.txt


In [21]:
###
###
###Tae, please change the paths.
###Keep strands.
###
###



#'''
#Enough to run once

PATH = "/nfs/nas12.ethz.ch/fs1201/green_groups_let_public/Euler/Vakil/Mouse_brain_Sept2024/mouse_brain_Sep2024_processed/"
prefix = ""
suffix = ".GRCh38.p13_G_"

OUTPATH = "/cluster/home/taekim/stressed_mice/data_binning_normalization/"

BINSIZEs = [float(10**3), float(10**4), float(10**5)]#run on 17.09.2024
print("test")

for BINSIZE in BINSIZEs:
    DF_damage_binned = pd.DataFrame({})

    for sample in os.listdir(PATH):
                #S = sample.split("_")[1].replace("-Sample", "")
            
                df1 = os.path.join(PATH, sample, "bed_and_bedgraph", prefix + sample + suffix + "plus" + "_strand.bedgraph")
                df1 = pd.read_csv(df1, sep = "\t", header = None, names = ["Chr", "Start", "End", "Value", "MAPQ"])
                df2 = os.path.join(PATH, sample, "bed_and_bedgraph", prefix + sample + suffix + "minus" + "_strand.bedgraph")
                df2 = pd.read_csv(df2, sep = "\t", header = None, names = ["Chr", "Start", "End", "Value", "MAPQ"])

                for chromosome in DF_chrsizes[0].values:#chromosomes:                    
                    chr_length = float(DF_chrsizes[DF_chrsizes[0] == chromosome][1].iloc[0])
                    bin_borders = list(np.arange(0, chr_length, BINSIZE))
                    if bin_borders[-1] != chr_length - 1:
                        bin_borders += [chr_length - 1]
                    bin_borders = np.array(bin_borders)
                    sizes_array = np.diff(bin_borders)
                    sizes_array = [sizes_array[0] + 1] + list(sizes_array[1:])#we made the left boundary of the first bin inclusive, therefore, the length is 1 bp bigger
                    template = pd.DataFrame({"Bin" : bin_borders[:-1], "Bin_size" : sizes_array})
                    

                    df1_ch = df1[df1["Chr"] == chromosome].copy().reset_index(drop = True)
                    df2_ch = df2[df2["Chr"] == chromosome].copy().reset_index(drop = True)

                    df1_ch.loc[:, 'Bin'] = pd.cut(df1_ch["Start"], bins = bin_borders, labels = bin_borders[:-1], 
                                                  include_lowest = True, right = True)
                    if df1_ch.shape[0] - df1_ch.dropna().shape[0] > 0:
                        print("Warning")
                    df2_ch.loc[:, 'Bin'] = pd.cut(df2_ch["Start"], bins = bin_borders, labels = bin_borders[:-1], 
                                                  include_lowest = True, right = True)
                    if df2_ch.shape[0] - df2_ch.dropna().shape[0] > 0:
                        print("Warning")
                    df1_ch.loc[:, 'Bin'] = df1_ch['Bin'].astype(float)
                    df2_ch.loc[:, 'Bin'] = df2_ch['Bin'].astype(float)
                    df1_ch = pd.merge(template, df1_ch, on = "Bin", how = "left").loc[:, ["Value", "Bin", "Bin_size"]].astype(float).fillna(0)
                    df2_ch = pd.merge(template, df2_ch, on = "Bin", how = "left").loc[:, ["Value", "Bin", "Bin_size"]].astype(float).fillna(0)
                    df1_ch = df1_ch.groupby(by = ["Bin", "Bin_size"]).sum().reset_index()
                    df2_ch = df2_ch.groupby(by = ["Bin", "Bin_size"]).sum().reset_index()

                    df1_ch.loc[:, "Strand"] = "+" 
                    df2_ch.loc[:, "Strand"] = "-" 
                    df1_ch.loc[:, "Damage"] = df1_ch["Value"]
                    df2_ch.loc[:, "Damage"] = df2_ch["Value"]
                    df1_ch.loc[:, "Chromosome"] = chromosome
                    df2_ch.loc[:, "Chromosome"] = chromosome
                    df1_ch.loc[:, "Sample"] = sample
                    df2_ch.loc[:, "Sample"] = sample

                    df1_ch = df1_ch.loc[:, ["Bin", "Bin_size", "Damage", "Chromosome", "Sample", "Strand"]]
                    df2_ch = df2_ch.loc[:, ["Bin", "Bin_size", "Damage", "Chromosome", "Sample", "Strand"]]
            
                    df_ch = pd.concat([df1_ch, df2_ch])
                    df_ch = df_ch.sort_values(by = ["Bin", "Strand"], ascending = [True, True])
                    
                    DF_damage_binned = pd.concat([DF_damage_binned, df_ch])

                print(sample)

    DF_damage_binned = DF_damage_binned.reset_index(drop = True)

    DF_damage_binned.to_csv(OUTPATH + "Binned_damage_GENOMEWIDE_strand" + str(int(BINSIZE)) + "_CCS.20_mice_Sept2024.csv")
    print ("Finished")

#'''

test
Sample_14_CRS_evening_S14_
Sample_15_Ctrl_evening_S15_
Sample_05_Ctrl_morning_S5_
Sample_01_Ctrl_morning_S1_
Sample_16_CRS_evening_S16_
Sample_11_Ctrl_evening_S11_
Sample_13_Ctrl_evening_S13_
Sample_08_CRS_morning_S8_
Sample_20_CRS_evening_S20_
Sample_18_CRS_evening_S18_
Sample_17_Ctrl_evening_S17_
Sample_04_CRS_morning_S4_
Sample_19_Ctrl_evening_S19_
Sample_03_Ctrl_morning_S3_
Sample_02_CRS_morning_S2_
Sample_09_Ctrl_morning_S9_
Sample_10_CRS_morning_S10_
Sample_06_CRS_morning_S6_
Sample_12_CRS_evening_S12_
Sample_07_Ctrl_morning_S7_
Finished
Sample_14_CRS_evening_S14_
Sample_15_Ctrl_evening_S15_
Sample_05_Ctrl_morning_S5_
Sample_01_Ctrl_morning_S1_
Sample_16_CRS_evening_S16_
Sample_11_Ctrl_evening_S11_
Sample_13_Ctrl_evening_S13_
Sample_08_CRS_morning_S8_
Sample_20_CRS_evening_S20_
Sample_18_CRS_evening_S18_
Sample_17_Ctrl_evening_S17_
Sample_04_CRS_morning_S4_
Sample_19_Ctrl_evening_S19_
Sample_03_Ctrl_morning_S3_
Sample_02_CRS_morning_S2_
Sample_09_Ctrl_morning_S9_
Sample_10_C

Some work in the terminal:

<code>cd /nfs/nas12.ethz.ch/fs1201/green_groups_let_public/Euler/Vakil/Mouse_brain_Sept2024/Binning_normalization/
module load stack/2024-06 gcc/12.2.0 bedtools2/2.31.0
bedtools getfasta -fi /nfs/nas12.ethz.ch/fs1201/green_groups_let_public/Euler/Navnit/genomes/mouse/GRCm39_NCBI_Bowtie2.fasta -bed Genome_bins_100000.mouse.bed -bedOut > Genome_bins_100000.mouse.SEQ.bed
bedtools getfasta -fi /nfs/nas12.ethz.ch/fs1201/green_groups_let_public/Euler/Navnit/genomes/mouse/GRCm39_NCBI_Bowtie2.fasta -bed Genome_bins_10000.mouse.bed -bedOut > Genome_bins_10000.mouse.SEQ.bed
bedtools getfasta -fi /nfs/nas12.ethz.ch/fs1201/green_groups_let_public/Euler/Navnit/genomes/mouse/GRCm39_NCBI_Bowtie2.fasta -bed Genome_bins_1000.mouse.bed -bedOut > Genome_bins_1000.mouse.SEQ.bed
</code>

In [54]:
###
###
###Tae, please change the paths.
###In my code, I count G and C together. (C is G on the opposite strand - complementarity principle). Since your analysis is strand specific, 
###you should count G and C separately. Number of G is the number of G on the + strand. Nubmer of C is the number C on the - strand.
###


#'''
BINSIZEs = [float(10**3), float(10**4), float(10**5)]#run on 17.09.2024

for BINSIZE in BINSIZEs:
    PATH = "/cluster/home/taekim/stressed_mice/data_binning_normalization/"
    df = pd.read_csv(PATH + "Genome_bins_" + str(int(BINSIZE)) + ".mouse.SEQ.bed", header = None, index_col = None, sep = "\t")
    print(df.shape[0])

    df.loc[:, "Seq"] = df[4].str.upper()
    df.loc[:, "G_count"] = df["Seq"].str.count("G")  # For plus strand
    df.loc[:, "C_count"] = df["Seq"].str.count("C")  # For minus strand
    df = df.loc[:, [0, 3, "G_count", "C_count"]]
    df.to_csv(PATH + "Genome_bins_" + str(int(BINSIZE)) + ".G_counts.csv")
    df
#'''

2723425
272352
27243


In [54]:
###
###
###Tae, please change the paths.
###In my code, I count G and C together. (C is G on the opposite strand - complementarity principle). Since your analysis is strand specific, 
###you should count G and C separately. Number of G is the number of G on the + strand. Nubmer of C is the number C on the - strand.
###


#'''
BINSIZEs = [float(10**3), float(10**4), float(10**5)]#run on 17.09.2024

for BINSIZE in BINSIZEs:
    PATH = "/cluster/home/taekim/stressed_mice/data_binning_normalization/"
    df = pd.read_csv(PATH + "Genome_bins_" + str(int(BINSIZE)) + ".mouse.SEQ.bed", header = None, index_col = None, sep = "\t")
    print(df.shape[0])

    df.loc[:, "Seq"] = df[4].str.upper()
    df.loc[:, "G_count"] = df["Seq"].str.count("G")  # For plus strand
    df.loc[:, "C_count"] = df["Seq"].str.count("C")  # For minus strand
    df = df.loc[:, [0, 3, "G_count", "C_count"]]
    df.to_csv(PATH + "Genome_bins_" + str(int(BINSIZE)) + ".G_counts.csv")
    df
#'''

2723425
272352
27243


In [16]:
###
###
###Tae, please change the paths.
###The idea of this code is to create coordinates of bins so that the code below can fetch reference sequences of the bins and then we could count G 
###
###

#'''
OUTPATH = "/cluster/home/taekim/stressed_mice/data_binning_normalization/"

BINSIZEs = [float(10**3), float(10**4), float(10**5)]#run on 17.09.2024

for BINSIZE in BINSIZEs:
    DF_bins = pd.DataFrame({})

    for chromosome in DF_chrsizes[0].values:#chromosomes:                    
        chr_length = float(DF_chrsizes[DF_chrsizes[0] == chromosome][1].iloc[0])
        bin_borders = list(np.arange(0, chr_length, BINSIZE))

        if bin_borders[-1] != chr_length - 1:
            bin_borders += [chr_length - 1]
        bin_borders = np.array(bin_borders, dtype = int)

        #aligning bed's [start, end) with pd.cut's (start, end] + include_lowest = Ture
        starts = [0] + list(bin_borders[1:-1] + 1)
        ends = list(bin_borders[1:] + 1)

        template = pd.DataFrame({"Chromosome" : [chromosome]*len(starts), "Bin_start" : starts, "Bin_end" : ends, "Bin" : bin_borders[:-1]})

        DF_bins = pd.concat([DF_bins, template])

    DF_bins = DF_bins.reset_index(drop = True)
    DF_bins.to_csv(OUTPATH + "Genome_bins_" + str(int(BINSIZE)) + ".mouse.bed", index = False, header = False, sep = "\t")
    print("Finished")
    
    DF_bins 
    
#'''    

Finished
Finished
Finished


   NC_000067.7     0  1001   0.1  \
0  NC_000067.7  1001  2001  1000   
1  NC_000067.7  2001  3001  2000   
2  NC_000067.7  3001  4001  3000   
3  NC_000067.7  4001  5001  4000   
4  NC_000067.7  5001  6001  5000   

  NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN

In [56]:
###
###
###Tae, please change the paths.
###This code is to calculate normalization factors. 
###You need to incorporate strands.
###
###

PATH = "/cluster/home/taekim/stressed_mice/data_binning_normalization/"
BINSIZEs = [float(10**5)]
for BINSIZE in BINSIZEs:
    print(BINSIZE)
    ###G counts per bin according to reference genome
    DF_Gs = pd.read_csv(PATH + "Genome_bins_" + str(int(BINSIZE)) + ".G_counts.csv", index_col = 0)
    DF_Gs = DF_Gs.rename(columns={"0" : "Chromosome", "3" : "Bin"})
    print(DF_Gs.shape[0])
    
    ###Binned damage data
    tmp = pd.read_csv(PATH + "Binned_damage_GENOMEWIDE_strand" + str(int(BINSIZE)) + "_CCS.20_mice_Sept2024.csv")
    print(tmp.shape[0], tmp.shape[0]/len(tmp["Sample"].unique()))

    # Create separate dataframes for plus and minus strands
    tmp_plus = tmp[tmp["Strand"] == "+"].copy()
    tmp_minus = tmp[tmp["Strand"] == "-"].copy()

    ###Normalizing the binned damage data for G count
    tmp_plus = pd.merge(tmp_plus, DF_Gs[DF_Gs["G_count"] > 0], on = ("Chromosome", "Bin"), how = "inner")
    print(tmp_plus.shape[0], tmp_plus.shape[0]/len(tmp_plus["Sample"].unique()))
    
    tmp_plus.loc[:, "Damage"] = (10**3)*tmp_plus["Damage"]/tmp_plus["G_count"]

    ###Normalizing the binned damage data for C count
    tmp_minus = pd.merge(tmp_minus, DF_Gs[DF_Gs["C_count"] > 0], on = ("Chromosome", "Bin"), how = "inner")
    print(tmp_minus.shape[0], tmp_minus.shape[0]/len(tmp_minus["Sample"].unique()))
    
    tmp_minus.loc[:, "Damage"] = (10**3)*tmp_minus["Damage"]/tmp_minus["C_count"]

    tmp_normalized = pd.concat([tmp_plus, tmp_minus])

    NF_df = pd.DataFrame({})
    for sample in list(tmp_normalized["Sample"].unique()): #one median per sample
        tmp_sample = tmp_normalized[tmp_normalized["Sample"] == sample]
        M = np.median(tmp_sample["Damage"].values)
        NF_df = pd.concat([NF_df, pd.DataFrame({"Sample": [sample], "Median": [M]})])

    NF_df.to_csv(PATH + "NFstrand" + str(int(BINSIZE)) + ".csv")
    print(NF_df)

100000.0
27243
1089720 54486.0
531700 26585.0
531660 26583.0
                        Sample     Median
0   Sample_14_CRS_evening_S14_   2.546314
0  Sample_15_Ctrl_evening_S15_   3.570246
0   Sample_05_Ctrl_morning_S5_   6.184096
0   Sample_01_Ctrl_morning_S1_   6.921409
0   Sample_16_CRS_evening_S16_   2.879618
0  Sample_11_Ctrl_evening_S11_   2.485161
0  Sample_13_Ctrl_evening_S13_   2.403964
0    Sample_08_CRS_morning_S8_   5.246539
0   Sample_20_CRS_evening_S20_  10.633666
0   Sample_18_CRS_evening_S18_   4.602043
0  Sample_17_Ctrl_evening_S17_   2.929046
0    Sample_04_CRS_morning_S4_   6.227455
0  Sample_19_Ctrl_evening_S19_   2.545900
0   Sample_03_Ctrl_morning_S3_   9.529751
0    Sample_02_CRS_morning_S2_   7.151634
0   Sample_09_Ctrl_morning_S9_   7.923174
0   Sample_10_CRS_morning_S10_   8.815860
0    Sample_06_CRS_morning_S6_   7.930738
0   Sample_12_CRS_evening_S12_   2.849694
0   Sample_07_Ctrl_morning_S7_   4.924262


In [61]:
# Define paths and bin sizes
import pandas as pd
import numpy as np

BINSIZEs = [float(10**3), float(10**4), float(10**5)]
PATH = "/cluster/home/taekim/stressed_mice/data_binning_normalization/"

for BINSIZE in BINSIZEs:
    binsize_str = str(int(BINSIZE))
    print(f"Processing bin size: {binsize_str}")
    
    # Load the damage data with bins
    damage_file = PATH + "Binned_damage_GENOMEWIDE_strand" + binsize_str + "_CCS.20_mice_Sept2024.csv"
    damage_df = pd.read_csv(damage_file, index_col=0)
    
    # Load the nucleotide count reference
    nuc_file = PATH + "Genome_bins_" + binsize_str + ".G_counts.csv"
    nuc_df = pd.read_csv(nuc_file, index_col=0)
    
    # Load the pre-calculated median normalization factors
    median_factors_file = PATH + "NFstrand100000" + ".csv" #only use median with this bin size
    median_factors = pd.read_csv(median_factors_file)
    
    # Rename and prepare the reference dataframe columns
    nuc_df.rename(columns={"0": "Chromosome", "3": "Bin"}, inplace=True)
    
    # Ensure Bin column is in the same data type
    damage_df["Bin"] = damage_df["Bin"].astype(float)
    nuc_df["Bin"] = nuc_df["Bin"].astype(float)
    
    # Merge the datasets on Chromosome and Bin
    merged_df = pd.merge(
        damage_df, 
        nuc_df, 
        on=["Chromosome", "Bin"], 
        how="left"
    )
    
    # Create a new column for normalized damage
    # For '+' strand, normalize by G_count; for '-' strand, normalize by C_count
    merged_df["GC_count"] = np.where(
        merged_df["Strand"] == "+",
        merged_df["G_count"],
        merged_df["C_count"]
    )
    
    # Avoid division by zero
    merged_df["GC_count"] = merged_df["GC_count"].replace(0, np.nan)
    
    # Calculate normalized damage by G or C count
    # scale like the code above. Calculated for each bin size 
    merged_df["GC_Normalized_Damage"] = (10**3) * merged_df["Damage"] / merged_df["GC_count"]
    
    # Merge with median normalization factors
    merged_df = pd.merge(merged_df, median_factors, on="Sample", how="left")
    
    # Apply median normalization
    merged_df["Median_Normalized_Damage"] = merged_df["GC_Normalized_Damage"] / merged_df["Median"]
    
    # Save the normalized data
    output_file = PATH + "Normalized_" + binsize_str + ".csv"
    merged_df.to_csv(output_file)
    
    print(f"Normalization completed for bin size {binsize_str}")

Processing bin size: 1000
Normalization completed for bin size 1000
Processing bin size: 10000
Normalization completed for bin size 10000
Processing bin size: 100000
Normalization completed for bin size 100000
