In [2]:
! pwd

/cluster/home/taekim/stressed_mice/jupyter_notebooks


In [17]:
import csv
import re
#This code converts csv data to a bed file
#For binning data where id is like "3460000.0_-NC_000074.7"

# Path to your CSV file
input_csv = "../data_anova/bin100000/anova_100000_all_significant_bins.csv"
output_bed = "../data_anova/bin100000/important_bins.bed"
bin_size = 100000 #CHANGE

skipped_factor_criteria = 0

# Open files
with open(input_csv, 'r') as csv_file, open(output_bed, 'w') as bed_file:
    # Write header comment
    bed_file.write("# BED file generated from anova_1000_all_significant_bins.csv using RefSeq accessions\n")
    
    # Create CSV reader
    reader = csv.DictReader(csv_file)
    
    # Keep track of how many rows we process
    processed_rows = 0
    
    # Process each row
    for row in reader:
        # Parse bin_id to get position, strand, and chromosome
        bin_id = row['bin_id']

        factor1_significant = row.get('factor1_significant', '').lower() == 'true'
        factor2_significant = row.get('factor2_significant', '').lower() == 'true'
        interaction_significant = row.get('interaction_significant', '').lower() == 'true'
        
        if not factor1_significant and factor2_significant and not interaction_significant:
            skipped_factor_criteria += 1
            continue
        
        # Find the position of the underscore for manual split
        underscore_pos = bin_id.find('_')
        if underscore_pos == -1:
            continue
            
        # Split the bin_id manually
        position_part = bin_id[:underscore_pos]
        strand_chr_part = bin_id[underscore_pos+1:]
        
        # Extract position (remove decimal part)
        try:
            pos = int(float(position_part))
        except ValueError:
            continue
        
        # Extract strand - first character after underscore
        strand = strand_chr_part[0]  # Should be + or -
        if strand not in ['+', '-']:
            continue
            
        # Extract RefSeq ID
        # The format is like "3460000.0_-NC_000074.7"
        refseq_match = re.search(r'[+-](NC_\d+\.\d+)', strand_chr_part)
        if not refseq_match:
            continue
            
        refseq_id = refseq_match.group(1)
        
        # Create 1kb bin centered on position (±500bp)
        start = pos
        end = pos + bin_size
        
        # Get name and score
        name = bin_id
        
        # Additional fields
        sig_factors = row['significant_factors']
        
        # Write BED line (tab-separated) using RefSeq ID as chromosome
        bed_line = f"{refseq_id}\t{start}\t{end}\t{name}\t{strand}\t{sig_factors}\n"
        bed_file.write(bed_line)
        processed_rows += 1
        
print(f"BED file created: {output_bed} with {processed_rows} bins")
print(f"Skipped bins: {skipped_factor_criteria} with factor1=False, factor2=True, interaction=False")

BED file created: ../data_anova/bin100000/important_bins.bed with 1 bins
Skipped bins: 3733 with factor1=False, factor2=True, interaction=False


In [25]:
!pwd

/cluster/home/taekim/stressed_mice/jupyter_notebooks


In [1]:
import csv
import re
# This code converts csv data to a bed file
#For binning data where id is like "NC_000074.7+3460000"

# Path to your CSV file
input_csv = "../data_anova/promoters/anova_all_significant_bins.csv"
output_bed = "../data_anova/promoters/important_bins.bed"
bin_size = 3000 #CHANGE
skipped_factor_criteria = 0

# Open files
with open(input_csv, 'r') as csv_file, open(output_bed, 'w') as bed_file:
    # Write header comment
    bed_file.write("# BED file generated from anova_all_significant_bins.csv using RefSeq accessions\n")
    
    # Create CSV reader
    reader = csv.DictReader(csv_file)
    
    # Keep track of how many rows we process
    processed_rows = 0
    
    # Process each row
    for row in reader:
        # Parse bin_id to get position, strand, and chromosome
        bin_id = row['bin_id']
        factor1_significant = row.get('factor1_significant', '').lower() == 'true'
        factor2_significant = row.get('factor2_significant', '').lower() == 'true'
        interaction_significant = row.get('interaction_significant', '').lower() == 'true'
        
        if not factor1_significant and factor2_significant and not interaction_significant:
            skipped_factor_criteria += 1
            continue
        
        # Extract RefSeq ID and position from the new format "NC_000074.7+3460000"
        # The + or - indicates strand
        refseq_match = re.match(r'^(.+?)_(\d+)([+-])$', bin_id)
        if not refseq_match:
            continue
            
        refseq_id = refseq_match.group(1)
        strand = refseq_match.group(3)  # + or -
        
        # Extract position
        try:
            pos = int(refseq_match.group(2))
        except ValueError:
            continue
        
        # Create bin with specified size
        start = pos
        end = pos + bin_size
        
        # Get name and score
        name = bin_id
        
        # Additional fields
        sig_factors = row['significant_factors']
        
        # Write BED line (tab-separated) using RefSeq ID as chromosome
        bed_line = f"{refseq_id}\t{start}\t{end}\t{name}\t{strand}\t{sig_factors}\n"
        bed_file.write(bed_line)
        processed_rows += 1
        
print(f"BED file created: {output_bed} with {processed_rows} bins")
print(f"Skipped bins: {skipped_factor_criteria} with factor1=False, factor2=True, interaction=False")

BED file created: ../data_anova/promoters/important_bins.bed with 0 bins
Skipped bins: 4 with factor1=False, factor2=True, interaction=False


In [27]:
#USE INTERSECT OF BEDTOOLS FOR THE KNOWN GENE AND PROMOTER COORDINATES WITH THE PRODUCED BED FILE
!module load stack/2024-06 gcc/12.2.0 bedtools2/2.31.0
!/cluster/software/stacks/2024-06/spack/opt/spack/linux-ubuntu22.04-x86_64_v3/gcc-12.2.0/bedtools2-2.31.0-a4obbslkxntgdx2criopqpwx662gcftq/bin/bedtools intersect -a ../data_anova/cpg/important_bins.bed -b /nfs/nas12.ethz.ch/fs1201/green_groups_let_public/Euler/Vakil/Mouse_brain_Sept2024/gene_annotation/knownGenes_canononical.bed -wa -wb > ../data_anova/cpg/cpg_overlap_genes_result.bed

Many modules are hidden in this stack. Use "module --show_hidden spider SOFTWARE" if you are not able to find the required software

Inactive Modules:
  1) python/3.11.6_cuda

Due to MODULEPATH changes, the following have been reloaded:
  1) hdf5/1.14.3     2) r/4.3.2

The following have been reloaded with a version change:
  1) cuda/12.2.1 => cuda/12.8.0     3) stack/2024-05 => stack/2024-06
  2) gcc/13.2.0 => gcc/12.2.0



In [28]:
!/cluster/software/stacks/2024-06/spack/opt/spack/linux-ubuntu22.04-x86_64_v3/gcc-12.2.0/bedtools2-2.31.0-a4obbslkxntgdx2criopqpwx662gcftq/bin/bedtools intersect -a ../data_anova/cpg/important_bins.bed -b /nfs/nas12.ethz.ch/fs1201/green_groups_let_public/Euler/Vakil/Mouse_brain_Sept2024/gene_annotation/knownGenes_canonTr_2000bp_upstream_TSS_feature.bed -wa -wb > ../data_anova/cpg/overlap_promoters_result.bed
