In [2]:
! pwd

/cluster/home/taekim/stressed_mice/jupyter_notebooks


In [4]:
import csv
import re

# Path to your CSV file
input_csv = "../data_anova/anova_1000_all_significant_bins.csv"
output_bed = "../data_anova/significant_bins.bed"

# Open files
with open(input_csv, 'r') as csv_file, open(output_bed, 'w') as bed_file:
    # Write header comment
    bed_file.write("# BED file generated from anova_1000_all_significant_bins.csv using RefSeq accessions\n")
    
    # Create CSV reader
    reader = csv.DictReader(csv_file)
    
    # Keep track of how many rows we process
    processed_rows = 0
    
    # Process each row
    for row in reader:
        # Parse bin_id to get position, strand, and chromosome
        bin_id = row['bin_id']
        
        # Find the position of the underscore for manual split
        underscore_pos = bin_id.find('_')
        if underscore_pos == -1:
            continue
            
        # Split the bin_id manually
        position_part = bin_id[:underscore_pos]
        strand_chr_part = bin_id[underscore_pos+1:]
        
        # Extract position (remove decimal part)
        try:
            pos = int(float(position_part))
        except ValueError:
            continue
        
        # Extract strand - first character after underscore
        strand = strand_chr_part[0]  # Should be + or -
        if strand not in ['+', '-']:
            continue
            
        # Extract RefSeq ID
        # The format is actually like "3460000.0_-NC_000074.7" (note the extra underscore)
        # So we need to use a different approach to get the chromosome ID
        refseq_match = re.search(r'[+-](NC_\d+\.\d+)', strand_chr_part)
        if not refseq_match:
            continue
            
        refseq_id = refseq_match.group(1)
        
        # Create 1kb bin centered on position (±500bp)
        start = max(0, pos - 500)
        end = pos + 500
        
        # Get name and score
        name = bin_id
        max_f_value = float(row['max_f_value'])
        score = min(1000, int(max_f_value))
        
        # Additional fields
        most_sig_factor = row['most_significant_factor']
        sig_factors = row['significant_factors']
        
        # Write BED line (tab-separated) using RefSeq ID as chromosome
        bed_line = f"{refseq_id}\t{start}\t{end}\t{name}\t{score}\t{strand}\t{most_sig_factor}\t{sig_factors}\n"
        bed_file.write(bed_line)
        processed_rows += 1
        
print(f"BED file created: {output_bed} with {processed_rows} bins")

BED file created: ../data_anova/significant_bins.bed with 128 bins


In [19]:
from collections import defaultdict

# Input file path - change this to your file path if different
input_file_gene = "../data_anova/data_intersect_bed/overlap_genes_result.bed"
input_file_promoter = "../data_anova/data_intersect_bed/overlap_promoters_result.bed"
# Initialize data structures
genes_by_factor = defaultdict(set)  # Will hold unique genes for each factor
gene_to_factors = defaultdict(set)  # Will map each gene to all its factors

# Create the three groups
all_genes = set()
timepoint_only = set()
treatment_only = set()
multiple_factors = set()

# Parse the bed file
with open(input_file_gene, 'r') as f:
    # Tab-delimited file
    for line in f:
        fields = line.strip().split('\t')
        
        # Check if line has all needed fields
        if len(fields) >= 12:
            # Extract fields
            significant_factors = fields[7]  # This contains factors separated by semicolons
            gene_id = fields[11]  # ENSMUSG ID

            all_genes.add(gene_id)
            
            # Parse individual factors and add gene to each
            for factor in significant_factors.split(';'):
                genes_by_factor[factor].add(gene_id)
                gene_to_factors[gene_id].add(factor)

# Parse the bed file
with open(input_file_promoter, 'r') as f:
    # Tab-delimited file
    for line in f:
        fields = line.strip().split('\t')
        
        # Check if line has all needed fields
        if len(fields) >= 12:
            # Extract fields
            significant_factors = fields[7]  # This contains factors separated by semicolons
            gene_id = fields[11]  # ENSMUSG ID

            all_genes.add(gene_id)
            
            # Parse individual factors and add gene to each
            for factor in significant_factors.split(';'):
                genes_by_factor[factor].add(gene_id)
                gene_to_factors[gene_id].add(factor)

# Classify each gene
for gene, factors in gene_to_factors.items():
    # Convert to list for easier handling
    factor_list = list(factors)
    
    # If gene is affected by multiple factors or has a factor involving multiple components
    if len(factors) > 1 or any(";" in f for f in factors):
        multiple_factors.add(gene)
    # Gene is only affected by timepoint
    elif len(factors) == 1 and "timepoint" in factors:
        timepoint_only.add(gene)
    # Gene is only affected by treatment
    elif len(factors) == 1 and "treatment" in factors:
        treatment_only.add(gene)

# Check if all genes are in one of the three groups
categorized_genes = timepoint_only.union(treatment_only).union(multiple_factors)
missing_genes = all_genes - categorized_genes

# Print summary
print("-" * 40)
print(f"Total unique genes in file: {len(all_genes)}")
print(f"Timepoint only: {len(timepoint_only)} genes")
print(f"Treatment only: {len(treatment_only)} genes")
print(f"Multiple factors: {len(multiple_factors)} genes")
print(f"Total categorized: {len(categorized_genes)} genes")
print("-" * 40)

# Print results
print(f"Timepoint only")
print("-" * 40)
for gene in sorted(timepoint_only):
    print(gene)

print("-" * 40)
print(f"Treatment only")
print("-" * 40)
for gene in sorted(treatment_only):
    print(gene)

print("-" * 40)
print(f"Multiple factors")
print("-" * 40)
for gene in sorted(multiple_factors):
    print(gene)

print(f"\n")

----------------------------------------
Total unique genes in file: 65
Timepoint only: 21 genes
Treatment only: 29 genes
Multiple factors: 13 genes
Total categorized: 63 genes
----------------------------------------
Timepoint only
----------------------------------------
ENSMUSG00000019990
ENSMUSG00000020564
ENSMUSG00000021700
ENSMUSG00000028036
ENSMUSG00000031772
ENSMUSG00000039488
ENSMUSG00000040037
ENSMUSG00000040270
ENSMUSG00000043668
ENSMUSG00000043673
ENSMUSG00000045103
ENSMUSG00000049176
ENSMUSG00000052331
ENSMUSG00000060843
ENSMUSG00000064115
ENSMUSG00000078899
ENSMUSG00000102642
ENSMUSG00000103181
ENSMUSG00000106018
ENSMUSG00000111399
ENSMUSG00000112904
----------------------------------------
Treatment only
----------------------------------------
ENSMUSG00000003746
ENSMUSG00000007617
ENSMUSG00000010797
ENSMUSG00000018160
ENSMUSG00000018654
ENSMUSG00000021044
ENSMUSG00000021420
ENSMUSG00000022577
ENSMUSG00000029352
ENSMUSG00000031841
ENSMUSG00000042156
ENSMUSG00000042308
EN

[GO enrichment result for group timepoint only](https://biit.cs.ut.ee/gprofiler/gost?organism=mmusculus&query=ENSMUSG00000019990%0AENSMUSG00000020564%0AENSMUSG00000021700%0AENSMUSG00000028036%0AENSMUSG00000031772%0AENSMUSG00000039488%0AENSMUSG00000040037%0AENSMUSG00000040270%0AENSMUSG00000043668%0AENSMUSG00000043673%0AENSMUSG00000045103%0AENSMUSG00000049176%0AENSMUSG00000052331%0AENSMUSG00000060843%0AENSMUSG00000064115%0AENSMUSG00000078899%0AENSMUSG00000102642%0AENSMUSG00000103181%0AENSMUSG00000106018%0AENSMUSG00000111399%0AENSMUSG00000112904&ordered=false&all_results=false&no_iea=false&combined=false&measure_underrepresentation=false&domain_scope=annotated&significance_threshold_method=g_SCS&user_threshold=0.05&numeric_namespace=ENTREZGENE_ACC&sources=GO:MF,GO:CC,GO:BP,KEGG,TF,REAC,MIRNA,HPA,CORUM,HP,WP&background=&highlight=true&no_evidences=false)

[GO enrichment result for group treatment only](https://biit.cs.ut.ee/gprofiler/gost?organism=mmusculus&query=ENSMUSG00000003746%0AENSMUSG00000007617%0AENSMUSG00000010797%0AENSMUSG00000018160%0AENSMUSG00000018654%0AENSMUSG00000021044%0AENSMUSG00000021420%0AENSMUSG00000022577%0AENSMUSG00000029352%0AENSMUSG00000031841%0AENSMUSG00000042156%0AENSMUSG00000042308%0AENSMUSG00000043008%0AENSMUSG00000045007%0AENSMUSG00000051000%0AENSMUSG00000055540%0AENSMUSG00000056536%0AENSMUSG00000060534%0AENSMUSG00000061603%0AENSMUSG00000062687%0AENSMUSG00000073145%0AENSMUSG00000075220%0AENSMUSG00000086607%0AENSMUSG00000099970%0AENSMUSG00000102805%0AENSMUSG00000108815%0AENSMUSG00000113434%0AENSMUSG00000115718%0AENSMUSG00000120152&ordered=false&all_results=false&no_iea=false&combined=false&measure_underrepresentation=false&domain_scope=annotated&significance_threshold_method=g_SCS&user_threshold=0.05&numeric_namespace=ENTREZGENE_ACC&sources=GO:MF,GO:CC,GO:BP,KEGG,TF,REAC,MIRNA,HPA,CORUM,HP,WP&background=&highlight=true&no_evidences=false)

[GO enrichment result for group multiple factors](https://biit.cs.ut.ee/gprofiler/gost?organism=mmusculus&query=ENSMUSG00000004568%0AENSMUSG00000028245%0AENSMUSG00000033769%0AENSMUSG00000034833%0AENSMUSG00000040690%0AENSMUSG00000045967%0AENSMUSG00000055065%0AENSMUSG00000058589%0AENSMUSG00000060371%0AENSMUSG00000082617%0AENSMUSG00000097207%0AENSMUSG00000101574%0AENSMUSG00000106379&ordered=false&all_results=false&no_iea=false&combined=false&measure_underrepresentation=false&domain_scope=annotated&significance_threshold_method=g_SCS&user_threshold=0.05&numeric_namespace=ENTREZGENE_ACC&sources=GO:MF,GO:CC,GO:BP,KEGG,TF,REAC,MIRNA,HPA,CORUM,HP,WP&background=&highlight=true&no_evidences=false)