In [2]:
! pwd

/cluster/home/taekim/stressed_mice/jupyter_notebooks


In [1]:
import csv
import re

# Path to your CSV file
input_csv = "../data_anova/anova_1000_all_significant_bins.csv"
output_bed = "../data_anova/significant_bins.bed"

# Open files
with open(input_csv, 'r') as csv_file, open(output_bed, 'w') as bed_file:
    # Write header comment
    bed_file.write("# BED file generated from anova_1000_all_significant_bins.csv using RefSeq accessions\n")
    
    # Create CSV reader
    reader = csv.DictReader(csv_file)
    
    # Keep track of how many rows we process
    processed_rows = 0
    
    # Process each row
    for row in reader:
        # Parse bin_id to get position, strand, and chromosome
        bin_id = row['bin_id']
        
        # Find the position of the underscore for manual split
        underscore_pos = bin_id.find('_')
        if underscore_pos == -1:
            continue
            
        # Split the bin_id manually
        position_part = bin_id[:underscore_pos]
        strand_chr_part = bin_id[underscore_pos+1:]
        
        # Extract position (remove decimal part)
        try:
            pos = int(float(position_part))
        except ValueError:
            continue
        
        # Extract strand - first character after underscore
        strand = strand_chr_part[0]  # Should be + or -
        if strand not in ['+', '-']:
            continue
            
        # Extract RefSeq ID
        # The format is actually like "3460000.0_-NC_000074.7" (note the extra underscore)
        # So we need to use a different approach to get the chromosome ID
        refseq_match = re.search(r'[+-](NC_\d+\.\d+)', strand_chr_part)
        if not refseq_match:
            continue
            
        refseq_id = refseq_match.group(1)
        
        # Create 1kb bin centered on position (±500bp)
        start = max(0, pos - 500)
        end = pos + 500
        
        # Get name and score
        name = bin_id
        max_f_value = float(row['max_f_value'])
        score = min(1000, int(max_f_value))
        
        # Additional fields
        most_sig_factor = row['most_significant_factor']
        sig_factors = row['significant_factors']
        
        # Write BED line (tab-separated) using RefSeq ID as chromosome
        bed_line = f"{refseq_id}\t{start}\t{end}\t{name}\t{score}\t{strand}\t{most_sig_factor}\t{sig_factors}\n"
        bed_file.write(bed_line)
        processed_rows += 1
        
print(f"BED file created: {output_bed} with {processed_rows} bins")

BED file created: ../data_anova/significant_bins.bed with 128 bins


In [4]:
from collections import defaultdict, Counter

# Input file path - change this to your file path if different
input_file_gene = "../data_anova/data_intersect_bed/overlap_genes_result.bed"
input_file_promoter = "../data_anova/data_intersect_bed/overlap_promoters_result.bed"

# Initialize data structures
genes_by_factor = defaultdict(set)  # Will hold unique genes for each factor
gene_to_factors = defaultdict(set)  # Will map each gene to all its factors
bin_to_genes = defaultdict(set)     # Will map each bin to the genes it intersects with
gene_to_bins = defaultdict(set)     # Will map each gene to the bins it intersects with

# Create the three groups
all_genes = set()
all_bins = set()
timepoint_only = set()
treatment_only = set()
rest = set()

# Parse the gene bed file
print("Parsing gene intersections...")
with open(input_file_gene, 'r') as f:
    for line in f:
        fields = line.strip().split('\t')
        
        # Check if line has all needed fields
        if len(fields) >= 12:
            # Extract fields
            bin_chrom = fields[0]
            bin_start = fields[1]
            bin_end = fields[2]
            bin_id = f"{bin_chrom}:{bin_start}-{bin_end}"  # Create a unique bin identifier
            significant_factors = fields[7]  # This contains factors separated by semicolons
            gene_id = fields[11]  # ENSMUSG ID
            
            all_genes.add(gene_id)
            all_bins.add(bin_id)
            
            # Map bin to gene and gene to bin
            bin_to_genes[bin_id].add(gene_id)
            gene_to_bins[gene_id].add(bin_id)
            
            # Parse individual factors and add gene to each
            for factor in significant_factors.split(';'):
                genes_by_factor[factor].add(gene_id)
                gene_to_factors[gene_id].add(factor)

# Parse the promoter bed file
print("Parsing promoter intersections...")
with open(input_file_promoter, 'r') as f:
    for line in f:
        fields = line.strip().split('\t')
        
        # Check if line has all needed fields
        if len(fields) >= 12:
            # Extract fields
            bin_chrom = fields[0]
            bin_start = fields[1]
            bin_end = fields[2]
            bin_id = f"{bin_chrom}:{bin_start}-{bin_end}"  # Create a unique bin identifier
            significant_factors = fields[7]  # This contains factors separated by semicolons
            gene_id = fields[11]  # ENSMUSG ID
            
            all_genes.add(gene_id)
            all_bins.add(bin_id)
            
            # Map bin to gene and gene to bin
            bin_to_genes[bin_id].add(gene_id)
            gene_to_bins[gene_id].add(bin_id)
            
            # Parse individual factors and add gene to each
            for factor in significant_factors.split(';'):
                genes_by_factor[factor].add(gene_id)
                gene_to_factors[gene_id].add(factor)

# Classify each gene
for gene, factors in gene_to_factors.items():
    # Gene is only affected by timepoint
    if len(factors) == 1 and "timepoint" in factors:
        timepoint_only.add(gene)
    # Gene is only affected by treatment
    elif len(factors) == 1 and "treatment" in factors:
        treatment_only.add(gene)
    # Rest
    else:
        rest.add(gene)

# Analyze the nature of bin-gene mappings
print("\n" + "="*50)
print("MAPPING ANALYSIS")
print("="*50)

# Count genes per bin
genes_per_bin = Counter(len(genes) for genes in bin_to_genes.values())
print("\nDistribution of genes per bin:")
for count, frequency in sorted(genes_per_bin.items()):
    print(f"{count} gene(s): {frequency} bin(s) ({frequency/len(bin_to_genes)*100:.2f}%)")

# Count bins per gene
bins_per_gene = Counter(len(bins) for bins in gene_to_bins.values())
print("\nDistribution of bins per gene:")
for count, frequency in sorted(bins_per_gene.items()):
    print(f"{count} bin(s): {frequency} gene(s) ({frequency/len(gene_to_bins)*100:.2f}%)")

# Determine the nature of the mappings
one_to_one = True
for bin_id, genes in bin_to_genes.items():
    if len(genes) != 1:
        one_to_one = False
        break

if one_to_one:
    for gene, bins in gene_to_bins.items():
        if len(bins) != 1:
            one_to_one = False
            break

# Check if each bin maps to at most one gene (injective bin→gene)
injective_bin_to_gene = all(len(genes) <= 1 for genes in bin_to_genes.values())

# Check if each gene maps to at most one bin (injective gene→bin)
injective_gene_to_bin = all(len(bins) <= 1 for bins in gene_to_bins.values())

# Check if every gene is mapped to by at least one bin (surjective bin→gene)
surjective_bin_to_gene = all(len(bins) >= 1 for gene, bins in gene_to_bins.items())

# Check if every bin is mapped to by at least one gene (surjective gene→bin)
surjective_gene_to_bin = all(len(genes) >= 1 for bin_id, genes in bin_to_genes.items())

# Print the nature of the mappings
print("\nMapping characteristics:")
if one_to_one:
    print("The mapping is bijective (one-to-one correspondence between bins and genes)")
else:
    print("The mapping is many-to-many (not bijective)")
    
    if injective_bin_to_gene:
        print("The bin-to-gene mapping is injective (each bin maps to at most one gene)")
    else:
        print("The bin-to-gene mapping is not injective (some bins map to multiple genes)")
        
    if injective_gene_to_bin:
        print("The gene-to-bin mapping is injective (each gene maps to at most one bin)")
    else:
        print("The gene-to-bin mapping is not injective (some genes map to multiple bins)")
        
    if surjective_bin_to_gene:
        print("The bin-to-gene mapping is surjective (every gene is mapped to by at least one bin)")
    else:
        print("The bin-to-gene mapping is not surjective (some genes are not mapped to by any bin)")
        
    if surjective_gene_to_bin:
        print("The gene-to-bin mapping is surjective (every bin is mapped to by at least one gene)")
    else:
        print("The gene-to-bin mapping is not surjective (some bins are not mapped to by any gene)")

# Calculate detailed statistics
print("\nDetailed mapping statistics:")
print(f"Total unique bins: {len(all_bins)}")
print(f"Total unique genes: {len(all_genes)}")
print(f"Bins with at least one gene: {len(bin_to_genes)} ({len(bin_to_genes)/len(all_bins)*100:.2f}% of all bins)")
print(f"Genes with at least one bin: {len(gene_to_bins)} ({len(gene_to_bins)/len(all_genes)*100:.2f}% of all genes)")

# Find bins with the most genes
bins_with_most_genes = sorted(bin_to_genes.items(), key=lambda x: len(x[1]), reverse=True)[:10]
print("\nTop 10 bins with the most genes:")
for bin_id, genes in bins_with_most_genes:
    print(f"Bin {bin_id}: {len(genes)} genes")

# Find genes with the most bins
genes_with_most_bins = sorted(gene_to_bins.items(), key=lambda x: len(x[1]), reverse=True)[:10]
print("\nTop 10 genes with the most bins:")
for gene, bins in genes_with_most_bins:
    print(f"Gene {gene}: {len(bins)} bins")

# Analyze mapping by gene category
timepoint_only_bin_counts = Counter(len(gene_to_bins[gene]) for gene in timepoint_only if gene in gene_to_bins)
treatment_only_bin_counts = Counter(len(gene_to_bins[gene]) for gene in treatment_only if gene in gene_to_bins)
rest_bin_counts = Counter(len(gene_to_bins[gene]) for gene in rest if gene in gene_to_bins)

print("\nBin counts by gene category:")
print("\nTimepoint-only genes:")
for count, frequency in sorted(timepoint_only_bin_counts.items()):
    print(f"{count} bin(s): {frequency} gene(s) ({frequency/len(timepoint_only)*100:.2f}% of timepoint-only genes)")

print("\nTreatment-only genes:")
for count, frequency in sorted(treatment_only_bin_counts.items()):
    print(f"{count} bin(s): {frequency} gene(s) ({frequency/len(treatment_only)*100:.2f}% of treatment-only genes)")

print("\nRest genes:")
for count, frequency in sorted(rest_bin_counts.items()):
    print(f"{count} bin(s): {frequency} gene(s) ({frequency/len(rest)*100:.2f}% of rest genes)")

# Export the detailed mappings to files as before
# ... (rest of the code remains the same as in the previous answer)

Parsing gene intersections...
Parsing promoter intersections...

MAPPING ANALYSIS

Distribution of genes per bin:
1 gene(s): 50 bin(s) (86.21%)
2 gene(s): 8 bin(s) (13.79%)

Distribution of bins per gene:
1 bin(s): 64 gene(s) (98.46%)
2 bin(s): 1 gene(s) (1.54%)

Mapping characteristics:
The mapping is many-to-many (not bijective)
The bin-to-gene mapping is not injective (some bins map to multiple genes)
The gene-to-bin mapping is not injective (some genes map to multiple bins)
The bin-to-gene mapping is surjective (every gene is mapped to by at least one bin)
The gene-to-bin mapping is surjective (every bin is mapped to by at least one gene)

Detailed mapping statistics:
Total unique bins: 58
Total unique genes: 65
Bins with at least one gene: 58 (100.00% of all bins)
Genes with at least one bin: 65 (100.00% of all genes)

Top 10 bins with the most genes:
Bin NC_000071.7:22987500-22988500: 2 genes
Bin NC_000077.7:11647500-11648500: 2 genes
Bin NC_000067.7:54957500-54958500: 2 genes
Bi

[GO enrichment result for group timepoint only](https://biit.cs.ut.ee/gprofiler/gost?organism=mmusculus&query=ENSMUSG00000019990%0AENSMUSG00000020564%0AENSMUSG00000021700%0AENSMUSG00000028036%0AENSMUSG00000031772%0AENSMUSG00000039488%0AENSMUSG00000040037%0AENSMUSG00000040270%0AENSMUSG00000043668%0AENSMUSG00000043673%0AENSMUSG00000045103%0AENSMUSG00000049176%0AENSMUSG00000052331%0AENSMUSG00000060843%0AENSMUSG00000064115%0AENSMUSG00000078899%0AENSMUSG00000102642%0AENSMUSG00000103181%0AENSMUSG00000106018%0AENSMUSG00000111399%0AENSMUSG00000112904&ordered=false&all_results=false&no_iea=false&combined=false&measure_underrepresentation=false&domain_scope=annotated&significance_threshold_method=g_SCS&user_threshold=0.05&numeric_namespace=ENTREZGENE_ACC&sources=GO:MF,GO:CC,GO:BP,KEGG,TF,REAC,MIRNA,HPA,CORUM,HP,WP&background=&highlight=true&no_evidences=false)

[GO enrichment result for group treatment only](https://biit.cs.ut.ee/gprofiler/gost?organism=mmusculus&query=ENSMUSG00000003746%0AENSMUSG00000007617%0AENSMUSG00000010797%0AENSMUSG00000018160%0AENSMUSG00000018654%0AENSMUSG00000021044%0AENSMUSG00000021420%0AENSMUSG00000022577%0AENSMUSG00000029352%0AENSMUSG00000031841%0AENSMUSG00000042156%0AENSMUSG00000042308%0AENSMUSG00000043008%0AENSMUSG00000045007%0AENSMUSG00000051000%0AENSMUSG00000055540%0AENSMUSG00000056536%0AENSMUSG00000060534%0AENSMUSG00000061603%0AENSMUSG00000062687%0AENSMUSG00000073145%0AENSMUSG00000075220%0AENSMUSG00000086607%0AENSMUSG00000099970%0AENSMUSG00000102805%0AENSMUSG00000108815%0AENSMUSG00000113434%0AENSMUSG00000115718%0AENSMUSG00000120152&ordered=false&all_results=false&no_iea=false&combined=false&measure_underrepresentation=false&domain_scope=annotated&significance_threshold_method=g_SCS&user_threshold=0.05&numeric_namespace=ENTREZGENE_ACC&sources=GO:MF,GO:CC,GO:BP,KEGG,TF,REAC,MIRNA,HPA,CORUM,HP,WP&background=&highlight=true&no_evidences=false)

[GO enrichment result for group multiple factors](https://biit.cs.ut.ee/gprofiler/gost?organism=mmusculus&query=ENSMUSG00000004568%0AENSMUSG00000028245%0AENSMUSG00000033769%0AENSMUSG00000034833%0AENSMUSG00000040690%0AENSMUSG00000045967%0AENSMUSG00000055065%0AENSMUSG00000058589%0AENSMUSG00000060371%0AENSMUSG00000082617%0AENSMUSG00000097207%0AENSMUSG00000101574%0AENSMUSG00000106379&ordered=false&all_results=false&no_iea=false&combined=false&measure_underrepresentation=false&domain_scope=annotated&significance_threshold_method=g_SCS&user_threshold=0.05&numeric_namespace=ENTREZGENE_ACC&sources=GO:MF,GO:CC,GO:BP,KEGG,TF,REAC,MIRNA,HPA,CORUM,HP,WP&background=&highlight=true&no_evidences=false)