In [2]:
import os
import sys
import glob
import numpy as np
import pandas as pd
import seaborn as sns
import math
import random
import config
os.chdir(config.LOOP_CATALOG_DIR)

#### Input Parameters

In [51]:
cell_type = "non-immune"

In [52]:
foreground_seqs_file = "results/motif_analysis/conserved_anchors_11_06_2024/sea/{ct}/no_mergefilt_01.27.2025/input_fasta.fa.2.7.7.80.10.50.500.mask".format(ct = cell_type)
#background_seqs_file = "results/motif_analysis/conserved_anchors_11_06_2024/conserved_anchor_results/all/mergefilt/bkgd_nonconserved_all.fa"
background_seqs_file = "results/motif_analysis/conserved_anchors_11_06_2024/conserved_anchor_results/{ct}/no_mergefilt_01.27.2025/bkgd_nonconserved_all.fa.2.7.7.80.10.50.500.mask".format(ct = cell_type)
output_file = "results/motif_analysis/conserved_anchors_11_06_2024/sea/{ct}/no_mergefilt_01.27.2025/bkgd_nonconserved_biased_downsampled_fasta.fa.2.7.7.80.10.50.500.mask".format(ct = cell_type)
quantile_percent = .5 # Percentile buckets

#### Load Sequences

In [53]:
# foreground
foreground_seqs = []
with open(foreground_seqs_file) as fs:
    curr_seq = []
    for line in fs:
        if not line.startswith(">"):
            curr_seq.append(line.strip())
        else:
            if len(curr_seq) != 0:
                foreground_seqs.append("".join(curr_seq))
                curr_seq = []
    foreground_seqs.append("".join(curr_seq))
            
# background
background_seqs = []
with open(background_seqs_file) as bs:
    curr_seq = []
    for line in bs:
        if not line.startswith(">"):
            curr_seq.append(line.strip())
        else:
            if len(curr_seq) != 0:
                background_seqs.append("".join(curr_seq))
                curr_seq = []
    background_seqs.append("".join(curr_seq))

In [54]:
print("Number of Foreground Sequences:", len(foreground_seqs))
print("Number of Background Sequences:", len(background_seqs))

Number of Foreground Sequences: 1493
Number of Background Sequences: 199136


#### Pre-Process Background Sequences

In [55]:
# function to get the GC content of an input DNA string
def get_gc_content(seq):
    return (seq.count("G") + seq.count("C")) / len(seq)

In [56]:
# compute the GC content of each background seqeunce
# store in list as a tuple of (seq, gc content)
background_gc_content = []
for seq in background_seqs:
    background_gc_content.append((seq, get_gc_content(seq)))
background_gc_content = np.array(background_gc_content, np.dtype([('sequence', 'U5000'), ('gc_content', 'f8')]))

In [57]:
# sort wrt gc content (ascending)
sorted_indices = np.argsort(background_gc_content["gc_content"])
sorted_background_gc_content = background_gc_content[sorted_indices]

In [58]:
# define the number of quantiles
num_quantiles = int(100 / quantile_percent)

# calculate quantile values
quantile_values = np.quantile(sorted_background_gc_content["gc_content"], np.linspace(0, 1, num_quantiles + 1))

# assign each element to its quantile bin
quantile_bins = np.digitize(sorted_background_gc_content["gc_content"], quantile_values[1:-1])

In [59]:
quantile_bins

array([  0,   0,   0, ..., 199, 199, 199])

In [60]:
# create a database dictionary by quantile where the keys are quantiles and values are lists of seqs in that quantile
background_database = dict(zip(np.unique(quantile_bins), [[] for _ in range(len(np.unique(quantile_bins)))]))
for i in range(len(quantile_bins)):
    background_database[quantile_bins[i]].append(sorted_background_gc_content["sequence"][i])

In [61]:
print("Quantiles:", quantile_values)
print()
print("Number of Sequences Per Quantile:", np.unique(quantile_bins, return_counts = True))

Quantiles: [0.      0.2996  0.3114  0.3176  0.322   0.3256  0.3286  0.331   0.3332
 0.3352  0.337   0.3388  0.3406  0.342   0.3436  0.345   0.3462  0.3474
 0.3486  0.3498  0.351   0.352   0.3532  0.3542  0.3552  0.3562  0.3572
 0.3582  0.3592  0.36    0.361   0.3618  0.3628  0.3636  0.3646  0.3654
 0.3662  0.367   0.3678  0.3686  0.3694  0.3702  0.371   0.3718  0.3726
 0.3734  0.3742  0.3748  0.3756  0.3764  0.377   0.3778  0.3784  0.3792
 0.3798  0.3806  0.3812  0.382   0.3828  0.3834  0.3842  0.3848  0.3854
 0.3862  0.3868  0.3874  0.3882  0.3888  0.3896  0.3902  0.3908  0.3916
 0.3922  0.3928  0.3936  0.3942  0.39486 0.3956  0.3962  0.397   0.3976
 0.3984  0.399   0.3996  0.4004  0.401   0.4016  0.4024  0.403   0.4036
 0.4044  0.405   0.4058  0.4064  0.407   0.4078  0.4084  0.4092  0.4098
 0.4106  0.4112  0.412   0.4126  0.4134  0.4142  0.4148  0.4156  0.4164
 0.4172  0.4178  0.4186  0.4194  0.4202  0.4208  0.4216  0.4224  0.4232
 0.424   0.4248  0.4256  0.4264  0.4272  0.428   0.42

#### Perform Paired Sampling of  Background 

In [62]:
# function to perform paired sampling 
def paired_sample(seq):
    quantile = np.digitize(get_gc_content(seq),  quantile_values[1:-1]) # get the background seq quantile that the input seq falls in
    random_index = random.randint(0, len(background_database[quantile]) - 1) # generate a random index within range of the nubmer of seqs in that quantile
    #print(len(background_database[quantile]), random_index)
    return background_database[quantile][random_index] # return corresponding background seq

In [63]:
biased_background_seqs = []
for seq in foreground_seqs:
    biased_background_seqs.append(paired_sample(seq))

#### Get Nucleotide Frequency Statistics

In [64]:
def calc_freqs(seqs):
    freqs = {"A" : 0, "C" : 0, "G" : 0, "T" : 0, "N" : 0}
    total_nucs = 0
    for seq in seqs:
        seq = seq.upper()
        for char in seq:
            freqs[char] += 1
        total_nucs += len(seq)
    for freq in freqs:
        print(freq, freqs[freq] / total_nucs)

In [65]:
print("Foreground:", len(foreground_seqs), "sequences")
calc_freqs(foreground_seqs)
print()

print("Full Background:", len(background_seqs), "sequences")
calc_freqs(background_seqs)
print()

print("Sampled Background:", len(biased_background_seqs), "sequences")
calc_freqs(biased_background_seqs)

Foreground: 1493 sequences
A 0.24459772270596114
C 0.24410743469524449
G 0.24493945077026122
T 0.244967180174146
N 0.02138821165438714

Full Background: 199136 sequences
A 0.2781438002169372
C 0.21005601297605656
G 0.21022038004178048
T 0.27860511610155875
N 0.02297469066366704

Sampled Background: 1493 sequences
A 0.24654722036168789
C 0.24465733422638983
G 0.2444826523777629
T 0.24604447421299397
N 0.01826831882116544


#### Produce FASTA File

In [66]:
sequences_to_keep = set(biased_background_seqs)

with open(output_file, "w") as out:
    with open(background_seqs_file) as bs:
        curr_seq = []
        curr_header = None
        for line in bs:
            if line.startswith(">"):
                if len(curr_seq) != 0 and "".join(curr_seq) in sequences_to_keep:
                    out.write(f"{curr_header}\n")
                    out.write(f"{''.join(curr_seq)}\n")
                curr_header = line.strip()
                curr_seq = []
            else:
                curr_seq.append(line.strip())
        
        if len(curr_seq) != 0 and "".join(curr_seq) in sequences_to_keep:
            out.write(f"{curr_header}\n")
            out.write(f"{''.join(curr_seq)}\n")