# Compute length and trinucleotide sequence of mappable bins 

For each set of bins of length between 1 Mbp and 10 Kbp, compute their length and their sequence composition (trinucleotide content). 

In [1]:
from collections import defaultdict
from collections import Counter
import gzip
import json

from bgreference import hg38
import pandas as pd
from tqdm import tqdm

In [2]:
main_dir = ''

In [3]:
def rev_comp(seq):
    """Compute reverse complementary of a sequence"""
    comp_nucleotides = {
        'A': 'T',
        'C': 'G',
        'G': 'C',
        'T': 'A'
    }
    return ''.join(list(map(lambda x: comp_nucleotides[x], seq[::-1])))

In [4]:
autosomes = [f'chr{i}' for i in range(1, 23)]

In [5]:
# Pyrimidine-based reference trinucleotides
sorted_trinuc = []
for n2 in ['C', 'T']:
    for n1 in ['A', 'C', 'G', 'T']:
        for n3 in ['A', 'C', 'G', 'T']:
            sorted_trinuc.append(n1 + n2 + n3)

## Compute length 

Check that all bins expand the same length of the genome

In [6]:
bins = [1000000, 500000, 250000, 100000, 50000, 25000, 10000]

In [7]:
for binsize in bins: 
    bin_name = f'{int(binsize/1000)}kb'
    print(f'Computing {bin_name}')
    
    bins_file = f'{main_dir}/data/hg38_{bin_name}_bin.nodrivers.filtered.mappable_positions.autosomes.bed.gz'
    length = 0
    with gzip.open(bins_file, 'rt') as fd: 
        next(fd)     # skip header
        for line in fd: 
            chrom, start, end, binid = line.strip().split('\t')
            if chrom in autosomes: 
                start = int(start) + 1    # undo BED format
                end = int(end)
                size = end - start + 1
                length += len(hg38(chrom, start, size=size))
    print(f'Length = {length}')

Computing 1000kb
Length = 2012091302
Computing 500kb
Length = 2012091302
Computing 250kb
Length = 2012091302
Computing 100kb
Length = 2012091302
Computing 50kb
Length = 2012091302
Computing 25kb
Length = 2012091302
Computing 10kb
Length = 2012091302


## Compute trinucleotide composition 

Calculate trinucleotide content for each bin across bin sizes

In [9]:
bins = [1000000, 500000, 250000, 100000, 50000, 25000, 10000]

In [8]:
def trinucleotide_counts_bin(bins_file): 
    """Count trinucleotide composition across bins in a file containing bins"""
    
    trinucleotides_per_bin = defaultdict(dict)
    
    with gzip.open(bins_file, 'rt') as fd:
        next(fd)    # skip header
        for i, line in enumerate(tqdm(fd)):
            chrom, start, end, binid = line.strip().split()
            # Account for trinucleotide sequence (substract 1 position to start; add 1 position to end)
            start = int(start)     # undo BED format --> start - 1 + 1
            end = int(end) + 1
            size = end - start + 1
            nucleotide_sequence = hg38(chrom, start, size=size)
            trinucleotides = Counter([nucleotide_sequence[i:i + 3] for i in range(len(nucleotide_sequence) - 2)])
            trinucleotides_per_bin[binid] = Counter(trinucleotides_per_bin[binid]) + trinucleotides
                
    return trinucleotides_per_bin


In [10]:
for binsize in bins: 

    bin_name = f'{int(binsize/1000)}kb'
    print(f'Computing {bin_name}')

    bins_file = f'{main_dir}/data/hg38_{bin_name}_bin.nodrivers.filtered.mappable_positions.autosomes.bed.gz'
    all_bins_f = f'{main_dir}/data/hg38_{bin_name}_bin.nodrivers.filtered.all_positions.autosomes.bed.gz'

    output_f1 = f'{main_dir}/data/hg38_{bin_name}_bin.nodrivers.filtered.mappable_positions.autosomes.trinuc_per_bin.json'
    output_f2 = f'{main_dir}/data/hg38_{bin_name}_bin.nodrivers.filtered.mappable_positions.autosomes.trinuc_merged_bins.json'

    # Get trinucleotides per bin
    trinucleotides_per_bin = trinucleotide_counts_bin(bins_file)
    print(f'\tbins with sequence: {len(trinucleotides_per_bin.keys())}')
    
    #Add missing bins
    all_bins_df = pd.read_csv(all_bins_f, sep='\t', header=0)
    total_bins = all_bins_df['BINID'].unique()
    missing_bins = set(total_bins).difference(set(trinucleotides_per_bin.keys()))
    print(f'\tbins missing sequence: {len(missing_bins)}')

    for binid in missing_bins: 
        chrom, start_end = binid.split(':')
        start, end = start_end.split('-')
        start = int(start) if int(start) > 0 else 1
        end = int(end) + 1
        size = end - start + 1
        nucleotide_sequence = hg38(chrom, start, size=size)
        trinucleotides = Counter([nucleotide_sequence[i:i + 3] for i in range(len(nucleotide_sequence) - 2)])
        trinucleotides_per_bin[binid] = trinucleotides
    
    # Collapse trinucleotide counts per bin
    trinucleotides_per_bin_v2 = {}
    # For each trinucleotide, save its counts across all bins in a list
    counts_per_trinucleotide = defaultdict(list)
    # Compute
    for binid, counter in trinucleotides_per_bin.items():
        trinucleotides_per_bin_v2[binid] = {}
        for trinucleotide in sorted_trinuc:
            counts = counter[trinucleotide] + counter[rev_comp(trinucleotide)]    # add up complementary trinucleotides
            trinucleotides_per_bin_v2[binid][trinucleotide] = counts
            counts_per_trinucleotide[trinucleotide] += [counts]

    # Undo defaultdict counts_per_trinucleotide (not compatible with json)
    counts_per_trinucleotide_v2 = {}
    for k, v in counts_per_trinucleotide.items():
        counts_per_trinucleotide_v2[k] = v

    # Trinucleotide counts per bin (bins as keys)
    with open(output_f1, 'w') as ofd:
        json.dump(trinucleotides_per_bin_v2, ofd)

    # Trinucleotide counts across bins (trinucleotide as keys)
    with open(output_f2, 'w') as ofd:
        json.dump(counts_per_trinucleotide_v2, ofd)

Computing 1000kb


11678127it [16:05, 12099.61it/s]


	bins with sequence: 2196
	bins missing sequence: 0


1090it [00:00, 10898.66it/s]

Computing 500kb


11680122it [17:29, 11129.73it/s]


	bins with sequence: 4392
	bins missing sequence: 0


252it [00:00, 2518.95it/s]

Computing 250kb


11684130it [18:51, 10323.90it/s]


	bins with sequence: 8784
	bins missing sequence: 0


913it [00:00, 9122.20it/s]

Computing 100kb


11696068it [18:20, 10625.89it/s]


	bins with sequence: 21940
	bins missing sequence: 20


0it [00:00, ?it/s]

Computing 50kb


11716057it [18:32, 10528.20it/s]


	bins with sequence: 43785
	bins missing sequence: 135


0it [00:00, ?it/s]

Computing 25kb


11756066it [17:09, 11421.86it/s]


	bins with sequence: 87367
	bins missing sequence: 473


0it [00:00, ?it/s]

Computing 10kb


11876173it [17:43, 11167.27it/s]


	bins with sequence: 217968
	bins missing sequence: 1632
