# Compute length and trinucleotide composition of the mappable genome

This script computes trinucleotide counts for two different files: 
1. Mappable genome regions including drivers
2. Mappable genome regions excluding drivers

In [1]:
from collections import defaultdict
from collections import Counter
import gzip
import json

from bgreference import hg38

In [2]:
main_dir = ''

### 1) Compute length and trinucleotide composition of the mappable genome including drivers

In [3]:
genome_file = f'{main_dir}/data/hg38_mappable_genome.tsv.gz'

#### Check length

In [4]:
length = 0
with gzip.open(genome_file, 'rt') as fd: 
    next(fd)     # skip header
    for line in fd: 
        chrom, start, end = line.strip().split('\t')
        start = int(start)
        end = int(end)
        size = end - start + 1
        length += len(hg38(chrom, start, size=size))

In [5]:
length

2531297102

#### Compute trinucleotide composition

In [6]:
output_f = f'{main_dir}/data/hg38_mappable_genome.trinucleotide_counts.json'

In [7]:
def rev_comp(seq):
    """Compute reverse complementary of a sequence"""
    comp_nucleotides = {
        'A': 'T',
        'C': 'G',
        'G': 'C',
        'T': 'A'
    }
    return ''.join(list(map(lambda x: comp_nucleotides[x], seq[::-1])))

In [8]:
# Pyrimidine-based reference trinucleotides
sorted_trinuc = []
for n2 in ['C', 'T']:
    for n1 in ['A', 'C', 'G', 'T']:
        for n3 in ['A', 'C', 'G', 'T']:
            sorted_trinuc.append(n1 + n2 + n3)

In [9]:
# Trinucleotide counts
trinucleotides_genome = dict()
with gzip.open(genome_file, 'rt') as fd:
    next(fd)    # skip header
    for line in fd:
        chrom, start, end = line.strip().split('\t')
        # Account for trinucleotide sequence (substract 1 position to start; add 1 position to end)
        start = int(start) - 1
        end = int(end) + 1
        size = end - start + 1
        nucleotide_sequence = hg38(chrom, start, size=size)
        trinucleotides = Counter([nucleotide_sequence[i:i + 3] for i in range(len(nucleotide_sequence) - 2)])
        trinucleotides_genome = Counter(trinucleotides_genome) + trinucleotides

In [10]:
# Collapse into C/T reference trinucleotides
# Non-reference trinucleotides (those with unknwnon nucleotides or 'N') are discarded
# Undo defaultdict (no json friendly)
trinucleotides_genome_merged = dict()
for trinucleotide in sorted_trinuc:
    counts = trinucleotides_genome[trinucleotide] + trinucleotides_genome[rev_comp(trinucleotide)]
    trinucleotides_genome_merged[trinucleotide] = counts

In [11]:
sum(trinucleotides_genome_merged.values())

2531296367

In [12]:
# Trinucleotide counts across bins (trinucleotide as keys)
with open(output_f, 'w') as ofd:
    json.dump(trinucleotides_genome_merged, ofd)

### 2) Compute length and trinucleotide composition of the mappable genome excluding drivers

In [14]:
genome_file = f'{main_dir}/data/hg38_mappable_genome.nodrivers.tsv.gz'

#### Check length

In [15]:
length = 0
with gzip.open(genome_file, 'rt') as fd: 
    next(fd)     # skip header
    for line in fd: 
        chrom, start, end = line.strip().split('\t')
        start = int(start)
        end = int(end)
        size = end - start + 1
        length += len(hg38(chrom, start, size=size))

In [24]:
length

2439219900

In [16]:
drivers_file = f'{main_dir}/data/cancerdrivers_regions.tsv'

In [17]:
drivers_length = 0
with open(drivers_file, 'r') as fd: 
    next(fd)     # skip header
    for line in fd: 
        chrom, start, end = line.strip().split('\t')
        start = int(start)
        end = int(end)
        size = end - start + 1
        drivers_length += len(hg38(chrom, start, size=size))

In [20]:
drivers_length

99105646

#### Compute trinucleotide composition

In [18]:
output_f = f'{main_dir}/data/hg38_mappable_genome.nodrivers.trinucleotide_counts.json'

In [19]:
def rev_comp(seq):
    """Compute reverse complementary of a sequence"""
    comp_nucleotides = {
        'A': 'T',
        'C': 'G',
        'G': 'C',
        'T': 'A'
    }
    return ''.join(list(map(lambda x: comp_nucleotides[x], seq[::-1])))

In [20]:
# Pyrimidine-based reference trinucleotides
sorted_trinuc = []
for n2 in ['C', 'T']:
    for n1 in ['A', 'C', 'G', 'T']:
        for n3 in ['A', 'C', 'G', 'T']:
            sorted_trinuc.append(n1 + n2 + n3)

In [21]:
# Trinucleotide counts
trinucleotides_genome = dict()
with gzip.open(genome_file, 'rt') as fd:
    next(fd)    # skip header
    for line in fd:
        chrom, start, end = line.strip().split('\t')
        # Account for trinucleotide sequence (substract 1 position to start; add 1 position to end)
        start = int(start) - 1
        end = int(end) + 1
        size = end - start + 1
        nucleotide_sequence = hg38(chrom, start, size=size)
        trinucleotides = Counter([nucleotide_sequence[i:i + 3] for i in range(len(nucleotide_sequence) - 2)])
        trinucleotides_genome = Counter(trinucleotides_genome) + trinucleotides

In [22]:
# Collapse into C/T reference trinucleotides
# Non-reference trinucleotides (those with unknwnon nucleotides or 'N') are discarded
# Undo defaultdict (no json friendly)
trinucleotides_genome_merged = dict()
for trinucleotide in sorted_trinuc:
    counts = trinucleotides_genome[trinucleotide] + trinucleotides_genome[rev_comp(trinucleotide)]
    trinucleotides_genome_merged[trinucleotide] = counts

In [32]:
sum(trinucleotides_genome_merged.values())

2439219170

In [23]:
# Trinucleotide counts across bins (trinucleotide as keys)
with open(output_f, 'w') as ofd:
    json.dump(trinucleotides_genome_merged, ofd)