# Goal

Compare gapped and ungapped lastz output to see whether the methods yield different amounts of gametolog sequence.

In [36]:
import csv
from tqdm import tqdm

gap_file = '../../data/alignment-lastz/lastz-align-10k-gapped.bed'
ungap_file = '../../data/alignment-lastz/lastz-align-10k-ungapped.bed'

In [15]:
# create dicts w/ lastz data
with open(gap_file, 'r', newline = '') as f:
    reader = csv.DictReader(f, delimiter = '\t')
    gapped = [row for row in reader]

with open(ungap_file, 'r', newline = '') as f:
    reader = csv.DictReader(f, delimiter = '\t')
    ungapped = [row for row in reader]

In [23]:
gapped[0]

{'#score': '977241',
 'covPct': '3.4%',
 'coverage': '11909/345555',
 'end1': '11708',
 'end2': '11909',
 'idPct': '96.7%',
 'identity': '10969/11345',
 'name1': 'chromosome_6',
 'name2': 'mtMinus',
 'size1': '528439',
 'size2': '345555',
 'strand1': '+',
 'strand2': '+',
 'zstart1': '0',
 'zstart2': '0'}

In [44]:
def populate_intervals(lastz_file, allele = None):
    if allele == 'plus':
        intervals = [[int(line['zstart1']), int(line['end1'])] 
                      for line in lastz_file]
    elif allele == 'minus':
        intervals = [[int(line['zstart2']), int(line['end2'])] 
                     for line in lastz_file]
    bases_covered = []
    for start, end in tqdm(intervals):
        values = [num for num in range(start, end)]
        bases_covered.extend(values)
    return set(bases_covered)
        
gapped_plus = populate_intervals(gapped, allele = 'plus')
ungapped_plus = populate_intervals(ungapped, allele = 'plus')

print(len(gapped_plus),
      len(ungapped_plus))

difference_plus = gapped_plus.difference(ungapped_plus)
print(len(difference_plus))

100%|██████████| 260/260 [00:00<00:00, 10393.32it/s]
100%|██████████| 1003/1003 [00:00<00:00, 46371.19it/s]

358673 312300
46373





In [47]:
sorted(list(difference_plus))[:20]

[209,
 210,
 211,
 212,
 213,
 214,
 215,
 216,
 217,
 218,
 219,
 220,
 221,
 222,
 223,
 224,
 225,
 226,
 227,
 228]

In [49]:
# how does the 209-282 interval stack up fasta wise?
# this interval was excluded in ungapped, but included in gapped
from Bio import SeqIO
plus_fasta = SeqIO.read('../../data/references/mt_plus.fasta', 'fasta')
minus_fasta = SeqIO.read('../../data/references/mt_minus.fasta', 'fasta')

for f in [plus_fasta, minus_fasta]:
    print(str(f.seq[209:282]))

CATCACAGGTAGCAAAGCAGCTCAAGCAAGGCCCGGCCAGGAGCCGGTCATTGTCAAACCCCGCACAGGCAAT
GTCACAGGTAGTAAAGCAACTCAAGCAAGGCCGGCCAGGGGCCGGTCATTGTCAAACCCCGCACAGGCAATCC


In [51]:
# what percent of the mt locus are we losing

print(len(difference_plus) / len(plus_fasta.seq))

0.0877546888098721
