In [None]:
!/d0/home/adamk/pysccnv/venv/bin/pip install pysam

In [6]:
import pysam
import json
import pprint
pprint = pprint.PrettyPrinter().pprint
import re
import collections
import time

# Detect polyA reads

In [3]:
def polyadenylation_reader(bam_filename, limit = -1):
    polyadenylation_reads = dict()
    
    required_soft_clip_length = 5
    required_T_content = 4

    with pysam.AlignmentFile(bam_filename, "rb") as f:
        for i, line in enumerate(f):
            if i == limit:
                break
            if not line.cigartuples:
                continue
            if not line.cigartuples[0]:
                continue
            if line.cigartuples[0][0] != 4: # 4 = softclip
                continue
            if line.cigartuples[0][1] < required_soft_clip_length:
                continue
            three_prime_sequence = line.seq[:required_soft_clip_length]
            if collections.Counter(three_prime_sequence)["T"] < required_T_content:
                continue
            tags = {i:j for i, j in line.get_tags()}
            try:
                reads_by_gene = polyadenylation_reads.setdefault(tags["GN"], {})            
                reads_by_reference_start = reads_by_gene.setdefault(line.reference_start, [])
                reads_by_reference_start.append([tags["CB"], tags["UB"], line.seq])            
            except KeyError:
                pass
    return polyadenylation_reads

In [4]:
def serialise(o, filename):
    with open(filename, "w") as f:
        json.dump(o, f)

In [5]:
limit = 10**7
start = time.time()
polya_reads = polyadenylation_reader("5k_pbmc_protein_v3_possorted_genome_bam.bam", limit = limit)
stop = time.time()

In [6]:
runtime = stop - start

In [7]:
estimated_runtime = runtime/limit * 245409397

In [8]:
estimated_runtime

489.04225002009036

In [9]:
polya_reads["NADK"]

{1751231: [['AAAGTCCGTGTGTCGC-1',
   'GGCATCAGCTTT',
   'TTTTTTTTTTTTTATTTTCTACAAGTCTTGTTTATTGAAAGGATCTGAAAAGCGTAATAAGGCTTTCAATGACATTTAATACATTTTCAAG'],
  ['ACCCAAACATGCGTGC-1',
   'TTCAGCCTGGAG',
   'TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCTACAAGTCTTGTTTATTGAAAGGATCTGAAAAGCGTAATAAGGCTTTCAATGAC'],
  ['AGAAGTACAGTAGAAT-1',
   'TCTGCGCGCTAC',
   'TTTTTTTTTTTTTTTTTTTTTTCTACAAGTCTTGTTTATTGAAAGGATCTGAAAAGCGTAATAAGGCTTTCAATGACATTTAATACATTTT'],
  ['AGATCCAGTGAGGATC-1',
   'TATACATAGTTA',
   'TTTTTTTTTTTTTTTTTTTTTTTTTTTCTACAAGTCTTGTTTATTGAAAGGATCTGAAAAGCGTAATAAGGCTTTCAATGACATTTAATAC'],
  ['AGCCAATCACAAACGG-1',
   'GATTAAAGCTTA',
   'TTTTTTTTTTTTTATTTTTTTTATTTTCTACAAGTCTTGTTTATTGAAAGGATCTGAAAAACGTAATAAGGCTTTCAATGACATTTAATAC'],
  ['AGCTCAAAGTGGACGT-1',
   'TCGTGGATCAGA',
   'TTTTTTTTTTTTTTTACAAGTCTTGTTTATTGAAAGGATCTGAAAAGCGTAATAAGGCTTTCAATGACATTTAATACATTTTCAAGAAATT'],
  ['AGTAGCTAGCTAATGA-1',
   'TTGTCTGATTAT',
   'TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATTTTCTACAAGTCTTGTTTATTGAAAGGATCTGAAAAGCGTAATAAGGCTT

In [None]:
start = time.time()
polya_reads = polyadenylation_reader("5k_pbmc_protein_v3_possorted_genome_bam.bam")
stop = time.time()

In [None]:
stop - start

In [None]:
serialise(polya_reads, "polyadenylation_reads.json")

# Load up the polya_reads

In [26]:
def deserialise(filename):
    with open(filename, "r") as f:
        o = json.load(f)
    return o

In [27]:
polya_reads = deserialise("polyadenylation_reads.json")

In [28]:
len([key for key in polya_reads])

6949

In [29]:
counts = {}
for gene, gene_data in polya_reads.items():
    for location, sequence_data in gene_data.items():
        UMI_per_location = collections.Counter()
        for CB, UMI, seq in sequence_data:
            UMI_per_location[(CB, UMI)] += 1
        counts[gene, location] = len(UMI_per_location)

In [31]:
sorted_by_abundance = sorted([(count, gene_location) for gene_location, count in counts.items() if count > 100], reverse=True)

In [33]:
abundant_sites = {}
for abundance, (gene, location) in sorted_by_abundance:
    assert abundance > 100
    abundant_sites.setdefault(gene, []).append(location)

In [41]:
{i: sites for i, sites in abundant_sites.items() if len(sites) > 1}

{'RPL30': ['98041720', '98041718'],
 'RPL32': ['12836046', '12836048', '12836025'],
 'TPT1': ['45337176', '45336869'],
 'RPL18': ['48615330', '48615327', '48615332'],
 'RPS16': ['39433206', '39433223', '39433216'],
 'RPS14': ['150444237', '150444228'],
 'RPS4X': ['72272602', '72272611', '72272607'],
 'S100A4': ['153543620', '153543621', '153543618'],
 'RPS29': ['49583576', '49583578', '49583571', '49583577'],
 'FAU': ['65120629', '65120633'],
 'CYBA': ['88643288', '88643290'],
 'HLA-DRB1': ['32579078', '32578774', '32578768'],
 'H3F3B': ['75778249', '75777458', '75778019'],
 'RPL35': ['124857882', '124857884'],
 'JUN': ['58781485', '58781319', '58781489', '58781382', '58780787'],
 'HLA-DPA1': ['33065015', '33065013'],
 'RPL12': ['127447675', '127447677', '127447673'],
 'RACK1': ['181236936', '181236928', '181236941'],
 'RPL24': ['101681095', '101681090'],
 'ARPC3': ['110434889', '110434905'],
 'GMFG': ['39328364', '39328358']}

In [36]:
discover_polya = {}
for gene, gene_data in polya_reads.items():
    count_by_location = discover_polya.setdefault(gene, {})
    for location, sequence_data in gene_data.items():
        UMI_per_location = collections.Counter()
        for CB, UMI, seq in sequence_data:
            UMI_per_location[(CB, UMI)] += 1
        count_by_location[location] = len(UMI_per_location)

In [43]:
discover_polya["TPT1"]

{'45335960': 1,
 '45336678': 1,
 '45336781': 1,
 '45336865': 88,
 '45336866': 1,
 '45336867': 2,
 '45336869': 188,
 '45336870': 3,
 '45336871': 2,
 '45336872': 0,
 '45336873': 6,
 '45336874': 3,
 '45336875': 5,
 '45336887': 2,
 '45336925': 1,
 '45336966': 4,
 '45336972': 1,
 '45336983': 1,
 '45337022': 1,
 '45337026': 1,
 '45337075': 0,
 '45337080': 1,
 '45337132': 1,
 '45337139': 1,
 '45337163': 1,
 '45337166': 43,
 '45337171': 1,
 '45337172': 1,
 '45337174': 1,
 '45337175': 21,
 '45337176': 1536,
 '45337177': 51,
 '45337178': 33,
 '45337179': 13,
 '45337180': 6,
 '45337181': 3,
 '45337184': 1,
 '45337185': 1,
 '45337186': 2,
 '45337190': 40,
 '45337191': 5,
 '45337203': 1,
 '45337208': 1,
 '45337219': 5,
 '45337227': 3,
 '45337229': 1,
 '45337254': 1,
 '45337273': 1,
 '45337275': 0,
 '45337282': 1,
 '45337283': 1,
 '45337291': 1,
 '45337298': 1,
 '45337306': 1,
 '45337313': 1,
 '45337314': 1,
 '45337317': 1,
 '45337335': 1,
 '45337338': 1,
 '45337348': 1,
 '45337358': 1,
 '45337363':

In [46]:
polya_reads["TPT1"]["45337190"]

[['AACAACCGTGCACATT-1',
  'TCCCCGCTGACA',
  'TTTTTTTTTTTTTTTTTTTTTAGACAACCTACATGACATGTTTTTCTTAAAAACAATGCCTCCACTCCAAATAAATCACAGTCAAAATAAA'],
 ['ACATGCACACTCTAGA-1',
  'TGGCGACCTCAC',
  'TTTTTTTTTTTTTTTTTTTTTTTTTTTTTAGACAACCTACATGACATGTTTTTCTTAAAAACAATGCCTCCACTCCAAATAAATCACAGTC'],
 ['ACATGCACACTCTAGA-1',
  'TGGCGACCTCAC',
  'TTTTTTTTTTTTTTTAGACAACCTACATGACATGTTTTTCTTAAAAACAATGCCTCCACTCCAAATAAATCACAGTCAAAATAAATGAAGA'],
 ['ACCATTTCATAATGAG-1',
  'AGTCACAGGTGG',
  'TTTTTTTTTTTTTTTATTTTTAGACAACCTACATGACATGTTTTTCTTAAAAACAATGCCTCCACTCCAAATAAATCACAGTCAAAATAAA'],
 ['ACCATTTCATAATGAG-1',
  'AGTCACAGGTGG',
  'TTTTTTTTTTTTTTTATTTTTAGACAACCTACATGACATGTTTTTCTTAAAAACAATGCCTCCACTCCAAATAAATCACAGTCAAAATAAA'],
 ['ACTACGATCAGGAGAC-1',
  'CTTTTTTACCAC',
  'TTTTTTTTTTTTTTTTTTTTAGACAACCTACATGACATGTTTTTCTTAAAAACAATGCCTCCACTCCAAATAAATCACAGTCAAAATAAAT'],
 ['AGCGCTGGTACCGTGC-1',
  'TCTTGGAGTCTT',
  'TTTTTTTTTTTTTTTTAGACAACCTACATGACATGTTTTTCTTAAAAACAATGCCTCCACTCCAAATAAATCACAGTCAAAATAAATGAAG'],
 ['AGGGTGAGTTGTAGCT-

In [45]:
polya_reads["TPT1"]["45337176"]

[['AAACCCAAGGCCTAGA-1',
  'ATCGACACCGGG',
  'TTTTTTTTTTTTTTTTTTGAGTTTAAATGCATTTTATTTTTAGACAACCTACATGACATGTTTTTCTTAAAAACAATGCCTCCACTCCAAA'],
 ['AAACCCACATCGGCCA-1',
  'GTCTACGTACGG',
  'TTTTTTTTTTTTTTTGAGTTTAAATGCATTTTATTTTTAGACAACCTACATGACATGTTTTTCTTAAAAACAATGCCTCCACTCCAAATAA'],
 ['AAAGAACGTAGTTCCA-1',
  'CTCCGTCGGTTA',
  'TTTTTTTTTTTTTTGAGTTTAAATGCATTTTATTTTTAGACAACCTACATGACATGTTTTTCTTAAAAACAATGCCTCCACTCCAAATAAA'],
 ['AAAGAACGTGGATCAG-1',
  'TCCCACTCCTGC',
  'TTTTTTTTTTGAGTTTAAATGCATTTTATTTTTAGACAACCTACATGACATGTTTTTCTTAAAAACAATGCCTCCACTCCAAATAAATCAC'],
 ['AAAGAACGTGTGTCCG-1',
  'AACCGCAACGGA',
  'TTTTTTTTTTTTGAGTTTAAATGCATTTTATTTTTAGACAACCTACATGACATGTTTTTCTTAAAAACAATGCCTCCACTCCAAATAAATC'],
 ['AAAGGATAGTAGCTCT-1',
  'TATGTCATCGAC',
  'TTTTTTTTTTTGAGTTTAAATGCATTTTATTTTTAGACAACCTACATGACATGTTTTTCTTAAAAAACAATGCCTCCACTCCAAATAAATC'],
 ['AAAGGATCACACTTAG-1',
  'CTTACCTATGGA',
  'TTTTTTTTTTTTTGAGTTTAAATGCATTTTATTTTTAGACAACCTACATGACATGTTTTTCTTAAAAACAATGCCTCCACTCCAAATAAAT'],
 ['AAAGGATGTGATTAGA-

In [38]:
discover_polya["JUN"]

{'58780787': 115,
 '58780790': 1,
 '58780791': 1,
 '58780822': 1,
 '58780843': 1,
 '58780869': 1,
 '58780902': 1,
 '58780926': 1,
 '58780941': 0,
 '58780942': 1,
 '58780972': 3,
 '58780978': 1,
 '58780997': 1,
 '58781007': 1,
 '58781023': 3,
 '58781029': 2,
 '58781091': 1,
 '58781097': 1,
 '58781109': 3,
 '58781119': 4,
 '58781122': 1,
 '58781123': 1,
 '58781132': 1,
 '58781133': 1,
 '58781136': 1,
 '58781137': 1,
 '58781138': 1,
 '58781143': 1,
 '58781153': 1,
 '58781174': 1,
 '58781185': 1,
 '58781191': 1,
 '58781202': 3,
 '58781219': 1,
 '58781225': 1,
 '58781228': 1,
 '58781232': 1,
 '58781238': 2,
 '58781260': 1,
 '58781296': 1,
 '58781297': 1,
 '58781310': 66,
 '58781311': 1,
 '58781312': 1,
 '58781314': 2,
 '58781315': 1,
 '58781316': 1,
 '58781318': 4,
 '58781319': 281,
 '58781320': 3,
 '58781321': 10,
 '58781322': 8,
 '58781328': 2,
 '58781329': 1,
 '58781332': 2,
 '58781355': 1,
 '58781356': 1,
 '58781365': 2,
 '58781377': 1,
 '58781380': 1,
 '58781381': 3,
 '58781382': 174,


In [40]:
polya_reads["JUN"]['58780787']

[['AAGCCATAGCGGATCA-1',
  'AACAGCATACGG',
  'TTTTTTTATTGGTATTTGAATACATTTATTGTGACAAGAATGCTGTTATAAATATTCATAAGCAAAGGCCATCTTTTTATCTAGGAATTGT'],
 ['AAGGAATGTCGCAGTC-1',
  'GAGGTTTCGAAA',
  'TTTTTTTTTTTTTTTTTTTATTTTTTTTTTTTTTGGTATTTGAATACATTTATTGTGACAAGAATGCTGTTATAAATATTCATAAGCAAAG'],
 ['AAGGTAATCTATCGCC-1',
  'ATTGTTTAATCC',
  'TTTTTTTTTTTTTTTTTTGGTATTTGAATACATTTATTGTGACAAGAATGCTGTTATAAATATTCATAAGCAAAGGCCATCTTTTTATCTA'],
 ['AATCACGTCGAGCACC-1',
  'ATGCCATTTCAA',
  'TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGGTATTTGAATACATTTATTGTGACAAGAATGCTGTTATAAATATTCATAAGCAAAGGC'],
 ['AATGACCCAGCTGTAT-1',
  'TAGAAATTTTTG',
  'TTTTTTTTTTTGGTATTTGAATACATTTATTGTGACAAGAATGCTGTTATAAATATTCATAAGCAAAGGCCATCCCCATGTACTCTGCGTT'],
 ['ACAAGCTGTTGCCTAA-1',
  'AGTCGCCCGTGT',
  'TTTTTTTTTTTTTTTTGGTATTTGAATACATTTATTGTGACAAGAATGCTGTTATAAATATTCATAAGCAAAGGCCATCTTTTTATCTAGG'],
 ['ACAGAAACACGGTGTC-1',
  'TCAGCTGATTGC',
  'TTTTTTTTTTTTTTTTTTTTTTTTGGTATTTGAATACATTTATTGTGACAAGAATGCTGTTATAAATATTCATAAGCAAAGGCCATCTTTT'],
 ['ACAGAAATCACGTAGT-

In [23]:
polya_reads["HLA-DRB1"]['32578768']

[['AAAGGATAGAGGCCAT-1',
  'AACTAATTTGCG',
  'TTTTTTTTTTTTTTTAACTCCATCTTTGAGAAACATTTAATAATGTAATGTGTTTGTGGTACAGGGTGAGTACAGATGCACAGGAGGCCAT'],
 ['AAAGGATGTGATTAGA-1',
  'CGTTGCTTACGC',
  'TTTTTTTTTAACTCCATCTTTGAGAAACATTTAATAATGTAATGTGTTTGTGGTACAGGGTGAGTACAGATGCACAGGAGGCCATAGGGTT'],
 ['AAAGGATGTGATTAGA-1',
  'CTAAACTTAGCC',
  'ATTTTTTTTTTTTTTTTTTTTTTAACTCCATCTTTGAGAAACATTTAATAATGTAATGTGTTTGTGGTACAGGGTGAGTACAGATGCACAG'],
 ['AAAGTGAAGTGCAGCA-1',
  'CTATATTTATCG',
  'TTTTTTTTTTTTTTTTTTTTTTAACTCCATCTTTGAGAAACATTTAATAATGTAATGTGTTTGTGGTACAGGGTGAGTACAGATGCACAGG'],
 ['AACAGGGGTCAACATC-1',
  'TTAGATACTAAA',
  'TTTTTTTTTTTTTTTTTTTTTTAACTCCATCTTTGAGAAACATTTAATAATGTAATGTGTTTGTGGTACAGGGTGAGTATAGATGCACAGG'],
 ['AAGACTCCAACCGACC-1',
  'AACTAATTTGTT',
  'TTTTTTTTTTTTTTTTTAACTCCATCTTTGAGAAACATTTAATAATGTAATGTGTTTGTGGTACAGGGTGAGTACAGATGCACAGGAGGCC'],
 ['AAGACTCCAACCGACC-1',
  'AACTAATTTGTT',
  'TTTTTTTTTTTTTTTTTAACTCCATCTTTGAGAAACATTTAATAATGTAATGTGTTTGTGGTACAGGGTGAGTACAGATGCACAGGAGGCC'],
 ['AAGCGAGCATACAGCT-

In [25]:
polya_reads["HLA-DRB1"]

{'32578768': [['AAAGGATAGAGGCCAT-1',
   'AACTAATTTGCG',
   'TTTTTTTTTTTTTTTAACTCCATCTTTGAGAAACATTTAATAATGTAATGTGTTTGTGGTACAGGGTGAGTACAGATGCACAGGAGGCCAT'],
  ['AAAGGATGTGATTAGA-1',
   'CGTTGCTTACGC',
   'TTTTTTTTTAACTCCATCTTTGAGAAACATTTAATAATGTAATGTGTTTGTGGTACAGGGTGAGTACAGATGCACAGGAGGCCATAGGGTT'],
  ['AAAGGATGTGATTAGA-1',
   'CTAAACTTAGCC',
   'ATTTTTTTTTTTTTTTTTTTTTTAACTCCATCTTTGAGAAACATTTAATAATGTAATGTGTTTGTGGTACAGGGTGAGTACAGATGCACAG'],
  ['AAAGTGAAGTGCAGCA-1',
   'CTATATTTATCG',
   'TTTTTTTTTTTTTTTTTTTTTTAACTCCATCTTTGAGAAACATTTAATAATGTAATGTGTTTGTGGTACAGGGTGAGTACAGATGCACAGG'],
  ['AACAGGGGTCAACATC-1',
   'TTAGATACTAAA',
   'TTTTTTTTTTTTTTTTTTTTTTAACTCCATCTTTGAGAAACATTTAATAATGTAATGTGTTTGTGGTACAGGGTGAGTATAGATGCACAGG'],
  ['AAGACTCCAACCGACC-1',
   'AACTAATTTGTT',
   'TTTTTTTTTTTTTTTTTAACTCCATCTTTGAGAAACATTTAATAATGTAATGTGTTTGTGGTACAGGGTGAGTACAGATGCACAGGAGGCC'],
  ['AAGACTCCAACCGACC-1',
   'AACTAATTTGTT',
   'TTTTTTTTTTTTTTTTTAACTCCATCTTTGAGAAACATTTAATAATGTAATGTGTTTGTGGTACAGGGTGAGTACAGATGCA

In [15]:
discover_polya["JUN"]

{'58780787': 115,
 '58780790': 1,
 '58780791': 1,
 '58780822': 1,
 '58780843': 1,
 '58780869': 1,
 '58780902': 1,
 '58780926': 1,
 '58780941': 0,
 '58780942': 1,
 '58780972': 3,
 '58780978': 1,
 '58780997': 1,
 '58781007': 1,
 '58781023': 3,
 '58781029': 2,
 '58781091': 1,
 '58781097': 1,
 '58781109': 3,
 '58781119': 4,
 '58781122': 1,
 '58781123': 1,
 '58781132': 1,
 '58781133': 1,
 '58781136': 1,
 '58781137': 1,
 '58781138': 1,
 '58781143': 1,
 '58781153': 1,
 '58781174': 1,
 '58781185': 1,
 '58781191': 1,
 '58781202': 3,
 '58781219': 1,
 '58781225': 1,
 '58781228': 1,
 '58781232': 1,
 '58781238': 2,
 '58781260': 1,
 '58781296': 1,
 '58781297': 1,
 '58781310': 66,
 '58781311': 1,
 '58781312': 1,
 '58781314': 2,
 '58781315': 1,
 '58781316': 1,
 '58781318': 4,
 '58781319': 281,
 '58781320': 3,
 '58781321': 10,
 '58781322': 8,
 '58781328': 2,
 '58781329': 1,
 '58781332': 2,
 '58781355': 1,
 '58781356': 1,
 '58781365': 2,
 '58781377': 1,
 '58781380': 1,
 '58781381': 3,
 '58781382': 174,


In [17]:
polya_reads["JUN"]["58781319"]

[['AAAGGATGTCCTCAGG-1',
  'TGCTTGTTCAGA',
  'TTTTTTTTTTTTTTTTTTTTTTTACTTTCCAATAGTTTATTGTATTTTCTTAAATATCCTTTCTGGAATTTTCAGAAACAAAACATAAAAA'],
 ['AAAGTCCCAAATTAGG-1',
  'GTTGTCTTACAT',
  'TTTTTTTTTTTTTTTTTTTTTTTACTTTCCAATAGTTTATTGTATTTTCTTAAATATCCTTTCTGGAATTTTCAGAAACAAAACATAAAAA'],
 ['AACCTGAGTCGAGCTC-1',
  'GTGTGTATTGAG',
  'TTTTTTTTTTTTTTTTTTTTACTTTCCAATAGTTTATTGTATTTTCTTAAATATCCTTTCTGGAATTTTCAGAAACAAAACATAAAAAAAT'],
 ['AAGAACACAACGGCCT-1',
  'TCCTTTGACATC',
  'TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTACTTTCCAATAGTTTATTGTATTTTCTTAAATATCCTTTCTGGAATTTTCAGAAACAAAAC'],
 ['AAGCCATCAGTCACGC-1',
  'CTCGGTCCAAAC',
  'TTTTTTTTTTTTTTTTTTTTTTTTTACTTTCCAATAGTTTATTGTATTTTCTTAAATATCCTTTCTGGAATTTTCAGAAACAAAACATAAA'],
 ['AAGGTAAAGGAAGAAC-1',
  'TATGATGGCTGA',
  'TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTACTTTCCAATAGTTTATTGTATTTTCTTAAATATCCTTTCTGGAATTTTCAGAAACAAAA'],
 ['AAGTACCCAGCATGCC-1',
  'TCAATTTCTGGC',
  'TTTTTTTTTTATTTTTTTTTACTTTCCAATAGTTTATTGTATTTTCTTAAATATCCTTTCTGGAATTTTCAGAAACAAAACATAAAAAAAT'],
 ['AAGTACCCAGCATGCC-

In [16]:
polya_reads["JUN"]["58780787"]

[['AAGCCATAGCGGATCA-1',
  'AACAGCATACGG',
  'TTTTTTTATTGGTATTTGAATACATTTATTGTGACAAGAATGCTGTTATAAATATTCATAAGCAAAGGCCATCTTTTTATCTAGGAATTGT'],
 ['AAGGAATGTCGCAGTC-1',
  'GAGGTTTCGAAA',
  'TTTTTTTTTTTTTTTTTTTATTTTTTTTTTTTTTGGTATTTGAATACATTTATTGTGACAAGAATGCTGTTATAAATATTCATAAGCAAAG'],
 ['AAGGTAATCTATCGCC-1',
  'ATTGTTTAATCC',
  'TTTTTTTTTTTTTTTTTTGGTATTTGAATACATTTATTGTGACAAGAATGCTGTTATAAATATTCATAAGCAAAGGCCATCTTTTTATCTA'],
 ['AATCACGTCGAGCACC-1',
  'ATGCCATTTCAA',
  'TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGGTATTTGAATACATTTATTGTGACAAGAATGCTGTTATAAATATTCATAAGCAAAGGC'],
 ['AATGACCCAGCTGTAT-1',
  'TAGAAATTTTTG',
  'TTTTTTTTTTTGGTATTTGAATACATTTATTGTGACAAGAATGCTGTTATAAATATTCATAAGCAAAGGCCATCCCCATGTACTCTGCGTT'],
 ['ACAAGCTGTTGCCTAA-1',
  'AGTCGCCCGTGT',
  'TTTTTTTTTTTTTTTTGGTATTTGAATACATTTATTGTGACAAGAATGCTGTTATAAATATTCATAAGCAAAGGCCATCTTTTTATCTAGG'],
 ['ACAGAAACACGGTGTC-1',
  'TCAGCTGATTGC',
  'TTTTTTTTTTTTTTTTTTTTTTTTGGTATTTGAATACATTTATTGTGACAAGAATGCTGTTATAAATATTCATAAGCAAAGGCCATCTTTT'],
 ['ACAGAAATCACGTAGT-

# ATP1B1

Here: [ATP1B1 paper](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0076290)

# Reference

Match -- 0

Insertion -- 1

Deletion -- 2

Splice -- 3

Softclip -- 4

Hardclip (not good, rerun the alignment) -- 5

```
[(798512, (0,)),
 (143947, (4, 0)),
 (46502, (0, 4)),
 (3855, (4, 0, 4)),
 (3518, (0, 1, 0)),
 (1662, (0, 2, 0)),
 (1380, (0, 3, 0)),
 (133, (4, 0, 1, 0)),
 (123, (0, 1, 0, 4)),
 (109, (4, 0, 3, 0)),
 (101, (0, 3, 0, 4)),
 (62, (0, 2, 0, 4)),
 (46, (4, 0, 2, 0)),
 (11, (4, 0, 3, 0, 4)),
 (9, (0, 3, 0, 3, 0)),
 (8, (0, 3, 0, 1, 0)),
 (6, (4, 0, 1, 0, 4)),
 (4, (0, 1, 0, 1, 0)),
 (3, (4, 0, 2, 0, 4)),
 (3, (0, 1, 0, 2, 0)),
 (2, (0, 2, 0, 1, 0)),
 (1, (4, 0, 1, 0, 2, 0)),
 (1, (0, 2, 0, 2, 0)),
 (1, (0, 1, 0, 3, 0)),
 (1, (0, 1, 0, 1, 0, 4))]
```