In [None]:
!/d0/home/adamk/pysccnv/venv/bin/pip install pysam

In [3]:
import pysam
import json
import pprint
pprint = pprint.PrettyPrinter().pprint
import re
import collections
import time

# Detect polyA reads

In [3]:
def polyadenylation_reader(bam_filename, limit = -1):
    polyadenylation_reads = dict()
    
    required_soft_clip_length = 5
    required_T_content = 4

    with pysam.AlignmentFile(bam_filename, "rb") as f:
        for i, line in enumerate(f):
            if i == limit:
                break
            if not line.cigartuples:
                continue
            if not line.cigartuples[0]:
                continue
            if line.cigartuples[0][0] != 4: # 4 = softclip
                continue
            if line.cigartuples[0][1] < required_soft_clip_length:
                continue
            three_prime_sequence = line.seq[:required_soft_clip_length]
            if collections.Counter(three_prime_sequence)["T"] < required_T_content:
                continue
            tags = {i:j for i, j in line.get_tags()}
            try:
                reads_by_gene = polyadenylation_reads.setdefault(tags["GN"], {})            
                reads_by_reference_start = reads_by_gene.setdefault(line.reference_start, [])
                reads_by_reference_start.append([tags["CB"], tags["UB"], line.seq])            
            except KeyError:
                pass
    return polyadenylation_reads

In [4]:
def serialise(o, filename):
    with open(filename, "w") as f:
        json.dump(o, f)

In [5]:
limit = 10**7
start = time.time()
polya_reads = polyadenylation_reader("5k_pbmc_protein_v3_possorted_genome_bam.bam", limit = limit)
stop = time.time()

In [6]:
runtime = stop - start

In [7]:
estimated_runtime = runtime/limit * 245409397

In [8]:
estimated_runtime

489.04225002009036

In [9]:
polya_reads["NADK"]

{1751231: [['AAAGTCCGTGTGTCGC-1',
   'GGCATCAGCTTT',
   'TTTTTTTTTTTTTATTTTCTACAAGTCTTGTTTATTGAAAGGATCTGAAAAGCGTAATAAGGCTTTCAATGACATTTAATACATTTTCAAG'],
  ['ACCCAAACATGCGTGC-1',
   'TTCAGCCTGGAG',
   'TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCTACAAGTCTTGTTTATTGAAAGGATCTGAAAAGCGTAATAAGGCTTTCAATGAC'],
  ['AGAAGTACAGTAGAAT-1',
   'TCTGCGCGCTAC',
   'TTTTTTTTTTTTTTTTTTTTTTCTACAAGTCTTGTTTATTGAAAGGATCTGAAAAGCGTAATAAGGCTTTCAATGACATTTAATACATTTT'],
  ['AGATCCAGTGAGGATC-1',
   'TATACATAGTTA',
   'TTTTTTTTTTTTTTTTTTTTTTTTTTTCTACAAGTCTTGTTTATTGAAAGGATCTGAAAAGCGTAATAAGGCTTTCAATGACATTTAATAC'],
  ['AGCCAATCACAAACGG-1',
   'GATTAAAGCTTA',
   'TTTTTTTTTTTTTATTTTTTTTATTTTCTACAAGTCTTGTTTATTGAAAGGATCTGAAAAACGTAATAAGGCTTTCAATGACATTTAATAC'],
  ['AGCTCAAAGTGGACGT-1',
   'TCGTGGATCAGA',
   'TTTTTTTTTTTTTTTACAAGTCTTGTTTATTGAAAGGATCTGAAAAGCGTAATAAGGCTTTCAATGACATTTAATACATTTTCAAGAAATT'],
  ['AGTAGCTAGCTAATGA-1',
   'TTGTCTGATTAT',
   'TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTATTTTCTACAAGTCTTGTTTATTGAAAGGATCTGAAAAGCGTAATAAGGCTT

In [None]:
start = time.time()
polya_reads = polyadenylation_reader("5k_pbmc_protein_v3_possorted_genome_bam.bam")
stop = time.time()

In [None]:
stop - start

In [None]:
serialise(polya_reads, "polyadenylation_reads.json")

# Load up the polya_reads

In [4]:
def deserialise(filename):
    with open(filename, "r") as f:
        o = json.load(f)
    return o

In [5]:
polya_reads = deserialise("polyadenylation_reads.json")

In [6]:
len([key for key in polya_reads])

6949

In [7]:
counts = {}
for gene, gene_data in polya_reads.items():
    for location, sequence_data in gene_data.items():
        UMI_per_location = collections.Counter()
        for CB, UMI, seq in sequence_data:
            UMI_per_location[(CB, UMI)] += 1
        counts[gene, location] = len(UMI_per_location)

In [8]:
counts

{('AL627309.1', '89543'): 2,
 ('LINC00115', '826205'): 3,
 ('LINC00115', '826714'): 0,
 ('NOC2L', '944211'): 2,
 ('NOC2L', '944213'): 3,
 ('NOC2L', '944214'): 2,
 ('NOC2L', '944215'): 1,
 ('NOC2L', '944216'): 1,
 ('NOC2L', '958260'): 1,
 ('NOC2L', '958266'): 1,
 ('AL645608.8', '995998'): 1,
 ('AL645608.8', '996460'): 1,
 ('AL645608.8', '996508'): 1,
 ('AL645608.8', '996526'): 1,
 ('AL645608.8', '996592'): 1,
 ('HES4', '998961'): 10,
 ('HES4', '998963'): 4,
 ('HES4', '999156'): 0,
 ('TNFRSF18', '1203507'): 3,
 ('TNFRSF4', '1211325'): 1,
 ('TNFRSF4', '1211339'): 1,
 ('TNFRSF4', '1211344'): 10,
 ('SDF4', '1216907'): 10,
 ('SDF4', '1216913'): 6,
 ('SDF4', '1216918'): 1,
 ('SDF4', '1216920'): 1,
 ('SDF4', '1216928'): 1,
 ('SDF4', '1216930'): 11,
 ('SDF4', '1216932'): 2,
 ('SDF4', '1216934'): 3,
 ('SDF4', '1217070'): 1,
 ('SDF4', '1228580'): 1,
 ('C1QTNF12', '1242452'): 2,
 ('UBE2J2', '1253908'): 1,
 ('UBE2J2', '1253911'): 4,
 ('UBE2J2', '1253916'): 1,
 ('UBE2J2', '1253919'): 1,
 ('UBE2J2', 

In [9]:
sorted_by_abundance = sorted([(count, gene_location) for gene_location, count in counts.items() if count > 100], reverse=True)

In [10]:
sorted_by_abundance[::-1]

[(101, ('RPL9', '39454123')),
 (102, ('CTSZ', '58995186')),
 (104, ('CSTB', '43773949')),
 (104, ('GMFG', '39328358')),
 (104, ('TMEM258', '61789129')),
 (104, ('YPEL3', '30092313')),
 (105, ('CD48', '160678745')),
 (106, ('EEF1D', '143579727')),
 (108, ('ARHGDIB', '14942030')),
 (108, ('NDUFB7', '14566077')),
 (109, ('LDHB', '21635341')),
 (111, ('PYCARD', '31201485')),
 (111, ('ZFP36L2', '43222405')),
 (112, ('ANXA5', '121667996')),
 (112, ('KLF6', '3776552')),
 (112, ('NOSIP', '49555710')),
 (115, ('JUN', '58780787')),
 (116, ('EIF4G2', '10797053')),
 (116, ('RPL18', '48615332')),
 (118, ('ENO1', '8861001')),
 (118, ('GMFG', '39328364')),
 (119, ('LCP1', '46125922')),
 (119, ('RPL12', '127447673')),
 (120, ('CFL1', '65854810')),
 (120, ('CHCHD2', '56101572')),
 (121, ('AES', '3053158')),
 (121, ('SPI1', '47354859')),
 (122, ('ICAM3', '10333775')),
 (122, ('TKT', '53225647')),
 (124, ('RPL24', '101681090')),
 (124, ('RPL37', '40832457')),
 (125, ('ATP5F1E', '59028684')),
 (125, ('TRI

In [11]:
abundant_sites = {}
for abundance, (gene, location) in sorted_by_abundance:
    assert abundance > 100
    abundant_sites.setdefault(gene, []).append(location)

In [12]:
abundant_sites

{'HLA-B': ['31353871'],
 'RPL30': ['98041720', '98041718'],
 'RPS6': ['19376254'],
 'RPL32': ['12836046', '12836048', '12836025'],
 'TPT1': ['45337176', '45336869'],
 'UBC': ['124911645'],
 'DUSP1': ['172768095'],
 'S100A8': ['153390031'],
 'JUND': ['18279759'],
 'RPL18': ['48615330', '48615327', '48615332'],
 'HLA-C': ['31268748'],
 'S100A6': ['153534598'],
 'RPL8': ['144789768'],
 'RPS16': ['39433206', '39433223', '39433216'],
 'RPL22': ['6186614'],
 'RPS14': ['150444237', '150444228'],
 'RPS4X': ['72272602', '72272611', '72272607'],
 'CD74': ['150401638'],
 'DDX5': ['64499619'],
 'S100A4': ['153543620', '153543621', '153543618'],
 'RPS29': ['49583576', '49583578', '49583571', '49583577'],
 'NFKBIA': ['35401512'],
 'FAU': ['65120629', '65120633'],
 'CST3': ['23633656'],
 'CYBA': ['88643288', '88643290'],
 'ACTG1': ['81509970'],
 'HLA-DRB1': ['32579078', '32578774', '32578768'],
 'TYROBP': ['35904402'],
 'SAMHD1': ['36890396'],
 'H3F3B': ['75778249', '75777458', '75778019'],
 'FCN1': 

In [13]:
{i: sites for i, sites in abundant_sites.items() if len(sites) > 2}

{'RPL32': ['12836046', '12836048', '12836025'],
 'RPL18': ['48615330', '48615327', '48615332'],
 'RPS16': ['39433206', '39433223', '39433216'],
 'RPS4X': ['72272602', '72272611', '72272607'],
 'S100A4': ['153543620', '153543621', '153543618'],
 'RPS29': ['49583576', '49583578', '49583571', '49583577'],
 'HLA-DRB1': ['32579078', '32578774', '32578768'],
 'H3F3B': ['75778249', '75777458', '75778019'],
 'JUN': ['58781485', '58781319', '58781489', '58781382', '58780787'],
 'RPL12': ['127447675', '127447677', '127447673'],
 'RACK1': ['181236936', '181236928', '181236941']}

In [14]:
discover_polya = {}
for gene, gene_data in polya_reads.items():
    count_by_location = discover_polya.setdefault(gene, {})
    for location, sequence_data in gene_data.items():
        UMI_per_location = collections.Counter()
        for CB, UMI, seq in sequence_data:
            UMI_per_location[(CB, UMI)] += 1
        count_by_location[location] = len(UMI_per_location)

In [15]:
discover_polya["JUN"]

{'58780787': 115,
 '58780790': 1,
 '58780791': 1,
 '58780822': 1,
 '58780843': 1,
 '58780869': 1,
 '58780902': 1,
 '58780926': 1,
 '58780941': 0,
 '58780942': 1,
 '58780972': 3,
 '58780978': 1,
 '58780997': 1,
 '58781007': 1,
 '58781023': 3,
 '58781029': 2,
 '58781091': 1,
 '58781097': 1,
 '58781109': 3,
 '58781119': 4,
 '58781122': 1,
 '58781123': 1,
 '58781132': 1,
 '58781133': 1,
 '58781136': 1,
 '58781137': 1,
 '58781138': 1,
 '58781143': 1,
 '58781153': 1,
 '58781174': 1,
 '58781185': 1,
 '58781191': 1,
 '58781202': 3,
 '58781219': 1,
 '58781225': 1,
 '58781228': 1,
 '58781232': 1,
 '58781238': 2,
 '58781260': 1,
 '58781296': 1,
 '58781297': 1,
 '58781310': 66,
 '58781311': 1,
 '58781312': 1,
 '58781314': 2,
 '58781315': 1,
 '58781316': 1,
 '58781318': 4,
 '58781319': 281,
 '58781320': 3,
 '58781321': 10,
 '58781322': 8,
 '58781328': 2,
 '58781329': 1,
 '58781332': 2,
 '58781355': 1,
 '58781356': 1,
 '58781365': 2,
 '58781377': 1,
 '58781380': 1,
 '58781381': 3,
 '58781382': 174,


In [17]:
polya_reads["JUN"]["58781319"]

[['AAAGGATGTCCTCAGG-1',
  'TGCTTGTTCAGA',
  'TTTTTTTTTTTTTTTTTTTTTTTACTTTCCAATAGTTTATTGTATTTTCTTAAATATCCTTTCTGGAATTTTCAGAAACAAAACATAAAAA'],
 ['AAAGTCCCAAATTAGG-1',
  'GTTGTCTTACAT',
  'TTTTTTTTTTTTTTTTTTTTTTTACTTTCCAATAGTTTATTGTATTTTCTTAAATATCCTTTCTGGAATTTTCAGAAACAAAACATAAAAA'],
 ['AACCTGAGTCGAGCTC-1',
  'GTGTGTATTGAG',
  'TTTTTTTTTTTTTTTTTTTTACTTTCCAATAGTTTATTGTATTTTCTTAAATATCCTTTCTGGAATTTTCAGAAACAAAACATAAAAAAAT'],
 ['AAGAACACAACGGCCT-1',
  'TCCTTTGACATC',
  'TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTACTTTCCAATAGTTTATTGTATTTTCTTAAATATCCTTTCTGGAATTTTCAGAAACAAAAC'],
 ['AAGCCATCAGTCACGC-1',
  'CTCGGTCCAAAC',
  'TTTTTTTTTTTTTTTTTTTTTTTTTACTTTCCAATAGTTTATTGTATTTTCTTAAATATCCTTTCTGGAATTTTCAGAAACAAAACATAAA'],
 ['AAGGTAAAGGAAGAAC-1',
  'TATGATGGCTGA',
  'TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTACTTTCCAATAGTTTATTGTATTTTCTTAAATATCCTTTCTGGAATTTTCAGAAACAAAA'],
 ['AAGTACCCAGCATGCC-1',
  'TCAATTTCTGGC',
  'TTTTTTTTTTATTTTTTTTTACTTTCCAATAGTTTATTGTATTTTCTTAAATATCCTTTCTGGAATTTTCAGAAACAAAACATAAAAAAAT'],
 ['AAGTACCCAGCATGCC-

In [16]:
polya_reads["JUN"]["58780787"]

[['AAGCCATAGCGGATCA-1',
  'AACAGCATACGG',
  'TTTTTTTATTGGTATTTGAATACATTTATTGTGACAAGAATGCTGTTATAAATATTCATAAGCAAAGGCCATCTTTTTATCTAGGAATTGT'],
 ['AAGGAATGTCGCAGTC-1',
  'GAGGTTTCGAAA',
  'TTTTTTTTTTTTTTTTTTTATTTTTTTTTTTTTTGGTATTTGAATACATTTATTGTGACAAGAATGCTGTTATAAATATTCATAAGCAAAG'],
 ['AAGGTAATCTATCGCC-1',
  'ATTGTTTAATCC',
  'TTTTTTTTTTTTTTTTTTGGTATTTGAATACATTTATTGTGACAAGAATGCTGTTATAAATATTCATAAGCAAAGGCCATCTTTTTATCTA'],
 ['AATCACGTCGAGCACC-1',
  'ATGCCATTTCAA',
  'TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGGTATTTGAATACATTTATTGTGACAAGAATGCTGTTATAAATATTCATAAGCAAAGGC'],
 ['AATGACCCAGCTGTAT-1',
  'TAGAAATTTTTG',
  'TTTTTTTTTTTGGTATTTGAATACATTTATTGTGACAAGAATGCTGTTATAAATATTCATAAGCAAAGGCCATCCCCATGTACTCTGCGTT'],
 ['ACAAGCTGTTGCCTAA-1',
  'AGTCGCCCGTGT',
  'TTTTTTTTTTTTTTTTGGTATTTGAATACATTTATTGTGACAAGAATGCTGTTATAAATATTCATAAGCAAAGGCCATCTTTTTATCTAGG'],
 ['ACAGAAACACGGTGTC-1',
  'TCAGCTGATTGC',
  'TTTTTTTTTTTTTTTTTTTTTTTTGGTATTTGAATACATTTATTGTGACAAGAATGCTGTTATAAATATTCATAAGCAAAGGCCATCTTTT'],
 ['ACAGAAATCACGTAGT-

# ATP1B1

Here: [ATP1B1 paper](https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0076290)

# Reference

Match -- 0

Insertion -- 1

Deletion -- 2

Splice -- 3

Softclip -- 4

Hardclip (not good, rerun the alignment) -- 5

In [None]:
sorted([(j, i) for i, j in c.items()], reverse=True)

```
[(798512, (0,)),
 (143947, (4, 0)),
 (46502, (0, 4)),
 (3855, (4, 0, 4)),
 (3518, (0, 1, 0)),
 (1662, (0, 2, 0)),
 (1380, (0, 3, 0)),
 (133, (4, 0, 1, 0)),
 (123, (0, 1, 0, 4)),
 (109, (4, 0, 3, 0)),
 (101, (0, 3, 0, 4)),
 (62, (0, 2, 0, 4)),
 (46, (4, 0, 2, 0)),
 (11, (4, 0, 3, 0, 4)),
 (9, (0, 3, 0, 3, 0)),
 (8, (0, 3, 0, 1, 0)),
 (6, (4, 0, 1, 0, 4)),
 (4, (0, 1, 0, 1, 0)),
 (3, (4, 0, 2, 0, 4)),
 (3, (0, 1, 0, 2, 0)),
 (2, (0, 2, 0, 1, 0)),
 (1, (4, 0, 1, 0, 2, 0)),
 (1, (0, 2, 0, 2, 0)),
 (1, (0, 1, 0, 3, 0)),
 (1, (0, 1, 0, 1, 0, 4))]
```