## Why is ATAC paired data performing so poorly?

In [1]:
import pickle

In [112]:
from pprint import pprint

In [28]:
from collections import Counter

In [3]:
graphs = pickle.load(open('input/graphs.pickle','rb'))

In [4]:
len(graphs)

10

In [57]:
atac_graphs = {k:g for g,k in graphs if 'atac_region' in k}

In [58]:
atac_graphs

{('tad',
  'overlaps',
  'atac_region'): <networkx.classes.digraph.DiGraph at 0x7f9bab821d30>,
 ('atac_region',
  'overlaps',
  'gene'): <networkx.classes.digraph.DiGraph at 0x7f9b927d9d00>,
 ('enhancer',
  'overlaps',
  'atac_region'): <networkx.classes.digraph.DiGraph at 0x7f9bab88a160>,
 ('atac_region',
  'neighbors',
  'gene'): <networkx.classes.digraph.DiGraph at 0x7f9bab88a460>}

In [125]:
in_edges = {}
for relation, graph in atac_graphs.items():
    graph = graph.to_undirected()
    in_edges[relation] = Counter()
    for node in list(graph.nodes()):
        # t=node type, k=node key
        t, k = node
        if t == 'atac_region':
            ie = graph.edges(node)
            if ie:
                n = len(list(ie))
                in_edges[relation][n]+=1
            else:
                in_edges[relation][0]+=1
    print(relation)
    pprint(sorted(in_edges[relation].items()))

('tad', 'overlaps', 'atac_region')
[(1, 116232)]
('atac_region', 'overlaps', 'gene')
[(0, 8615), (1, 14628), (2, 617), (3, 19), (4, 2), (5, 2), (14, 1)]
('enhancer', 'overlaps', 'atac_region')
[(1, 9739),
 (2, 2528),
 (3, 1234),
 (4, 619),
 (5, 370),
 (6, 240),
 (7, 164),
 (8, 111),
 (9, 92),
 (10, 58),
 (11, 48),
 (12, 45),
 (13, 20),
 (14, 16),
 (15, 20),
 (16, 11),
 (17, 21),
 (18, 11),
 (19, 10),
 (20, 9),
 (21, 6),
 (22, 6),
 (23, 4),
 (24, 5),
 (25, 3),
 (27, 5),
 (29, 3),
 (30, 6),
 (35, 1),
 (36, 3),
 (37, 1),
 (38, 1),
 (39, 1),
 (43, 2)]
('atac_region', 'neighbors', 'gene')
[(0, 32416), (1, 52243), (2, 27877), (3, 3929)]


In [64]:
import networkx as nx

In [65]:
combined = nx.Graph()

in_edges = {}
for relation, graph in atac_graphs.items():
    graph = graph.to_undirected()
    combined = nx.compose(combined, graph)

In [66]:
combined.number_of_nodes()

168006

In [67]:
combined.number_of_edges()

268938

In [None]:
combined_in_edges = Counter()
for node in list(combined.nodes()):
    # t=node type, k=node key
    t, k = node
    if t == 'atac_region':
        ie = combined.edges(node)
        if ie:
            n = len(list(ie))
            combined_in_edges[n]+=1
        

In [120]:
c=0
for num_edges, count in combined_in_edges.most_common():
    c += count
    print(f'{num_edges:3d} {count:6d} {c}')

  2  48641 48641
  3  30126 78767
  1  26886 105653
  4   7165 112818
  5   1586 114404
  6    654 115058
  7    417 115475
  8    249 115724
  9    162 115886
 10    118 116004
 11     83 116087
 12     55 116142
 13     50 116192
 14     39 116231
 19     23 116254
 15     21 116275
 16     20 116295
 17     15 116310
 18     13 116323
 20     11 116334
 21      9 116343
 22      8 116351
 24      8 116359
 27      6 116365
 23      5 116370
 25      5 116375
 31      3 116378
 39      3 116381
 32      3 116384
 30      3 116387
 29      2 116389
 41      2 116391
 33      2 116393
 37      2 116395
 46      1 116396
 28      1 116397
 44      1 116398
 26      1 116399


In [128]:
sum(combined_in_edges.values())

116399

## Why are there ATAC regions with zero neighboring genes? 

KDTree should always return k results for nearest neighbor search

In [132]:
relation = ('atac_region', 'neighbors', 'gene')
g = atac_graphs[relation]
c = 0
for node in g.nodes():
    t,k = node
    if t == 'atac_region':
        n = len(combined.edges(node))
        if n==0:
            print(node, n)
            c+=1
            if c==100:
                break

('atac_region', 'chr12-9843-10760') 0
('atac_region', 'chr12-12504-13430') 0
('atac_region', 'chr12-21492-22404') 0
('atac_region', 'chr21-6369504-6370420') 0
('atac_region', 'chr21-6371511-6372344') 0
('atac_region', 'chr21-7775334-7776260') 0
('atac_region', 'chr21-8212819-8213728') 0
('atac_region', 'chr21-8219517-8220424') 0
('atac_region', 'chr21-8234038-8234950') 0
('atac_region', 'chr1-9776-10668') 0
('atac_region', 'chr20-63025546-63026401') 0
('atac_region', 'chr20-63027866-63028775') 0
('atac_region', 'chr20-63032920-63033837') 0
('atac_region', 'chr20-63049603-63050403') 0
('atac_region', 'chr20-63056176-63057074') 0
('atac_region', 'chr20-63071550-63072399') 0
('atac_region', 'chr20-63073124-63073951') 0
('atac_region', 'chr20-63082120-63082947') 0
('atac_region', 'chr20-63135249-63135879') 0
('atac_region', 'chr20-63178907-63179770') 0
('atac_region', 'chr20-63183028-63183941') 0
('atac_region', 'chr20-63441816-63442623') 0
('atac_region', 'chr20-63446211-63446917') 0
('at