# DNA mismatch graph
## NCATS Hackathon 2019-09-17



Trying to explain "How do mutations in MSH2 and MSH6 prevent DNA mismatch repair?"

<img src="img/dna_mismatch.png" width="1200">

**IMPORTANT**: Current BTE limitation is that any single query is limited to 100 results per query, so it may not entirely show all results from each source

In [67]:
# import the query module
from biothings_explorer.user_query_dispatcher import SingleEdgeQueryDispatcher
# import the hint module (suggest hits based on your input)
from biothings_explorer.hint import Hint
# import the registry module
from biothings_explorer.registry import Registry
reg = Registry()
ht = Hint()
# initialize the connect module
from biothings_explorer.user_query_dispatcher import Connect

#Import FindConnection module
from biothings_explorer.user_query_dispatcher import FindConnection

from collections import Counter
import networkx as nx


## Find MSH genes and DNA repair nodes

In [13]:
# use the hint the module to let BioThings Explorer suggest the inputs for you
a = ht.query('MSH2')
# the output of the hint module is grouped by semantic types
node_msh2 = a['Gene'][0]
print(node_msh2)

# use the hint the module to let BioThings Explorer suggest the inputs for you
a = ht.query('MSH6')
# the output of the hint module is grouped by semantic types
node_msh6 = a['Gene'][0]
print(node_msh6)

# use the hint the module to let BioThings Explorer suggest the inputs for you
a = ht.query('dna repair')
# the output of the hint module is grouped by semantic types
node_dnarepair = a['BiologicalProcess'][0]
print(node_dnarepair)

{'entrez': '4436', 'name': 'mutS homolog 2', 'symbol': 'MSH2', 'taxonomy': 9606, 'umls': 'C0879290', 'display': 'entrez(4436) name(mutS homolog 2) symbol(MSH2) taxonomy(9606) umls(C0879290) ', 'type': 'Gene', 'primary': {'identifier': 'entrez', 'cls': 'Gene', 'value': '4436'}}
{'entrez': '2956', 'name': 'mutS homolog 6', 'symbol': 'MSH6', 'taxonomy': 9606, 'umls': 'C0879393', 'display': 'entrez(2956) name(mutS homolog 6) symbol(MSH6) taxonomy(9606) umls(C0879393) ', 'type': 'Gene', 'primary': {'identifier': 'entrez', 'cls': 'Gene', 'value': '2956'}}
{'name': 'DNA Repair', 'umls': 'C0012899', 'display': 'name(DNA Repair) umls(C0012899) ', 'type': 'BiologicalProcess', 'primary': {'identifier': 'umls', 'cls': 'BiologicalProcess', 'value': 'C0012899'}}


## find paths between MSH2 and DNA repair

In [27]:
fc = FindConnection(node_msh2, node_dnarepair, registry=reg)
fc.connect()

start to query from entrez:4436
1st query completed
start to query from umls:C0012899
2nd query completed
completed!


In [34]:
len(list(fc.G.nodes()))


1207

## find paths between MSH6 and DNA repair

In [35]:
fc2 = FindConnection(node_msh6, node_dnarepair, registry=reg)
fc2.connect()

start to query from entrez:2956
1st query completed
start to query from umls:C0012899
2nd query completed
completed!


In [36]:
len(list(fc2.G.nodes()))


1138

## find repeated intermediate nodes

In [48]:
intermediate_node_list = list(fc.G.nodes())
intermediate_node_list.extend(list(fc2.G.nodes()))
len(intermediate_node_list)

2345

In [81]:
intermediate_node_counts = Counter(intermediate_node_list)
intermediate_node_counts

Counter({'4436': 2,
         'UBERON:0001630': 2,
         'UBERON:0004819': 1,
         '0000015': 1,
         'UBERON:0000467': 2,
         'UBERON:0014892': 1,
         'UBERON:0001231': 1,
         'UBERON:0001728': 1,
         'UBERON:0001831': 1,
         'UBERON:0001911': 1,
         'UBERON:0002108': 1,
         'UBERON:0002384': 1,
         'UBERON:0002385': 1,
         'UBERON:0002450': 1,
         'UBERON:0000966': 1,
         'UBERON:0000995': 1,
         'UBERON:0000006': 1,
         'UBERON:0000074': 1,
         'UBERON:0000977': 1,
         'UBERON:0007023': 1,
         '0002336': 1,
         '246319': 1,
         'MONDO:0004975': 1,
         'MONDO:0023113': 1,
         'MONDO:0018630': 2,
         'MONDO:0018618': 2,
         'MONDO:0010159': 2,
         'MONDO:0005835': 2,
         'MONDO:0007356': 2,
         'MONDO:0008018': 2,
         'HP:0000505': 2,
         'HP:0001371': 2,
         'HP:0000716': 2,
         'HP:0001250': 2,
         'HP:0000708': 2,
         '

In [89]:
len(intermediate_node_counts.keys())

1349

In [101]:
shared_nodes = {}
for id in intermediate_node_counts.keys():
    if intermediate_node_counts[id] == 2:
        node_type = fc.G.nodes[id]['type']
        if node_type in shared_nodes.keys():
            shared_nodes[node_type].append(id)
        else:
            shared_nodes[node_type] = [id]
shared_nodes

{'Gene': ['4436',
  '2348',
  '30858',
  '360',
  '143',
  '2468',
  '6601',
  '6947',
  '6944',
  '25737',
  '23168',
  '11111',
  '882',
  '952',
  '795',
  '8729',
  '29079',
  '8125',
  '8031',
  '17642',
  '28569',
  '1058',
  '1101',
  '1100',
  '16627',
  '14064',
  '24156',
  '24624',
  '12572',
  '12816',
  '12814',
  '12831',
  '3437',
  '3433',
  '3439',
  '3436',
  '12791',
  '12799',
  '9823',
  '9820',
  '9816',
  '9817',
  '9811',
  '7230',
  '7128',
  '7127',
  '11998',
  '11992',
  '9949',
  '7326',
  '9970',
  '23845',
  '7652',
  '7527',
  '7553',
  '21484',
  '18398',
  '9122',
  '9121',
  '9177',
  '9175',
  '9173',
  '9413',
  '7059',
  '28209',
  '29814',
  '20465',
  '1925',
  '1867',
  '29452',
  '16696',
  '3650',
  '16909',
  '3078',
  '7327',
  'C0920274',
  'C2681922',
  'C1705982',
  'C0022457',
  'C1708838',
  'C0031678',
  'C1421361',
  'C1366628',
  'C1705556',
  'C2827446',
  'C0017337',
  'C0879392',
  'C3811713',
  'C0879389',
  'C0879393',
  'C13333

how many nodes of each semantic type?

In [108]:
for x in shared_nodes.keys():
    print(x+": "+ str(len(shared_nodes[x])))

Gene: 279
AnatomicalEntity: 2
DiseaseOrPhenotypicFeature: 280
PhenotypicFeature: 79
MolecularActivity: 17
BiologicalProcess: 182
CellularComponent: 35
Pathway: 9
ChemicalSubstance: 112
Cell: 1
