# Disease-[affects]->Anatomy
MeSH -[affects]-> MeSH
- Make an anatomy tree of MeSH IDs and get the anatomy names (Use a downloaded MeSH Tree)
- Make a disease tree of MeSH IDs and get the disease names (performed in disease2disease)
- 'Text mine' PubMed MeSH metadata (see CaseOLAP download and parsing)
- Calculate relationships between Disease MeSH and Anatomy MeSH

In [None]:
# Some inspiration and source code from https://github.com/hetio/medline/blob/main/tissues.ipynb and https://github.com/CaseOLAP/caseolap 

In [4]:
import pandas as pd
import json
import csv

In [None]:
def make_lists(d):
    for k,v in d.copy().items():
        d[k] = list(v)
    return d

def make_sets(d):
    for k,v in d.copy().items():
        d[k] = set(v)
    return d

In [5]:
anatomy_name2id = json.load(open('output/anatomy2anatomy/meshterm-IS-meshid.json'))
disease_name2id = json.load(open('output/disease2disease/meshterm-IS-meshid.json'))

disease_mesh = list(disease_name2id.keys())
anatomy_mesh = list(anatomy_name2id.keys())

## Anatomy & Disease MeSH -[found in]-> PMID 
Finds the MeSH terms reported in a PubMed article's MeSH Term metadata section. The data was already downloaded from PubMed and parsed. See CaseOLAP download, parsing, and indexing steps.

In [6]:
disease_mesh2pmid, anatomy_mesh2pmid = dict(), dict()
anatomy_pmids, disease_pmids = set(), set()
anatomy_pmidcount, disease_pmidcount = dict(), dict()

# Load MeSH to PMID mapping
with open('../../caseolap/data/mesh2pmid.json') as fin:
    for i,line in enumerate(fin):
        line = json.loads(line.strip())
        meshterm = list(line.keys())[0]
        print(i, end='\r')
        
        '''
        Disease
        '''
        # Disease MeSH Term -[studied in]-> PMID
        if meshterm in disease_mesh:
            pmids = set(list(line.values())[0])
            if len(pmids) > 0:
                disease_mesh2pmid[meshterm] = pmids
            else:
                print(meshterm, 'has 0 PMIDs'); continue
            
            # Get PMIDs studying disease 
            for pmid in pmids:
                disease_pmids.add(pmid)
                    
            
            
        '''
        Anatomy
        '''
        # Anatomy MeSH Term -[studied in]-> PMID
        if meshterm in anatomy_mesh:
            pmids = set(list(line.values())[0])
            if len(pmids) > 0:
                anatomy_mesh2pmid[meshterm] = pmids
            else:
                print(meshterm, 'has 0 PMIDs'); continue
        
            # Get PMIDs studying anatomy
            for pmid in pmids:
                anatomy_pmids.add(pmid)        


# Sort dictionaries
anatomy_mesh2pmid = dict(sorted(anatomy_mesh2pmid.items()))
disease_mesh2pmid = dict(sorted(disease_mesh2pmid.items()))

29998

In [7]:
# Export
with open('output/disease2anatomy/anatomy_mesh2pmid.json','w') as fout:
    json.dump(make_lists(anatomy_mesh2pmid), fout)
    
with open('output/disease2anatomy/disease_mesh2pmid.json','w') as fout:
    json.dump(make_lists(disease_mesh2pmid), fout)
    
print(len(disease_pmids),'Disease PMIDs')
print(len(anatomy_pmids),'Anatomy PMIDs')

16872705 Disease PMIDs
11476702 Anatomy PMIDs


In [4]:
# Import
anatomy_mesh2pmid = make_sets(json.load(open('output/disease2anatomy/anatomy_mesh2pmid.json')))
disease_mesh2pmid = make_sets(json.load(open('output/disease2anatomy/disease_mesh2pmid.json')))

## Anatomy-Diseaes Co-occurrences

In [8]:
num_anatomy_pmids = len(anatomy_pmids)
num_disease_pmids = len(disease_pmids)
print(num_anatomy_pmids, num_disease_pmids)

11476702 16872705


In [10]:
disease_mesh2pmid_backup = disease_mesh2pmid.copy()
anatomy_mesh2pmid_backup = anatomy_mesh2pmid.copy()

#### Remove disease-PMIDs without an anatomy MeSH term

In [13]:
disease_mesh2pmid = disease_mesh2pmid_backup.copy()
anatomy_mesh2pmid = anatomy_mesh2pmid_backup.copy()

# If any disease-studying PMID does not study an anatomy, remove the PMID (to make proportion calculation more faithful)
print('#Disease PMIDs - #Removed PMIDs = #DiseaseAnatomy PMIDs')

# For each disease
for disease, d_pmids in disease_mesh2pmid.copy().items():
        
    # Remove disease-noanatomy PMIDs for each disease
    disease_mesh2pmid[disease] = set(d_pmids).intersection(set(anatomy_pmids))
    
    # If any PMIDs were removed, display the counts
    diff = len(d_pmids) - len(disease_mesh2pmid[disease])
    if diff != 0:
        remain_per = round(1-diff/len(d_pmids),2)*100
        print(len(d_pmids),'-',diff,'=',len(disease_mesh2pmid[disease]),'=', str(remain_per)+'%')

        # If the disease has no co-occurring anatomy MeSH terms, remove the disease
        if len(d_pmids) - diff == 0:
            disease_mesh2pmid.pop(disease)

#Disease PMIDs - #Removed PMIDs = #DiseaseAnatomy PMIDs
227 - 107 = 120 = 53.0%
739 - 157 = 582 = 79.0%
62 - 21 = 41 = 66.0%
986 - 751 = 235 = 24.0%
592 - 352 = 240 = 41.0%
38 - 24 = 14 = 37.0%
3907 - 1913 = 1994 = 51.0%
814 - 452 = 362 = 44.0%
2041 - 1159 = 882 = 43.0%
21346 - 16533 = 4813 = 23.0%
9755 - 5277 = 4478 = 46.0%
3171 - 1815 = 1356 = 43.0%
15159 - 5953 = 9206 = 61.0%
9623 - 5144 = 4479 = 47.0%
23170 - 13506 = 9664 = 42.0%
1313 - 611 = 702 = 53.0%
125 - 38 = 87 = 70.0%
282 - 99 = 183 = 65.0%
599 - 273 = 326 = 54.0%
578 - 204 = 374 = 65.0%
14717 - 7562 = 7155 = 49.0%
43338 - 10840 = 32498 = 75.0%
862 - 426 = 436 = 51.0%
1564 - 950 = 614 = 39.0%
7562 - 4316 = 3246 = 43.0%
735 - 547 = 188 = 26.0%
1111 - 665 = 446 = 40.0%
1309 - 1033 = 276 = 21.0%
21363 - 14612 = 6751 = 32.0%
2310 - 1635 = 675 = 28.999999999999996%
4701 - 2865 = 1836 = 39.0%
2381 - 1555 = 826 = 35.0%
28814 - 12809 = 16005 = 56.00000000000001%
370 - 246 = 124 = 34.0%
1213 - 683 = 530 = 44.0%
21 - 18 = 3 = 14.0000

2186 - 2005 = 181 = 8.0%
22408 - 7762 = 14646 = 65.0%
21489 - 13463 = 8026 = 37.0%
14514 - 7228 = 7286 = 50.0%
2276 - 984 = 1292 = 56.99999999999999%
9702 - 3783 = 5919 = 61.0%
17973 - 4220 = 13753 = 77.0%
10111 - 4223 = 5888 = 57.99999999999999%
795 - 377 = 418 = 53.0%
319 - 168 = 151 = 47.0%
131 - 49 = 82 = 63.0%
15614 - 5767 = 9847 = 63.0%
186 - 87 = 99 = 53.0%
30427 - 11680 = 18747 = 62.0%
14 - 2 = 12 = 86.0%
1750 - 746 = 1004 = 56.99999999999999%
344 - 177 = 167 = 49.0%
571 - 383 = 188 = 33.0%
1009 - 427 = 582 = 57.99999999999999%
1942 - 1019 = 923 = 48.0%
10523 - 7176 = 3347 = 32.0%
1655 - 1063 = 592 = 36.0%
80 - 35 = 45 = 56.00000000000001%
998 - 504 = 494 = 49.0%
899 - 548 = 351 = 39.0%
355 - 220 = 135 = 38.0%
7554 - 4918 = 2636 = 35.0%
3511 - 2202 = 1309 = 37.0%
20559 - 14316 = 6243 = 30.0%
225 - 100 = 125 = 56.00000000000001%
3054 - 1641 = 1413 = 46.0%
445 - 268 = 177 = 40.0%
528 - 283 = 245 = 46.0%
2388 - 909 = 1479 = 62.0%
1577 - 520 = 1057 = 67.0%
2467 - 1976 = 491 = 20.0%

3747 - 1851 = 1896 = 51.0%
2622 - 1477 = 1145 = 44.0%
3517 - 1457 = 2060 = 59.0%
848 - 465 = 383 = 45.0%
223 - 127 = 96 = 43.0%
5757 - 5202 = 555 = 10.0%
2714 - 2468 = 246 = 9.0%
9647 - 5241 = 4406 = 46.0%
118 - 40 = 78 = 66.0%
62 - 13 = 49 = 79.0%
1492 - 1126 = 366 = 25.0%
1249 - 874 = 375 = 30.0%
10618 - 4377 = 6241 = 59.0%
1098 - 781 = 317 = 28.999999999999996%
15567 - 15430 = 137 = 1.0%
47486 - 29085 = 18401 = 39.0%
6646 - 2739 = 3907 = 59.0%
2453 - 1157 = 1296 = 53.0%
1139 - 765 = 374 = 33.0%
3753 - 935 = 2818 = 75.0%
680 - 544 = 136 = 20.0%
135 - 87 = 48 = 36.0%
670 - 519 = 151 = 23.0%
936 - 416 = 520 = 56.00000000000001%
321 - 162 = 159 = 50.0%
174672 - 154637 = 20035 = 11.0%
375 - 184 = 191 = 51.0%
5909 - 3429 = 2480 = 42.0%
44631 - 11040 = 33591 = 75.0%
1779 - 880 = 899 = 51.0%
512 - 321 = 191 = 37.0%
39602 - 15452 = 24150 = 61.0%
1273 - 742 = 531 = 42.0%
1570 - 946 = 624 = 40.0%
6193 - 1755 = 4438 = 72.0%
4465 - 2748 = 1717 = 38.0%
664 - 294 = 370 = 56.00000000000001%
75 - 31

289 - 232 = 57 = 20.0%
21 - 5 = 16 = 76.0%
8863 - 6005 = 2858 = 32.0%
3029 - 1748 = 1281 = 42.0%
3183 - 2293 = 890 = 28.000000000000004%
8146 - 4198 = 3948 = 48.0%
861 - 254 = 607 = 70.0%
909 - 432 = 477 = 52.0%
101 - 46 = 55 = 54.0%
247 - 147 = 100 = 40.0%
66149 - 47210 = 18939 = 28.999999999999996%
30900 - 20231 = 10669 = 35.0%
14157 - 10496 = 3661 = 26.0%
62 - 37 = 25 = 40.0%
4538 - 3062 = 1476 = 33.0%
22565 - 9150 = 13415 = 59.0%
350 - 193 = 157 = 45.0%
1098 - 616 = 482 = 44.0%
230 - 138 = 92 = 40.0%
428 - 296 = 132 = 31.0%
38787 - 23526 = 15261 = 39.0%
148 - 49 = 99 = 67.0%
6229 - 3813 = 2416 = 39.0%
26 - 9 = 17 = 65.0%
868 - 498 = 370 = 43.0%
260 - 89 = 171 = 66.0%
2369 - 316 = 2053 = 87.0%
16379 - 9311 = 7068 = 43.0%
3642 - 2411 = 1231 = 34.0%
76141 - 37180 = 38961 = 51.0%
9538 - 6732 = 2806 = 28.999999999999996%
744 - 552 = 192 = 26.0%
4181 - 2620 = 1561 = 37.0%
118 - 97 = 21 = 18.0%
103703 - 69892 = 33811 = 33.0%
4943 - 4310 = 633 = 13.0%
12944 - 9418 = 3526 = 27.0%
126 - 79 =

69200 - 21649 = 47551 = 69.0%
4 - 4 = 0 = 0.0%
384706 - 104749 = 279957 = 73.0%
186130 - 102502 = 83628 = 45.0%
10377 - 7789 = 2588 = 25.0%
37929 - 21555 = 16374 = 43.0%
14008 - 9575 = 4433 = 32.0%
557 - 336 = 221 = 40.0%
4046 - 3517 = 529 = 13.0%
9483 - 4183 = 5300 = 56.00000000000001%
2661 - 2286 = 375 = 14.000000000000002%
11514 - 7919 = 3595 = 31.0%
4038 - 3586 = 452 = 11.0%
840 - 793 = 47 = 6.0%
381 - 130 = 251 = 66.0%
1628 - 1014 = 614 = 38.0%
143 - 111 = 32 = 22.0%
243 - 159 = 84 = 35.0%
3412 - 1805 = 1607 = 47.0%
3830 - 2417 = 1413 = 37.0%
692 - 443 = 249 = 36.0%
88 - 41 = 47 = 53.0%
3 - 2 = 1 = 33.0%
9489 - 3318 = 6171 = 65.0%
2769 - 1687 = 1082 = 39.0%
1855 - 1006 = 849 = 46.0%
466 - 222 = 244 = 52.0%
6156 - 4430 = 1726 = 28.000000000000004%
64209 - 40639 = 23570 = 37.0%
76 - 62 = 14 = 18.0%
832 - 341 = 491 = 59.0%
57 - 51 = 6 = 11.0%
25820 - 15169 = 10651 = 41.0%
905 - 828 = 77 = 9.0%
3648 - 2844 = 804 = 22.0%
13934 - 10996 = 2938 = 21.0%
24775 - 19746 = 5029 = 20.0%
847 - 6

6756 - 1518 = 5238 = 78.0%
2906 - 1054 = 1852 = 64.0%
38291 - 20396 = 17895 = 47.0%
1730 - 705 = 1025 = 59.0%
4604 - 2122 = 2482 = 54.0%
1008 - 544 = 464 = 46.0%
1187 - 797 = 390 = 33.0%
6037 - 3282 = 2755 = 46.0%
3106 - 1840 = 1266 = 41.0%
1625 - 884 = 741 = 46.0%
1847 - 864 = 983 = 53.0%
12901 - 6817 = 6084 = 47.0%
2748 - 1136 = 1612 = 59.0%
3146 - 1385 = 1761 = 56.00000000000001%
12379 - 5789 = 6590 = 53.0%
778 - 465 = 313 = 40.0%
7311 - 3072 = 4239 = 57.99999999999999%
4801 - 2570 = 2231 = 46.0%
3958 - 2154 = 1804 = 46.0%
4306 - 819 = 3487 = 81.0%
7592 - 4816 = 2776 = 37.0%
979 - 348 = 631 = 64.0%
6741 - 2737 = 4004 = 59.0%
8471 - 4000 = 4471 = 53.0%
1675 - 565 = 1110 = 66.0%
2114 - 325 = 1789 = 85.0%
1280 - 608 = 672 = 53.0%
6927 - 3524 = 3403 = 49.0%
12948 - 4848 = 8100 = 63.0%
3180 - 1427 = 1753 = 55.00000000000001%
1475 - 1229 = 246 = 17.0%
888 - 676 = 212 = 24.0%
1120 - 872 = 248 = 22.0%
531 - 428 = 103 = 19.0%
806 - 621 = 185 = 23.0%
459 - 370 = 89 = 19.0%
707 - 579 = 128 = 1

382 - 210 = 172 = 45.0%
190 - 97 = 93 = 49.0%
198 - 71 = 127 = 64.0%
682 - 329 = 353 = 52.0%
53 - 25 = 28 = 53.0%
101 - 42 = 59 = 57.99999999999999%
15 - 6 = 9 = 60.0%
4696 - 2998 = 1698 = 36.0%
443 - 270 = 173 = 39.0%
134 - 91 = 43 = 32.0%
6012 - 3811 = 2201 = 37.0%
12032 - 7789 = 4243 = 35.0%
2538 - 1953 = 585 = 23.0%
4169 - 2588 = 1581 = 38.0%
959 - 628 = 331 = 35.0%
921 - 356 = 565 = 61.0%
198 - 112 = 86 = 43.0%
1799 - 543 = 1256 = 70.0%
148 - 57 = 91 = 61.0%
911 - 431 = 480 = 53.0%
273 - 87 = 186 = 68.0%
325 - 153 = 172 = 53.0%
14983 - 12554 = 2429 = 16.0%
12240 - 8614 = 3626 = 30.0%
10853 - 3912 = 6941 = 64.0%
26074 - 12880 = 13194 = 51.0%
13993 - 10565 = 3428 = 24.0%
11934 - 8672 = 3262 = 27.0%
1573 - 861 = 712 = 45.0%
23292 - 9617 = 13675 = 59.0%
987 - 563 = 424 = 43.0%
595 - 475 = 120 = 20.0%
2267 - 896 = 1371 = 60.0%
2106 - 1014 = 1092 = 52.0%
228 - 100 = 128 = 56.00000000000001%
497 - 290 = 207 = 42.0%
2549 - 1310 = 1239 = 49.0%
1268 - 646 = 622 = 49.0%
417 - 142 = 275 = 66.

139 - 23 = 116 = 83.0%
10960 - 3971 = 6989 = 64.0%
689 - 359 = 330 = 48.0%
302 - 197 = 105 = 35.0%
2454 - 1403 = 1051 = 43.0%
1354 - 796 = 558 = 41.0%
1107 - 651 = 456 = 41.0%
164 - 95 = 69 = 42.0%
85 - 46 = 39 = 46.0%
653 - 433 = 220 = 34.0%
8464 - 5503 = 2961 = 35.0%
13383 - 4132 = 9251 = 69.0%
1862 - 1141 = 721 = 39.0%
236 - 168 = 68 = 28.999999999999996%
2298 - 1560 = 738 = 32.0%
204 - 101 = 103 = 50.0%
4852 - 3076 = 1776 = 37.0%
967 - 507 = 460 = 48.0%
1166 - 958 = 208 = 18.0%
13435 - 9101 = 4334 = 32.0%
920 - 618 = 302 = 33.0%
9064 - 4093 = 4971 = 55.00000000000001%
104 - 26 = 78 = 75.0%
26852 - 18813 = 8039 = 30.0%
48 - 33 = 15 = 31.0%
1733 - 1453 = 280 = 16.0%
6711 - 1209 = 5502 = 82.0%
1937 - 1019 = 918 = 47.0%
927 - 329 = 598 = 65.0%
66 - 57 = 9 = 14.000000000000002%
3603 - 1816 = 1787 = 50.0%
30732 - 18347 = 12385 = 40.0%
795 - 698 = 97 = 12.0%
239 - 152 = 87 = 36.0%
3319 - 1428 = 1891 = 56.99999999999999%
4679 - 3060 = 1619 = 35.0%
8916 - 4814 = 4102 = 46.0%
6374 - 4673 = 1

2360 - 1457 = 903 = 38.0%
199 - 109 = 90 = 45.0%
55 - 28 = 27 = 49.0%
7244 - 3881 = 3363 = 46.0%
2404 - 1684 = 720 = 30.0%
993 - 620 = 373 = 38.0%
1342 - 898 = 444 = 33.0%
2513 - 1636 = 877 = 35.0%
20534 - 13077 = 7457 = 36.0%
903 - 367 = 536 = 59.0%
87905 - 47022 = 40883 = 47.0%
6051 - 2837 = 3214 = 53.0%
99186 - 72559 = 26627 = 27.0%
76385 - 44817 = 31568 = 41.0%
1010 - 538 = 472 = 47.0%
2399 - 1010 = 1389 = 57.99999999999999%
64 - 30 = 34 = 53.0%
23 - 23 = 0 = 0.0%
948 - 280 = 668 = 70.0%
9855 - 7524 = 2331 = 24.0%
239 - 186 = 53 = 22.0%
4034 - 1907 = 2127 = 53.0%
952 - 288 = 664 = 70.0%
1271 - 642 = 629 = 49.0%
111 - 55 = 56 = 50.0%
692 - 154 = 538 = 78.0%
19265 - 3709 = 15556 = 81.0%
541 - 374 = 167 = 31.0%
165 - 142 = 23 = 14.000000000000002%
473 - 322 = 151 = 32.0%
596 - 383 = 213 = 36.0%
2624 - 1796 = 828 = 32.0%
105 - 95 = 10 = 10.0%
7450 - 2257 = 5193 = 70.0%
215 - 143 = 72 = 33.0%
1313 - 1204 = 109 = 8.0%
1137 - 978 = 159 = 14.000000000000002%
4916 - 1242 = 3674 = 75.0%
699 

1002 - 487 = 515 = 51.0%
770 - 385 = 385 = 50.0%
659 - 528 = 131 = 20.0%
177 - 56 = 121 = 68.0%
17149 - 8386 = 8763 = 51.0%
8420 - 3922 = 4498 = 53.0%
4410 - 2599 = 1811 = 41.0%
8112 - 3855 = 4257 = 52.0%
990 - 428 = 562 = 56.99999999999999%
49293 - 39535 = 9758 = 20.0%
803 - 591 = 212 = 26.0%
2179 - 1167 = 1012 = 46.0%
19866 - 14884 = 4982 = 25.0%
5043 - 4228 = 815 = 16.0%
1864 - 1340 = 524 = 28.000000000000004%
1576 - 553 = 1023 = 65.0%
212 - 77 = 135 = 64.0%
80 - 15 = 65 = 81.0%
5 - 3 = 2 = 40.0%
141 - 73 = 68 = 48.0%
1812 - 1207 = 605 = 33.0%
425 - 276 = 149 = 35.0%
3793 - 2370 = 1423 = 38.0%
660 - 474 = 186 = 28.000000000000004%
17527 - 14588 = 2939 = 17.0%
24346 - 10926 = 13420 = 55.00000000000001%
1558 - 371 = 1187 = 76.0%
6156 - 1878 = 4278 = 69.0%
3964 - 1038 = 2926 = 74.0%
107 - 67 = 40 = 37.0%
4933 - 1869 = 3064 = 62.0%
18923 - 7150 = 11773 = 62.0%
7834 - 3653 = 4181 = 53.0%
6966 - 2825 = 4141 = 59.0%
1156 - 328 = 828 = 72.0%
9865 - 4953 = 4912 = 50.0%
24 - 2 = 22 = 92.0%
17

2240 - 1117 = 1123 = 50.0%
16319 - 9222 = 7097 = 43.0%
245 - 115 = 130 = 53.0%
440 - 233 = 207 = 47.0%
710 - 446 = 264 = 37.0%
5387 - 2929 = 2458 = 46.0%
1259 - 628 = 631 = 50.0%
1906 - 834 = 1072 = 56.00000000000001%
325 - 194 = 131 = 40.0%
318 - 181 = 137 = 43.0%
1472 - 882 = 590 = 40.0%
198 - 84 = 114 = 57.99999999999999%
1541 - 849 = 692 = 45.0%
574 - 162 = 412 = 72.0%
790 - 254 = 536 = 68.0%
200 - 67 = 133 = 67.0%
34 - 17 = 17 = 50.0%
18641 - 7903 = 10738 = 57.99999999999999%
1582 - 559 = 1023 = 65.0%
326 - 137 = 189 = 57.99999999999999%
9242 - 4830 = 4412 = 48.0%
2088 - 958 = 1130 = 54.0%
1259 - 468 = 791 = 63.0%
1218 - 590 = 628 = 52.0%
1014 - 540 = 474 = 47.0%
294 - 151 = 143 = 49.0%
5568 - 2972 = 2596 = 47.0%
51 - 37 = 14 = 27.0%
110 - 32 = 78 = 71.0%
3802 - 2483 = 1319 = 35.0%
7222 - 2366 = 4856 = 67.0%
249 - 198 = 51 = 20.0%
429 - 190 = 239 = 56.00000000000001%
5762 - 4217 = 1545 = 27.0%
524 - 233 = 291 = 56.00000000000001%
850 - 468 = 382 = 45.0%
2098 - 1094 = 1004 = 48.0%


1648 - 806 = 842 = 51.0%
41591 - 12536 = 29055 = 70.0%
9471 - 4729 = 4742 = 50.0%
25118 - 9635 = 15483 = 62.0%
200 - 44 = 156 = 78.0%
709 - 411 = 298 = 42.0%
1904 - 1129 = 775 = 41.0%
656 - 195 = 461 = 70.0%
5778 - 1148 = 4630 = 80.0%
1733 - 189 = 1544 = 89.0%
5609 - 2562 = 3047 = 54.0%
1738 - 349 = 1389 = 80.0%
169 - 37 = 132 = 78.0%
838 - 219 = 619 = 74.0%
5040 - 2739 = 2301 = 46.0%
6570 - 1875 = 4695 = 71.0%
869 - 308 = 561 = 65.0%
3476 - 1180 = 2296 = 66.0%
2931 - 973 = 1958 = 67.0%
4962 - 2796 = 2166 = 44.0%
21693 - 10287 = 11406 = 53.0%
7657 - 2780 = 4877 = 64.0%
2753 - 1179 = 1574 = 56.99999999999999%
1006 - 107 = 899 = 89.0%
460 - 258 = 202 = 44.0%
48689 - 27484 = 21205 = 44.0%
13932 - 8603 = 5329 = 38.0%
7320 - 5078 = 2242 = 31.0%
2002 - 670 = 1332 = 67.0%
25010 - 9645 = 15365 = 61.0%
123 - 47 = 76 = 62.0%
2565 - 1091 = 1474 = 56.99999999999999%
952 - 493 = 459 = 48.0%
1960 - 1320 = 640 = 33.0%
2612 - 1627 = 985 = 38.0%
17965 - 10159 = 7806 = 43.0%
5931 - 3163 = 2768 = 47.0%
2

6784 - 4965 = 1819 = 27.0%
639 - 430 = 209 = 33.0%
52198 - 35472 = 16726 = 32.0%
58 - 40 = 18 = 31.0%
494 - 279 = 215 = 44.0%
5733 - 3479 = 2254 = 39.0%
69 - 40 = 29 = 42.0%
11672 - 8722 = 2950 = 25.0%
614 - 394 = 220 = 36.0%
4348 - 3341 = 1007 = 23.0%
104 - 79 = 25 = 24.0%
5426 - 3978 = 1448 = 27.0%
9071 - 6267 = 2804 = 31.0%
391 - 205 = 186 = 48.0%
80 - 65 = 15 = 19.0%
2047 - 1483 = 564 = 28.000000000000004%
4061 - 3455 = 606 = 15.0%
45663 - 39228 = 6435 = 14.000000000000002%
957 - 637 = 320 = 33.0%
3989 - 1830 = 2159 = 54.0%
100 - 62 = 38 = 38.0%
17954 - 11945 = 6009 = 33.0%
185 - 77 = 108 = 57.99999999999999%
22804 - 19334 = 3470 = 15.0%
518 - 201 = 317 = 61.0%
19688 - 15976 = 3712 = 19.0%
908 - 700 = 208 = 23.0%
6191 - 3611 = 2580 = 42.0%
1502 - 908 = 594 = 40.0%
6096 - 2899 = 3197 = 52.0%
3928 - 2027 = 1901 = 48.0%
663 - 299 = 364 = 55.00000000000001%
16885 - 10935 = 5950 = 35.0%
6002 - 3536 = 2466 = 41.0%
6443 - 3797 = 2646 = 41.0%
1594 - 371 = 1223 = 77.0%
252 - 181 = 71 = 28.0

238 - 69 = 169 = 71.0%
711 - 411 = 300 = 42.0%
2439 - 1220 = 1219 = 50.0%
30774 - 5483 = 25291 = 82.0%
632 - 343 = 289 = 46.0%
9713 - 6725 = 2988 = 31.0%
1466 - 703 = 763 = 52.0%
103 - 47 = 56 = 54.0%
23327 - 15608 = 7719 = 33.0%
13626 - 8678 = 4948 = 36.0%
9922 - 5968 = 3954 = 40.0%
34904 - 25357 = 9547 = 27.0%
1997 - 723 = 1274 = 64.0%
9771 - 6920 = 2851 = 28.999999999999996%
8159 - 5299 = 2860 = 35.0%
1295 - 214 = 1081 = 83.0%
23028 - 16785 = 6243 = 27.0%
1068 - 491 = 577 = 54.0%
41683 - 30477 = 11206 = 27.0%
2145 - 1528 = 617 = 28.999999999999996%
2259 - 1295 = 964 = 43.0%
4158 - 3371 = 787 = 19.0%
112 - 38 = 74 = 66.0%
72 - 38 = 34 = 47.0%
43 - 15 = 28 = 65.0%
2479 - 1105 = 1374 = 55.00000000000001%
10722 - 2573 = 8149 = 76.0%
20593 - 8548 = 12045 = 57.99999999999999%
22863 - 6533 = 16330 = 71.0%
1219 - 389 = 830 = 68.0%
194 - 64 = 130 = 67.0%
735 - 395 = 340 = 46.0%
5539 - 2044 = 3495 = 63.0%
692 - 382 = 310 = 45.0%
4109 - 2042 = 2067 = 50.0%
3323 - 866 = 2457 = 74.0%
5622 - 2241

193 - 153 = 40 = 21.0%
657 - 597 = 60 = 9.0%
506 - 209 = 297 = 59.0%
40 - 24 = 16 = 40.0%
759 - 466 = 293 = 39.0%
13 - 13 = 0 = 0.0%
5224 - 3611 = 1613 = 31.0%
266 - 118 = 148 = 56.00000000000001%
6032 - 5522 = 510 = 8.0%
1517 - 133 = 1384 = 91.0%
591 - 344 = 247 = 42.0%
228 - 83 = 145 = 64.0%
1348 - 881 = 467 = 35.0%
139 - 105 = 34 = 24.0%
466 - 279 = 187 = 40.0%
5252 - 4291 = 961 = 18.0%
201 - 118 = 83 = 41.0%
971 - 458 = 513 = 53.0%
4526 - 2884 = 1642 = 36.0%
121 - 102 = 19 = 16.0%
4086 - 2927 = 1159 = 28.000000000000004%
6223 - 1624 = 4599 = 74.0%
20228 - 9760 = 10468 = 52.0%
314 - 226 = 88 = 28.000000000000004%
1213 - 705 = 508 = 42.0%
219 - 94 = 125 = 56.99999999999999%
4549 - 3154 = 1395 = 31.0%
9559 - 8814 = 745 = 8.0%
1283 - 502 = 781 = 61.0%
326 - 213 = 113 = 35.0%
665 - 586 = 79 = 12.0%
151 - 104 = 47 = 31.0%
206 - 103 = 103 = 50.0%
504 - 334 = 170 = 34.0%
7254 - 2660 = 4594 = 63.0%
4086 - 2647 = 1439 = 35.0%
1648 - 866 = 782 = 47.0%
66 - 64 = 2 = 3.0%
70 - 65 = 5 = 7.000000

13184 - 4334 = 8850 = 67.0%
12077 - 10800 = 1277 = 11.0%
629 - 360 = 269 = 43.0%
247 - 131 = 116 = 47.0%
4028 - 1892 = 2136 = 53.0%
10689 - 5172 = 5517 = 52.0%
268 - 125 = 143 = 53.0%
201 - 141 = 60 = 30.0%
191 - 102 = 89 = 47.0%
2857 - 1548 = 1309 = 46.0%
7245 - 4501 = 2744 = 38.0%
5903 - 2056 = 3847 = 65.0%
2716 - 1029 = 1687 = 62.0%
436 - 41 = 395 = 91.0%
789 - 300 = 489 = 62.0%
2472 - 648 = 1824 = 74.0%
2462 - 366 = 2096 = 85.0%
3277 - 1718 = 1559 = 48.0%
6795 - 4002 = 2793 = 41.0%
3041 - 1148 = 1893 = 62.0%
1544 - 187 = 1357 = 88.0%
6319 - 1348 = 4971 = 79.0%
3137 - 1476 = 1661 = 53.0%
4292 - 2807 = 1485 = 35.0%
988 - 266 = 722 = 73.0%
2636 - 1219 = 1417 = 54.0%
1005 - 206 = 799 = 80.0%
1082 - 476 = 606 = 56.00000000000001%
7391 - 937 = 6454 = 87.0%
2251 - 694 = 1557 = 69.0%
2374 - 579 = 1795 = 76.0%
2883 - 1965 = 918 = 32.0%
53 - 20 = 33 = 62.0%
2659 - 2021 = 638 = 24.0%
6399 - 2497 = 3902 = 61.0%
3878 - 1812 = 2066 = 53.0%
4674 - 3442 = 1232 = 26.0%
36 - 21 = 15 = 42.0%
2443 - 1

24805 - 18470 = 6335 = 26.0%
231 - 206 = 25 = 11.0%
69 - 67 = 2 = 3.0%
3554 - 2054 = 1500 = 42.0%
704 - 447 = 257 = 37.0%
8683 - 5335 = 3348 = 39.0%
59 - 32 = 27 = 46.0%
770 - 464 = 306 = 40.0%
513 - 384 = 129 = 25.0%
1529 - 1162 = 367 = 24.0%
110 - 41 = 69 = 63.0%
896 - 511 = 385 = 43.0%
5666 - 3611 = 2055 = 36.0%
161 - 78 = 83 = 52.0%
1902 - 77 = 1825 = 96.0%
230 - 144 = 86 = 37.0%
537 - 417 = 120 = 22.0%
204 - 46 = 158 = 77.0%
4954 - 3576 = 1378 = 28.000000000000004%
520 - 295 = 225 = 43.0%
1429 - 913 = 516 = 36.0%
1302 - 972 = 330 = 25.0%
5574 - 3727 = 1847 = 33.0%
228 - 204 = 24 = 11.0%
487 - 340 = 147 = 30.0%
7 - 7 = 0 = 0.0%
34303 - 21983 = 12320 = 36.0%
41602 - 31524 = 10078 = 24.0%
1266 - 1044 = 222 = 18.0%
47 - 16 = 31 = 66.0%
1446 - 705 = 741 = 51.0%
1850 - 1142 = 708 = 38.0%
4465 - 3487 = 978 = 22.0%
2616 - 1374 = 1242 = 47.0%
576 - 478 = 98 = 17.0%
3374 - 1901 = 1473 = 44.0%
2102 - 930 = 1172 = 56.00000000000001%
534 - 485 = 49 = 9.0%
214 - 87 = 127 = 59.0%
4 - 1 = 3 = 75.

#### Calculate observed proportion of disease documents studying anatomy

In [20]:
disease_mesh2pmid_l = {k:list(v) for k,v in disease_mesh2pmid.items()}

json.dump(disease_mesh2pmid_l, open('output/disease2anatomy/disease_mesh2pmid.json','w'))

In [21]:
obs_disease_anatomy_prop = dict()
tot = len(disease_mesh2pmid)

# For each disease
for i, (di, di_pmids) in enumerate(disease_mesh2pmid.items()):

    print(i,'/',tot, end='\r')
    
    # Initialize disease_i's dictionary
    obs_disease_anatomy_prop[di] = dict()

    # For each anatomy
    for aj, aj_pmids in anatomy_mesh2pmid.items():
        
        # Shared disease_i-anatomy_j PMIDs
        di_aj_inter_pmids = set(aj_pmids).intersection(set(di_pmids))

        # Observed ratio of anatomy_j-studying PMIDs that are disease_i-studying PMIDs out of disease_i-studying PMIDs
        # (I like thish better than Jaccard similairty)
        obs_disease_anatomy_prop[di][aj] = len(di_aj_inter_pmids)/len(di_pmids)
        
with open('output/disease2anatomy/obs_disease_anatomy_prop.json','w') as fout:
    json.dump(obs_disease_anatomy_prop, fout)

5010 / 5011

In [22]:
da_prop = json.load(open('output/disease2anatomy/obs_disease_anatomy_prop.json'))

# Sort the Disease-Anatomy proportion relationships
sort_da_prop = dict()
for d in da_prop:
    sort_da_prop[d] = dict()
    
    tuplelist = sorted(da_prop[d].items(), key=lambda x:x[1], reverse=True)
    sortdict = dict(tuplelist)
    
    for k,v in sortdict.items():
        if v > 0.01: # this could change
            sort_da_prop[d][k] = v

In [23]:
anatomy_name2id = json.load(open('output/anatomy2anatomy/meshterm-IS-meshid.json'))
disease_name2id = json.load(open('output/disease2disease/meshterm-IS-meshid.json'))

In [24]:
# Output to edges file
with open('output/disease2anatomy/edges_disease2anatomy.csv','w') as fout:
    writer = csv.writer(fout)
    writer.writerow(['Disease (MeSH)','Anatomy (MeSH)','Relationship','Weight'])
    
    for disease, anatomies in sort_da_prop.items():
        try: disease_id = disease_name2id[disease][0]
        except: print('bad disease', disease); continue
            
        for anatomy,weight in anatomies.items():
            try: anatomy_id = anatomy_name2id[anatomy][0]
            except: print('bad anatomy', anatomy); continue
            
            writer.writerow(['MeSH_Disease:'+disease_id, 'MeSH_Anatomy:'+anatomy_id, '-affects->', weight])
            
df = pd.read_csv('output/disease2anatomy/edges_disease2anatomy.csv').drop_duplicates()
df.to_csv('output/edges/edges_disease2anatomy.csv', index = False)
df.to_csv('output/edges to use/Disease_(MeSH)_2_Anatomy_(MeSH).csv', index = False)
df.tail()

Unnamed: 0,Disease (MeSH),Anatomy (MeSH),Relationship,Weight
144475,MeSH_Disease:D014842,MeSH_Anatomy:D008533,-affects->,0.013034
144476,MeSH_Disease:D014842,MeSH_Anatomy:D012730,-affects->,0.013034
144477,MeSH_Disease:D014842,MeSH_Anatomy:D004912,-affects->,0.01231
144478,MeSH_Disease:D014842,MeSH_Anatomy:D007962,-affects->,0.010862
144479,MeSH_Disease:D014842,MeSH_Anatomy:D002462,-affects->,0.010138


#### Calculate expected proportion of documents studying disease-anatomy pairs

In [18]:
disease_anatomy_exp = dict()

# For each disease and the #PMIDs studying the disease...
for disease,pmids in disease_mesh2pmid.items():
    count_dis = len(pmids)
        
    # Calculate the expected anatomy-disease co-occurrences in PMID metadata
    disease_anatomy_exp[disease] = count_dis/num_disease_pmids
    
with open('output/disease2anatomy/disease_anatomy_exp.json','w') as fout:
    json.dump(disease_anatomy_exp, fout)

In [47]:
da_exp_prop = json.load(open('output/disease2anatomy/disease_anatomy_exp.json','r'))