# Gene to Disease
Entrez Gene ID to MeSH Disease ID

Note: The relationships are only 'association' relationships, e.g., there is no relationship between if a gene is up- or down-regulated in the disease. Additionally, we vary the filter of the relationship strength based on the strength of their evidence.

## DisGeNET (Gene-[associated]->Disease)
NOTE: This gets a lot of associations

In [8]:
import csv
import json
import pandas as pd
import numpy as np
from biomed_apis import *
from biomedkg_utils import *
import urllib.parse
import urllib.request
import requests as req
import ast

In [3]:
#! wget -P input/ https://www.disgenet.org/static/disgenet_ap1/files/downloads/all_gene_disease_associations.tsv.gz
#! gunzip input/all_gene_disease_associations.tsv.gz
df = pd.read_table('input/all_gene_disease_associations.tsv')
df = df[df['score']>=0.06]
df.tail()

Unnamed: 0,geneId,geneSymbol,DSI,DPI,diseaseId,diseaseName,diseaseType,diseaseClass,diseaseSemanticType,score,EI,YearInitial,YearFinal,NofPmids,NofSnps,source
1134825,115482723,H3P40,0.506,0.846,C0036341,Schizophrenia,disease,F03,Mental or Behavioral Dysfunction,0.1,0.952,1991.0,2020.0,63,0,BEFREE
1134869,115482723,H3P40,0.506,0.846,C0349204,Nonorganic psychosis,disease,F03,Mental or Behavioral Dysfunction,0.1,1.0,2005.0,2020.0,10,0,BEFREE
1134895,115482723,H3P40,0.506,0.846,C1269955,Tumor Cell Invasion,phenotype,,Neoplastic Process,0.07,1.0,2008.0,2018.0,7,0,BEFREE
1134898,115482723,H3P40,0.506,0.846,C1306459,Primary malignant neoplasm,group,C04,Neoplastic Process,0.08,1.0,2003.0,2018.0,8,0,BEFREE
1134937,115804232,CEROX1,,,C0005890,Body Height,phenotype,,Organism Attribute,0.1,1.0,2019.0,2019.0,1,0,GWASCAT


In [76]:
#dict(df['diseaseSemanticType'].value_counts())

In [9]:
# Entrez Gene ID -associated_with-> MeSH Disease
umls2mesh = json.load(open('output/otherMappings/umls2mesh.json'))
entrez_gene2mesh_disease_fromdisgenet, mesh_disease2entrez_gene_fromdisgenet = dict(), dict()

evidenceCutoff = 0.5
score_thresh = 0.06 # https://think-lab.github.io/d/105/
ev, noev = 0, 0
with open('output/gene2disease/edges_gene-ASSOCIATED-WITH->disease_disgenet.csv','w',newline='') as fout:
    writer = csv.writer(fout)
    writer.writerow(['Gene (Entrez)', 'Disease (MeSH)', 'Relationship','Weight'])
    
    with open('input/all_gene_disease_associations.tsv') as fin:
        for i, line in enumerate(fin):
            if i == 0:
                continue
            line = line.strip().split('\t')
            
            # Disease
            umls_dis = line[4]
            try: mesh_dises = umls2mesh[umls_dis]
            except: continue
            
            # Gene
            entrez_id = line[0]
            
            # Evidence
            score = float(line[9])
            
            # Check evidence, write row
            try:
                if score >= score_thresh:
                    for mesh_dis in mesh_dises:
                        writer.writerow(['Entrez:'+str(entrez_id), 
                                         'MeSH_Disease:'+mesh_dis, 
                                         '-associated_with-',
                                         score])
                    ev += 1
            except:
                noev += 1
                
print(ev, 'Gene-Disease Associations with Evidence')
print(noev,'without Evidence')

df = pd.read_csv('output/gene2disease/edges_gene-ASSOCIATED-WITH->disease_disgenet.csv')
df.to_csv('output/edges/edges_gene-ASSOCIATED-WITH->disease_disgenet.csv', index=False)
df.tail()

239193 Gene-Disease Associations with Evidence
0 without Evidence


Unnamed: 0,Gene (Entrez),Disease (MeSH),Relationship,Weight
239190,Entrez:115482723,MeSH_Disease:D009369,-associated_with-,0.09
239191,Entrez:115482723,MeSH_Disease:D009369,-associated_with-,0.1
239192,Entrez:115482723,MeSH_Disease:D011618,-associated_with-,0.1
239193,Entrez:115482723,MeSH_Disease:D012559,-associated_with-,0.1
239194,Entrez:115804232,MeSH_Disease:D001827,-associated_with-,0.1


## PharmGKB (Gene-[associated]->Disease)


In [10]:
pgkb_df = pd.read_table('input/relationships.tsv')
pgkb_df = pgkb_df.loc[(pgkb_df['Entity1_type']=='Gene') \
                     &(pgkb_df['Entity2_type']=='Disease')]

In [7]:
pgkb_df

Unnamed: 0,Entity1_id,Entity1_name,Entity1_type,Entity2_id,Entity2_name,Entity2_type,Evidence,Association,PK,PD,PMIDs
0,PA142672624,ANKFN1,Gene,PA447288,Essential hypertension,Disease,"ClinicalAnnotation,VariantAnnotation",associated,,PD,
2,PA164722583,MIR2053,Gene,PA166122058,mucositis,Disease,VariantAnnotation,not associated,,,28628559
3,PA164722583,MIR2053,Gene,PA443937,Drug Toxicity,Disease,VariantAnnotation,not associated,,,
4,PA164722583,MIR2053,Gene,PA446155,Precursor Cell Lymphoblastic Leukemia-Lymphoma,Disease,VariantAnnotation,not associated,,,28628559
20,PA31744,NQO1,Gene,PA151958383,Gastrointestinal Stromal Tumors,Disease,"ClinicalAnnotation,VariantAnnotation",associated,,PD,30237583
...,...,...,...,...,...,...,...,...,...,...,...
117329,PA161,F12,Gene,PA443842,Death,Disease,VariantAnnotation,associated,,,23280790
117330,PA161,F12,Gene,PA447054,Stroke,Disease,"ClinicalAnnotation,VariantAnnotation",associated,,PD,23280790
117401,PA134907262,KIAA2026,Gene,PA166123366,event-free survival,Disease,VariantAnnotation,associated,,,28570300
117402,PA134907262,KIAA2026,Gene,PA444760,"Leukemia, Myeloid, Acute",Disease,VariantAnnotation,associated,,,28570300


#### Gene Name --> Gene ID

In [11]:
gene_names = list(pgkb_df['Entity1_name'])
job_id = submit_id_mapping_UniProtAPI(
                  from_db = 'Gene_Name',
                  to_db = 'GeneID', 
                  ids = gene_names)

# This checks on the job until it is finished
if check_id_mapping_results_ready_UniProtAPI(job_id):
    link = get_id_mapping_results_link_UniProtAPI(job_id)
    results = get_id_mapping_results_search_UniProtAPI(link)
    
gene_to_uniprot = get_to_uniprot_id_from_gene_name_mapping_dict_UniProtAPI(results, [9606])

### All PID->GID
prot_ids_to_gene_ids = json.load(open('output/protein2gene/all_uniprot2entrez.json','r'))

# Map gene Name to gene id
gene_name_to_gene_id = dict()
for gene_name, prot_ids in gene_to_uniprot.items():
    try:
        for prot_id in prot_ids:
            gene_ids = prot_ids_to_gene_ids[prot_id]
            for gene_id in gene_ids:
                gene_name_to_gene_id.setdefault(gene_name,set()).add(gene_id)
    except:
        continue
        
# Change the values from set into a list
gene_name_to_gene_id = switch_dictset_to_dictlist(gene_name_to_gene_id)
 
# Check if any Gene names correspond to multiple genes (not good / makes things difficult)
for gene_ids in gene_name_to_gene_id.values():
    if len(gene_ids) != 1:
        print(gene_ids)

HTTPError: 400 Client Error:  for url: https://rest.uniprot.org/idmapping/run

#### Disease Name --> Disease ID (UMLS)

In [108]:
# Name --> UMLS ID
name2umls = dict()
with open('input/MRCONSO.RRF') as fin:
    for i,line in enumerate(fin):
        line = line.strip().split('|')
        if 'ENG' not in line:
            continue
        umls = line[0]
        name = line[14]
        name2umls.setdefault(name.lower(), set()).add(umls)
        
        
# All PharmGKB-provided disease names
disease_names = [dis.lower() for dis in pgkb_df['Entity2_name']]

# Removing non-disease PharmGKB-provided 'disease names'
not_diseases = ['Elderly Adult','Pregnancy','Recurrence','cessation','retreatment failure', 'time above therapeutic range',
 'time below therapeutic range', 'time in therapeutic range','time to achieve stable dose', 'time to delivery',
 'time to relapse', 'time to therapeutic inr', 'tolerance']
not_diseases = [not_dis.lower() for not_dis in not_diseases]

for notdis in not_diseases:
    disease_names.remove(notdis)
    
    
# DiseaseName -> Disease UMLS ID
dis_name_to_mesh = dict()
for dis_name in disease_names:
    try:
        dis_umlses = name2umls[dis_name]
        dis_meshes = list()
        for dis_umls in dis_umlses:
            dis_meshes += umls2mesh[dis_umls]
        for dis_mesh in dis_meshes:
            dis_name_to_mesh.setdefault(dis_name, set()).add(dis_mesh)
    except:
        continue
print(len(dis_name_to_mesh), 'diseases mapped from name to MeSH')

dis_name_to_mesh_str = dict()
for k,v in dis_name_to_mesh.items():
    dis_name_to_mesh_str[k] = str(v)

418 diseases mapped from name to MeSH


In [148]:
gene_name_to_gene_id = dict()
for name, IDs in gene_name_to_gene_id.items():
    gene_name_to_gene_id[name] = int(IDs[0])

In [171]:
# Gene ID --> Disease ID

# Filter by / preprocess disease
pgkb_df['Entity2_name'] = pgkb_df['Entity2_name'].str.lower()
pgkb_df['Entity2_name'].replace(dis_name_to_mesh_str, inplace=True)
pgkb_df = pgkb_df.replace(np.nan, 'not a num')
pgkb_df = pgkb_df.drop(pgkb_df[pgkb_df['Entity2_name'].str.contains('}') == False].index)
pgkb_df['Entity2_name'] = pgkb_df['Entity2_name'].str.upper()

# Filter by / preprocess gene
pgkb_df['Entity1_name'].replace(gene_name_to_gene_id, inplace=True)

pgkb_df = pgkb_df[['Entity1_name','Entity2_name','Association']]
pgkb_df.columns = ['Gene', 'Disease', 'Relationship']

# Turn into dictionaries
pgkb_pos = pgkb_df[pgkb_df['Relationship'] == 'associated'].set_index('Gene')['Disease'].to_dict()
pgkb_neg = pgkb_df[pgkb_df['Relationship'] == 'not associated'].set_index('Gene')['Disease'].to_dict()
pgkb_pos_temp = dict()
for gene, diseases in pgkb_pos.items():
    diseases = ast.literal_eval(diseases)
    for disease in diseases:
        pgkb_pos_temp.setdefault(gene, list()).append(disease)
pgkb_neg_temp = dict()
for gene, diseases in pgkb_neg.items():
    diseases = ast.literal_eval(diseases)
    for disease in diseases:
        pgkb_neg_temp.setdefault(gene, list()).append(disease)

In [231]:
with open('output/gene2disease/edges_gene-ASSOCIATED_WITH->disease_pharmgkb.csv','w') as fout:
    writer = csv.writer(fout)
    writer.writerow(['Gene (Entrez)','Disease (MeSH)','Relationship'])
    
    for gene, diseases in pgkb_pos_temp.items():
        for disease in diseases:
            writer.writerow(['Entrez:'+str(gene),'MeSH_Disease:'+disease,'-associated_with-'])
! cp 'output/gene2disease/edges_gene-ASSOCIATED_WITH->disease_pharmgkb.csv' 'output/edges/edges_gene-ASSOCIATED_WITH->disease_pharmgkb.csv'
df1 = pd.read_csv(open('output/gene2disease/edges_gene-ASSOCIATED_WITH->disease_pharmgkb.csv'))
df1.tail()

Unnamed: 0,Gene (Entrez),Disease (MeSH),Relationship
1205,Entrez:6564,MeSH_Disease:D009369,-associated_with-
1206,Entrez:2567,MeSH_Disease:D012559,-associated_with-
1207,Entrez:3083,MeSH_Disease:D006471,-associated_with-
1208,Entrez:1066,MeSH_Disease:D015179,-associated_with-
1209,Entrez:158358,MeSH_Disease:D015470,-associated_with-


In [221]:
with open('output/gene2disease/edges_gene-ASSOCIATED_WITH->disease_pharmgkb.csv','w') as fout:
    writer = csv.writer(fout)
    writer.writerow(['Gene (Entrez)','Disease (MeSH)','Relationship'])
    
    for gene, diseases in pgkb_pos_temp.items():
        for disease in diseases:
            writer.writerow(['Entrez:'+str(gene),'MeSH_Disease:'+disease,'-associated_with-'])

#! cp 'output/gene2disease/edges_gene-NOT-ASSOCIATED_WITH-disease_pharmgkb.csv' 'output/edges/edges_gene-NOT-ASSOCIATED_WITH->disease_pharmgkb.csv'
df1 = pd.read_csv(open('output/gene2disease/edges_gene-ASSOCIATED_WITH->disease_pharmgkb.csv'))
#df1.tail()

In [4]:
df1 = pd.read_csv(open('output/gene2disease/edges_gene-ASSOCIATED_WITH->disease_pharmgkb.csv'))
df1

Unnamed: 0,Gene (Entrez),Disease (MeSH),Relationship
0,Entrez:162282,MeSH_Disease:D000075222,-associated_with-
1,Entrez:1728,MeSH_Disease:D009369,-associated_with-
2,Entrez:5972,MeSH_Disease:D000075222,-associated_with-
3,Entrez:51141,MeSH_Disease:D024821,-associated_with-
4,Entrez:978,MeSH_Disease:D016399,-associated_with-
...,...,...,...
1205,Entrez:6564,MeSH_Disease:D009369,-associated_with-
1206,Entrez:2567,MeSH_Disease:D012559,-associated_with-
1207,Entrez:3083,MeSH_Disease:D006471,-associated_with-
1208,Entrez:1066,MeSH_Disease:D015179,-associated_with-


In [222]:
with open('output/gene2disease/edges_gene-NOT-ASSOCIATED_WITH-disease_pharmgkb.csv','w') as fout:
    writer = csv.writer(fout)
    writer.writerow(['Gene (Entrez)','Disease (MeSH)','Relationship'])
    
    for gene, diseases in pgkb_neg_temp.items():
        for disease in diseases:
            writer.writerow(['Entrez:'+str(gene),'MeSH_Disease:'+disease,'-not_associated_with-'])
! cp 'output/gene2disease/edges_gene-NOT-ASSOCIATED_WITH-disease_pharmgkb.csv' 'output/edges/edges_gene-ASSOCIATED_WITH->disease_pharmgkb.csv'
df1 = pd.read_csv(open('output/gene2disease/edges_gene-NOT-ASSOCIATED_WITH->disease_pharmgkb.csv'))
#df1.tail()

In [5]:
df1 = pd.read_csv(open('output/gene2disease/edges_gene-NOT-ASSOCIATED_WITH->disease_pharmgkb.csv'))
df1

Unnamed: 0,Gene (Entrez),Disease (MeSH),Relationship
0,Entrez:162282,MeSH_Disease:D000075222,-not_associated_with-
1,Entrez:1728,MeSH_Disease:D009369,-not_associated_with-
2,Entrez:5972,MeSH_Disease:D000075222,-not_associated_with-
3,Entrez:51141,MeSH_Disease:D024821,-not_associated_with-
4,Entrez:978,MeSH_Disease:D016399,-not_associated_with-
...,...,...,...
1205,Entrez:6564,MeSH_Disease:D009369,-not_associated_with-
1206,Entrez:2567,MeSH_Disease:D012559,-not_associated_with-
1207,Entrez:3083,MeSH_Disease:D006471,-not_associated_with-
1208,Entrez:1066,MeSH_Disease:D015179,-not_associated_with-


------------------------------------------------------------------------------------------------------

## ClinVar
- Gene - OMIM Disease
- Gene - MeSH Disease

In [47]:
os.system('wget -N -P input/ https://ftp.ncbi.nlm.nih.gov/pub/clinvar/gene_condition_source_id')

--2023-03-15 17:31:30--  https://ftp.ncbi.nlm.nih.gov/pub/clinvar/gene_condition_source_id
Resolving ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)... 165.112.9.228, 165.112.9.229, 2607:f220:41e:250::13, ...
Connecting to ftp.ncbi.nlm.nih.gov (ftp.ncbi.nlm.nih.gov)|165.112.9.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1142467 (1.1M)
Saving to: ‘input/gene_condition_source_id’

     0K .......... .......... .......... .......... ..........  4%  359K 3s
    50K .......... .......... .......... .......... ..........  8%  358K 3s
   100K .......... .......... .......... .......... .......... 13%  712K 2s
   150K .......... .......... .......... .......... .......... 17%  718K 2s
   200K .......... .......... .......... .......... .......... 22%  716K 2s
   250K .......... .......... .......... .......... .......... 26%  714K 2s
   300K .......... .......... .......... .......... .......... 31%  713K 1s
   350K .......... .......... .......... .......... .....

0

In [237]:
# MONDO-is-OMIM-is-MeSH 
mondo2omim = json.load(open('output/disease2disease/mondo2omim.json'))
omim2mesh = json.load(open('output/disease2disease/omim2mesh.json'))
for omim,mesh in omim2mesh.copy().items():
    del omim2mesh[omim]
    omim = omim.replace('OMIM_','')
    omim2mesh[omim] = mesh


entrez_gene2omim_disease, omim_disease2entrez_gene = dict(), dict()

for i, line in enumerate(open('input/gene_condition_source_id')):
    line = line.split('\t')
    if i == 0:
        continue

        
    '''Entrez Gene'''
    entrez_gene_id = line[0]
    # If no gene ID, skip this line
    if entrez_gene_id == '':
        continue
    
    
    '''OMIM Disease ID'''
    disease_omim_ids = line[7]
    
    # If no OMIM
    if disease_omim_ids == '':
        # Try to get OMIM via MONDO2OMIM
        disease_other_id_type = line[5]        
        disease_other_ids = line[6]
        
        if disease_other_id_type == 'MONDO':
            mondo_id = disease_other_ids.replace(':','_')
            if mondo_id in mondo2omim:
                disease_omim_ids = mondo2omim[mondo_id]
            # Otherwise, skip this line
            else:
                continue


    ''' Entrez Gene - OMIM Disease '''
    if type(disease_omim_ids) != list:
        disease_omim_ids = [disease_omim_ids]
    
    for disease_omim_id in disease_omim_ids:
        entrez_gene2omim_disease.setdefault(entrez_gene_id, set()).add(disease_omim_id)
        omim_disease2entrez_gene.setdefault(disease_omim_id, set()).add(entrez_gene_id)

print(len(omim_disease2entrez_gene), 'OMIM Diseases to Entrez Genes')
print(len(entrez_gene2omim_disease), 'Entrez Genes to OMIM Diseases')


mesh_disease2entrez_gene, entrez_gene2mesh_disease = dict(), dict()

for omim, entrez_genes in omim_disease2entrez_gene.items():
    if omim in omim2mesh:
        
        # MeSH Disease
        mesh_diseases = omim2mesh[omim]
        for mesh_disease in mesh_diseases:
            
            # Gene
            for entrez_gene in entrez_genes:
            
                # MeSH Disease - Gene
                mesh_disease2entrez_gene.setdefault(mesh_disease, set()).add(entrez_gene)
                entrez_gene2mesh_disease.setdefault(entrez_gene, set()).add(mesh_disease)

print(len(mesh_disease2entrez_gene), 'MeSH Diseases to Entrez Genes')
print(len(entrez_gene2mesh_disease), 'Entrez Genes to MeSH Diseases')

json.dump(switch_dictset_to_dictlist(entrez_gene2mesh_disease), open('output/gene2disease/entrez_gene2mesh_disease_fromclinvar.json','w'))
json.dump(switch_dictset_to_dictlist(mesh_disease2entrez_gene), open('output/gene2disease/mesh_disease2entrez_gene_fromclinvar.json','w'))
json.dump(switch_dictset_to_dictlist(entrez_gene2omim_disease), open('output/gene2disease/entrez_gene2omim_disease_fromclinvar.json','w'))
json.dump(switch_dictset_to_dictlist(omim_disease2entrez_gene), open('output/gene2disease/omim_disease2entrez_gene_fromclinvar.json','w'))

output_edgefile_onerel_noweight('output/gene2disease/edges_gene-ASSOCIATED-WITH->disease_clinvar.csv',
                               ['Gene (Entrez)', 'Disease (MeSH)', 'Relationship'],
                                entrez_gene2mesh_disease,
                                '-associated_with-',
                               'Entrez:',
                               'MeSH_Disease:')


gene2protein = json.load(open('output/protein2gene/all_entrez2uniprot.json'))

protein2mesh_disease = dict()

for gene, mesh_diseases in entrez_gene2mesh_disease.items():
    try: 
        proteins = gene2protein[gene]
        for protein in proteins:
            for mesh_disease in mesh_diseases:
                protein2mesh_disease.setdefault(protein, set()).add(mesh_disease)
    except:
        continue

6430 OMIM Diseases to Entrez Genes
4854 Entrez Genes to OMIM Diseases
1634 MeSH Diseases to Entrez Genes
1472 Entrez Genes to MeSH Diseases


In [56]:
len(protein2mesh_disease)

1508

In [57]:
len(entrez_gene2mesh_disease)

1472

## ClinGen
Code from Alexander Pelletier

In [22]:
os.system('wget -N -P input/ https://search.clinicalgenome.org/kb/gene-validity/download')

--2023-05-24 17:53:10--  https://search.clinicalgenome.org/kb/gene-validity/download
Resolving search.clinicalgenome.org (search.clinicalgenome.org)... 35.243.222.62
Connecting to search.clinicalgenome.org (search.clinicalgenome.org)|35.243.222.62|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 582465 (569K) [text/plain]
Saving to: ‘input/download’

     0K .......... .......... .......... .......... ..........  8%  303K 2s
    50K .......... .......... .......... .......... .......... 17%  612K 1s
   100K .......... .......... .......... .......... .......... 26%  608K 1s
   150K .......... .......... .......... .......... .......... 35% 7.75M 1s
   200K .......... .......... .......... .......... .......... 43% 4.42M 0s
   250K .......... .......... .......... .......... .......... 52%  770K 0s
   300K .......... .......... .......... .......... .......... 61% 10.6M 0s
   350K .......... .......... .......... .......... .......... 70% 12.5M 0s
   400K ......

0

In [16]:
clingen_df = pd.read_csv('input/download',header=4)
clingen_df.drop(clingen_df.index[0], inplace=True)
clingen_df = clingen_df.reset_index()
clingen_df.tail()

Unnamed: 0,index,GENE SYMBOL,GENE ID (HGNC),DISEASE LABEL,DISEASE ID (MONDO),MOI,SOP,CLASSIFICATION,ONLINE REPORT,CLASSIFICATION DATE,GCEP
2052,2053,ZNF423,HGNC:16762,nephronophthisis,MONDO:0019005,AR,SOP8,Limited,https://search.clinicalgenome.org/kb/gene-vali...,2022-03-23T16:00:00.000Z,Kidney Cystic and Ciliopathy Disorders
2053,2054,ZNF462,HGNC:21684,weiss-kruszka syndrome,MONDO:0032836,AD,SOP7,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2020-08-25T13:10:51.689Z,Syndromic Disorders
2054,2055,ZNF674,HGNC:17625,X-linked intellectual disability,MONDO:0100284,XL,SOP8,Disputed,https://search.clinicalgenome.org/kb/gene-vali...,2021-05-04T17:00:00.000Z,Intellectual Disability and Autism
2055,2056,ZNF711,HGNC:13128,X-linked complex neurodevelopmental disorder,MONDO:0100148,XL,SOP9,Definitive,https://search.clinicalgenome.org/kb/gene-vali...,2022-09-22T06:00:00.000Z,Intellectual Disability and Autism
2056,2057,ZNF81,HGNC:13156,X-linked intellectual disability,MONDO:0100284,XL,SOP8,Disputed,https://search.clinicalgenome.org/kb/gene-vali...,2021-01-26T17:00:00.000Z,Intellectual Disability and Autism


In [17]:
def extract_mapping_table(results):
    mapping_table = {}
    for entry in results['results']:
        from_ = entry['from']
        to_ = entry['to']['primaryAccession']
        if from_ not in mapping_table:
            mapping_table[from_] = []
        mapping_table[from_] += [to_]
    return mapping_table


def uniprot_convert_hgnc_to_uniprot_gene(genes):
    # send job
    job_id = submit_id_mapping_UniProtAPI(
                      from_db = 'HGNC',
#                       to_db = 'Gene_Name', 
                      to_db = 'UniProtKB', 
                      ids = genes)

    # check job until it is finished
    if check_id_mapping_results_ready_UniProtAPI(job_id):
        link = get_id_mapping_results_link_UniProtAPI(job_id)
        results = get_id_mapping_results_search_UniProtAPI(link)
    
    mapping_table = extract_mapping_table(results)
    
    print("%d results. %d out of %d sucessfully mapped (%d failed)"%(len(results['results']),
                                                        len(mapping_table),
                                                        len(genes),
                                                        len(results['failedIds'])))
    return mapping_table


# need to prepare clingen to have h,r,t encoding UniProt gene -[ClinGen_Classification]-> MeSH Term
def prepare_clingen(hgnc_to_uniprot, mondo_to_mesh, clingen_df, debug=False):
    mapped_clingen_dict = {x:[] for x in ["head","relation","tail","weight","edge_type"]}
    
    for hgnc_gene, classification, mondo_disease in zip(clingen_df['GENE ID (HGNC)'],
                                                        clingen_df['CLASSIFICATION'],
                                                        clingen_df['DISEASE ID (MONDO)']):
        if hgnc_gene in hgnc_to_uniprot and mondo_disease in mondo_to_mesh:
            uniprot_list = hgnc_to_uniprot[hgnc_gene]
            mesh_terms = mondo_to_mesh[mondo_disease]
                        
            for uniprot in uniprot_list:
                for disease in mesh_terms:
                
                    h = uniprot
                    r = "ClinGen_Classification:%s"%(classification)
                    t = disease
                    mapped_clingen_dict['head'] += ['UniProt:'+h]
                    mapped_clingen_dict['relation'] += [r]
                    mapped_clingen_dict['tail'] += ["MeSH_Disease:"+t]
                    mapped_clingen_dict['weight'] += [1]
                    mapped_clingen_dict['edge_type'] += ["ClinGen"]
    
    mapped_clingen_df = pd.DataFrame(mapped_clingen_dict)
    
    if debug:
        # print statistics
        hgnc_genes = set(clingen_df['GENE ID (HGNC)'])
        mondo_diseases = set(clingen_df['DISEASE ID (MONDO)'])
        uniprot_proteins = set(mapped_clingen_df['head'])
        mesh_terms = set(mapped_clingen_df['tail'])
        print("Originally %d HGNC genes and %d MONDO diseases"%(len(hgnc_genes), len(mondo_diseases)))
        print("%d edges between %d UniProt proteins and %d MeSH terms"%(mapped_clingen_df.shape[0],
                                                                        len(uniprot_proteins),
                                                                        len(mesh_terms)))

    return mapped_clingen_df

mondo_to_mesh_file = r"output/disease2disease/mondo2mesh.json"
mondo_to_mesh = json.load(open(mondo_to_mesh_file,"r"))
hgnc_to_uniprot = uniprot_convert_hgnc_to_uniprot_gene(clingen_df['GENE ID (HGNC)'])
mapped_clingen_df = prepare_clingen(hgnc_to_uniprot, mondo_to_mesh, clingen_df, debug=True)
mapped_clingen_df.head()

Job still running. Retrying in 3s
11881 results. 1696 out of 2057 sucessfully mapped (192 failed)
Originally 1704 HGNC genes and 1125 MONDO diseases
7736 edges between 4980 UniProt proteins and 530 MeSH terms


Unnamed: 0,head,relation,tail,weight,edge_type
0,UniProt:A8K2U0,ClinGen_Classification:Disputed,MeSH_Disease:D009634,1,ClinGen
1,UniProt:F5GXP1,ClinGen_Classification:Disputed,MeSH_Disease:D009634,1,ClinGen
2,UniProt:F5GYG7,ClinGen_Classification:Disputed,MeSH_Disease:D009634,1,ClinGen
3,UniProt:H0YGG5,ClinGen_Classification:Disputed,MeSH_Disease:D009634,1,ClinGen
4,UniProt:H0YH14,ClinGen_Classification:Disputed,MeSH_Disease:D009634,1,ClinGen


In [61]:
mapped_clingen_df['relation'].value_counts()

ClinGen_Classification:Definitive                       4982
ClinGen_Classification:Limited                           947
ClinGen_Classification:Moderate                          774
ClinGen_Classification:Disputed                          657
ClinGen_Classification:No Known Disease Relationship     147
ClinGen_Classification:Refuted                            82
ClinGen_Classification:Strong                             72
Name: relation, dtype: int64

In [18]:
l = list(np.where(mapped_clingen_df['relation']=='ClinGen_Classification:Disputed')[0])

In [19]:
bad_indices = list()
bad_confidences = ['ClinGen_Classification:Disputed','ClinGen_Classification:No Known Disease Relationship',\
                   'ClinGen_Classification:Refuted','ClinGen_Classification:Limited']

for bad_confidence in bad_confidences:
    bad_indices += list(np.where(mapped_clingen_df['relation']==bad_confidence)[0])
bad_indices = sorted(bad_indices)

In [20]:
confident_clingen_df = mapped_clingen_df.drop(bad_indices)
confident_clingen_df.tail()

Unnamed: 0,head,relation,tail,weight,edge_type
7731,UniProt:U3KQ51,ClinGen_Classification:Definitive,MeSH_Disease:C536990,1,ClinGen
7732,UniProt:O75844,ClinGen_Classification:Definitive,MeSH_Disease:C535706,1,ClinGen
7733,UniProt:A0A6Q8PF67,ClinGen_Classification:Definitive,MeSH_Disease:C535706,1,ClinGen
7734,UniProt:A0A6Q8PH40,ClinGen_Classification:Definitive,MeSH_Disease:C535706,1,ClinGen
7735,UniProt:A0A6Q8PHG9,ClinGen_Classification:Definitive,MeSH_Disease:C535706,1,ClinGen


In [23]:
prot_ids_to_gene_ids = json.load(open('output/protein2gene/all_uniprot2entrez.json','r'))

In [26]:
with open('output/gene2disease/gene_to_disease_clingen.csv','w') as fout:
    writer = csv.writer(fout)
    writer.writerow(['Gene (Entrez)','Disease (MeSH)','Relationship'])
    for i in range(0,len(confident_clingen_df)):
        protein = confident_clingen_df['head'].iloc[i].split('UniProt:')[1]
        try: 
            genes = prot_ids_to_gene_ids[protein]
            disease = confident_clingen_df['tail'].iloc[i]
            for gene in genes:
                writer.writerow(['Entrez:'+str(gene),
                                 disease,
                                 '-associated_with-'])
        except:
            continue

## KEGG

In [193]:
omim2mesh = json.load(open('output/disease2disease/omim2mesh.json'))

'''KEGG Disease -is- MeSH Disease'''
kegg_disease2mesh_disease = dict()

for i, line in enumerate(open('input/KEGG/kegg_disease_to_mesh_and_omim.csv')):
    if i == 0: continue
    line = line.split(',')
    kegg_disease = line[0]
    mesh_diseases = line[1].split('; ')
    omim_disease = line[2]
    
    if mesh_diseases == ['']:
        try:
            mesh_diseases = omim2mesh[omim_disease]
        except:
            continue
    
    for mesh_disease in mesh_diseases:
        kegg_disease2mesh_disease.setdefault(kegg_disease, set()).add(mesh_disease)

In [194]:
! curl https://rest.kegg.jp/link/hsa/disease > input/KEGG/kegg_gene_to_disease.tsv

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  141k    0  141k    0     0  91448      0 --:--:--  0:00:01 --:--:-- 91429


In [240]:
'''Entrez Gene - MeSH Disease'''
mesh_disease2entrez_gene_kegg = dict()
gene_to_disease_kegg = dict()
for line in open('input/KEGG/kegg_gene_to_disease.tsv'):
    try:
        line = line.split('\t')

        # Gene
        kegg_gene = line[1].strip()
        entrez_gene = kegg_gene.split(':')[1]

        # Disease 
        kegg_disease = line[0].strip()
        mesh_diseases = kegg_disease2mesh_disease[kegg_disease]

        # Gene - Disease
        for mesh_disease in mesh_diseases:
            mesh_disease2entrez_gene_kegg.setdefault(mesh_disease, set()).add(entrez_gene)
            gene_to_disease_kegg.setdefault(entrez_gene, set()).add(mesh_disease)
    except:
        continue
        
json.dump(switch_dictset_to_dictlist(mesh_disease2entrez_gene_kegg), open('output/gene2disease/mesh_disease2entrez_gene_kegg_from_kegg.json','w'))
output_edgefile_onerel_noweight('output/gene2disease/edges_gene-ASSOCIATED-WITH->disease_kegg.csv',
                               ['Gene (Entrez)','Disease (MeSH)','Relationship'],
                                gene_to_disease_kegg,
                                '-associated_with-',
                                'Entrez:',
                                'MeSH_Disease:')

## Merge and export all

In [314]:
disgenet_df = pd.read_csv('output/edges/edges_gene-ASSOCIATED-WITH->disease_disgenet.csv').drop_duplicates()
clingen_df = pd.read_csv('output/gene2disease/gene_to_disease_clingen.csv').drop_duplicates()
pharmgkb_df = pd.read_csv('output/gene2disease/edges_gene-ASSOCIATED_WITH->disease_pharmgkb.csv').append(pd.read_csv('output/gene2disease/edges_gene-NOT-ASSOCIATED_WITH->disease_pharmgkb.csv')).drop_duplicates()
clinvar_df = pd.read_csv('output/gene2disease/edges_gene-ASSOCIATED-WITH->disease_clinvar.csv').drop_duplicates()
kegg_df = pd.read_csv('output/gene2disease/edges_gene-ASSOCIATED-WITH->disease_kegg.csv').drop_duplicates()

#display(disgenet_df.head())
#display(clingen_df.head())
#display(pharmgkb_df.head())
#display(clinvar_df.head())
#display(kegg_df.head())

'''Merge and output'''
# DisGeNET
disgenet_df = disgenet_df.groupby(['Gene (Entrez)', 'Disease (MeSH)', 'Relationship'])\
                          .agg({'Weight': 'mean'}).reset_index()
print(len(disgenet_df), 'rows in DisGeNET') # after you map UMLS to MeSH, there are a few duplicate rows (different weights though)

# Merged
first_three_cols = ['Gene (Entrez)', 'Disease (MeSH)', 'Relationship']
df = pd.concat([disgenet_df, clingen_df, clinvar_df, pharmgkb_df, kegg_df]).drop_duplicates()
print(len(df), 'concatenated rows (some duplicates with DisGeNET\'s weighted edges)')
df['Weight'].fillna(0, inplace=True) 
df = df.groupby(first_three_cols).agg({'Weight': 'sum'}).reset_index()
df['Weight'].replace(0.00, np.mean(df['Weight']), inplace=True) # Note: Used average weight as imputed value
print(len(df), 'concatenated rows')
df.reset_index()


file = 'Disease_(MeSH)_2_Gene_(Entrez).csv'
#df = pd.read_csv('output/gene2disease/disease_merged-ASSOCIATED-WITH->edges_gene.csv')
df.to_csv(os.path.join('output/edges',file), index=False)
df.to_csv(os.path.join('output/edges to use/',file), index=False)

df2 = pd.read_csv(os.path.join('output/edges to use/',file))
df2

191375 rows in DisGeNET
207316 concatenated rows (some duplicates with DisGeNET's weighted edges)
201336 concatenated rows


Unnamed: 0,Gene (Entrez),Disease (MeSH),Relationship,Weight
0,Entrez:1,MeSH_Disease:D006529,-associated_with-,0.300000
1,Entrez:1,MeSH_Disease:D012559,-associated_with-,0.300000
2,Entrez:10,MeSH_Disease:D000077277,-associated_with-,0.070000
3,Entrez:10,MeSH_Disease:D000086002,-associated_with-,0.340000
4,Entrez:10,MeSH_Disease:D000236,-associated_with-,0.080000
...,...,...,...,...
201331,Entrez:ZFP91-CNTF,MeSH_Disease:D012559,-not_associated_with-,0.154237
201332,Entrez:ZNRD1-AS1,MeSH_Disease:D004816,-associated_with-,0.154237
201333,Entrez:ZNRD1-AS1,MeSH_Disease:D004816,-not_associated_with-,0.154237
201334,Entrez:ZNRD1-AS1,MeSH_Disease:D013262,-associated_with-,0.154237
