In [1]:
import pandas as pd
import requests 
from tqdm import tqdm
import os 
import numpy as np 
import torch 

# set seed 
torch.manual_seed(0)
np.random.seed(0)

# CTD ; protein associates disease 

source:https://ctdbase.org/downloads/;jsessionid=5A2DBE8711C67A20D48E2A6F93712BF6#cd

NOTE: 

```
# Use is subject to the terms set forth at http://ctdbase.org/about/legal.jsp
# These terms include:
#
#   1. All forms of publication (e.g., web sites, research papers, databases,
#      software applications, etc.) that use or rely on CTD data must cite CTD.
#      Citation guidelines: http://ctdbase.org/about/publications/#citing
#
#   2. All electronic or online applications must include hyperlinks from
#      contexts that use CTD data to the applicable CTD data pages.
#      Linking instructions: http://ctdbase.org/help/linking.jsp
#
#   3. You must notify CTD, and describe your use of our data:
#      http://ctdbase.org/help/contact.go
#
#   4. For quality control purposes, you must provide CTD with periodic
#      access to your publication of our data.
#
# More information: http://ctdbase.org/downloads/
```

In [2]:
protdis = pd.read_csv('../../data/tkg_raw/CTD_curated_genes_diseases.csv', comment='#')
protdis.columns = 'GeneSymbol,GeneID,DiseaseName,DiseaseID,OmimIDs,PubMedIDs'.split(',')
protdis.head()

disease_space = np.loadtxt('../../extdata/meta/disease_space.txt', dtype=str)
protdis = protdis[lambda x: x.DiseaseID.isin(disease_space)]

In [3]:
print(f'number of unique diseases: {protdis.DiseaseID.nunique()}')
print(f'number of unique genes: {protdis.GeneID.nunique()}')

number of unique diseases: 4060
number of unique genes: 8534


In [4]:
protdis.head()

Unnamed: 0,GeneSymbol,GeneID,DiseaseName,DiseaseID,OmimIDs,PubMedIDs
0,A,50518,Diabetes Mellitus,MESH:D003920,,1473152
1,A,50518,"Diabetes Mellitus, Type 2",MESH:D003924,,8146154
2,A,50518,Diabetic Nephropathies,MESH:D003928,,37769864
3,A,50518,Edema,MESH:D004487,,32937126
4,A,50518,Failure to Thrive,MESH:D005183,,32937126


In [5]:
protdis.to_csv('../../extdata/meta/CTD__genes_diseases.csv', index=False)

In [8]:
protdis_fwd = protdis[['GeneSymbol', 'DiseaseID']].drop_duplicates().rename(columns={'GeneSymbol':'src', 'DiseaseID':'dst'})
protdis_fwd = protdis_fwd.assign(src_type='gene', dst_type='disease', relation='associates_fwd')
protdis_fwd.to_csv('../../extdata/relations/ctd_genes_diseases_fwd.csv', index=False)

protdis_rev = protdis[['DiseaseID', 'GeneSymbol']].drop_duplicates().rename(columns={'GeneSymbol':'dst', 'DiseaseID':'src'})
protdis_rev = protdis_rev.assign(src_type='disease', dst_type='gene', relation='associates_rev')
protdis_rev.to_csv('../../extdata/relations/ctd_genes_diseases_rev.csv', index=False)


In [9]:
protdis_fwd.head()

Unnamed: 0,src,dst,src_type,dst_type,relation
0,A,MESH:D003920,gene,disease,associates_fwd
1,A,MESH:D003924,gene,disease,associates_fwd
2,A,MESH:D003928,gene,disease,associates_fwd
3,A,MESH:D004487,gene,disease,associates_fwd
4,A,MESH:D005183,gene,disease,associates_fwd
