Mapping and Merging Gene and Disease Data from HPO and Orphanet

In [31]:
import pandas as pd

Disease Mapping

In [None]:
#gene disease data
df_gene_disease_orpha = pd.read_csv(r"...path_to\RDsqr-KG\Preprocessed_datasets\disease_gene_orphanet.csv")
df_gene_disease_orpha_hpo = pd.read_csv(r"...path_to\RDsqr-KG\Preprocessed_datasets\genes_to_disease_orpha_hpo.csv")
df_gene_disease_omim_hpo = pd.read_csv(r"...path_to\RDsqr-KG\Preprocessed_datasets\genes_to_disease_omim_hpo.csv")

In [None]:
#disease links
df_disease_map = pd.read_csv(r"...path_to\RDsqr-KG\Preprocessed_datasets\disease_references.csv")
df_omim_map = pd.read_csv(r"...path_to\RDsqr-KG\Preprocessed_datasets\genes_to_phenotype_omim_hpo.csv")

In [34]:
#ensuring everything is of object datatype
df_gene_disease_orpha = df_gene_disease_orpha.astype(str)
df_gene_disease_orpha_hpo = df_gene_disease_orpha_hpo.astype(str)
df_gene_disease_omim_hpo = df_gene_disease_omim_hpo.astype(str)
df_disease_map = df_disease_map.astype(str)
df_omim_map = df_omim_map.astype(str)

In [35]:
df_gene_disease_orpha = df_gene_disease_orpha[['OrphaCode', 'DisorderName', 'Ensembl']]

In [36]:
df_hpo_orpha = pd.merge(df_gene_disease_orpha_hpo, df_gene_disease_orpha, left_on='ORPHA_disease_id', right_on='OrphaCode', how='left')
df_hpo_orpha = df_hpo_orpha[['OrphaCode','DisorderName','ncbi_gene_id']]

In [37]:
df_omim_map.head(1)

Unnamed: 0,ncbi_gene_id,gene_symbol,hpo_id,hpo_name,OMIM_disease_id
0,10,NAT2,7,Autosomal recessive inheritance,243400


In [38]:
df_gene_disease_omim_hpo.head(1)

Unnamed: 0,ncbi_gene_id,gene_symbol,OMIM_disease_id
0,64170,CARD9,212050


In [39]:
df_hpo_omim = pd.merge(df_gene_disease_omim_hpo, df_omim_map, left_on='OMIM_disease_id', right_on='OMIM_disease_id', how='left')
df_hpo_omim = df_hpo_omim[['ncbi_gene_id_x','OMIM_disease_id', 'hpo_name']]

In [40]:
df_disease_map['OMIM'] = df_disease_map['OMIM'].str.replace('.0', '', regex=False)

In [41]:
df_hpo_omim = pd.merge(df_hpo_omim, df_disease_map, left_on='OMIM_disease_id', right_on='OMIM', how='left')

In [42]:
df_hpo_omim = df_hpo_omim[['ncbi_gene_id_x','OMIM','hpo_name','OrphaCode','DisorderName']]

In [43]:
df_hpo_omim.head()

Unnamed: 0,ncbi_gene_id_x,OMIM,hpo_name,OrphaCode,DisorderName
0,64170,212050,Chronic tinea infection,457088,Predisposition to invasive fungal disease due ...
1,64170,212050,Meningitis,457088,Predisposition to invasive fungal disease due ...
2,64170,212050,Abnormal B cell count,457088,Predisposition to invasive fungal disease due ...
3,64170,212050,Autosomal recessive inheritance,457088,Predisposition to invasive fungal disease due ...
4,64170,212050,Deep dermatophytosis,457088,Predisposition to invasive fungal disease due ...


In [44]:
df_hpo_omim.isna().sum()

ncbi_gene_id_x        0
OMIM              88578
hpo_name            449
OrphaCode         88578
DisorderName      88578
dtype: int64

In [45]:
df_hpo_omim = df_hpo_omim[['ncbi_gene_id_x','OrphaCode','DisorderName']]

Gene Mapping

In [None]:
df_gene_map = pd.read_csv(r"...path_to\RDsqr-KG\Datasets\Vocabulary_links\gene_links.csv")

In [47]:
df_gene_map['ncbi_id'] = df_gene_map['ncbi_id'].astype(str)

In [48]:
df_gdi1 = pd.merge(df_gene_disease_orpha, df_gene_map, left_on='Ensembl', right_on='Ensembl_id', how='left')
df_gdi1 = df_gdi1[['ncbi_id','gene_name','OrphaCode','DisorderName']]

df_gdi2 = pd.merge(df_hpo_orpha, df_gene_map, left_on='ncbi_gene_id', right_on='ncbi_id', how='left')
df_gdi2 = df_gdi2[['ncbi_id','gene_name','OrphaCode','DisorderName']]

df_gdi3 = pd.merge(df_hpo_omim, df_gene_map, left_on='ncbi_gene_id_x', right_on='ncbi_id', how='left')
df_gdi3 = df_gdi3[['ncbi_id','gene_name','OrphaCode','DisorderName']]

In [49]:
df_gdi = pd.concat([df_gdi1, df_gdi2, df_gdi3], ignore_index=True)

In [51]:
df_gdi['DisorderName'] = df_gdi['DisorderName'].str.replace(r'^NON RARE IN EUROPE:\s*', '', regex=True).str.strip()
df_gdi = df_gdi[~df_gdi['DisorderName'].fillna('').str.startswith('OBSOLETE')]

In [52]:
df_gdi.isna().sum()

ncbi_id          1803
gene_name        1803
OrphaCode       88578
DisorderName    88578
dtype: int64

In [53]:
df_gdi.dropna(inplace=True)

In [54]:
df_gdi.isna().sum()

ncbi_id         0
gene_name       0
OrphaCode       0
DisorderName    0
dtype: int64

In [55]:
df_gdi.drop_duplicates(inplace=True)

In [56]:
df_gdi.shape

(9237, 4)

In [57]:
df_gdi = df_gdi.rename(columns={'ncbi_id': 'head_id','gene_name': 'head_name','OrphaCode': 'tail_id','DisorderName': 'tail_name'})

In [58]:
df_gdi['head_type'] = 'gene'
df_gdi['head_ref'] = 'NCBI'
df_gdi['tail_type'] = 'disease'
df_gdi['tail_ref'] = 'Orphanet'
df_gdi['predicate'] = 'gdi' 

In [59]:
df_gdi = df_gdi[['head_id', 'head_name', 'head_type','head_ref','tail_id', 'tail_name','tail_type','tail_ref','predicate']]

In [60]:
df_gdi.shape

(9237, 9)

In [61]:
df_gdi.head(1)

Unnamed: 0,head_id,head_name,head_type,head_ref,tail_id,tail_name,tail_type,tail_ref,predicate
0,374654,kinesin family member 7,gene,NCBI,166024,"Multiple epiphyseal dysplasia, Al-Gazali type",disease,Orphanet,gdi


In [None]:
df_gdi.to_csv(r"...path_to\RDsqr-KG\KG_datasets\gdi.csv", index=False)