In [1]:
import pandas as pd

In [2]:
all_diseases = pd.read_csv("all_diseases.tsv", sep="\t", header=None)
all_diseases

Unnamed: 0,0,1
0,"AARSKOG SYNDROME, AUTOSOMAL DOMINANT",OMIM:100050
1,"AORTIC ANEURYSM, FAMILIAL ABDOMINAL, 1",OMIM:100070
2,PRUNE BELLY SYNDROME,OMIM:100100
3,ABDUCENS PALSY,OMIM:100200
4,ADAMS-OLIVER SYNDROME 1,OMIM:100300
...,...,...
8822,,OMIM:618367
8823,,OMIM:618369
8824,,OMIM:618371
8825,,OMIM:618372


In [3]:
gene_diseases = pd.read_csv("genes_diseases.tsv", sep="\t", header=None)
gene_diseases

Unnamed: 0,0,1
0,3119,OMIM:612595
1,3265,OMIM:218040
2,9992,OMIM:608988
3,84570,OMIM:609384
4,6638,OMIM:615091
...,...,...
11760,9439,OMIM:614346
11761,10000,OMIM:615937
11762,1630,OMIM:617542
11763,1316,OMIM:608658


In [4]:
all_genes = pd.read_csv("gene_id_data_index.tsv", sep="\t", header=None)
all_genes_ids = all_genes.loc[1:, 0].tolist()
all_genes_ids = [int(item) for item in all_genes_ids]
len(all_genes_ids)

17247

In [5]:
## https://www.informatics.jax.org/downloads/reports/MGI_DO.rpt
gene2diseases_mouse = pd.read_csv("MGI_DO.tsv", sep="\t")
gene2diseases_mouse["EntrezGene ID"] = gene2diseases_mouse["EntrezGene ID"].fillna(-1)
gene2diseases_mouse["EntrezGene ID"] = gene2diseases_mouse["EntrezGene ID"].astype(int)
gene2diseases_mouse

Unnamed: 0,DO Disease ID,DO Disease Name,OMIM IDs,Common Organism Name,NCBI Taxon ID,Symbol,EntrezGene ID,Mouse MGI ID
0,DOID:0112248,17-beta hydroxysteroid dehydrogenase 3 deficiency,OMIM:264300,human,9606,HSD17B3,3293,
1,DOID:0112248,17-beta hydroxysteroid dehydrogenase 3 deficiency,OMIM:264300,"mouse, laboratory",10090,Hsd17b3,15487,MGI:107177
2,DOID:0111453,2-aminoadipic 2-oxoadipic aciduria,OMIM:204750,human,9606,DHTKD1,55526,
3,DOID:0111453,2-aminoadipic 2-oxoadipic aciduria,OMIM:204750,"mouse, laboratory",10090,Dhtkd1,209692,MGI:2445096
4,DOID:0050573,2-hydroxyglutaric aciduria,,human,9606,L2HGDH,79944,
...,...,...,...,...,...,...,...,...
18631,DOID:905,Zellweger syndrome,,human,9606,PHYH,5264,
18632,DOID:905,Zellweger syndrome,,"mouse, laboratory",10090,Pex1,71382,MGI:1918632
18633,DOID:905,Zellweger syndrome,,"mouse, laboratory",10090,Pex11b,18632,MGI:1338882
18634,DOID:0060478,Zika fever,,human,9606,STAT2,6773,


In [6]:
#gene2diseases_mouse_only_mouse = gene2diseases_mouse[gene2diseases_mouse["Common Organism Name"]]

gene2diseases_mouse["HumanOrthologEntrezID"] = gene2diseases_mouse["EntrezGene ID"]
gene2diseases_mouse

Unnamed: 0,DO Disease ID,DO Disease Name,OMIM IDs,Common Organism Name,NCBI Taxon ID,Symbol,EntrezGene ID,Mouse MGI ID,HumanOrthologEntrezID
0,DOID:0112248,17-beta hydroxysteroid dehydrogenase 3 deficiency,OMIM:264300,human,9606,HSD17B3,3293,,3293
1,DOID:0112248,17-beta hydroxysteroid dehydrogenase 3 deficiency,OMIM:264300,"mouse, laboratory",10090,Hsd17b3,15487,MGI:107177,15487
2,DOID:0111453,2-aminoadipic 2-oxoadipic aciduria,OMIM:204750,human,9606,DHTKD1,55526,,55526
3,DOID:0111453,2-aminoadipic 2-oxoadipic aciduria,OMIM:204750,"mouse, laboratory",10090,Dhtkd1,209692,MGI:2445096,209692
4,DOID:0050573,2-hydroxyglutaric aciduria,,human,9606,L2HGDH,79944,,79944
...,...,...,...,...,...,...,...,...,...
18631,DOID:905,Zellweger syndrome,,human,9606,PHYH,5264,,5264
18632,DOID:905,Zellweger syndrome,,"mouse, laboratory",10090,Pex1,71382,MGI:1918632,71382
18633,DOID:905,Zellweger syndrome,,"mouse, laboratory",10090,Pex11b,18632,MGI:1338882,18632
18634,DOID:0060478,Zika fever,,human,9606,STAT2,6773,,6773


In [7]:
df_mouse_human_gene_ids_list = pd.read_csv("df_mouse_human_gene_ids_list.tsv", sep=",")
df_mouse_human_gene_ids_list

Unnamed: 0,MouseGene,HumanGene,HumanOrthologEntrezID
0,Sgsm1,SGSM1,129049
1,Slc22a17,SLC22A17,51310
2,Mafg,MAFG,4097
3,Ccdc47,CCDC47,57003
4,Obscn,OBSCN,84033
...,...,...,...
15887,Ccdc27,CCDC27,148870
15888,Egln3,EGLN3,112399
15889,Gria1,GRIA1,2890
15890,S100a8,S100A8,6279


In [8]:
list_ortho = list()
for i, row in gene2diseases_mouse.iterrows():
    mouse_row = row["Common Organism Name"] == "mouse, laboratory"
    if mouse_row:
        selected_ortho_gene = df_mouse_human_gene_ids_list[df_mouse_human_gene_ids_list["MouseGene"] == row["Symbol"]]
        row["HumanOrthologEntrezID"] = selected_ortho_gene["HumanOrthologEntrezID"]
        if len(selected_ortho_gene["HumanOrthologEntrezID"].values) == 0:
            list_ortho.append(-1)
        else:
            list_ortho.append(selected_ortho_gene["HumanOrthologEntrezID"].values[0])
    else:
        list_ortho.append(row["EntrezGene ID"])
gene2diseases_mouse["HumanOrthologEntrezID"] = list_ortho
gene2diseases_mouse

Unnamed: 0,DO Disease ID,DO Disease Name,OMIM IDs,Common Organism Name,NCBI Taxon ID,Symbol,EntrezGene ID,Mouse MGI ID,HumanOrthologEntrezID
0,DOID:0112248,17-beta hydroxysteroid dehydrogenase 3 deficiency,OMIM:264300,human,9606,HSD17B3,3293,,3293
1,DOID:0112248,17-beta hydroxysteroid dehydrogenase 3 deficiency,OMIM:264300,"mouse, laboratory",10090,Hsd17b3,15487,MGI:107177,3293
2,DOID:0111453,2-aminoadipic 2-oxoadipic aciduria,OMIM:204750,human,9606,DHTKD1,55526,,55526
3,DOID:0111453,2-aminoadipic 2-oxoadipic aciduria,OMIM:204750,"mouse, laboratory",10090,Dhtkd1,209692,MGI:2445096,55526
4,DOID:0050573,2-hydroxyglutaric aciduria,,human,9606,L2HGDH,79944,,79944
...,...,...,...,...,...,...,...,...,...
18631,DOID:905,Zellweger syndrome,,human,9606,PHYH,5264,,5264
18632,DOID:905,Zellweger syndrome,,"mouse, laboratory",10090,Pex1,71382,MGI:1918632,5189
18633,DOID:905,Zellweger syndrome,,"mouse, laboratory",10090,Pex11b,18632,MGI:1338882,8799
18634,DOID:0060478,Zika fever,,human,9606,STAT2,6773,,6773


In [9]:
gene2diseases_mouse.to_csv("gene2diseases_mouse_human_ortholog_entrez.tsv", index=None)

In [10]:
#mouse_human_gene_ids = list(set(gene2diseases_mouse["EntrezGene ID"].tolist()))
mouse_human_gene_ids = list(set(gene2diseases_mouse["HumanOrthologEntrezID"].tolist()))
mouse_human_gene_ids = [int(item) for item in mouse_human_gene_ids]
len(mouse_human_gene_ids)

5546

In [11]:
human_genes = all_genes_ids #list(set(gene_diseases.loc[:, 0].tolist()))
len(human_genes)

17247

In [12]:
gene_ids_not_present = list(set(mouse_human_gene_ids).difference(set(human_genes)))
len(gene_ids_not_present)

203

In [13]:
all_disease_names = all_diseases.loc[:, 1].tolist()
all_disease_OMIM = list(set([int(item.split(":")[1]) for item in all_disease_names]))
len(all_disease_OMIM), all_disease_names[:5], all_disease_OMIM[:5]

(8827,
 ['OMIM:100050', 'OMIM:100070', 'OMIM:100100', 'OMIM:100200', 'OMIM:100300'],
 [163850, 229400, 131100, 262190, 163950])

In [14]:
#gene2diseases_mouse_filtered = gene2diseases_mouse[~gene2diseases_mouse["EntrezGene ID"].isin(gene_ids_not_present)]
gene2diseases_mouse_filtered = gene2diseases_mouse[~gene2diseases_mouse["HumanOrthologEntrezID"].isin(gene_ids_not_present)]
gene2diseases_mouse_filtered

Unnamed: 0,DO Disease ID,DO Disease Name,OMIM IDs,Common Organism Name,NCBI Taxon ID,Symbol,EntrezGene ID,Mouse MGI ID,HumanOrthologEntrezID
0,DOID:0112248,17-beta hydroxysteroid dehydrogenase 3 deficiency,OMIM:264300,human,9606,HSD17B3,3293,,3293
1,DOID:0112248,17-beta hydroxysteroid dehydrogenase 3 deficiency,OMIM:264300,"mouse, laboratory",10090,Hsd17b3,15487,MGI:107177,3293
2,DOID:0111453,2-aminoadipic 2-oxoadipic aciduria,OMIM:204750,human,9606,DHTKD1,55526,,55526
3,DOID:0111453,2-aminoadipic 2-oxoadipic aciduria,OMIM:204750,"mouse, laboratory",10090,Dhtkd1,209692,MGI:2445096,55526
4,DOID:0050573,2-hydroxyglutaric aciduria,,human,9606,L2HGDH,79944,,79944
...,...,...,...,...,...,...,...,...,...
18631,DOID:905,Zellweger syndrome,,human,9606,PHYH,5264,,5264
18632,DOID:905,Zellweger syndrome,,"mouse, laboratory",10090,Pex1,71382,MGI:1918632,5189
18633,DOID:905,Zellweger syndrome,,"mouse, laboratory",10090,Pex11b,18632,MGI:1338882,8799
18634,DOID:0060478,Zika fever,,human,9606,STAT2,6773,,6773


In [15]:
gene2diseases_mouse_filtered["OMIM IDs"] = gene2diseases_mouse_filtered["OMIM IDs"].fillna(-1)
#gene2diseases_mouse_filtered["OMIM IDs"] = gene2diseases_mouse_filtered["OMIM IDs"].astype(int)
gene2diseases_mouse_filtered

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene2diseases_mouse_filtered["OMIM IDs"] = gene2diseases_mouse_filtered["OMIM IDs"].fillna(-1)


Unnamed: 0,DO Disease ID,DO Disease Name,OMIM IDs,Common Organism Name,NCBI Taxon ID,Symbol,EntrezGene ID,Mouse MGI ID,HumanOrthologEntrezID
0,DOID:0112248,17-beta hydroxysteroid dehydrogenase 3 deficiency,OMIM:264300,human,9606,HSD17B3,3293,,3293
1,DOID:0112248,17-beta hydroxysteroid dehydrogenase 3 deficiency,OMIM:264300,"mouse, laboratory",10090,Hsd17b3,15487,MGI:107177,3293
2,DOID:0111453,2-aminoadipic 2-oxoadipic aciduria,OMIM:204750,human,9606,DHTKD1,55526,,55526
3,DOID:0111453,2-aminoadipic 2-oxoadipic aciduria,OMIM:204750,"mouse, laboratory",10090,Dhtkd1,209692,MGI:2445096,55526
4,DOID:0050573,2-hydroxyglutaric aciduria,-1,human,9606,L2HGDH,79944,,79944
...,...,...,...,...,...,...,...,...,...
18631,DOID:905,Zellweger syndrome,-1,human,9606,PHYH,5264,,5264
18632,DOID:905,Zellweger syndrome,-1,"mouse, laboratory",10090,Pex1,71382,MGI:1918632,5189
18633,DOID:905,Zellweger syndrome,-1,"mouse, laboratory",10090,Pex11b,18632,MGI:1338882,8799
18634,DOID:0060478,Zika fever,-1,human,9606,STAT2,6773,,6773


In [16]:
#gene2diseases_mouse_filtered = gene2diseases_mouse_filtered[["EntrezGene ID", "OMIM IDs"]]
gene2diseases_mouse_filtered = gene2diseases_mouse_filtered[["HumanOrthologEntrezID", "OMIM IDs"]]
gene2diseases_mouse_filtered

Unnamed: 0,HumanOrthologEntrezID,OMIM IDs
0,3293,OMIM:264300
1,3293,OMIM:264300
2,55526,OMIM:204750
3,55526,OMIM:204750
4,79944,-1
...,...,...
18631,5264,-1
18632,5189,-1
18633,8799,-1
18634,6773,-1


In [17]:
#gene2diseases_zebrafish_genes_OMIMs["OMIM_ID"] =  'OMIM:' + gene2diseases_zebrafish_genes_OMIMs['OMIM_ID'].astype(str)
#gene2diseases_zebrafish_genes_OMIMs
gene2diseases_mouse_filtered = gene2diseases_mouse_filtered[~(gene2diseases_mouse_filtered["OMIM IDs"] == -1)]
gene2diseases_mouse_filtered["HumanOrthologEntrezID"] = [int(item) for item in \
                                                         gene2diseases_mouse_filtered["HumanOrthologEntrezID"].tolist()]
gene2diseases_mouse_filtered["OMIM IDs"] = [str(item) for item in \
                                                         gene2diseases_mouse_filtered["OMIM IDs"].tolist()]
gene2diseases_mouse_filtered = gene2diseases_mouse_filtered.drop_duplicates()
gene2diseases_mouse_filtered

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene2diseases_mouse_filtered["HumanOrthologEntrezID"] = [int(item) for item in \
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gene2diseases_mouse_filtered["OMIM IDs"] = [str(item) for item in \


Unnamed: 0,HumanOrthologEntrezID,OMIM IDs
0,3293,OMIM:264300
2,55526,OMIM:204750
6,5648,OMIM:257920
7,78989,OMIM:265050
8,10584,OMIM:248340
...,...,...
18622,90665,OMIM:400047
18623,8287,OMIM:415000
18624,55753,OMIM:619701
18625,9896,OMIM:216340


In [18]:
import numpy as np
mouse_OMIM_ID = list(set(gene2diseases_mouse_filtered["OMIM IDs"].tolist()))
#mouse_OMIM_ID = [int() for item in mouse_OMIM_ID]
len(mouse_OMIM_ID), mouse_OMIM_ID[:5]

(4318,
 ['OMIM:614749', 'OMIM:617276', 'OMIM:193230', 'OMIM:125851', 'OMIM:618955'])

In [19]:
disease_ids_not_present = list(set(mouse_OMIM_ID).difference(set(all_disease_names)))
len(disease_ids_not_present), disease_ids_not_present[:5]

(537,
 ['OMIM:114580|OMIM:212050|OMIM:607644|OMIM:613108|OMIM:613956|OMIM:614162|OMIM:615527|OMIM:616445',
  'OMIM:618955',
  'OMIM:620452',
  'OMIM:153700|OMIM:153840|OMIM:608161|OMIM:616151|OMIM:616152',
  'OMIM:248200|OMIM:600110|OMIM:603786'])

In [20]:
len(all_disease_OMIM), all_disease_OMIM[:10], len(all_disease_names), all_disease_names[:10]

(8827,
 [163850,
  229400,
  131100,
  262190,
  163950,
  229500,
  131200,
  262300,
  164000,
  229600],
 8827,
 ['OMIM:100050',
  'OMIM:100070',
  'OMIM:100100',
  'OMIM:100200',
  'OMIM:100300',
  'OMIM:100600',
  'OMIM:100675',
  'OMIM:100700',
  'OMIM:100800',
  'OMIM:100820'])

In [21]:
gene2diseases_mouse_filtered = gene2diseases_mouse_filtered[~gene2diseases_mouse_filtered["OMIM IDs"].isin(disease_ids_not_present)]
gene2diseases_mouse_filtered

Unnamed: 0,HumanOrthologEntrezID,OMIM IDs
0,3293,OMIM:264300
2,55526,OMIM:204750
6,5648,OMIM:257920
7,78989,OMIM:265050
8,10584,OMIM:248340
...,...,...
18618,2623,OMIM:314050
18619,2158,OMIM:300807
18620,7547,OMIM:314390
18623,8287,OMIM:415000


In [22]:
gene2diseases_mouse_filtered.columns = range(gene2diseases_mouse_filtered.shape[1])
gene2diseases_mouse_filtered
#gene2diseases_zebrafish_genes_OMIMs_nh.to_csv("gene2diseases_zebrafish_genes_OMIMs.tsv", index=None)

Unnamed: 0,0,1
0,3293,OMIM:264300
2,55526,OMIM:204750
6,5648,OMIM:257920
7,78989,OMIM:265050
8,10584,OMIM:248340
...,...,...
18618,2623,OMIM:314050
18619,2158,OMIM:300807
18620,7547,OMIM:314390
18623,8287,OMIM:415000


In [23]:
unique_gene2diseases_mouse_genes_OMIMs = gene2diseases_mouse_filtered.drop_duplicates()
unique_gene2diseases_mouse_genes_OMIMs

Unnamed: 0,0,1
0,3293,OMIM:264300
2,55526,OMIM:204750
6,5648,OMIM:257920
7,78989,OMIM:265050
8,10584,OMIM:248340
...,...,...
18618,2623,OMIM:314050
18619,2158,OMIM:300807
18620,7547,OMIM:314390
18623,8287,OMIM:415000


In [24]:
mouse_gene_ids = list()
mouse_omim_ids = list()

for i, row in unique_gene2diseases_mouse_genes_OMIMs.iterrows():
    gene_id = row[0]
    omimids = row[1]
    s_omimids = omimids.split("|")
    if len(s_omimids) > 1:
        #print(gene_id, omimids)
        for oids in s_omimids:
            mouse_gene_ids.append(gene_id)
            mouse_omim_ids.append(oids)
    else:
        mouse_gene_ids.append(gene_id)
        mouse_omim_ids.append(omimids)
inflated_mouse_gene_omimid = pd.DataFrame(zip(mouse_gene_ids, mouse_omim_ids))
inflated_mouse_gene_omimid

Unnamed: 0,0,1
0,3293,OMIM:264300
1,55526,OMIM:204750
2,5648,OMIM:257920
3,78989,OMIM:265050
4,10584,OMIM:248340
...,...,...
7079,2623,OMIM:314050
7080,2158,OMIM:300807
7081,7547,OMIM:314390
7082,8287,OMIM:415000


In [25]:
unique_inflated_mouse_gene_omimid = inflated_mouse_gene_omimid.drop_duplicates()
unique_inflated_mouse_gene_omimid

Unnamed: 0,0,1
0,3293,OMIM:264300
1,55526,OMIM:204750
2,5648,OMIM:257920
3,78989,OMIM:265050
4,10584,OMIM:248340
...,...,...
7079,2623,OMIM:314050
7080,2158,OMIM:300807
7081,7547,OMIM:314390
7082,8287,OMIM:415000


In [26]:
unique_human_gene_diseases = gene_diseases.drop_duplicates()
unique_human_gene_diseases

Unnamed: 0,0,1
0,3119,OMIM:612595
1,3265,OMIM:218040
2,9992,OMIM:608988
3,84570,OMIM:609384
4,6638,OMIM:615091
...,...,...
11760,9439,OMIM:614346
11761,10000,OMIM:615937
11762,1630,OMIM:617542
11763,1316,OMIM:608658


In [27]:
unique_inflated_mouse_gene_omimid.loc[:, 0] = unique_inflated_mouse_gene_omimid.loc[:, 0].astype(int)
unique_inflated_mouse_gene_omimid.loc[:, 1] = unique_inflated_mouse_gene_omimid.loc[:, 1].astype(str)

unique_inflated_mouse_gene_omimid

Unnamed: 0,0,1
0,3293,OMIM:264300
1,55526,OMIM:204750
2,5648,OMIM:257920
3,78989,OMIM:265050
4,10584,OMIM:248340
...,...,...
7079,2623,OMIM:314050
7080,2158,OMIM:300807
7081,7547,OMIM:314390
7082,8287,OMIM:415000


In [28]:
unique_human_gene_diseases.loc[:, 0] = unique_human_gene_diseases.loc[:, 0].astype(int)
unique_human_gene_diseases.loc[:, 1] = unique_human_gene_diseases.loc[:, 1].astype(str)
unique_human_gene_diseases

Unnamed: 0,0,1
0,3119,OMIM:612595
1,3265,OMIM:218040
2,9992,OMIM:608988
3,84570,OMIM:609384
4,6638,OMIM:615091
...,...,...
11760,9439,OMIM:614346
11761,10000,OMIM:615937
11762,1630,OMIM:617542
11763,1316,OMIM:608658


In [29]:
concat_pd = pd.concat([unique_inflated_mouse_gene_omimid, unique_human_gene_diseases], axis=0)
concat_pd

Unnamed: 0,0,1
0,3293,OMIM:264300
1,55526,OMIM:204750
2,5648,OMIM:257920
3,78989,OMIM:265050
4,10584,OMIM:248340
...,...,...
11760,9439,OMIM:614346
11761,10000,OMIM:615937
11762,1630,OMIM:617542
11763,1316,OMIM:608658


In [30]:
only_in_mouse_gene_id = list()
only_in_mouse_omim_id = list()

for i, row in unique_inflated_mouse_gene_omimid.iterrows():
    gene_id = row[0]
    omim_id = row[1]
    common_mouse_human = unique_human_gene_diseases[(unique_human_gene_diseases.loc[:, 0] == gene_id) & (unique_human_gene_diseases.loc[:, 1] == omim_id)]
    if len(common_mouse_human.index) > 0:
        continue
    else:
        only_in_mouse_gene_id.append(gene_id)
        only_in_mouse_omim_id.append(omim_id)
        
df_only_in_mouse = pd.DataFrame(zip(only_in_mouse_gene_id, only_in_mouse_omim_id))
df_only_in_mouse

Unnamed: 0,0,1
0,57647,OMIM:607080
1,3952,OMIM:605552
2,7433,OMIM:200400
3,3119,OMIM:200400
4,8086,OMIM:200400
...,...,...
3125,657,OMIM:194200
3126,2067,OMIM:610965
3127,5295,OMIM:300755
3128,50814,OMIM:302950


In [31]:
df_only_in_mouse.to_csv("df_only_in_mouse.tsv", sep="\t", index=None)

In [32]:
df_only_in_mouse[df_only_in_mouse.loc[:, 0] == 5295]

Unnamed: 0,0,1
1125,5295,OMIM:608089
1651,5295,OMIM:182280
3127,5295,OMIM:300755


In [33]:
unique_human_gene_diseases[unique_human_gene_diseases.loc[:, 0] == 5295]

Unnamed: 0,0,1
2202,5295,OMIM:612692
2609,5295,OMIM:615214
5025,5295,OMIM:616005
5090,5295,OMIM:269880
5488,5295,OMIM:613502
6362,5295,OMIM:613501
8571,5295,OMIM:601495
9838,5295,OMIM:613500
