In [1]:
import pandas as pd

In [2]:
# https://www.informatics.jax.org/homology.shtml
# https://www.informatics.jax.org/downloads/reports/HMD_HumanPhenotype.rpt

mouse_human_orthologs = pd.read_csv("HMD_HumanPhenotype.rpt.txt", sep="\t", header=None)
mouse_human_orthologs

Unnamed: 0,0,1,2,3,4,5
0,A1BG,1,A1bg,MGI:2152878,,
1,A1CF,29974,A1cf,MGI:1917115,"MP:0005367, MP:0005369, MP:0005370, MP:0005376...",
2,A2M,2,A2m,MGI:2449119,,
3,A3GALT2,127550,A3galt2,MGI:2685279,,
4,A4GALT,53947,A4galt,MGI:3512453,"MP:0005376, MP:0005386, MP:0010768",
...,...,...,...,...,...,...
29594,ZYG11A,440590,Zyg11a,MGI:2446208,,
29595,ZYG11B,79699,Zyg11b,MGI:2685277,"MP:0005386, MP:0010768",
29596,ZYX,7791,Zyx,MGI:103072,MP:0005384,
29597,ZZEF1,23140,Zzef1,MGI:2444286,"MP:0003631, MP:0005367, MP:0005376, MP:0005378...",


In [3]:
mouse_human_orthologs = mouse_human_orthologs[[0, 1, 2]]
mouse_human_orthologs

Unnamed: 0,0,1,2
0,A1BG,1,A1bg
1,A1CF,29974,A1cf
2,A2M,2,A2m
3,A3GALT2,127550,A3galt2
4,A4GALT,53947,A4galt
...,...,...,...
29594,ZYG11A,440590,Zyg11a
29595,ZYG11B,79699,Zyg11b
29596,ZYX,7791,Zyx
29597,ZZEF1,23140,Zzef1


In [4]:
mouse_genes = mouse_human_orthologs.loc[:, 2]
human_genes = mouse_human_orthologs.loc[:, 0]
human_gene_ids = mouse_human_orthologs.loc[:, 1]

In [5]:
df_mouse_human_gene_ids_list = pd.DataFrame(zip(mouse_genes, human_genes, human_gene_ids), \
                                           columns=["MouseGene", "HumanGene", "HumanOrthologEntrezID"])
df_mouse_human_gene_ids_list

Unnamed: 0,MouseGene,HumanGene,HumanOrthologEntrezID
0,A1bg,A1BG,1
1,A1cf,A1CF,29974
2,A2m,A2M,2
3,A3galt2,A3GALT2,127550
4,A4galt,A4GALT,53947
...,...,...,...
29594,Zyg11a,ZYG11A,440590
29595,Zyg11b,ZYG11B,79699
29596,Zyx,ZYX,7791
29597,Zzef1,ZZEF1,23140


In [7]:
df_mouse_human_gene_ids_list.to_csv("df_mouse_human_gene_ids_list.tsv", sep=",", index=None)
df_mouse_human_gene_ids_list

Unnamed: 0,MouseGene,HumanGene,HumanOrthologEntrezID
0,A1bg,A1BG,1
1,A1cf,A1CF,29974
2,A2m,A2M,2
3,A3galt2,A3GALT2,127550
4,A4galt,A4GALT,53947
...,...,...,...
29594,Zyg11a,ZYG11A,440590
29595,Zyg11b,ZYG11B,79699
29596,Zyx,ZYX,7791
29597,Zzef1,ZZEF1,23140


In [10]:
mouse_gene_interactions = pd.read_csv("mouse_gene_interactions_STRING_v12.tsv", sep="\t")
mouse_gene_interactions

Unnamed: 0,Prot1ID,Gene1ID,Gene1Name,Prot2ID,Gene2ID,Gene2Name
0,ENSMUSP00000000001,14679.0,Gnai3,ENSMUSP00000027991,19736.0,Rgs4
1,ENSMUSP00000000001,14679.0,Gnai3,ENSMUSP00000075170,13489.0,Drd2
2,ENSMUSP00000000001,14679.0,Gnai3,ENSMUSP00000121127,14696.0,Gnb4
3,ENSMUSP00000000001,14679.0,Gnai3,ENSMUSP00000081569,50780.0,Rgs3
4,ENSMUSP00000000001,14679.0,Gnai3,ENSMUSP00000025541,14682.0,Gnaq
...,...,...,...,...,...,...
396697,ENSMUSP00000159241,100042165.0,Thoc2l,ENSMUSP00000080242,56009.0,Alyref2
396698,ENSMUSP00000159241,100042165.0,Thoc2l,ENSMUSP00000124205,60532.0,Wtap
396699,ENSMUSP00000159241,100042165.0,Thoc2l,ENSMUSP00000038137,386612.0,Thoc6
396700,ENSMUSP00000159241,100042165.0,Thoc2l,ENSMUSP00000065819,66231.0,Thoc7


In [12]:
in_gene_ID = list()
out_gene_ID = list()
in_gene_name = list()
out_gene_name = list()

for i, row in mouse_gene_interactions.iterrows():
    r_values = row.values
    gene_in = df_mouse_human_gene_ids_list[df_mouse_human_gene_ids_list["MouseGene"] == r_values[2]]
    gene_out = df_mouse_human_gene_ids_list[df_mouse_human_gene_ids_list["MouseGene"] == r_values[5]]
    
    if len(gene_in.index) > 0 and len(gene_out.index) > 0:
        in_gene_ID.append(gene_in["HumanOrthologEntrezID"].values[0])
        out_gene_ID.append(gene_out["HumanOrthologEntrezID"].values[0])
        in_gene_name.append(r_values[2])
        out_gene_name.append(r_values[5])
    if i % 100000 == 0 and i > 0:
        print("{} rows processed".format(i))
        
df_mouse_gene_interactions = pd.DataFrame(zip(in_gene_name, in_gene_ID, out_gene_name, out_gene_ID), \
                                          columns=["GeneInName", "GeneInID", "GeneOutName", "GeneOutID"])
df_mouse_gene_interactions

100000 rows processed
200000 rows processed
300000 rows processed


Unnamed: 0,GeneInName,GeneInID,GeneOutName,GeneOutID
0,Gnai3,2773,Rgs4,5999
1,Gnai3,2773,Drd2,1813
2,Gnai3,2773,Gnb4,59345
3,Gnai3,2773,Rgs3,5998
4,Gnai3,2773,Gnaq,2776
...,...,...,...,...
371427,Thoc2l,57187,Alyref2,10189
371428,Thoc2l,57187,Wtap,9589
371429,Thoc2l,57187,Thoc6,79228
371430,Thoc2l,57187,Thoc7,80145


In [13]:
df_mouse_gene_interactions.to_csv("mouse_gene_interactions_human_entrez_id.tsv", index=None)

In [8]:
mouse_gene_expr_only_ensemble = pd.read_csv("mouse_gene_expr_only_ensemble.tsv", sep=",")
mouse_gene_expr_only_ensemble

Unnamed: 0,0,1,2,3,4
0,ENSMUSP00000000001,Gnai3,BTO:0000000,"tissues, cell types and enzyme sources",3.678
1,ENSMUSP00000000001,Gnai3,BTO:0000042,Animalic,3.677
2,ENSMUSP00000000001,Gnai3,BTO:0001489,Whole body,3.677
3,ENSMUSP00000000001,Gnai3,BTO:0000522,Gland,3.510
4,ENSMUSP00000000001,Gnai3,BTO:0001488,Endocrine gland,3.442
...,...,...,...,...,...
8614755,ENSMUSP00000159257,Carlr,BTO:0000122,Bile duct,0.507
8614756,ENSMUSP00000159257,Carlr,BTO:0000138,Midbrain,0.507
8614757,ENSMUSP00000159257,Carlr,BTO:0000164,Burkitt lymphoma cell,0.507
8614758,ENSMUSP00000159257,Carlr,BTO:0000901,Myocardium,0.506


In [9]:
#mouse_gene_expr_only_ensemble[mouse_gene_expr_only_ensemble.iloc[:, 1] == "Actr2"]

In [9]:
df_mouse_human_gene_ids_list

Unnamed: 0,MouseGene,HumanGene,HumanOrthologEntrezID
0,A1bg,A1BG,1
1,A1cf,A1CF,29974
2,A2m,A2M,2
3,A3galt2,A3GALT2,127550
4,A4galt,A4GALT,53947
...,...,...,...
29594,Zyg11a,ZYG11A,440590
29595,Zyg11b,ZYG11B,79699
29596,Zyx,ZYX,7791
29597,Zzef1,ZZEF1,23140


In [10]:
#df_mouse_human_gene_ids_list[df_mouse_human_gene_ids_list["MouseGene"] == "Actr2"]["HumanOrthologEntrezID"].values[0]

In [10]:
mouse_gene_ids = list()
mouse_gene_names = list()
mouse_organ_names = list()
mouse_gene_expression = list()


for i, row in mouse_gene_expr_only_ensemble.iterrows():
    r_values = row.values
    matched_gene = df_mouse_human_gene_ids_list[df_mouse_human_gene_ids_list["MouseGene"] == r_values[1]]
    if len(matched_gene.index) > 0:
        mouse_gene_ids.append(matched_gene["HumanOrthologEntrezID"].values[0])
        mouse_gene_names.append(r_values[1])
        mouse_organ_names.append(r_values[3])
        mouse_gene_expression.append(r_values[4])
    if i % 500000 == 0 and i > 0:
        print("{} rows processed".format(i))
        
df_mouse_gene_expression_human_entrez_ids = pd.DataFrame(zip(mouse_gene_names, mouse_gene_ids, mouse_organ_names, mouse_gene_expression), \
                                                        columns=["GeneName", "GeneId", "OrganName", "Expression"])
df_mouse_gene_expression_human_entrez_ids

500000 rows processed
1000000 rows processed
1500000 rows processed
2000000 rows processed
2500000 rows processed
3000000 rows processed
3500000 rows processed
4000000 rows processed
4500000 rows processed
5000000 rows processed
5500000 rows processed
6000000 rows processed
6500000 rows processed
7000000 rows processed
7500000 rows processed
8000000 rows processed
8500000 rows processed


Unnamed: 0,GeneName,GeneId,OrganName,Expression
0,Gnai3,2773,"tissues, cell types and enzyme sources",3.678
1,Gnai3,2773,Animalic,3.677
2,Gnai3,2773,Whole body,3.677
3,Gnai3,2773,Gland,3.510
4,Gnai3,2773,Endocrine gland,3.442
...,...,...,...,...
8193331,Thoc2l,57187,Lacrimal gland acinar cell,0.087
8193332,Thoc2l,57187,Hypophysis,0.083
8193333,Thoc2l,57187,Osteoclast,0.032
8193334,Thoc2l,57187,Diencephalon,0.031


In [11]:
#df_mouse_human_gene_ids_list[df_mouse_human_gene_ids_list["MouseGene"] == "Klf6"]
df_mouse_gene_expression_human_entrez_ids = df_mouse_gene_expression_human_entrez_ids.sort_values(by=["GeneId"])
df_mouse_gene_expression_human_entrez_ids.to_csv("mouse_gene_expression_human_entrez_ids.tsv", index=None)