In [2]:
import requests
import pandas as pd

# raw data from: https://tissues.jensenlab.org/Downloads
mouse_gene_expr = pd.read_csv("mouse_tissue_integrated_full.tsv", sep="\t", header=None)
mouse_gene_expr

Unnamed: 0,0,1,2,3,4
0,0610012D04Rik,0610012D04Rik,BTO:0000000,"tissues, cell types and enzyme sources",0.515
1,0610012D04Rik,0610012D04Rik,BTO:0000042,Animalic,0.515
2,0610012D04Rik,0610012D04Rik,BTO:0000887,Muscle,0.515
3,0610012D04Rik,0610012D04Rik,BTO:0001103,Skeletal muscle,0.515
4,0610012D04Rik,0610012D04Rik,BTO:0001369,Vertebrate muscular system,0.515
...,...,...,...,...,...
9040935,n-R5-8s1,n-R5-8s1,BTO:0003096,Internal male genital organ,0.652
9040936,n-R5-8s1,n-R5-8s1,BTO:0000570,Hematopoietic system,0.621
9040937,n-R5-8s1,n-R5-8s1,BTO:0000753,Lymphoid tissue,0.621
9040938,n-R5-8s1,n-R5-8s1,BTO:0001281,Spleen,0.621


In [3]:
# take only the ENSMUS genes
mouse_gene_expr_ensemble = mouse_gene_expr[mouse_gene_expr.index >= 62786]
mouse_gene_expr_ensemble

Unnamed: 0,0,1,2,3,4
62786,ENSMUSP00000000001,Gnai3,BTO:0000000,"tissues, cell types and enzyme sources",3.678
62787,ENSMUSP00000000001,Gnai3,BTO:0000042,Animalic,3.677
62788,ENSMUSP00000000001,Gnai3,BTO:0001489,Whole body,3.677
62789,ENSMUSP00000000001,Gnai3,BTO:0000522,Gland,3.510
62790,ENSMUSP00000000001,Gnai3,BTO:0001488,Endocrine gland,3.442
...,...,...,...,...,...
9040935,n-R5-8s1,n-R5-8s1,BTO:0003096,Internal male genital organ,0.652
9040936,n-R5-8s1,n-R5-8s1,BTO:0000570,Hematopoietic system,0.621
9040937,n-R5-8s1,n-R5-8s1,BTO:0000753,Lymphoid tissue,0.621
9040938,n-R5-8s1,n-R5-8s1,BTO:0001281,Spleen,0.621


In [4]:
mouse_gene_expr_ensemble.iloc[:, 0]

62786      ENSMUSP00000000001
62787      ENSMUSP00000000001
62788      ENSMUSP00000000001
62789      ENSMUSP00000000001
62790      ENSMUSP00000000001
                  ...        
9040935              n-R5-8s1
9040936              n-R5-8s1
9040937              n-R5-8s1
9040938              n-R5-8s1
9040939              n-R5-8s1
Name: 0, Length: 8978154, dtype: object

In [5]:
#mouse_gene_expr_ensemble["ENSMUSP" in mouse_gene_expr_ensemble.iloc[:, 0].values]
mouse_gene_expr_only_ensemble = mouse_gene_expr_ensemble[mouse_gene_expr_ensemble.iloc[:, 0].str.contains("ENSMUSP")]
mouse_gene_expr_only_ensemble

Unnamed: 0,0,1,2,3,4
62786,ENSMUSP00000000001,Gnai3,BTO:0000000,"tissues, cell types and enzyme sources",3.678
62787,ENSMUSP00000000001,Gnai3,BTO:0000042,Animalic,3.677
62788,ENSMUSP00000000001,Gnai3,BTO:0001489,Whole body,3.677
62789,ENSMUSP00000000001,Gnai3,BTO:0000522,Gland,3.510
62790,ENSMUSP00000000001,Gnai3,BTO:0001488,Endocrine gland,3.442
...,...,...,...,...,...
8677541,ENSMUSP00000159257,Carlr,BTO:0000122,Bile duct,0.507
8677542,ENSMUSP00000159257,Carlr,BTO:0000138,Midbrain,0.507
8677543,ENSMUSP00000159257,Carlr,BTO:0000164,Burkitt lymphoma cell,0.507
8677544,ENSMUSP00000159257,Carlr,BTO:0000901,Myocardium,0.506


In [6]:
mouse_gene_expr_only_ensemble = mouse_gene_expr_only_ensemble.reset_index(drop=True)
mouse_gene_expr_only_ensemble

Unnamed: 0,0,1,2,3,4
0,ENSMUSP00000000001,Gnai3,BTO:0000000,"tissues, cell types and enzyme sources",3.678
1,ENSMUSP00000000001,Gnai3,BTO:0000042,Animalic,3.677
2,ENSMUSP00000000001,Gnai3,BTO:0001489,Whole body,3.677
3,ENSMUSP00000000001,Gnai3,BTO:0000522,Gland,3.510
4,ENSMUSP00000000001,Gnai3,BTO:0001488,Endocrine gland,3.442
...,...,...,...,...,...
8614755,ENSMUSP00000159257,Carlr,BTO:0000122,Bile duct,0.507
8614756,ENSMUSP00000159257,Carlr,BTO:0000138,Midbrain,0.507
8614757,ENSMUSP00000159257,Carlr,BTO:0000164,Burkitt lymphoma cell,0.507
8614758,ENSMUSP00000159257,Carlr,BTO:0000901,Myocardium,0.506


In [7]:
#mouse_gene_expr_only_ensemble.to_csv("mouse_ensemble_gene_expression_TISSUES.tsv", sep="\t", index=None)

In [8]:
gene_names = mouse_gene_expr_only_ensemble.iloc[:, 1].tolist()
unique_mouse_genes = list(set(gene_names))
len(unique_mouse_genes)

20880

In [9]:
unique_mouse_genes[:10]

['Fbxo41',
 'Sap30l',
 'Npy5r',
 'Ppp5c',
 'Mgrn1',
 'Col7a1',
 'Olfr1246',
 'Rbm4b',
 'Dicer1',
 'Sypl2']

In [10]:
human_genes = pd.read_csv("df_human_genes_data_NCBI.tsv", sep="\t")
human_genes

Unnamed: 0,Symbol,GeneID
0,APOE,348
1,ACE,1636
2,HLA-B,3106
3,CYP19A1,1588
4,UGT1A1,54658
...,...,...
193499,LOC128966549,128966549
193500,TRN-GTT3-2,116652394
193501,TRP-AGG3-1,100189361
193502,TRW-CCA6-1,100189309


In [11]:
mouse_gene_names = list()
mouse_human_gene_names = list()
human_ortholog_entrez_gene_ids = list()
mouse_genes_not_present = list()

print("Filtering {} mouse genes".format(len(unique_mouse_genes)))

for mouse_gene in unique_mouse_genes:
    u_gene = mouse_gene.upper()
    m_human_ortho_gene_id = human_genes[human_genes["Symbol"] == u_gene]
    if len(m_human_ortho_gene_id.index) > 0:
        mouse_gene_names.append(mouse_gene)
        mouse_human_gene_names.append(u_gene)
        human_ortholog_entrez_gene_ids.append(int(m_human_ortho_gene_id["GeneID"].values[0]))
    else:
        mouse_genes_not_present.append(mouse_gene)
    
df_mouse_human_gene_ids_list = pd.DataFrame(zip(mouse_gene_names, mouse_human_gene_names, human_ortholog_entrez_gene_ids), \
                                            columns=["MouseGene", "HumanGene", "HumanOrthologEntrezID"])

Filtering 20880 mouse genes


In [18]:
df_mouse_human_gene_ids_list

Unnamed: 0,MouseGene,HumanGene,HumanOrthologEntrezID
0,Fbxo41,FBXO41,150726
1,Sap30l,SAP30L,79685
2,Npy5r,NPY5R,4889
3,Ppp5c,PPP5C,5536
4,Mgrn1,MGRN1,23295
...,...,...,...
15887,Bag1,BAG1,573
15888,Slc6a5,SLC6A5,9152
15889,Foxe1,FOXE1,2304
15890,Krt20,KRT20,54474


In [13]:
df_mouse_human_gene_ids_list.to_csv("df_mouse_human_gene_ids_list.tsv", index=None)

In [14]:
len(mouse_genes_not_present)

4988

In [15]:
mouse_gene_expr_only_ensemble

Unnamed: 0,0,1,2,3,4
0,ENSMUSP00000000001,Gnai3,BTO:0000000,"tissues, cell types and enzyme sources",3.678
1,ENSMUSP00000000001,Gnai3,BTO:0000042,Animalic,3.677
2,ENSMUSP00000000001,Gnai3,BTO:0001489,Whole body,3.677
3,ENSMUSP00000000001,Gnai3,BTO:0000522,Gland,3.510
4,ENSMUSP00000000001,Gnai3,BTO:0001488,Endocrine gland,3.442
...,...,...,...,...,...
8614755,ENSMUSP00000159257,Carlr,BTO:0000122,Bile duct,0.507
8614756,ENSMUSP00000159257,Carlr,BTO:0000138,Midbrain,0.507
8614757,ENSMUSP00000159257,Carlr,BTO:0000164,Burkitt lymphoma cell,0.507
8614758,ENSMUSP00000159257,Carlr,BTO:0000901,Myocardium,0.506


In [19]:
mouse_gene_expr_only_ensemble.to_csv("mouse_gene_expr_only_ensemble.tsv", index=None)

ValueError: Length of values (1001) does not match length of index (8614760)