In [1]:
import os, glob
import numpy as np
import pandas as pd
from pybiomart import Server
server = Server(host = 'http://www.ensembl.org')
mart = server.marts['ENSEMBL_MART_ENSEMBL']

In [2]:
mart_dict = {
    'mmusculus' : 'mmusculus_gene_ensembl',
    'hsapiens' : 'hsapiens_gene_ensembl'
    }

attr_dict = {
    'mmusculus' : {
        'external_gene_name'                    : 'mmusculus',
        'hsapiens_homolog_associated_gene_name' : 'hsapiens'
        },
    'hsapiens' : {
        'external_gene_name'                     : 'hsapiens',
        'mmusculus_homolog_associated_gene_name' : 'mmusculus'
        }
    }

for species in mart_dict:
    mart_dict[species] = mart.datasets[mart_dict[species]]
    mart_dict[species] = mart_dict[species].query(attributes = attr_dict[species].keys())
    mart_dict[species] = mart_dict[species].dropna().drop_duplicates()
    mart_dict[species].columns = attr_dict[species].values()
    print(species, mart_dict[species])

mmusculus       mmusculus hsapiens
5        mt-Nd1   MT-ND1
9        mt-Nd2   MT-ND2
15       mt-Co1   MT-CO1
18       mt-Co2   MT-CO2
20      mt-Atp8  MT-ATP8
...         ...      ...
60521      Adnp     ADNP
60523      Dpm1     DPM1
60524     Mocs3    MOCS3
60526     Kcng1    KCNG1
60531     Or1q1    OR1Q1

[24419 rows x 2 columns]
hsapiens       hsapiens mmusculus
5       MT-ND1    mt-Nd1
9       MT-ND2    mt-Nd2
15      MT-CO1    mt-Co1
18      MT-CO2    mt-Co2
20     MT-ATP8   mt-Atp8
...        ...       ...
48217       U6   Gm22845
48221     AGRN      Agrn
48222    ASTN1     Astn1
48223   MIR488    Mir488
48227    CLCN6     Clcn6

[24419 rows x 2 columns]


In [6]:
summary_fn =  os.path.join('..', '..', 'data', 'summary.csv')
summary_df = pd.read_csv(summary_fn, index_col = 0)
species_dict = summary_df['species'].to_dict()
signature_fn = sorted(glob.glob(os.path.join('..', 'results', '*', '*.txt')))
signature_dict = {os.path.split(fn)[1].replace('.txt', '') : fn for fn in signature_fn}
for i, key in enumerate(signature_dict):
    signature = np.loadtxt(signature_dict[key], dtype = str)
    species = species_dict[key]
    df = mart_dict[species].copy()
    df = df.loc[df[species].isin(signature)].reset_index(drop = True)
    outdir = os.path.split(signature_dict[key])[0]
    df.to_csv(os.path.join(outdir, 'biomart.csv'), index = False)
    signature_dict[key] = df
    print(key, df)

GSE136689    mmusculus hsapiens
0      Map1b    MAP1B
1       Irx1     IRX1
2        Id2      ID2
3     Homer2   HOMER2
4        Clu      CLU
..       ...      ...
67     Aldoa    ALDOA
68       Mmd      MMD
69      Peg3     PEG3
70   Slc16a3  SLC16A3
71    Mpped2   MPPED2

[72 rows x 2 columns]
GSE162534     mmusculus  hsapiens
0        Cdh2      CDH2
1       Greb1     GREB1
2        Sqle      SQLE
3    Tnfrsf19  TNFRSF19
4       Rspo3     RSPO3
..        ...       ...
201      Cst3      CST3
202    Mogat2    MOGAT2
203  Serpinh1  SERPINH1
204       Sp5       SP5
205      Peg3      PEG3

[206 rows x 2 columns]
GSE201257     mmusculus hsapiens
0     Phactr1  PHACTR1
1       Vcam1    VCAM1
2         Fos      FOS
3     Ppp1r1a  PPP1R1A
4     Spata13  SPATA13
..        ...      ...
148     Celf2    CELF2
149    Arpc1b   ARPC1B
150     Gdpd5    GDPD5
151    Tom1l1   TOM1L1
152     Mfap4    MFAP4

[153 rows x 2 columns]
GSE229103     mmusculus hsapiens
0        Fbn2     FBN2
1       Greb1  

In [12]:
df = pd.concat(signature_dict.values()).sort_values('mmusculus')
df = df.drop_duplicates().reset_index(drop = True)
df.to_csv(os.path.join('..', 'results', 'biomart.csv'), index = False)