In [1]:
import os, glob
import numpy as np
import pandas as pd
from pybiomart import Server
server = Server(host = 'http://www.ensembl.org')
mart = server.marts['ENSEMBL_MART_ENSEMBL']

In [2]:
mart_dict = {
    'mmusculus' : 'mmusculus_gene_ensembl',
    'hsapiens' : 'hsapiens_gene_ensembl'
    }

attr_dict = {
    'mmusculus' : {
        'external_gene_name'                    : 'mmusculus',
        'hsapiens_homolog_associated_gene_name' : 'hsapiens'
        },
    'hsapiens' : {
        'external_gene_name'                     : 'hsapiens',
        'mmusculus_homolog_associated_gene_name' : 'mmusculus'
        }
    }

for species in mart_dict:
    mart_dict[species] = mart.datasets[mart_dict[species]]
    mart_dict[species] = mart_dict[species].query(attributes = attr_dict[species].keys())
    mart_dict[species] = mart_dict[species].dropna().drop_duplicates()
    mart_dict[species].columns = attr_dict[species].values()
    print(species, mart_dict[species])

mmusculus       mmusculus hsapiens
5        mt-Nd1   MT-ND1
9        mt-Nd2   MT-ND2
15       mt-Co1   MT-CO1
18       mt-Co2   MT-CO2
20      mt-Atp8  MT-ATP8
...         ...      ...
60521      Adnp     ADNP
60523      Dpm1     DPM1
60524     Mocs3    MOCS3
60526     Kcng1    KCNG1
60531     Or1q1    OR1Q1

[24419 rows x 2 columns]
hsapiens       hsapiens mmusculus
5       MT-ND1    mt-Nd1
9       MT-ND2    mt-Nd2
15      MT-CO1    mt-Co1
18      MT-CO2    mt-Co2
20     MT-ATP8   mt-Atp8
...        ...       ...
48217       U6   Gm22845
48221     AGRN      Agrn
48222    ASTN1     Astn1
48223   MIR488    Mir488
48227    CLCN6     Clcn6

[24419 rows x 2 columns]


In [6]:
dset_summary = pd.read_csv('../data/summary.csv', index_col = 0)
species_dict = dset_summary['species'].to_dict()
signature_fn = sorted(glob.glob('results/*/*.txt'))
for fn in signature_fn:
    signature = np.loadtxt(fn, dtype = str)
    species = species_dict[fn.split('/')[1]]
    df = mart_dict[species][mart_dict[species][species].isin(signature)].reset_index(drop = True)
    bmart_dir_fn = '/'.join(fn.split('/')[:-1] + ['biomart'])
    os.makedirs(bmart_dir_fn, exist_ok = True)
    df.to_csv(f'{bmart_dir_fn}/{fn.split('/')[-1]}', index = False)
    print(fn, df)

results/GSE136689/early.txt    mmusculus hsapiens
0       Dab2     DAB2
1      Map1b    MAP1B
2       Irx1     IRX1
3      Rbm24    RBM24
4      Sox11    SOX11
..       ...      ...
65     Nop58    NOP58
66     Aldoa    ALDOA
67       Mmd      MMD
68   Slc16a3  SLC16A3
69    Mpped2   MPPED2

[70 rows x 2 columns]
results/GSE136689/late.txt    mmusculus hsapiens
0        Id2      ID2
1     Homer2   HOMER2
2       Manf     MANF
3       Bex2     BEX1
4       Bex2     BEX2
5       Bex1     BEX1
6       Bex1     BEX2
7      Nr2f2    NR2F2
8       Wnt2     WNT2
9       Alx1     ALX1
10     Csrp2    CSRP2
11     Sfrp1    SFRP1
12      Krt8     KRT8
13     Krt18    KRT18
14    Phlda1   PHLDA1
15      Osr1     OSR1
16      Mycn     MYCN
17      Rbp1     RBP1
18     Gata6    GATA6
19     Sfrp5    SFRP5
20     Rras2    RRAS2
21     Pmp22    PMP22
22     Cox17    COX17
23    Popdc2   POPDC2
24       Ddt     DDTL
25       Ddt      DDT
26   Smarcd3  SMARCD3
27      Ly6e     LY6E
28      Nrp1     NRP