In [1]:
from pybiomart import Server

In [3]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 4000)

In [4]:
server = Server(host='http://www.ensembl.org')
server.list_marts().head()
mart = server['ENSEMBL_MART_ENSEMBL']
mart.list_datasets().head()

Unnamed: 0,name,display_name
0,abrachyrhynchus_gene_ensembl,Pink-footed goose genes (ASM259213v1)
1,pkingsleyae_gene_ensembl,Paramormyrops kingsleyae genes (PKINGS_0.1)
2,nscutatus_gene_ensembl,Mainland tiger snake genes (TS10Xv2-PRI)
3,clumpus_gene_ensembl,Lumpfish genes (fCycLum1.pri)
4,llaticaudata_gene_ensembl,Blue-ringed sea krait genes (latLat_1.0)


In [5]:
dataset = mart['hsapiens_gene_ensembl']
ds_attri = dataset.list_attributes()
ds_attri_species = ds_attri[260:3020]
ds_attri_species_gene = ds_attri_species[ds_attri_species['name'].str.contains('associated_gene_name')]

In [6]:
species_list = ds_attri_species_gene.name.to_list()

In [7]:
species_list.remove('bmusculus_homolog_associated_gene_name')

In [8]:
dataset.query(attributes = species_list[0:6]).head()

Unnamed: 0,Abingdon island giant tortoise gene name,African ostrich gene name,Algerian mouse gene name,Alpaca gene name,Alpine marmot gene name,Amazon molly gene name
0,,,,,,
1,,MT-ND1,,ND1,,
2,,MT-ND2,,ND2,,
3,,MT-CO1,,COX1,,
4,,MT-CO2,,COX2,,


In [9]:
df = dataset.query(attributes = [species_list[0]])
df.columns[0].replace(' gene name','')

'Abingdon island giant tortoise'

In [10]:
def get_list(species):
    df = dataset.query(attributes = [species])
    genes = df.iloc[:,0].str.upper().to_list()
    return genes

In [11]:
def get_specie_name(species):
    df = dataset.query(attributes = [species])
    name = df.columns[0].replace(' gene name','')
    return name

In [13]:
gene_list = [get_list(species) for species in species_list]

BiomartException: African ostrich gene name
MT-ND1
MT-ND2
MT-CO1
MT-CO2
MT-ATP6
MT-CO3
MT-ND3
MT-ND4
MT-ND5
MT-ND6
MT-CYB
Query ERROR: caught BioMart::Exception::Database: Could not connect to mysql database ensembl_mart_108: DBI connect('database=ensembl_mart_108;host=127.0.0.1;port=5316','ensro',...) failed: Lost connection to MySQL server at 'reading initial communication packet', system error: 2 at /nfs/public/ro/ensweb/live/mart/www_108/biomart-perl/lib/BioMart/Configuration/DBLocation.pm line 98.


In [14]:
species_name = [get_specie_name(species) for species in species_list]

BiomartException: Channel catfish gene name
ND1
ND2
COX1
COX2
ATP6
COX3
ND3
ND4
ND5
ND6
CYTB
prodhb
dgcr6
tekt4
Query ERROR: caught BioMart::Exception::Database: Could not connect to mysql database ensembl_mart_108: DBI connect('database=ensembl_mart_108;host=127.0.0.1;port=5316','ensro',...) failed: Lost connection to MySQL server at 'reading initial communication packet', system error: 2 at /nfs/public/ro/ensweb/live/mart/www_108/biomart-perl/lib/BioMart/Configuration/DBLocation.pm line 98.


In [None]:
gene_dict = dict(zip(species_name, gene_list))

In [None]:
gene_series = pd.Series(gene_dict)

In [None]:
gene_series.head()

In [None]:
# Find union of all genes
genes_union = list(set().union(*gene_list))
genes_union = np.array([x for x in genes_union if str(x) != 'nan'])

In [None]:
def get_binary_array(union_list, small_list ):
    """
    Description
    -----------
    Get binary numpy array of union_list by check if union_list is in small_list
    Parameters
    ----------
        union_list : numpy array
        small_list : numpy array
    Returns
    -------
        retrun binary numpy array
    """
    binary_array = np.isin(union_list, small_list).astype(int)
    return binary_array

In [None]:
binary_mat = np.array([get_binary_array(genes_union, np.array(gene_small_list)) for gene_small_list in gene_list])

In [None]:
binary_mat_T = np.transpose(binary_mat)

In [None]:
binary_mat_T.shape

In [None]:
binary_mat_T

In [None]:
binary_all_species = pd.DataFrame(binary_mat_T, columns=species_name, index=genes_union)
binary_all_species.head()