# Download the ENSEMBL Mammal Genes Database

The scope of this Jupyter Notebook is to retrieve information about the genes expressed by all mammalian speciess, to later used in the 02C_Conservation part in the DEG analysis. 

In [1]:
from pybiomart import Server

In [2]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 4000)

In [3]:
server = Server(host='http://www.ensembl.org')
server.list_marts().head()
mart = server['ENSEMBL_MART_ENSEMBL']
mart.list_datasets().head()

Unnamed: 0,name,display_name
0,lchalumnae_gene_ensembl,Coelacanth genes (LatCha1)
1,pcatodon_gene_ensembl,Sperm whale genes (ASM283717v2)
2,smerianae_gene_ensembl,Argentine black and white tegu genes (HLtupMer3)
3,odegus_gene_ensembl,Degu genes (OctDeg1.0)
4,ccapucinus_gene_ensembl,Capuchin genes (Cebus_imitator-1.0)


In [4]:
dataset = mart['hsapiens_gene_ensembl']
ds_attri = dataset.list_attributes()
ds_attri_species = ds_attri[260:3020]
ds_attri_species_gene = ds_attri_species[ds_attri_species['name'].str.contains('associated_gene_name')]

In [5]:
species_list = ds_attri_species_gene.name.to_list()

In [6]:
species_list.remove('bmusculus_homolog_associated_gene_name')

In [7]:
dataset.query(attributes = species_list[0:6]).head()

Unnamed: 0,Abingdon island giant tortoise gene name,African ostrich gene name,Algerian mouse gene name,Alpaca gene name,Alpine marmot gene name,Amazon molly gene name
0,,,,,,
1,,MT-ND1,,ND1,,
2,,MT-ND2,,ND2,,
3,,MT-CO1,,COX1,,
4,,MT-CO2,,COX2,,


In [8]:
df = dataset.query(attributes = [species_list[0]])
df.columns[0].replace(' gene name','')

'Abingdon island giant tortoise'

In [9]:
def get_list(species):
    df = dataset.query(attributes = [species])
    genes = df.iloc[:,0].str.upper().to_list()
    return genes

In [10]:
def get_specie_name(species):
    df = dataset.query(attributes = [species])
    name = df.columns[0].replace(' gene name','')
    return name

In [11]:
gene_list = [get_list(species) for species in species_list]

BiomartException: Olive baboon gene name
ND1
ND2
COX1
COX2
ATP8
ATP6
COX3
ND3
ND4L
ND4
ND5
ND6
CYTB
Query ERROR: caught BioMart::Exception::Database: Could not connect to mysql database ensembl_mart_108: DBI connect('database=ensembl_mart_108;host=127.0.0.1;port=5316','ensro',...) failed: Lost connection to MySQL server at 'reading initial communication packet', system error: 2 at /nfs/public/ro/ensweb/live/mart/www_108/biomart-perl/lib/BioMart/Configuration/DBLocation.pm line 98.


In [None]:
species_name = [get_specie_name(species) for species in species_list]

In [None]:
gene_dict = dict(zip(species_name, gene_list))

In [None]:
gene_series = pd.Series(gene_dict)

In [None]:
gene_series.head()

In [None]:
# Find union of all genes
genes_union = list(set().union(*gene_list))
genes_union = np.array([x for x in genes_union if str(x) != 'nan'])

In [None]:
def get_binary_array(union_list, small_list ):
    """
    Description
    -----------
    Get binary numpy array of union_list by check if union_list is in small_list
    Parameters
    ----------
        union_list : numpy array
        small_list : numpy array
    Returns
    -------
        retrun binary numpy array
    """
    binary_array = np.isin(union_list, small_list).astype(int)
    return binary_array

In [None]:
binary_mat = np.array([get_binary_array(genes_union, np.array(gene_small_list)) for gene_small_list in gene_list])

In [None]:
binary_mat_T = np.transpose(binary_mat)

In [None]:
binary_mat_T.shape

In [None]:
binary_mat_T

In [None]:
binary_all_species = pd.DataFrame(binary_mat_T, columns=species_name, index=genes_union)
binary_all_species.head()

In [None]:
binary_all_species.to_csv("binary_mat_all_species.csv")