In [2]:
import scanpy as sc
import anndata
import numpy as np
import gc
import pandas as pd 
pd.set_option('display.max_columns', None)
from biomart import BiomartServer
from datetime import date
import os
gc.isenabled()

True

In [3]:
server = BiomartServer("http://www.ensembl.org/biomart")

In [4]:
server.show_datasets()

{'abrachyrhynchus_gene_ensembl': Pink-footed goose genes (ASM259213v1),
 'abrachyrhynchus_genomic_sequence': Pink-footed goose sequences (ASM259213v1),
 'acalliptera_gene_ensembl': Eastern happy genes (fAstCal1.2),
 'acalliptera_genomic_sequence': Eastern happy sequences (fAstCal1.2),
 'acarolinensis_gene_ensembl': Green anole genes (AnoCar2.0v2),
 'acarolinensis_genomic_sequence': Green anole sequences (AnoCar2.0v2),
 'acchrysaetos_gene_ensembl': Golden eagle genes (bAquChr1.2),
 'acchrysaetos_genomic_sequence': Golden eagle sequences (bAquChr1.2),
 'acitrinellus_gene_ensembl': Midas cichlid genes (Midas_v5),
 'acitrinellus_genomic_sequence': Midas cichlid sequences (Midas_v5),
 'amelanoleuca_gene_ensembl': Giant panda genes (ASM200744v2),
 'amelanoleuca_genomic_sequence': Giant panda sequences (ASM200744v2),
 'amexicanus_gene_ensembl': Mexican tetra genes (Astyanax_mexicanus-2.0),
 'amexicanus_genomic_sequence': Mexican tetra sequences (Astyanax_mexicanus-2.0),
 'anancymaae_gene_ense

## Human query

In [5]:
ensembl = server.datasets['hsapiens_gene_ensembl']

In [None]:
ensembl.show_attributes()

In [19]:
columns = ['ensembl_gene_id','external_gene_name', 'external_gene_source','entrezgene_accession','entrezgene_id']

In [10]:
'definition_1006',
'description',
'embl',
'entrezgene_accession',
'entrezgene_id',
'entrezgene_trans_name',
'external_gene_name',
'external_transcript_name',
'go_id',
'name_1006',
'protein_id',
'uniprot_gn_id',
'uniprot_gn_symbol',
'variation_name',
'wikigene_id',
'wikigene_name']

In [20]:
response = ensembl.search({'attributes' : columns,
                           'mart_instance' : 'ensembl'})

In [21]:
mapping_df = pd.read_csv(response.url, sep='\t', header=None, names=columns)

In [22]:
mapping_df

Unnamed: 0,ensembl_gene_id,external_gene_name,external_gene_source,entrezgene_accession,entrezgene_id
0,ENSG00000210049,MT-TF,HGNC Symbol,,
1,ENSG00000211459,MT-RNR1,HGNC Symbol,,
2,ENSG00000210077,MT-TV,HGNC Symbol,,
3,ENSG00000210082,MT-RNR2,HGNC Symbol,,
4,ENSG00000209082,MT-TL1,HGNC Symbol,,
...,...,...,...,...,...
75714,ENSG00000236500,CD24P1,HGNC Symbol,,
75715,ENSG00000197312,DDI2,HGNC Symbol,DDI2,84301.0
75716,ENSG00000215695,RSC1A1,HGNC Symbol,RSC1A1,6248.0
75717,ENSG00000271742,,,,


## Load human data

In [122]:
human = sc.read_h5ad(os.path.join(os.getcwd(),'Vignettes/frog_zebrafish_embryogenesis/data/human.h5ad'))

In [123]:
human.var

Unnamed: 0,gene,chromosome,entrez_id,gene_name,mouse_homologenes,n_cells
1,3.8-1.3,6,353008,HLA complex group 26 (non-protein coding) pseu...,,1470
9,A2M-AS1,12,144571,A2M antisense RNA 1 (head to head),,197
11,A2MP1,12,3,alpha-2-macroglobulin pseudogene 1,,1727
12,A3GALT2,1,127550,"alpha 1,3-galactosyltransferase 2",A3galt2,158
14,A4GNT,3,51146,"alpha-1,4-N-acetylglucosaminyltransferase",A4gnt,17
...,...,...,...,...,...,...
50270,ZXDA,X,7789,"zinc finger, X-linked, duplicated A",,11839
50271,ZXDB,X,158586,"zinc finger, X-linked, duplicated B",Zxdb,64061
50275,ZYG11B,1,79699,"zyg-11 family member B, cell cycle regulator",Zyg11b,35581
50276,ZYX,7,7791,zyxin,Zyx,1950


In [27]:
ncbi_ids = mapping_df[mapping_df.entrezgene_id.isin(human.var['entrez_id'])]

In [28]:
ncbi_ids

Unnamed: 0,ensembl_gene_id,external_gene_name,external_gene_source,entrezgene_accession,entrezgene_id
5,ENSG00000198888,MT-ND1,HGNC Symbol,ND1,4535.0
9,ENSG00000198763,MT-ND2,HGNC Symbol,ND2,4536.0
15,ENSG00000198804,MT-CO1,HGNC Symbol,COX1,4512.0
20,ENSG00000228253,MT-ATP8,HGNC Symbol,ATP8,4509.0
22,ENSG00000198938,MT-CO3,HGNC Symbol,COX3,4514.0
...,...,...,...,...,...
75704,ENSG00000142615,CELA2A,HGNC Symbol,CELA2A,63036.0
75705,ENSG00000215704,CELA2B,HGNC Symbol,CELA2B,51032.0
75706,ENSG00000132906,CASP9,HGNC Symbol,CASP9,842.0
75716,ENSG00000215695,RSC1A1,HGNC Symbol,RSC1A1,6248.0


## Remove duplicates

In [29]:
ncbi_ids[ncbi_ids.ensembl_gene_id.duplicated()]

Unnamed: 0,ensembl_gene_id,external_gene_name,external_gene_source,entrezgene_accession,entrezgene_id
944,ENSG00000282566,PRAMEF25,HGNC Symbol,PRAMEF26,645359.0
1039,ENSG00000114374,USP9Y,HGNC Symbol,TTTY15,64595.0
1197,ENSG00000228240,TTTY17A,HGNC Symbol,TTTY17C,474152.0
1621,ENSG00000227439,TTTY17B,HGNC Symbol,TTTY17C,474152.0
2291,ENSG00000291031,BCORP1,NCBI gene (formerly Entrezgene),LOC105377223,105377223.0
...,...,...,...,...,...
75022,ENSG00000143702,CEP170,HGNC Symbol,CEP170P1,645455.0
75188,ENSG00000290825,DDX11L2,NCBI gene (formerly Entrezgene),DDX11L16,727856.0
75189,ENSG00000290825,DDX11L2,NCBI gene (formerly Entrezgene),DDX11L1,100287102.0
75190,ENSG00000290825,DDX11L2,NCBI gene (formerly Entrezgene),DDX11L5,100287596.0


In [30]:
ncbi_ids[ncbi_ids.entrezgene_id.duplicated()]

Unnamed: 0,ensembl_gene_id,external_gene_name,external_gene_source,entrezgene_accession,entrezgene_id
949,ENSG00000281987,PRAMEF26,HGNC Symbol,PRAMEF26,645359.0
1621,ENSG00000227439,TTTY17B,HGNC Symbol,TTTY17C,474152.0
1671,ENSG00000238074,TSPY9,HGNC Symbol,TSPY9,728132.0
2305,ENSG00000233803,TSPY4,HGNC Symbol,TSPY4,728395.0
2318,ENSG00000225560,FAM197Y8,HGNC Symbol,FAM197Y8,105379268.0
...,...,...,...,...,...
75244,ENSG00000291217,LINC01347,NCBI gene (formerly Entrezgene),LINC01347,731275.0
75285,ENSG00000238142,,,LOC105376805,105376805.0
75376,ENSG00000160767,ENTREP3,HGNC Symbol,ENTREP3,10712.0
75479,ENSG00000290842,CROCCP3,NCBI gene (formerly Entrezgene),CROCCP3,114819.0


In [31]:
ncbi_ids.external_gene_source.unique()

array(['HGNC Symbol', nan, 'NCBI gene (formerly Entrezgene)'],
      dtype=object)

In [39]:
ncbi_ids = ncbi_ids[ncbi_ids.external_gene_source.isin(['HGNC Symbol'])]

In [40]:
ncbi_ids.ensembl_gene_id.duplicated().sum()

90

In [42]:
ncbi_ids.entrezgene_id.duplicated().sum()

2170

In [44]:
duplicates = ncbi_ids[ncbi_ids.ensembl_gene_id.isin(ncbi_ids[ncbi_ids.ensembl_gene_id.duplicated()].ensembl_gene_id)]

In [45]:
duplicates

Unnamed: 0,ensembl_gene_id,external_gene_name,external_gene_source,entrezgene_accession,entrezgene_id
943,ENSG00000282566,PRAMEF25,HGNC Symbol,PRAMEF25,441873.0
944,ENSG00000282566,PRAMEF25,HGNC Symbol,PRAMEF26,645359.0
1038,ENSG00000114374,USP9Y,HGNC Symbol,USP9Y,8287.0
1039,ENSG00000114374,USP9Y,HGNC Symbol,TTTY15,64595.0
1196,ENSG00000228240,TTTY17A,HGNC Symbol,TTTY17A,252949.0
...,...,...,...,...,...
74592,ENSG00000229571,PRAMEF25,HGNC Symbol,PRAMEF26,645359.0
75021,ENSG00000143702,CEP170,HGNC Symbol,CEP170,9859.0
75022,ENSG00000143702,CEP170,HGNC Symbol,CEP170P1,645455.0
75269,ENSG00000158747,NBL1,HGNC Symbol,NBL1,4681.0


In [46]:
ensembl_duplicates = {}
for index, row in duplicates.iterrows():
    if row['ensembl_gene_id'] not in ensembl_duplicates.keys():
        ensembl_duplicates[row['ensembl_gene_id']] = []
    ensembl_duplicates[row['ensembl_gene_id']].append(row.entrezgene_id)

In [47]:
len(ensembl_duplicates.keys())

90

In [48]:
ensembl_duplicates

{'ENSG00000282566': [441873.0, 645359.0],
 'ENSG00000114374': [8287.0, 64595.0],
 'ENSG00000228240': [252949.0, 474152.0],
 'ENSG00000227439': [474151.0, 474152.0],
 'ENSG00000222018': [101928147.0, 102723451.0],
 'ENSG00000159216': [861.0, 100506403.0],
 'ENSG00000205670': [54065.0, 102723553.0],
 'ENSG00000285432': [245910.0, 503614.0],
 'ENSG00000285832': [441317.0, 441327.0],
 'ENSG00000284979': [245910.0, 503614.0],
 'ENSG00000285015': [245908.0, 504180.0],
 'ENSG00000168746': [140834.0, 101927242.0],
 'ENSG00000288273': [255313.0, 728062.0],
 'ENSG00000263247': [5554.0, 11272.0],
 'ENSG00000282673': [5542.0, 653247.0],
 'ENSG00000282269': [5554.0, 11272.0],
 'ENSG00000275517': [5542.0, 653247.0],
 'ENSG00000278632': [414059.0, 101060321.0],
 'ENSG00000274166': [643707.0, 647042.0],
 'ENSG00000282773': [727909.0, 728047.0],
 'ENSG00000230417': [414243.0, 100132987.0],
 'ENSG00000226725': [101928345.0, 101928380.0],
 'ENSG00000160223': [23308.0, 102723996.0],
 'ENSG00000275153': [1

### Keep the one in the human adata object if it is

In [50]:
ids = ensembl_duplicates['ENSG00000282566']

In [52]:
ensembl_duplicates['ENSG00000282566'][1]

645359.0

In [83]:
genes_removed = []
for gene in ensembl_duplicates.keys():
    ids = ensembl_duplicates[gene]
    if ids[1] in human.var.entrez_id:
        genes_removed.append(ensembl_duplicates[gene][2])
    else :
        genes_removed.append(ensembl_duplicates[gene][1])                  

In [84]:
len(genes_removed) #Same length as the duplicates in human_ncbi_id df

90

In [85]:
genes_removed

[645359.0,
 64595.0,
 474152.0,
 474152.0,
 102723451.0,
 100506403.0,
 102723553.0,
 503614.0,
 441327.0,
 503614.0,
 504180.0,
 101927242.0,
 728062.0,
 11272.0,
 653247.0,
 11272.0,
 653247.0,
 101060321.0,
 647042.0,
 728047.0,
 100132987.0,
 101928380.0,
 102723996.0,
 102723859.0,
 101060376.0,
 101060321.0,
 100528032.0,
 645359.0,
 103157000.0,
 105373378.0,
 645455.0,
 100037417.0,
 645359.0,
 105375707.0,
 6607.0,
 728340.0,
 202658.0,
 728340.0,
 202658.0,
 6607.0,
 202658.0,
 100158262.0,
 202658.0,
 504180.0,
 441327.0,
 503614.0,
 266740.0,
 202658.0,
 202658.0,
 105371828.0,
 105374836.0,
 728062.0,
 100037417.0,
 100529261.0,
 729447.0,
 266740.0,
 93655.0,
 647042.0,
 202658.0,
 104797536.0,
 728047.0,
 677885.0,
 105376159.0,
 504180.0,
 503614.0,
 503614.0,
 441327.0,
 100874185.0,
 728724.0,
 100507257.0,
 63914.0,
 283551.0,
 728340.0,
 101060376.0,
 574406.0,
 101928283.0,
 102723728.0,
 6607.0,
 100271927.0,
 102723859.0,
 11272.0,
 101060321.0,
 653247.0,
 10192

In [86]:
ncbi_ids[~ncbi_ids.entrezgene_id.isin(genes_removed)].duplicated().any()

False

In [87]:
human_genes = ncbi_ids[~ncbi_ids.entrezgene_id.isin(genes_removed)]

In [88]:
human_genes.ensembl_gene_id.duplicated().any()

False

In [89]:
human_genes.entrezgene_id.duplicated().any()

True

In [90]:
human_genes

Unnamed: 0,ensembl_gene_id,external_gene_name,external_gene_source,entrezgene_accession,entrezgene_id
5,ENSG00000198888,MT-ND1,HGNC Symbol,ND1,4535.0
9,ENSG00000198763,MT-ND2,HGNC Symbol,ND2,4536.0
15,ENSG00000198804,MT-CO1,HGNC Symbol,COX1,4512.0
20,ENSG00000228253,MT-ATP8,HGNC Symbol,ATP8,4509.0
22,ENSG00000198938,MT-CO3,HGNC Symbol,COX3,4514.0
...,...,...,...,...,...
75704,ENSG00000142615,CELA2A,HGNC Symbol,CELA2A,63036.0
75705,ENSG00000215704,CELA2B,HGNC Symbol,CELA2B,51032.0
75706,ENSG00000132906,CASP9,HGNC Symbol,CASP9,842.0
75716,ENSG00000215695,RSC1A1,HGNC Symbol,RSC1A1,6248.0


## Check entrez_id doublets

In [91]:
human_genes[human_genes.entrezgene_id.duplicated()]

Unnamed: 0,ensembl_gene_id,external_gene_name,external_gene_source,entrezgene_accession,entrezgene_id
1671,ENSG00000238074,TSPY9,HGNC Symbol,TSPY9,728132.0
2305,ENSG00000233803,TSPY4,HGNC Symbol,TSPY4,728395.0
2318,ENSG00000225560,FAM197Y8,HGNC Symbol,FAM197Y8,105379268.0
2319,ENSG00000228383,FAM197Y7,HGNC Symbol,FAM197Y7,105379269.0
2326,ENSG00000229549,TSPY8,HGNC Symbol,TSPY8,728403.0
...,...,...,...,...,...
74969,ENSG00000143554,SLC27A3,HGNC Symbol,SLC27A3,11000.0
75021,ENSG00000143702,CEP170,HGNC Symbol,CEP170,9859.0
75194,ENSG00000080947,CROCCP3,HGNC Symbol,CROCCP3,114819.0
75376,ENSG00000160767,ENTREP3,HGNC Symbol,ENTREP3,10712.0


In [92]:
entrez_dup = human_genes[human_genes.entrezgene_id.isin(human_genes[human_genes.entrezgene_id.duplicated()].entrezgene_id)]

In [93]:
entrez_dup

Unnamed: 0,ensembl_gene_id,external_gene_name,external_gene_source,entrezgene_accession,entrezgene_id
934,ENSG00000282584,PRAMEF4,HGNC Symbol,PRAMEF4,400735.0
935,ENSG00000282663,PRAMEF10,HGNC Symbol,PRAMEF10,343071.0
936,ENSG00000279195,PRAMEF7,HGNC Symbol,PRAMEF7,441871.0
939,ENSG00000282119,PRAMEF6,HGNC Symbol,PRAMEF6,440561.0
941,ENSG00000282437,PRAMEF27,HGNC Symbol,PRAMEF27,101929983.0
...,...,...,...,...,...
74969,ENSG00000143554,SLC27A3,HGNC Symbol,SLC27A3,11000.0
75021,ENSG00000143702,CEP170,HGNC Symbol,CEP170,9859.0
75194,ENSG00000080947,CROCCP3,HGNC Symbol,CROCCP3,114819.0
75376,ENSG00000160767,ENTREP3,HGNC Symbol,ENTREP3,10712.0


In [94]:
entrezid = {}
for index, row in entrez_dup.iterrows():
    if row['entrezgene_id'] not in entrezid.keys():
        entrezid[row['entrezgene_id']] = []
    if row.ensembl_gene_id not in entrezid[row['entrezgene_id']]:
        entrezid[row['entrezgene_id']].append(row.ensembl_gene_id)

In [95]:
len(entrezid.keys())

1077

### Keep only the first

In [96]:
entrezgene_removed = []
for entrez in entrezid.keys():
    for gene in range(1,len(entrezid[entrez])):
        entrezgene_removed.append(entrezid[entrez][gene])

In [98]:
len(entrezgene_removed) #Same length as the duplicates in human_ncbi_id df

2117

In [106]:
human_genes[~human_genes.ensembl_gene_id.isin(entrezgene_removed)].entrezgene_id.duplicated().any()

False

In [156]:
final_df = human_genes[~human_genes.ensembl_gene_id.isin(entrezgene_removed)]

In [157]:
final_df.ensembl_gene_id.duplicated().any()

False

In [158]:
final_df.entrezgene_id.duplicated().any()

False

In [159]:
final_df

Unnamed: 0,ensembl_gene_id,external_gene_name,external_gene_source,entrezgene_accession,entrezgene_id
5,ENSG00000198888,MT-ND1,HGNC Symbol,ND1,4535.0
9,ENSG00000198763,MT-ND2,HGNC Symbol,ND2,4536.0
15,ENSG00000198804,MT-CO1,HGNC Symbol,COX1,4512.0
20,ENSG00000228253,MT-ATP8,HGNC Symbol,ATP8,4509.0
22,ENSG00000198938,MT-CO3,HGNC Symbol,COX3,4514.0
...,...,...,...,...,...
75704,ENSG00000142615,CELA2A,HGNC Symbol,CELA2A,63036.0
75705,ENSG00000215704,CELA2B,HGNC Symbol,CELA2B,51032.0
75706,ENSG00000132906,CASP9,HGNC Symbol,CASP9,842.0
75716,ENSG00000215695,RSC1A1,HGNC Symbol,RSC1A1,6248.0


## Create new var df

In [160]:
human

AnnData object with n_obs × n_vars = 76533 × 31551
    obs: 'sample_name', 'exp_component_name', 'cluster_label', 'cluster_color', 'cluster_order', 'class_label', 'class_color', 'class_order', 'subclass_label', 'subclass_color', 'subclass_order', 'donor_sex_label', 'donor_sex_color', 'donor_sex_order', 'region_label', 'region_color', 'region_order', 'cortical_layer_label', 'cortical_layer_color', 'cortical_layer_order', 'cell_type_accession_label', 'cell_type_accession_color', 'cell_type_accession_order', 'cell_type_alias_label', 'cell_type_alias_color', 'cell_type_alias_order', 'cell_type_alt_alias_label', 'cell_type_alt_alias_color', 'cell_type_alt_alias_order', 'cell_type_designation_label', 'cell_type_designation_color', 'cell_type_designation_order', 'external_donor_name_label', 'external_donor_name_color', 'external_donor_name_order', 'specimen_type', 'full_genotype_label', 'outlier_call', 'outlier_type', 'species', 'n_genes', 'cell_type'
    var: 'gene', 'chromosome', 'entrez_id

In [162]:
human[:,human.var.entrez_id.isin(final_df.entrezgene_id)]

View of AnnData object with n_obs × n_vars = 76533 × 15309
    obs: 'sample_name', 'exp_component_name', 'cluster_label', 'cluster_color', 'cluster_order', 'class_label', 'class_color', 'class_order', 'subclass_label', 'subclass_color', 'subclass_order', 'donor_sex_label', 'donor_sex_color', 'donor_sex_order', 'region_label', 'region_color', 'region_order', 'cortical_layer_label', 'cortical_layer_color', 'cortical_layer_order', 'cell_type_accession_label', 'cell_type_accession_color', 'cell_type_accession_order', 'cell_type_alias_label', 'cell_type_alias_color', 'cell_type_alias_order', 'cell_type_alt_alias_label', 'cell_type_alt_alias_color', 'cell_type_alt_alias_order', 'cell_type_designation_label', 'cell_type_designation_color', 'cell_type_designation_order', 'external_donor_name_label', 'external_donor_name_color', 'external_donor_name_order', 'specimen_type', 'full_genotype_label', 'outlier_call', 'outlier_type', 'species', 'n_genes', 'cell_type'
    var: 'gene', 'chromosome', 'e

In [163]:
human_reduce = human[:,human.var.entrez_id.isin(final_df.entrezgene_id)]

In [164]:
human_reduce.var = human_reduce.var.set_index('entrez_id')
human_reduce.var['entrez_id'] = human_reduce.var.index

AnnData expects .var.index to contain strings, but got values like:
    [144571, 127550, 51146, 729522, 13]

    Inferred to be: integer

  value_idx = self._prep_dim_index(value.index, attr)


In [165]:
human_reduce.var

Unnamed: 0_level_0,gene,chromosome,gene_name,mouse_homologenes,n_cells,entrez_id
entrez_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
144571,A2M-AS1,12,A2M antisense RNA 1 (head to head),,197,144571
127550,A3GALT2,1,"alpha 1,3-galactosyltransferase 2",A3galt2,158,127550
51146,A4GNT,3,"alpha-1,4-N-acetylglucosaminyltransferase",A4gnt,17,51146
729522,AACSP1,5,acetoacetyl-CoA synthetase pseudogene 1,,10489,729522
13,AADAC,3,arylacetamide deacetylase,Aadac,545,13
...,...,...,...,...,...,...
7789,ZXDA,X,"zinc finger, X-linked, duplicated A",,11839,7789
158586,ZXDB,X,"zinc finger, X-linked, duplicated B",Zxdb,64061,158586
79699,ZYG11B,1,"zyg-11 family member B, cell cycle regulator",Zyg11b,35581,79699
7791,ZYX,7,zyxin,Zyx,1950,7791


In [166]:
final_df = final_df.set_index('entrezgene_id')
final_df['entrez_id'] = final_df.index

In [167]:
final_df.index

Index([  4535.0,   4536.0,   4512.0,   4509.0,   4514.0,   4539.0,   4538.0,
         4540.0,   4541.0, 400735.0,
       ...
         5362.0,   6262.0,   4595.0,   1441.0,   4610.0,  63036.0,  51032.0,
          842.0,   6248.0,  23207.0],
      dtype='float64', name='entrezgene_id', length=15309)

In [178]:
new_df = pd.concat([human_reduce.var, final_df], axis=1)

In [179]:
new_df

Unnamed: 0,gene,chromosome,gene_name,mouse_homologenes,n_cells,entrez_id,ensembl_gene_id,external_gene_name,external_gene_source,entrezgene_accession,entrez_id.1
144571.0,A2M-AS1,12,A2M antisense RNA 1 (head to head),,197,144571,ENSG00000245105,A2M-AS1,HGNC Symbol,A2M-AS1,144571.0
127550.0,A3GALT2,1,"alpha 1,3-galactosyltransferase 2",A3galt2,158,127550,ENSG00000184389,A3GALT2,HGNC Symbol,A3GALT2,127550.0
51146.0,A4GNT,3,"alpha-1,4-N-acetylglucosaminyltransferase",A4gnt,17,51146,ENSG00000118017,A4GNT,HGNC Symbol,A4GNT,51146.0
729522.0,AACSP1,5,acetoacetyl-CoA synthetase pseudogene 1,,10489,729522,ENSG00000291019,AACSP1,HGNC Symbol,AACSP1,729522.0
13.0,AADAC,3,arylacetamide deacetylase,Aadac,545,13,ENSG00000114771,AADAC,HGNC Symbol,AADAC,13.0
...,...,...,...,...,...,...,...,...,...,...,...
7789.0,ZXDA,X,"zinc finger, X-linked, duplicated A",,11839,7789,ENSG00000198205,ZXDA,HGNC Symbol,ZXDA,7789.0
158586.0,ZXDB,X,"zinc finger, X-linked, duplicated B",Zxdb,64061,158586,ENSG00000198455,ZXDB,HGNC Symbol,ZXDB,158586.0
79699.0,ZYG11B,1,"zyg-11 family member B, cell cycle regulator",Zyg11b,35581,79699,ENSG00000162378,ZYG11B,HGNC Symbol,ZYG11B,79699.0
7791.0,ZYX,7,zyxin,Zyx,1950,7791,ENSG00000285443,ZYX,HGNC Symbol,ZYX,7791.0


In [180]:
new_df.reindex(human_reduce.var.index)

Unnamed: 0_level_0,gene,chromosome,gene_name,mouse_homologenes,n_cells,entrez_id,ensembl_gene_id,external_gene_name,external_gene_source,entrezgene_accession,entrez_id
entrez_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
144571,A2M-AS1,12,A2M antisense RNA 1 (head to head),,197,144571,ENSG00000245105,A2M-AS1,HGNC Symbol,A2M-AS1,144571.0
127550,A3GALT2,1,"alpha 1,3-galactosyltransferase 2",A3galt2,158,127550,ENSG00000184389,A3GALT2,HGNC Symbol,A3GALT2,127550.0
51146,A4GNT,3,"alpha-1,4-N-acetylglucosaminyltransferase",A4gnt,17,51146,ENSG00000118017,A4GNT,HGNC Symbol,A4GNT,51146.0
729522,AACSP1,5,acetoacetyl-CoA synthetase pseudogene 1,,10489,729522,ENSG00000291019,AACSP1,HGNC Symbol,AACSP1,729522.0
13,AADAC,3,arylacetamide deacetylase,Aadac,545,13,ENSG00000114771,AADAC,HGNC Symbol,AADAC,13.0
...,...,...,...,...,...,...,...,...,...,...,...
7789,ZXDA,X,"zinc finger, X-linked, duplicated A",,11839,7789,ENSG00000198205,ZXDA,HGNC Symbol,ZXDA,7789.0
158586,ZXDB,X,"zinc finger, X-linked, duplicated B",Zxdb,64061,158586,ENSG00000198455,ZXDB,HGNC Symbol,ZXDB,158586.0
79699,ZYG11B,1,"zyg-11 family member B, cell cycle regulator",Zyg11b,35581,79699,ENSG00000162378,ZYG11B,HGNC Symbol,ZYG11B,79699.0
7791,ZYX,7,zyxin,Zyx,1950,7791,ENSG00000285443,ZYX,HGNC Symbol,ZYX,7791.0


In [182]:
sum(human_reduce.var.index == new_df.index)

15309

In [183]:
human_reduce.var = new_df

AnnData expects .var.index to contain strings, but got values like:
    [144571.0, 127550.0, 51146.0, 729522.0, 13.0]

    Inferred to be: floating

  value_idx = self._prep_dim_index(value.index, attr)


# Fly query

In [6]:
fly_query = server.datasets['dmelanogaster_gene_ensembl']

In [None]:
fly_query.show_attributes()

In [8]:
columns = ['ensembl_gene_id','external_gene_name', 'external_gene_source']

In [11]:
response = fly_query.search({'attributes' : columns,
                           'mart_instance' : 'ensembl'})

In [12]:
mapping_fly = pd.read_csv(response.url, sep='\t', header=None, names=columns)

In [13]:
mapping_fly

Unnamed: 0,ensembl_gene_id,external_gene_name,external_gene_source
0,FBgn0029843,Nep1,FlyBase gene name
1,FBgn0266278,lncRNA:CR44963,FlyBase gene name
2,FBgn0026238,gus,FlyBase gene name
3,FBgn0035235,CG7879,FlyBase gene name
4,FBti0018960,flea{}771,Ensembl Metazoa
...,...,...,...
24273,FBgn0038358,Ttc26,FlyBase gene name
24274,FBtr0445469_df_nrg,,
24275,FBgn0030989,Inx5,FlyBase gene name
24276,FBgn0259851,Su(Ste):CR42420,FlyBase gene name


In [16]:
mapping_fly[mapping_fly.ensembl_gene_id == 'FBgn0267508']

Unnamed: 0,ensembl_gene_id,external_gene_name,external_gene_source
3500,FBgn0267508,28SrRNA-Psi:CR45848,FlyBase gene name
