# Import packages and define paths

In [467]:
import scanpy as sc
import anndata
import numpy as np
import gc
import pandas as pd 
import os
from datetime import date
pd.set_option('display.max_columns', None)
from biomart import BiomartServer
gc.isenabled()

True

In [41]:
os.path.realpath('one2many_human_mouse.ipynb')

'/nfs/research/irene/anaelle/Scripts/one2many_human_mouse.ipynb'

In [None]:
path_project = '/nfs/research/irene/anaelle'
path_scripts = os.path.join(path_project, 'Scripts','human_mouse_scripts')
print(path_scripts)
path_data = os.path.join(path_project, 'data')
print(path_data)

# Load Biomart output

In [9]:
human_mouse_homolog_genes = pd.read_csv(os.path.join(path_data, 'homolog_human_mouse_ensembl_gene.csv'))

In [11]:
human_mouse_homolog_genes = human_mouse_homolog_genes.drop('Unnamed: 0', axis=1)

In [12]:
human_mouse_homolog_genes

Unnamed: 0,human_ensembl_gene_id,human_entrezgene_id,human_external_gene_name,mouse_homolog_ensembl_gene,orthology_type,mouse_homolog_gene_name
0,ENSG00000198888,4535.0,MT-ND1,ENSMUSG00000064341,ortholog_one2one,mt-Nd1
1,ENSG00000198763,4536.0,MT-ND2,ENSMUSG00000064345,ortholog_one2one,mt-Nd2
2,ENSG00000198804,4512.0,MT-CO1,ENSMUSG00000064351,ortholog_one2one,mt-Co1
3,ENSG00000198712,4513.0,MT-CO2,ENSMUSG00000064354,ortholog_one2one,mt-Co2
4,ENSG00000228253,4509.0,MT-ATP8,ENSMUSG00000064356,ortholog_one2one,mt-Atp8
...,...,...,...,...,...,...
21936,ENSG00000187017,83715.0,ESPN,ENSMUSG00000028943,ortholog_one2one,Espn
21937,ENSG00000198216,777.0,CACNA1E,ENSMUSG00000004110,ortholog_one2one,Cacna1e
21938,ENSG00000179930,127665.0,ZNF648,ENSMUSG00000066797,ortholog_one2one,Zfp648
21939,ENSG00000162437,55225.0,RAVER2,ENSMUSG00000035275,ortholog_one2one,Raver2


# Import human and mouse data

In [13]:
mouse_data = sc.read_h5ad(os.path.join(path_data,'mouse/mouse_raw_counts_from_cellxgene.h5ad'))

In [14]:
mouse_data

AnnData object with n_obs × n_vars = 159738 × 30639
    obs: 'Unnamed: 0', 'nUMI', 'nGene', 'QC', 'cluster', 'Allen.cluster_id', 'Allen.cluster_label', 'Allen.class_label', 'Allen.subclass_label', 'comb.QC', 'row', 'BICCN_cluster_id', 'BICCN_cluster_label', 'BICCN_class_label', 'BICCN_subclass_label', 'size', 'gene.counts', 'umi.counts', 'Broad.QC.doublet', 'Broad.QC.Mito', 'Broad.passQC', 'MALE', 'Comb.QC', 'cl', 'temp_class_label', 'BICCN_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'tissue_ontology_term_id', 'cell_type_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'donor_id', 'suspension_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype'

In [15]:
human_data = sc.read_h5ad(os.path.join(path_data, 'human/hdata2023-06-05.h5ad'))



In [16]:
human_data

AnnData object with n_obs × n_vars = 76533 × 50281
    obs: 'sample_name', 'exp_component_name', 'cluster_label', 'cluster_color', 'cluster_order', 'class_label', 'class_color', 'class_order', 'subclass_label', 'subclass_color', 'subclass_order', 'donor_sex_label', 'donor_sex_color', 'donor_sex_order', 'region_label', 'region_color', 'region_order', 'cortical_layer_label', 'cortical_layer_color', 'cortical_layer_order', 'cell_type_accession_label', 'cell_type_accession_color', 'cell_type_accession_order', 'cell_type_alias_label', 'cell_type_alias_color', 'cell_type_alias_order', 'cell_type_alt_alias_label', 'cell_type_alt_alias_color', 'cell_type_alt_alias_order', 'cell_type_designation_label', 'cell_type_designation_color', 'cell_type_designation_order', 'external_donor_name_label', 'external_donor_name_color', 'external_donor_name_order', 'specimen_type', 'full_genotype_label', 'outlier_call', 'outlier_type'
    var: 'gene', 'chromosome', 'entrez_id', 'gene_name', 'mouse_homologenes'

# Get only one2many label

In [18]:
human_mouse_homolog_genes.orthology_type.unique()

array(['ortholog_one2one', 'ortholog_one2many', 'ortholog_many2many'],
      dtype=object)

In [19]:
human_mouse_homolog_genes = human_mouse_homolog_genes.loc[human_mouse_homolog_genes.orthology_type == 'ortholog_one2many']

In [20]:
human_mouse_homolog_genes

Unnamed: 0,human_ensembl_gene_id,human_entrezgene_id,human_external_gene_name,mouse_homolog_ensembl_gene,orthology_type,mouse_homolog_gene_name
13,ENSG00000277196,5625.0,PRODH,ENSMUSG00000003526,ortholog_one2many,Prodh
15,ENSG00000198692,9086.0,EIF1AY,ENSMUSG00000057561,ortholog_one2many,Eif1a
16,ENSG00000198692,9086.0,EIF1AY,ENSMUSG00000113201,ortholog_one2many,Eif1ad13
17,ENSG00000198692,9086.0,EIF1AY,ENSMUSG00000079031,ortholog_one2many,Eif1ad2
18,ENSG00000198692,9086.0,EIF1AY,ENSMUSG00000079029,ortholog_one2many,Eif1ad7
...,...,...,...,...,...,...
21930,ENSG00000241128,388761.0,OR14A2,ENSMUSG00000062878,ortholog_one2many,Or14a257
21931,ENSG00000241128,388761.0,OR14A2,ENSMUSG00000055610,ortholog_one2many,Or14a260
21932,ENSG00000241128,388761.0,OR14A2,ENSMUSG00000062042,ortholog_one2many,Or14a256
21933,ENSG00000241128,388761.0,OR14A2,ENSMUSG00000055571,ortholog_one2many,Or14a259


## Check for duplicates

In [24]:
human_mouse_homolog_genes.duplicated().any()

False

In [25]:
human_mouse_homolog_genes.human_ensembl_gene_id.duplicated().any()

True

In [26]:
human_mouse_homolog_genes.human_entrezgene_id.duplicated().any()

True

In [27]:
human_mouse_homolog_genes.mouse_homolog_ensembl_gene.duplicated().any()

True

### 1 entrez id ~ many ensembl

In [29]:
doublets = {}
for index, row in human_mouse_homolog_genes.iterrows():
    if row['human_entrezgene_id'] not in doublets.keys():
        doublets[row['human_entrezgene_id']] = []
    if row.human_ensembl_gene_id not in doublets[row['human_entrezgene_id']]:
        doublets[row['human_entrezgene_id']].append(row.human_ensembl_gene_id)

In [33]:
len(doublets)

826

In [31]:
real_duplicates = {}
for gene in doublets.keys():
    if len(doublets[gene])>1:
        real_duplicates[gene] = doublets[gene]

In [32]:
real_duplicates

{5625.0: ['ENSG00000277196', 'ENSG00000100033'],
 6606.0: ['ENSG00000205571', 'ENSG00000172062'],
 29057.0: ['ENSG00000268350', 'ENSG00000179304'],
 1159.0: ['ENSG00000237289', 'ENSG00000223572'],
 552900.0: ['ENSG00000169627', 'ENSG00000183336'],
 79008.0: ['ENSG00000132207', 'ENSG00000181625'],
 3963.0: ['ENSG00000178934', 'ENSG00000205076'],
 55894.0: ['ENSG00000177243', 'ENSG00000176797'],
 245910.0: ['ENSG00000198129', 'ENSG00000186572'],
 10156.0: ['ENSG00000105808', 'ENSG00000170667'],
 51326.0: ['ENSG00000185829', 'ENSG00000228696']}

In [34]:
to_be_removed = []
for gene in real_duplicates.keys():
    to_be_removed.append(real_duplicates[gene][1])

In [35]:
to_be_removed

['ENSG00000100033',
 'ENSG00000172062',
 'ENSG00000179304',
 'ENSG00000223572',
 'ENSG00000183336',
 'ENSG00000181625',
 'ENSG00000205076',
 'ENSG00000176797',
 'ENSG00000186572',
 'ENSG00000170667',
 'ENSG00000228696']

In [37]:
human_mouse_homolog_genes[~human_mouse_homolog_genes.human_ensembl_gene_id.isin(to_be_removed)]

Unnamed: 0,human_ensembl_gene_id,human_entrezgene_id,human_external_gene_name,mouse_homolog_ensembl_gene,orthology_type,mouse_homolog_gene_name
13,ENSG00000277196,5625.0,PRODH,ENSMUSG00000003526,ortholog_one2many,Prodh
15,ENSG00000198692,9086.0,EIF1AY,ENSMUSG00000057561,ortholog_one2many,Eif1a
16,ENSG00000198692,9086.0,EIF1AY,ENSMUSG00000113201,ortholog_one2many,Eif1ad13
17,ENSG00000198692,9086.0,EIF1AY,ENSMUSG00000079031,ortholog_one2many,Eif1ad2
18,ENSG00000198692,9086.0,EIF1AY,ENSMUSG00000079029,ortholog_one2many,Eif1ad7
...,...,...,...,...,...,...
21930,ENSG00000241128,388761.0,OR14A2,ENSMUSG00000062878,ortholog_one2many,Or14a257
21931,ENSG00000241128,388761.0,OR14A2,ENSMUSG00000055610,ortholog_one2many,Or14a260
21932,ENSG00000241128,388761.0,OR14A2,ENSMUSG00000062042,ortholog_one2many,Or14a256
21933,ENSG00000241128,388761.0,OR14A2,ENSMUSG00000055571,ortholog_one2many,Or14a259


In [38]:
human_mouse_homolog_genes = human_mouse_homolog_genes[~human_mouse_homolog_genes.human_ensembl_gene_id.isin(to_be_removed)]

In [39]:
human_mouse_homolog_genes.duplicated().any()

False

### 1 ensembl ~ many entrez id

In [42]:
doublets = {}
for index, row in human_mouse_homolog_genes.iterrows():
    if row['human_ensembl_gene_id'] not in doublets.keys():
        doublets[row['human_ensembl_gene_id']] = []
    if row.human_entrezgene_id not in doublets[row['human_ensembl_gene_id']]:
        doublets[row['human_ensembl_gene_id']].append(row.human_entrezgene_id)

In [43]:
len(doublets)

826

In [44]:
real_duplicates = {}
for gene in doublets.keys():
    if len(doublets[gene])>1:
        real_duplicates[gene] = doublets[gene]

In [45]:
real_duplicates

{}

## Get one2many

In [47]:
human_mouse_homolog_genes.human_ensembl_gene_id.duplicated().sum()

990

In [48]:
human_mouse_homolog_genes

Unnamed: 0,human_ensembl_gene_id,human_entrezgene_id,human_external_gene_name,mouse_homolog_ensembl_gene,orthology_type,mouse_homolog_gene_name
13,ENSG00000277196,5625.0,PRODH,ENSMUSG00000003526,ortholog_one2many,Prodh
15,ENSG00000198692,9086.0,EIF1AY,ENSMUSG00000057561,ortholog_one2many,Eif1a
16,ENSG00000198692,9086.0,EIF1AY,ENSMUSG00000113201,ortholog_one2many,Eif1ad13
17,ENSG00000198692,9086.0,EIF1AY,ENSMUSG00000079031,ortholog_one2many,Eif1ad2
18,ENSG00000198692,9086.0,EIF1AY,ENSMUSG00000079029,ortholog_one2many,Eif1ad7
...,...,...,...,...,...,...
21930,ENSG00000241128,388761.0,OR14A2,ENSMUSG00000062878,ortholog_one2many,Or14a257
21931,ENSG00000241128,388761.0,OR14A2,ENSMUSG00000055610,ortholog_one2many,Or14a260
21932,ENSG00000241128,388761.0,OR14A2,ENSMUSG00000062042,ortholog_one2many,Or14a256
21933,ENSG00000241128,388761.0,OR14A2,ENSMUSG00000055571,ortholog_one2many,Or14a259


In [56]:
human_mouse_homolog_genes.loc[human_mouse_homolog_genes.human_ensembl_gene_id == 'ENSG00000198692'].shape[0]

17

In [60]:
one2many_ids = [gene for gene in human_mouse_homolog_genes.human_ensembl_gene_id if (human_mouse_homolog_genes.loc[human_mouse_homolog_genes.human_ensembl_gene_id == gene].shape[0] > 1)]

In [61]:
len(one2many_ids)

1346

In [62]:
one2many_genes = human_mouse_homolog_genes.loc[human_mouse_homolog_genes.human_ensembl_gene_id.isin(one2many_ids)]

In [63]:
one2many_genes

Unnamed: 0,human_ensembl_gene_id,human_entrezgene_id,human_external_gene_name,mouse_homolog_ensembl_gene,orthology_type,mouse_homolog_gene_name
15,ENSG00000198692,9086.0,EIF1AY,ENSMUSG00000057561,ortholog_one2many,Eif1a
16,ENSG00000198692,9086.0,EIF1AY,ENSMUSG00000113201,ortholog_one2many,Eif1ad13
17,ENSG00000198692,9086.0,EIF1AY,ENSMUSG00000079031,ortholog_one2many,Eif1ad2
18,ENSG00000198692,9086.0,EIF1AY,ENSMUSG00000079029,ortholog_one2many,Eif1ad7
19,ENSG00000198692,9086.0,EIF1AY,ENSMUSG00000093847,ortholog_one2many,Eif1ad15
...,...,...,...,...,...,...
21930,ENSG00000241128,388761.0,OR14A2,ENSMUSG00000062878,ortholog_one2many,Or14a257
21931,ENSG00000241128,388761.0,OR14A2,ENSMUSG00000055610,ortholog_one2many,Or14a260
21932,ENSG00000241128,388761.0,OR14A2,ENSMUSG00000062042,ortholog_one2many,Or14a256
21933,ENSG00000241128,388761.0,OR14A2,ENSMUSG00000055571,ortholog_one2many,Or14a259


In [67]:
one2many_genes.mouse_homolog_ensembl_gene.duplicated().any()

False

# Create new anndata objects

## Human

In [95]:
homolog_human = human_data[:, human_data.var.entrez_id.isin(human_mouse_homolog_genes['human_entrezgene_id'])]

In [96]:
homolog_human

View of AnnData object with n_obs × n_vars = 76533 × 826
    obs: 'sample_name', 'exp_component_name', 'cluster_label', 'cluster_color', 'cluster_order', 'class_label', 'class_color', 'class_order', 'subclass_label', 'subclass_color', 'subclass_order', 'donor_sex_label', 'donor_sex_color', 'donor_sex_order', 'region_label', 'region_color', 'region_order', 'cortical_layer_label', 'cortical_layer_color', 'cortical_layer_order', 'cell_type_accession_label', 'cell_type_accession_color', 'cell_type_accession_order', 'cell_type_alias_label', 'cell_type_alias_color', 'cell_type_alias_order', 'cell_type_alt_alias_label', 'cell_type_alt_alias_color', 'cell_type_alt_alias_order', 'cell_type_designation_label', 'cell_type_designation_color', 'cell_type_designation_order', 'external_donor_name_label', 'external_donor_name_color', 'external_donor_name_order', 'specimen_type', 'full_genotype_label', 'outlier_call', 'outlier_type'
    var: 'gene', 'chromosome', 'entrez_id', 'gene_name', 'mouse_homolo

## Mouse

In [97]:
homolog_mouse = mouse_data[:, mouse_data.var_names.isin(human_mouse_homolog_genes['mouse_homolog_ensembl_gene'])]

In [98]:
homolog_mouse

View of AnnData object with n_obs × n_vars = 159738 × 1558
    obs: 'Unnamed: 0', 'nUMI', 'nGene', 'QC', 'cluster', 'Allen.cluster_id', 'Allen.cluster_label', 'Allen.class_label', 'Allen.subclass_label', 'comb.QC', 'row', 'BICCN_cluster_id', 'BICCN_cluster_label', 'BICCN_class_label', 'BICCN_subclass_label', 'size', 'gene.counts', 'umi.counts', 'Broad.QC.doublet', 'Broad.QC.Mito', 'Broad.passQC', 'MALE', 'Comb.QC', 'cl', 'temp_class_label', 'BICCN_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'tissue_ontology_term_id', 'cell_type_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'donor_id', 'suspension_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype'

## Format obs df

In [405]:
homolog_human.obs['homolog_class_label'] = homolog_human.obs['class_label']
homolog_human.obs['homolog_subclass_label'] = homolog_human.obs['subclass_label']

  homolog_human.obs['homolog_class_label'] = homolog_human.obs['class_label']


In [406]:
homolog_human.obs

Unnamed: 0,sample_name,exp_component_name,cluster_label,cluster_color,cluster_order,class_label,class_color,class_order,subclass_label,subclass_color,subclass_order,donor_sex_label,donor_sex_color,donor_sex_order,region_label,region_color,region_order,cortical_layer_label,cortical_layer_color,cortical_layer_order,cell_type_accession_label,cell_type_accession_color,cell_type_accession_order,cell_type_alias_label,cell_type_alias_color,cell_type_alias_order,cell_type_alt_alias_label,cell_type_alt_alias_color,cell_type_alt_alias_order,cell_type_designation_label,cell_type_designation_color,cell_type_designation_order,external_donor_name_label,external_donor_name_color,external_donor_name_order,specimen_type,full_genotype_label,outlier_call,outlier_type,homolog_class_label,homolog_subclass_label
0,AAACCCAAGGATTTCC-LKTX_190129_01_A01,AAACCCAAGGATTTCC-21L8TX_180927_001_A01,Inh L1-2 SST CCNJL,#fb8d00,50,GABAergic,#FF7373,1,Sst,#FF9900,5,F,#FF7373,1,M1,#FF7373,1,all,#FF7373,1,CS1912131050,#fb8d00,50,Inh L1-2 SST CCNJL,#fb8d00,50,,#fb8d00,50,Neuron 50,#fb8d00,50,H18.30.001,#FF7373,1,nucleus,,False,,GABAergic,Sst
1,AAACCCAAGTATGGCG-LKTX_190129_01_A01,AAACCCAAGTATGGCG-21L8TX_180927_001_A01,Exc L5-6 FEZF2 IFNG-AS1,#2c815f,116,Glutamatergic,#3DCC3D,2,L5/6 NP,#3E9E64,12,F,#FF7373,1,M1,#FF7373,1,all,#FF7373,1,CS1912131116,#2c815f,116,Exc L5-6 FEZF2 IFNG-AS1,#2c815f,116,,#2c815f,116,Neuron 116,#2c815f,116,H18.30.001,#FF7373,1,nucleus,,False,,Glutamatergic,L5/6 NP
2,AAACCCACAAAGTGTA-LKTX_190129_01_A01,AAACCCACAAAGTGTA-21L8TX_180927_001_A01,Exc L3-5 RORB LINC01202,#547d7a,87,Glutamatergic,#3DCC3D,2,L5 IT,#50B2AD,8,F,#FF7373,1,M1,#FF7373,1,all,#FF7373,1,CS1912131087,#547d7a,87,Exc L3-5 RORB LINC01202,#547d7a,87,,#547d7a,87,Neuron 87,#547d7a,87,H18.30.001,#FF7373,1,nucleus,,False,,Glutamatergic,L5 IT
3,AAACCCACACTACTTT-LKTX_190129_01_A01,AAACCCACACTACTTT-21L8TX_180927_001_A01,Exc L2 LINC00507 GLRA3,#cecd32,75,Glutamatergic,#3DCC3D,2,L2/3 IT,#C4EC04,7,F,#FF7373,1,M1,#FF7373,1,all,#FF7373,1,CS1912131075,#cecd32,75,Exc L2 LINC00507 GLRA3,#cecd32,75,,#cecd32,75,Neuron 75,#cecd32,75,H18.30.001,#FF7373,1,nucleus,,False,,Glutamatergic,L2/3 IT
4,AAACCCACAGTGAGCA-LKTX_190129_01_A01,AAACCCACAGTGAGCA-21L8TX_180927_001_A01,Oligo L2-6 OPALIN FTH1P3,#003a28,120,Non-Neuronal,#171799,3,Oligo,#2E3E39,17,F,#FF7373,1,M1,#FF7373,1,all,#FF7373,1,CS1912131120,#003a28,120,Oligo L2-6 OPALIN FTH1P3,#003a28,120,,#003a28,120,Non-neuron 3,#003a28,120,H18.30.001,#FF7373,1,nucleus,,False,,Non-Neuronal,Oligo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76528,TTTGTTGAGATGGCGT-LKTX_190130_01_H01,TTTGTTGAGATGGCGT-35L8TX_181108_001_D01,Oligo L2-6 OPALIN FTH1P3,#003a28,120,Non-Neuronal,#171799,3,Oligo,#2E3E39,17,F,#FF7373,1,M1,#FF7373,1,all,#FF7373,1,CS1912131120,#003a28,120,Oligo L2-6 OPALIN FTH1P3,#003a28,120,,#003a28,120,Non-neuron 3,#003a28,120,H18.30.001,#FF7373,1,nucleus,,False,,Non-Neuronal,Oligo
76529,TTTGTTGCACAGCCAC-LKTX_190130_01_H01,TTTGTTGCACAGCCAC-35L8TX_181108_001_D01,Exc L3-5 RORB LNX2,#01d9d5,90,Glutamatergic,#3DCC3D,2,L5 IT,#50B2AD,8,F,#FF7373,1,M1,#FF7373,1,all,#FF7373,1,CS1912131090,#01d9d5,90,Exc L3-5 RORB LNX2,#01d9d5,90,,#01d9d5,90,Neuron 90,#01d9d5,90,H18.30.001,#FF7373,1,nucleus,,False,,Glutamatergic,L5 IT
76530,TTTGTTGCAGAGACTG-LKTX_190130_01_H01,TTTGTTGCAGAGACTG-35L8TX_181108_001_D01,Exc L2-3 RORB PTPN3,#b7ce00,81,Glutamatergic,#3DCC3D,2,L2/3 IT,#C4EC04,7,F,#FF7373,1,M1,#FF7373,1,all,#FF7373,1,CS1912131081,#b7ce00,81,Exc L2-3 RORB PTPN3,#b7ce00,81,,#b7ce00,81,Neuron 81,#b7ce00,81,H18.30.001,#FF7373,1,nucleus,,False,,Glutamatergic,L2/3 IT
76531,TTTGTTGCATAATGAG-LKTX_190130_01_H01,TTTGTTGCATAATGAG-35L8TX_181108_001_D01,Oligo L2-6 OPALIN FTH1P3,#003a28,120,Non-Neuronal,#171799,3,Oligo,#2E3E39,17,F,#FF7373,1,M1,#FF7373,1,all,#FF7373,1,CS1912131120,#003a28,120,Oligo L2-6 OPALIN FTH1P3,#003a28,120,,#003a28,120,Non-neuron 3,#003a28,120,H18.30.001,#FF7373,1,nucleus,,False,,Non-Neuronal,Oligo


In [407]:
homolog_mouse.obs['sample_name'] = homolog_mouse.obs.index

  homolog_mouse.obs['sample_name'] = homolog_mouse.obs.index


In [408]:
homolog_mouse.obs['homolog_class_label'] = homolog_mouse.obs['Allen.class_label']
homolog_mouse.obs['homolog_subclass_label'] = homolog_mouse.obs['Allen.subclass_label']

In [411]:
homolog_mouse.obs = homolog_mouse.obs.drop('Unnamed: 0', axis = 1)

In [412]:
homolog_mouse.obs

Unnamed: 0,nUMI,nGene,QC,cluster,Allen.cluster_id,Allen.cluster_label,Allen.class_label,Allen.subclass_label,comb.QC,row,BICCN_cluster_id,BICCN_cluster_label,BICCN_class_label,BICCN_subclass_label,size,gene.counts,umi.counts,Broad.QC.doublet,Broad.QC.Mito,Broad.passQC,MALE,Comb.QC,cl,temp_class_label,BICCN_ontology_term_id,assay_ontology_term_id,disease_ontology_term_id,tissue_ontology_term_id,cell_type_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id,sex_ontology_term_id,is_primary_data,organism_ontology_term_id,donor_id,suspension_type,cell_type,assay,disease,organism,sex,tissue,self_reported_ethnicity,development_stage,sample_name,homolog_class_label,homolog_subclass_label
pBICCNsMMrMOpRMiF007d190314_AAAGGGCAGGGCTAAC,4499,2094,PassQC,Astro_14,91.0,Astro Aqp4_1,Non-Neuronal,Astro,PassQC,4,91.0,Astro Aqp4_1,Non-Neuronal,Astro,16143.0,2601.086105,5846.112804,0.03,0.0,0.97,0.448615,,113.0,Non-NeuronalAstro,ILX:0770141,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000127,na,MmusDv:0000061,PATO:0000383,False,NCBITaxon:10090,F007,nucleus,astrocyte,10x 3' v3,normal,Mus musculus,female,primary motor cortex,na,early adult stage,pBICCNsMMrMOpRMiF007d190314_AAAGGGCAGGGCTAAC,Non-Neuronal,Astro
pBICCNsMMrMOpRMiF007d190314_AAAGGTAGTGGCTGAA,11900,4182,PassQC,Ex2_9,48.0,L2/3 IT_1,Glutamatergic,L2/3 IT,PassQC,5,48.0,L2/3 IT_1,Glutamatergic,L2/3 IT,23971.0,5300.903467,19543.896210,0.11,0.0,0.89,0.402945,,64.0,GlutamatergicL2/3 IT,ILX:0770156,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000679,na,MmusDv:0000061,PATO:0000383,False,NCBITaxon:10090,F007,nucleus,glutamatergic neuron,10x 3' v3,normal,Mus musculus,female,primary motor cortex,na,early adult stage,pBICCNsMMrMOpRMiF007d190314_AAAGGTAGTGGCTGAA,Glutamatergic,L2/3 IT
pBICCNsMMrMOpRMiF007d190314_AAAGTCCCAAGAGCTG,4166,2025,PassQC,Astro_0,91.0,Astro Aqp4_1,Non-Neuronal,Astro,PassQC,6,91.0,Astro Aqp4_1,Non-Neuronal,Astro,16143.0,2601.086105,5846.112804,0.03,0.0,0.97,0.448615,,113.0,Non-NeuronalAstro,ILX:0770141,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000127,na,MmusDv:0000061,PATO:0000383,False,NCBITaxon:10090,F007,nucleus,astrocyte,10x 3' v3,normal,Mus musculus,female,primary motor cortex,na,early adult stage,pBICCNsMMrMOpRMiF007d190314_AAAGTCCCAAGAGCTG,Non-Neuronal,Astro
pBICCNsMMrMOpRMiF007d190314_AAATGGAAGATCACCT,11834,4090,PassQC,Ex3_0,66.0,L6 CT Cpa6,Glutamatergic,L6 CT,PassQC,7,66.0,L6 CT Cpa6,Glutamatergic,L6 CT,16829.0,4862.752035,15796.942180,0.00,0.0,0.99,0.471210,,104.0,GlutamatergicL6 CT,ILX:0770162,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000679,na,MmusDv:0000061,PATO:0000383,False,NCBITaxon:10090,F007,nucleus,glutamatergic neuron,10x 3' v3,normal,Mus musculus,female,primary motor cortex,na,early adult stage,pBICCNsMMrMOpRMiF007d190314_AAATGGAAGATCACCT,Glutamatergic,L6 CT
pBICCNsMMrMOpRMiF007d190314_AACAAAGCACGCGCAT,1957,1362,PassQC,Micro_14,113.0,Macrophage_2,Non-Neuronal,Macrophage,PassQC,8,113.0,Macrophage_2,Non-Neuronal,Macrophage,7801.0,1583.783361,2507.487117,0.00,0.0,1.00,0.482374,,123.0,Non-NeuronalMacrophage,Macrophage,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000235,na,MmusDv:0000061,PATO:0000383,False,NCBITaxon:10090,F007,nucleus,macrophage,10x 3' v3,normal,Mus musculus,female,primary motor cortex,na,early adult stage,pBICCNsMMrMOpRMiF007d190314_AACAAAGCACGCGCAT,Non-Neuronal,Macrophage
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
pBICCNsMMrMOpRMiM006d190320_TTTGGTTCATGAGTAA,20193,5695,PassQC,Ex1_5,53.0,L6 IT Sulf1_1,Glutamatergic,L6 IT,PassQC,215818,53.0,L6 IT Sulf1_1,Glutamatergic,L6 IT,3978.0,5218.066616,18064.753140,0.01,0.0,0.99,0.471594,,85.0,GlutamatergicL6 IT,ILX:0770158,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000679,na,MmusDv:0000061,PATO:0000384,False,NCBITaxon:10090,M006,nucleus,glutamatergic neuron,10x 3' v3,normal,Mus musculus,male,primary motor cortex,na,early adult stage,pBICCNsMMrMOpRMiM006d190320_TTTGGTTCATGAGTAA,Glutamatergic,L6 IT
pBICCNsMMrMOpRMiM006d190320_TTTGGTTTCGCAAGAG,2858,1602,PassQC,Oligo_19,90.0,Oligo Opalin_4,Non-Neuronal,Oligo,PassQC,215820,90.0,Oligo Opalin_4,Non-Neuronal,Oligo,16566.0,1740.348364,3419.937704,0.06,0.0,0.94,0.487323,,150.0,Non-NeuronalOligo,ILX:0770140,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000128,na,MmusDv:0000061,PATO:0000384,False,NCBITaxon:10090,M006,nucleus,oligodendrocyte,10x 3' v3,normal,Mus musculus,male,primary motor cortex,na,early adult stage,pBICCNsMMrMOpRMiM006d190320_TTTGGTTTCGCAAGAG,Non-Neuronal,Oligo
pBICCNsMMrMOpRMiM006d190320_TTTGTTGAGACTCTTG,35854,7344,PassQC,Ex1_3,51.0,L5 IT S100b,Glutamatergic,L5 IT,PassQC,215821,51.0,L5 IT S100b,Glutamatergic,L5 IT,8684.0,5464.036043,20579.022920,0.01,0.0,0.99,0.385306,,70.0,GlutamatergicL5 IT,ILX:0770157,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000679,na,MmusDv:0000061,PATO:0000384,False,NCBITaxon:10090,M006,nucleus,glutamatergic neuron,10x 3' v3,normal,Mus musculus,male,primary motor cortex,na,early adult stage,pBICCNsMMrMOpRMiM006d190320_TTTGTTGAGACTCTTG,Glutamatergic,L5 IT
pBICCNsMMrMOpRMiM006d190320_TTTGTTGTCACCTTGC,23493,6146,PassQC,Ex1_3,52.0,L5 IT Pld5,Glutamatergic,L5 IT,PassQC,215822,52.0,L5 IT Pld5,Glutamatergic,L5 IT,3621.0,5680.692074,22718.304610,0.01,0.0,0.99,0.446838,,83.0,GlutamatergicL5 IT,ILX:0770157,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000679,na,MmusDv:0000061,PATO:0000384,False,NCBITaxon:10090,M006,nucleus,glutamatergic neuron,10x 3' v3,normal,Mus musculus,male,primary motor cortex,na,early adult stage,pBICCNsMMrMOpRMiM006d190320_TTTGTTGTCACCTTGC,Glutamatergic,L5 IT


# Create o2m object

## Create the matrix and var df

In [173]:
one2many_genes

Unnamed: 0,human_ensembl_gene_id,human_entrezgene_id,human_external_gene_name,mouse_homolog_ensembl_gene,orthology_type,mouse_homolog_gene_name
15,ENSG00000198692,9086.0,EIF1AY,ENSMUSG00000057561,ortholog_one2many,Eif1a
16,ENSG00000198692,9086.0,EIF1AY,ENSMUSG00000113201,ortholog_one2many,Eif1ad13
17,ENSG00000198692,9086.0,EIF1AY,ENSMUSG00000079031,ortholog_one2many,Eif1ad2
18,ENSG00000198692,9086.0,EIF1AY,ENSMUSG00000079029,ortholog_one2many,Eif1ad7
19,ENSG00000198692,9086.0,EIF1AY,ENSMUSG00000093847,ortholog_one2many,Eif1ad15
...,...,...,...,...,...,...
21930,ENSG00000241128,388761.0,OR14A2,ENSMUSG00000062878,ortholog_one2many,Or14a257
21931,ENSG00000241128,388761.0,OR14A2,ENSMUSG00000055610,ortholog_one2many,Or14a260
21932,ENSG00000241128,388761.0,OR14A2,ENSMUSG00000062042,ortholog_one2many,Or14a256
21933,ENSG00000241128,388761.0,OR14A2,ENSMUSG00000055571,ortholog_one2many,Or14a259


In [135]:
homolog_human.obs_names

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '76523', '76524', '76525', '76526', '76527', '76528', '76529', '76530',
       '76531', '76532'],
      dtype='object', length=76533)

In [100]:
homolog_human.var_names

Index(['23', '60', '155', '159', '173', '207', '302', '309', '347', '408',
       ...
       '50100', '50101', '50119', '50138', '50154', '50155', '50171', '50187',
       '50196', '50216'],
      dtype='object', length=826)

In [101]:
homolog_human.var

Unnamed: 0,gene,chromosome,entrez_id,gene_name,mouse_homologenes
23,AADACL4,1,343066,arylacetamide deacetylase-like 4,Gm13177
60,ABCB1,7,5243,"ATP-binding cassette, sub-family B (MDR/TAP), ...",Abcb1a
155,ACAA1,3,30,acetyl-CoA acyltransferase 1,Acaa1a
159,ACAD10,12,80724,"acyl-CoA dehydrogenase family, member 10",Acad10
173,ACAT2,6,39,acetyl-CoA acetyltransferase 2,Acat2;Acat3
...,...,...,...,...,...
50155,ZNF845,19,91664,zinc finger protein 845,
50171,ZNF878,19,729747,zinc finger protein 878,Zfp617
50187,ZNF92,7,168374,zinc finger protein 92,
50196,ZNF98,19,148198,zinc finger protein 98,


In [107]:
homolog_human.var_names

Index(['23', '60', '155', '159', '173', '207', '302', '309', '347', '408',
       ...
       '50100', '50101', '50119', '50138', '50154', '50155', '50171', '50187',
       '50196', '50216'],
      dtype='object', length=826)

In [152]:
homolog_mouse.var_names

Index(['ENSMUSG00000090129', 'ENSMUSG00000020029', 'ENSMUSG00000024292',
       'ENSMUSG00000022586', 'ENSMUSG00000056197', 'ENSMUSG00000054850',
       'ENSMUSG00000041144', 'ENSMUSG00000096436', 'ENSMUSG00000033765',
       'ENSMUSG00000075470',
       ...
       'ENSMUSG00000103362', 'ENSMUSG00000070810', 'ENSMUSG00000094328',
       'ENSMUSG00000056367', 'ENSMUSG00000007440', 'ENSMUSG00000021404',
       'ENSMUSG00000001666', 'ENSMUSG00000095620', 'ENSMUSG00000045883',
       'ENSMUSG00000096679'],
      dtype='object', name='feature_id', length=1558)

In [172]:
homolog_mouse.var

Unnamed: 0_level_0,feature_is_filtered,feature_name,feature_reference,feature_biotype
feature_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSMUSG00000090129,False,Olfr287,NCBITaxon:10090,gene
ENSMUSG00000020029,False,Nudt4,NCBITaxon:10090,gene
ENSMUSG00000024292,False,Cyp4f14,NCBITaxon:10090,gene
ENSMUSG00000022586,False,Ly6i,NCBITaxon:10090,gene
ENSMUSG00000056197,False,4931417E11Rik,NCBITaxon:10090,gene
...,...,...,...,...
ENSMUSG00000021404,False,Serpinb9c,NCBITaxon:10090,gene
ENSMUSG00000001666,False,Ddt,NCBITaxon:10090,gene
ENSMUSG00000095620,False,Csta2,NCBITaxon:10090,gene
ENSMUSG00000045883,False,Olfr1461,NCBITaxon:10090,gene


In [154]:
homolog_mouse.obs.shape[0]

159738

In [215]:
np.asarray(homolog_mouse.X.todense()[:,0].flatten())[0]

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [223]:
np.asarray(homolog_mouse.X.todense()[:,[0,1,3]]).T

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 1., 4., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [196]:
mouse_matrix = homolog_mouse.X.todense()[:,0].flatten()

In [208]:
array_mouse = np.asarray(mouse_matrix)[0]

In [211]:
array_mouse

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [169]:
homolog_human.X

ArrayView([[0., 1., 0., ..., 0., 0., 0.],
           [0., 0., 0., ..., 0., 0., 0.],
           [0., 0., 0., ..., 0., 0., 0.],
           ...,
           [0., 0., 0., ..., 0., 0., 0.],
           [0., 0., 0., ..., 0., 0., 0.],
           [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [114]:
homolog_human.X[:,0] #Gives all the value for a gene

ArrayView([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [214]:
np.concatenate([homolog_human.X[:,0], array_mouse],axis=0)

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [122]:
homolog_human.var.loc[homolog_human.var.entrez_id == 80724].index[0]

'159'

In [131]:
np.where(homolog_human.var_names == homolog_human.var.loc[homolog_human.var.entrez_id == 80724].index[0])[0][0]

3

In [127]:
homolog_human.X[:,np.where(homolog_human.var_names == homolog_human.var.loc[homolog_human.var.entrez_id == 80724].index[0])[0][0]]

ArrayView([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [134]:
sum(homolog_human.X[:,3] == homolog_human.X[:,np.where(homolog_human.var_names == homolog_human.var.loc[homolog_human.var.entrez_id == 80724].index[0])[0][0]])

76533

In [147]:
one2many_genes.loc[one2many_genes.human_entrezgene_id == 5243].human_entrezgene_id.values[0]

5243.0

In [150]:
homolog_human.var.loc[homolog_human.var.entrez_id == one2many_genes.loc[one2many_genes.human_entrezgene_id == 5243].human_entrezgene_id.values[0]].chromosome.values[0]

'7'

In [175]:
homolog_mouse.var.loc[homolog_mouse.var.index == 'ENSMUSG00000113201'].index[0]

'ENSMUSG00000113201'

In [302]:
bb = [1,2,3,10]
aa = [[4,5,6,11],[7,8,9,12]]

In [303]:
np.mean(aa,axis=0)

array([ 5.5,  6.5,  7.5, 11.5])

In [320]:
aa.T

array([[ 4,  7,  1,  1],
       [ 5,  8,  2,  2],
       [ 6,  9,  3,  3],
       [11, 12, 10, 10]])

In [319]:
c = [1,2]
c.append(3)
c

[1, 2, 3]

In [313]:
aa = np.concatenate([aa,[bb]],axis=0)

In [178]:
 b = np.array([1,2])

In [310]:
a = np.array([[1, 2], [3, 4]])
a

array([[1, 2],
       [3, 4]])

In [177]:
np.mean(a, axis=0)

array([2., 3.])

In [322]:
d = np.append(a,[b], axis=0)
d.T

array([[1, 3, 1],
       [2, 4, 2]])

In [226]:
trym = np.asarray(homolog_mouse.X.todense()[:,[0,1,3]]).T
trymm = np.mean(trym, axis=0)
mat = np.concatenate([homolog_human.X[:,0], trymm])

In [228]:
len(mat)

236271

In [234]:
resexp = np.append([mat], [mat], axis = 0)
resexp

array([[0.        , 0.        , 0.        , ..., 0.33333334, 1.3333334 ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.33333334, 1.3333334 ,
        0.        ]], dtype=float32)

In [238]:
np.shape(resexp)

(2, 236271)

In [245]:
resexp.T

array([[0.        , 0.        ],
       [0.        , 0.        ],
       [0.        , 0.        ],
       ...,
       [0.33333334, 0.33333334],
       [1.3333334 , 1.3333334 ],
       [0.        , 0.        ]], dtype=float32)

In [258]:
one2many_genes.mouse_homolog_ensembl_gene.iloc[[1,4,7]]

16    ENSMUSG00000113201
19    ENSMUSG00000093847
22    ENSMUSG00000113805
Name: mouse_homolog_ensembl_gene, dtype: object

In [290]:
names = one2many_genes.mouse_homolog_ensembl_gene.iloc[[1,4,7]]

In [291]:
names = np.asarray(names)

In [295]:
names.sort()

In [293]:
names = [str(i) for i in names]

In [296]:
names

['ENSMUSG00000093847', 'ENSMUSG00000113201', 'ENSMUSG00000113805']

In [690]:
def construct_o2m_matrixanddf():
    # def the variables to return 
    matrix = []
    already_done = {}
    var_df = []
    mouse_ensembl_ids = []
    
    # start the loop throught the o2m dataframe
    for index, row in one2many_genes.iterrows():
        #get the human id
        human_id = row.human_ensembl_gene_id
        
        # check if it has already been done, if not we get into the if
        if human_id not in already_done.keys():
            # starts new lines for dataframes
            var_line = {}
            
            # get all the human info
            var_line['human_ensembl_id'] = human_id
            var_line['human_entrez_id'] = row.human_entrezgene_id
            var_line['human_gene_name'] = row.human_external_gene_name
            var_line['orthology_type'] = row.orthology_type
            var_line['human_chromosome'] = homolog_human.var.loc[homolog_human.var.entrez_id == row.human_entrezgene_id].chromosome.iloc[0]
            var_line['human_long_gene_name'] = homolog_human.var.loc[homolog_human.var.entrez_id == row.human_entrezgene_id].gene_name.iloc[0]
            
            already_done[human_id] = []
            
            # get all the mouse homolog genes for the human gene
            current_df = one2many_genes.loc[one2many_genes.human_ensembl_gene_id == human_id]
            
            # def mouse arrays for stocking info
            mouse_counts = []
            mouse_names = []
            mouse_ids = []
            index = []
            
            # pass throught all the mouse genes
            for index2, row2 in current_df.iterrows():
                # create arrays for the df
                mouse_names.append(row2.mouse_homolog_gene_name)
                mouse_ids.append(row2.mouse_homolog_ensembl_gene)
                # append the array to check afterwards
                mouse_ensembl_ids.append(row2.mouse_homolog_ensembl_gene)
                # fill the dictionnary
                already_done[human_id].append(row2.mouse_homolog_ensembl_gene)
                # get the index of the gene in the matrix
                index.append(np.where(homolog_mouse.var_names == homolog_mouse.var.loc[homolog_mouse.var.index == row2.mouse_homolog_ensembl_gene].index[0])[0][0])
            
            # get the cells counts for each gene
            homolog_matrix = np.asarray(homolog_mouse.X.todense()[:, index].T)
            # get the mean for each cell
            mouse_counts = np.mean(homolog_matrix, axis=0)        
            
            # add the newline to the count matrix
            if matrix == []:
                matrix = np.array([np.concatenate([homolog_human.X[:,np.where(homolog_human.var_names == homolog_human.var.loc[homolog_human.var.entrez_id == row.human_entrezgene_id].index[0])[0][0]],
                                          mouse_counts])])
            else : 
                matrix = np.append(matrix,
                                   [np.concatenate([homolog_human.X[:,np.where(homolog_human.var_names == homolog_human.var.loc[homolog_human.var.entrez_id == row.human_entrezgene_id].index[0])[0][0]],
                                              mouse_counts])],
                                   axis = 0)
            
            # To find the genes afterward if necessary
            already_done[human_id] = mouse_ids
            
            # add the information for the mouse's genes to the df
            var_line['mouse_homologs_ids'] = mouse_ids
            var_line['mouse_homologs_names'] = mouse_names
            mouse_ids.sort()
            var_line['mouse_ensembl_id'] = mouse_ids[0]
            var_line['mouse_gene_name'] = one2many_genes.loc[one2many_genes.mouse_homolog_ensembl_gene == mouse_ids[0]].mouse_homolog_gene_name.iloc[0]
            
            # add the newline to the rows of the df
            var_df.append(var_line)
                
    return matrix.T, var_df, already_done, mouse_ensembl_ids

In [691]:
o2m_matrix, o2m_var_df, human_genes_done, mouse_ids = construct_o2m_matrixanddf()

  if matrix == []:


In [692]:
o2m_matrix.shape

(236271, 356)

In [693]:
type(o2m_matrix)

numpy.ndarray

In [694]:
o2m_matrix

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.11764706, 0.5       , 0.        , ..., 1.5       , 2.5       ,
        0.        ],
       [0.05882353, 0.        , 0.        , ..., 1.        , 5.5       ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.5       , 1.        ,
        0.        ]], dtype=float32)

In [695]:
np.shape(homolog_human.X)

(76533, 826)

In [696]:
o2m_matrix[76533:,:]

array([[0.        , 0.        , 0.        , ..., 1.5       , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       ...,
       [0.11764706, 0.5       , 0.        , ..., 1.5       , 2.5       ,
        0.        ],
       [0.05882353, 0.        , 0.        , ..., 1.        , 5.5       ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.5       , 1.        ,
        0.        ]], dtype=float32)

In [697]:
o2m_var = pd.DataFrame(o2m_var_df)

In [698]:
o2m_var

Unnamed: 0,human_ensembl_id,human_entrez_id,human_gene_name,orthology_type,human_chromosome,human_long_gene_name,mouse_homologs_ids,mouse_homologs_names,mouse_ensembl_id,mouse_gene_name
0,ENSG00000198692,9086.0,EIF1AY,ortholog_one2many,Y,"eukaryotic translation initiation factor 1A, Y...","[ENSMUSG00000057561, ENSMUSG00000072905, ENSMU...","[Eif1a, Eif1ad13, Eif1ad2, Eif1ad7, Eif1ad15, ...",ENSMUSG00000057561,Eif1a
1,ENSG00000198252,6815.0,STYX,ortholog_one2many,14,serine/threonine/tyrosine interacting protein,"[ENSMUSG00000053205, ENSMUSG00000071748]","[Styx-ps, Styx]",ENSMUSG00000053205,Styx
2,ENSG00000101435,128821.0,CST9L,ortholog_one2many,20,cystatin 9-like,"[ENSMUSG00000027445, ENSMUSG00000027446]","[Cstdc2, Cst9]",ENSMUSG00000027445,Cst9
3,ENSG00000100612,51635.0,DHRS7,ortholog_one2many,14,dehydrogenase/reductase (SDR family) member 7,"[ENSMUSG00000021094, ENSMUSG00000109482]","[Dhrs7, Gm4756]",ENSMUSG00000021094,Dhrs7
4,ENSG00000050130,51528.0,JKAMP,ortholog_one2many,14,JNK1/MAPK8-associated membrane protein,"[ENSMUSG00000005078, ENSMUSG00000056197]","[Jkamp, Jkampl]",ENSMUSG00000005078,Jkamp
...,...,...,...,...,...,...,...,...,...,...
351,ENSG00000143416,8991.0,SELENBP1,ortholog_one2many,1,selenium binding protein 1,"[ENSMUSG00000068874, ENSMUSG00000068877]","[Selenbp1, Selenbp2]",ENSMUSG00000068874,Selenbp1
352,ENSG00000134184,2944.0,GSTM1,ortholog_one2many,1,glutathione S-transferase mu 1,"[ENSMUSG00000004038, ENSMUSG00000040562, ENSMU...","[Gstm3, Gstm2, Gstm6, Gstm1]",ENSMUSG00000004038,Gstm3
353,ENSG00000074800,2023.0,ENO1,ortholog_one2many,1,"enolase 1, (alpha)","[ENSMUSG00000059040, ENSMUSG00000063524]","[Eno1b, Eno1]",ENSMUSG00000059040,Eno1b
354,ENSG00000143549,7170.0,TPM3,ortholog_one2many,1,tropomyosin 3,"[ENSMUSG00000027940, ENSMUSG00000058126]","[Tpm3, Tpm3-rs7]",ENSMUSG00000027940,Tpm3


In [699]:
len(human_genes_done.keys())

356

In [700]:
len(mouse_ids)

1346

In [701]:
len(one2many_genes.human_ensembl_gene_id.unique())

356

In [702]:
len(one2many_genes.mouse_homolog_ensembl_gene.unique())

1346

## Create anndata object with the results

In [703]:
o2m_homolog = anndata.AnnData(X = o2m_matrix, obs = pd.concat([homolog_human.obs, homolog_mouse.obs], axis = 0),var = o2m_var)



In [704]:
o2m_homolog

AnnData object with n_obs × n_vars = 236271 × 356
    obs: 'sample_name', 'exp_component_name', 'cluster_label', 'cluster_color', 'cluster_order', 'class_label', 'class_color', 'class_order', 'subclass_label', 'subclass_color', 'subclass_order', 'donor_sex_label', 'donor_sex_color', 'donor_sex_order', 'region_label', 'region_color', 'region_order', 'cortical_layer_label', 'cortical_layer_color', 'cortical_layer_order', 'cell_type_accession_label', 'cell_type_accession_color', 'cell_type_accession_order', 'cell_type_alias_label', 'cell_type_alias_color', 'cell_type_alias_order', 'cell_type_alt_alias_label', 'cell_type_alt_alias_color', 'cell_type_alt_alias_order', 'cell_type_designation_label', 'cell_type_designation_color', 'cell_type_designation_order', 'external_donor_name_label', 'external_donor_name_color', 'external_donor_name_order', 'specimen_type', 'full_genotype_label', 'outlier_call', 'outlier_type', 'homolog_class_label', 'homolog_subclass_label', 'nUMI', 'nGene', 'QC', 'clu

In [705]:
o2m_homolog.X

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.11764706, 0.5       , 0.        , ..., 1.5       , 2.5       ,
        0.        ],
       [0.05882353, 0.        , 0.        , ..., 1.        , 5.5       ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.5       , 1.        ,
        0.        ]], dtype=float32)

In [706]:
o2m_homolog.var_names

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '346', '347', '348', '349', '350', '351', '352', '353', '354', '355'],
      dtype='object', length=356)

In [707]:
o2m_homolog.obs_names

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       'pBICCNsMMrMOpRMiM006d190320_TTTCCTCTCGAGCCTG',
       'pBICCNsMMrMOpRMiM006d190320_TTTCCTCTCTAACACG',
       'pBICCNsMMrMOpRMiM006d190320_TTTGATCAGTTACGTC',
       'pBICCNsMMrMOpRMiM006d190320_TTTGGAGAGTTGTAAG',
       'pBICCNsMMrMOpRMiM006d190320_TTTGGTTAGGATTTCC',
       'pBICCNsMMrMOpRMiM006d190320_TTTGGTTCATGAGTAA',
       'pBICCNsMMrMOpRMiM006d190320_TTTGGTTTCGCAAGAG',
       'pBICCNsMMrMOpRMiM006d190320_TTTGTTGAGACTCTTG',
       'pBICCNsMMrMOpRMiM006d190320_TTTGTTGTCACCTTGC',
       'pBICCNsMMrMOpRMiM006d190320_TTTGTTGTCATGCTAG'],
      dtype='object', length=236271)

In [708]:
o2m_homolog.var = o2m_homolog.var.set_index('human_entrez_id')
o2m_homolog.var['human_entrez_id'] = o2m_homolog.var.index

AnnData expects .var.index to contain strings, but got values like:
    [9086.0, 6815.0, 128821.0, 51635.0, 51528.0]

    Inferred to be: floating

  value_idx = self._prep_dim_index(value.index, attr)


In [709]:
o2m_homolog.var

Unnamed: 0_level_0,human_ensembl_id,human_gene_name,orthology_type,human_chromosome,human_long_gene_name,mouse_homologs_ids,mouse_homologs_names,mouse_ensembl_id,mouse_gene_name,human_entrez_id
human_entrez_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
9086.0,ENSG00000198692,EIF1AY,ortholog_one2many,Y,"eukaryotic translation initiation factor 1A, Y...","[ENSMUSG00000057561, ENSMUSG00000072905, ENSMU...","[Eif1a, Eif1ad13, Eif1ad2, Eif1ad7, Eif1ad15, ...",ENSMUSG00000057561,Eif1a,9086.0
6815.0,ENSG00000198252,STYX,ortholog_one2many,14,serine/threonine/tyrosine interacting protein,"[ENSMUSG00000053205, ENSMUSG00000071748]","[Styx-ps, Styx]",ENSMUSG00000053205,Styx,6815.0
128821.0,ENSG00000101435,CST9L,ortholog_one2many,20,cystatin 9-like,"[ENSMUSG00000027445, ENSMUSG00000027446]","[Cstdc2, Cst9]",ENSMUSG00000027445,Cst9,128821.0
51635.0,ENSG00000100612,DHRS7,ortholog_one2many,14,dehydrogenase/reductase (SDR family) member 7,"[ENSMUSG00000021094, ENSMUSG00000109482]","[Dhrs7, Gm4756]",ENSMUSG00000021094,Dhrs7,51635.0
51528.0,ENSG00000050130,JKAMP,ortholog_one2many,14,JNK1/MAPK8-associated membrane protein,"[ENSMUSG00000005078, ENSMUSG00000056197]","[Jkamp, Jkampl]",ENSMUSG00000005078,Jkamp,51528.0
...,...,...,...,...,...,...,...,...,...,...
8991.0,ENSG00000143416,SELENBP1,ortholog_one2many,1,selenium binding protein 1,"[ENSMUSG00000068874, ENSMUSG00000068877]","[Selenbp1, Selenbp2]",ENSMUSG00000068874,Selenbp1,8991.0
2944.0,ENSG00000134184,GSTM1,ortholog_one2many,1,glutathione S-transferase mu 1,"[ENSMUSG00000004038, ENSMUSG00000040562, ENSMU...","[Gstm3, Gstm2, Gstm6, Gstm1]",ENSMUSG00000004038,Gstm3,2944.0
2023.0,ENSG00000074800,ENO1,ortholog_one2many,1,"enolase 1, (alpha)","[ENSMUSG00000059040, ENSMUSG00000063524]","[Eno1b, Eno1]",ENSMUSG00000059040,Eno1b,2023.0
7170.0,ENSG00000143549,TPM3,ortholog_one2many,1,tropomyosin 3,"[ENSMUSG00000027940, ENSMUSG00000058126]","[Tpm3, Tpm3-rs7]",ENSMUSG00000027940,Tpm3,7170.0


In [710]:
o2m_homolog.obs

Unnamed: 0,sample_name,exp_component_name,cluster_label,cluster_color,cluster_order,class_label,class_color,class_order,subclass_label,subclass_color,subclass_order,donor_sex_label,donor_sex_color,donor_sex_order,region_label,region_color,region_order,cortical_layer_label,cortical_layer_color,cortical_layer_order,cell_type_accession_label,cell_type_accession_color,cell_type_accession_order,cell_type_alias_label,cell_type_alias_color,cell_type_alias_order,cell_type_alt_alias_label,cell_type_alt_alias_color,cell_type_alt_alias_order,cell_type_designation_label,cell_type_designation_color,cell_type_designation_order,external_donor_name_label,external_donor_name_color,external_donor_name_order,specimen_type,full_genotype_label,outlier_call,outlier_type,homolog_class_label,homolog_subclass_label,nUMI,nGene,QC,cluster,Allen.cluster_id,Allen.cluster_label,Allen.class_label,Allen.subclass_label,comb.QC,row,BICCN_cluster_id,BICCN_cluster_label,BICCN_class_label,BICCN_subclass_label,size,gene.counts,umi.counts,Broad.QC.doublet,Broad.QC.Mito,Broad.passQC,MALE,Comb.QC,cl,temp_class_label,BICCN_ontology_term_id,assay_ontology_term_id,disease_ontology_term_id,tissue_ontology_term_id,cell_type_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id,sex_ontology_term_id,is_primary_data,organism_ontology_term_id,donor_id,suspension_type,cell_type,assay,disease,organism,sex,tissue,self_reported_ethnicity,development_stage
0,AAACCCAAGGATTTCC-LKTX_190129_01_A01,AAACCCAAGGATTTCC-21L8TX_180927_001_A01,Inh L1-2 SST CCNJL,#fb8d00,50.0,GABAergic,#FF7373,1.0,Sst,#FF9900,5.0,F,#FF7373,1.0,M1,#FF7373,1.0,all,#FF7373,1.0,CS1912131050,#fb8d00,50.0,Inh L1-2 SST CCNJL,#fb8d00,50.0,,#fb8d00,50.0,Neuron 50,#fb8d00,50.0,H18.30.001,#FF7373,1.0,nucleus,,False,,GABAergic,Sst,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,AAACCCAAGTATGGCG-LKTX_190129_01_A01,AAACCCAAGTATGGCG-21L8TX_180927_001_A01,Exc L5-6 FEZF2 IFNG-AS1,#2c815f,116.0,Glutamatergic,#3DCC3D,2.0,L5/6 NP,#3E9E64,12.0,F,#FF7373,1.0,M1,#FF7373,1.0,all,#FF7373,1.0,CS1912131116,#2c815f,116.0,Exc L5-6 FEZF2 IFNG-AS1,#2c815f,116.0,,#2c815f,116.0,Neuron 116,#2c815f,116.0,H18.30.001,#FF7373,1.0,nucleus,,False,,Glutamatergic,L5/6 NP,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,AAACCCACAAAGTGTA-LKTX_190129_01_A01,AAACCCACAAAGTGTA-21L8TX_180927_001_A01,Exc L3-5 RORB LINC01202,#547d7a,87.0,Glutamatergic,#3DCC3D,2.0,L5 IT,#50B2AD,8.0,F,#FF7373,1.0,M1,#FF7373,1.0,all,#FF7373,1.0,CS1912131087,#547d7a,87.0,Exc L3-5 RORB LINC01202,#547d7a,87.0,,#547d7a,87.0,Neuron 87,#547d7a,87.0,H18.30.001,#FF7373,1.0,nucleus,,False,,Glutamatergic,L5 IT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,AAACCCACACTACTTT-LKTX_190129_01_A01,AAACCCACACTACTTT-21L8TX_180927_001_A01,Exc L2 LINC00507 GLRA3,#cecd32,75.0,Glutamatergic,#3DCC3D,2.0,L2/3 IT,#C4EC04,7.0,F,#FF7373,1.0,M1,#FF7373,1.0,all,#FF7373,1.0,CS1912131075,#cecd32,75.0,Exc L2 LINC00507 GLRA3,#cecd32,75.0,,#cecd32,75.0,Neuron 75,#cecd32,75.0,H18.30.001,#FF7373,1.0,nucleus,,False,,Glutamatergic,L2/3 IT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,AAACCCACAGTGAGCA-LKTX_190129_01_A01,AAACCCACAGTGAGCA-21L8TX_180927_001_A01,Oligo L2-6 OPALIN FTH1P3,#003a28,120.0,Non-Neuronal,#171799,3.0,Oligo,#2E3E39,17.0,F,#FF7373,1.0,M1,#FF7373,1.0,all,#FF7373,1.0,CS1912131120,#003a28,120.0,Oligo L2-6 OPALIN FTH1P3,#003a28,120.0,,#003a28,120.0,Non-neuron 3,#003a28,120.0,H18.30.001,#FF7373,1.0,nucleus,,False,,Non-Neuronal,Oligo,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
pBICCNsMMrMOpRMiM006d190320_TTTGGTTCATGAGTAA,pBICCNsMMrMOpRMiM006d190320_TTTGGTTCATGAGTAA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Glutamatergic,L6 IT,20193.0,5695.0,PassQC,Ex1_5,53.0,L6 IT Sulf1_1,Glutamatergic,L6 IT,PassQC,215818.0,53.0,L6 IT Sulf1_1,Glutamatergic,L6 IT,3978.0,5218.066616,18064.753140,0.01,0.0,0.99,0.471594,,85.0,GlutamatergicL6 IT,ILX:0770158,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000679,na,MmusDv:0000061,PATO:0000384,False,NCBITaxon:10090,M006,nucleus,glutamatergic neuron,10x 3' v3,normal,Mus musculus,male,primary motor cortex,na,early adult stage
pBICCNsMMrMOpRMiM006d190320_TTTGGTTTCGCAAGAG,pBICCNsMMrMOpRMiM006d190320_TTTGGTTTCGCAAGAG,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Non-Neuronal,Oligo,2858.0,1602.0,PassQC,Oligo_19,90.0,Oligo Opalin_4,Non-Neuronal,Oligo,PassQC,215820.0,90.0,Oligo Opalin_4,Non-Neuronal,Oligo,16566.0,1740.348364,3419.937704,0.06,0.0,0.94,0.487323,,150.0,Non-NeuronalOligo,ILX:0770140,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000128,na,MmusDv:0000061,PATO:0000384,False,NCBITaxon:10090,M006,nucleus,oligodendrocyte,10x 3' v3,normal,Mus musculus,male,primary motor cortex,na,early adult stage
pBICCNsMMrMOpRMiM006d190320_TTTGTTGAGACTCTTG,pBICCNsMMrMOpRMiM006d190320_TTTGTTGAGACTCTTG,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Glutamatergic,L5 IT,35854.0,7344.0,PassQC,Ex1_3,51.0,L5 IT S100b,Glutamatergic,L5 IT,PassQC,215821.0,51.0,L5 IT S100b,Glutamatergic,L5 IT,8684.0,5464.036043,20579.022920,0.01,0.0,0.99,0.385306,,70.0,GlutamatergicL5 IT,ILX:0770157,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000679,na,MmusDv:0000061,PATO:0000384,False,NCBITaxon:10090,M006,nucleus,glutamatergic neuron,10x 3' v3,normal,Mus musculus,male,primary motor cortex,na,early adult stage
pBICCNsMMrMOpRMiM006d190320_TTTGTTGTCACCTTGC,pBICCNsMMrMOpRMiM006d190320_TTTGTTGTCACCTTGC,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Glutamatergic,L5 IT,23493.0,6146.0,PassQC,Ex1_3,52.0,L5 IT Pld5,Glutamatergic,L5 IT,PassQC,215822.0,52.0,L5 IT Pld5,Glutamatergic,L5 IT,3621.0,5680.692074,22718.304610,0.01,0.0,0.99,0.446838,,83.0,GlutamatergicL5 IT,ILX:0770157,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000679,na,MmusDv:0000061,PATO:0000384,False,NCBITaxon:10090,M006,nucleus,glutamatergic neuron,10x 3' v3,normal,Mus musculus,male,primary motor cortex,na,early adult stage


## Check if the human matrix is still the same

In [355]:
o2m_human = homolog_human[:,homolog_human.var.entrez_id.isin(one2many_genes.human_entrezgene_id)]

In [356]:
o2m_human

View of AnnData object with n_obs × n_vars = 76533 × 356
    obs: 'sample_name', 'exp_component_name', 'cluster_label', 'cluster_color', 'cluster_order', 'class_label', 'class_color', 'class_order', 'subclass_label', 'subclass_color', 'subclass_order', 'donor_sex_label', 'donor_sex_color', 'donor_sex_order', 'region_label', 'region_color', 'region_order', 'cortical_layer_label', 'cortical_layer_color', 'cortical_layer_order', 'cell_type_accession_label', 'cell_type_accession_color', 'cell_type_accession_order', 'cell_type_alias_label', 'cell_type_alias_color', 'cell_type_alias_order', 'cell_type_alt_alias_label', 'cell_type_alt_alias_color', 'cell_type_alt_alias_order', 'cell_type_designation_label', 'cell_type_designation_color', 'cell_type_designation_order', 'external_donor_name_label', 'external_donor_name_color', 'external_donor_name_order', 'specimen_type', 'full_genotype_label', 'outlier_call', 'outlier_type'
    var: 'gene', 'chromosome', 'entrez_id', 'gene_name', 'mouse_homolo

In [433]:
o2m_human.obs_names

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '76523', '76524', '76525', '76526', '76527', '76528', '76529', '76530',
       '76531', '76532'],
      dtype='object', length=76533)

In [435]:
o2m_human.var.index = o2m_human.var.entrez_id
o2m_human.var['human_entrezgene_id'] = o2m_human.var.index

  o2m_human.var['human_entrezgene_id'] = o2m_human.var.index


In [436]:
o2m_human.var_names

Index(['343066', '5243', '30', '80724', '39', '23597', '8748', '113179',
       '11047', '185',
       ...
       '4013', '91833', '387032', '353355', '10782', '80778', '285971',
       '284323', '91664', '729747'],
      dtype='object', name='entrez_id', length=356)

In [437]:
o2m_human.var

Unnamed: 0_level_0,gene,chromosome,entrez_id,gene_name,mouse_homologenes,human_entrezgene_id
entrez_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
343066,AADACL4,1,343066,arylacetamide deacetylase-like 4,Gm13177,343066
5243,ABCB1,7,5243,"ATP-binding cassette, sub-family B (MDR/TAP), ...",Abcb1a,5243
30,ACAA1,3,30,acetyl-CoA acyltransferase 1,Acaa1a,30
80724,ACAD10,12,80724,"acyl-CoA dehydrogenase family, member 10",Acad10,80724
39,ACAT2,6,39,acetyl-CoA acetyltransferase 2,Acat2;Acat3,39
...,...,...,...,...,...,...
80778,ZNF34,8,80778,zinc finger protein 34,,80778
285971,ZNF775,7,285971,zinc finger protein 775,Zfp775,285971
284323,ZNF780A,19,284323,zinc finger protein 780A,C030039L03Rik;Zfp607,284323
91664,ZNF845,19,91664,zinc finger protein 845,,91664


In [462]:
def check_cell_counts():
    correct = 0
    incorrect = {}
    for cell in o2m_human.obs_names:
        for gene in o2m_human.var.entrez_id:
            human_gene_index = np.where(o2m_human.var_names == str(gene))[0][0]
            homolog_human_gene_index = np.where(o2m_homolog.var_names == gene)[0][0]
            if o2m_human.X[int(cell),int(human_gene_index)] == o2m_homolog.X[int(cell),int(homolog_human_gene_index)]:
                correct = correct+1
            else :
                incorrect[gene] = [cell,human_gene_index,homolog_human_gene_index]
    return correct, incorrect

In [503]:
correct, incorrect = check_cell_counts()

In [504]:
print(correct, len(incorrect.keys()))

27245748 0


## Write the results

In [711]:
o2m_homolog.obs = o2m_homolog.obs.drop(['outlier_call','is_primary_data'], axis = 1)

In [714]:
o2m_homolog.var.mouse_homologs_ids

human_entrez_id
9086.0      [ENSMUSG00000057561, ENSMUSG00000072905, ENSMU...
6815.0               [ENSMUSG00000053205, ENSMUSG00000071748]
128821.0             [ENSMUSG00000027445, ENSMUSG00000027446]
51635.0              [ENSMUSG00000021094, ENSMUSG00000109482]
51528.0              [ENSMUSG00000005078, ENSMUSG00000056197]
                                  ...                        
8991.0               [ENSMUSG00000068874, ENSMUSG00000068877]
2944.0      [ENSMUSG00000004038, ENSMUSG00000040562, ENSMU...
2023.0               [ENSMUSG00000059040, ENSMUSG00000063524]
7170.0               [ENSMUSG00000027940, ENSMUSG00000058126]
388761.0    [ENSMUSG00000055571, ENSMUSG00000055610, ENSMU...
Name: mouse_homologs_ids, Length: 356, dtype: object

In [715]:
o2m_homolog.var.human_chromosome

human_entrez_id
9086.0       Y
6815.0      14
128821.0    20
51635.0     14
51528.0     14
            ..
8991.0       1
2944.0       1
2023.0       1
7170.0       1
388761.0     1
Name: human_chromosome, Length: 356, dtype: object

In [716]:
o2m_homolog_write = o2m_homolog

In [717]:
o2m_homolog_write

AnnData object with n_obs × n_vars = 236271 × 356
    obs: 'sample_name', 'exp_component_name', 'cluster_label', 'cluster_color', 'cluster_order', 'class_label', 'class_color', 'class_order', 'subclass_label', 'subclass_color', 'subclass_order', 'donor_sex_label', 'donor_sex_color', 'donor_sex_order', 'region_label', 'region_color', 'region_order', 'cortical_layer_label', 'cortical_layer_color', 'cortical_layer_order', 'cell_type_accession_label', 'cell_type_accession_color', 'cell_type_accession_order', 'cell_type_alias_label', 'cell_type_alias_color', 'cell_type_alias_order', 'cell_type_alt_alias_label', 'cell_type_alt_alias_color', 'cell_type_alt_alias_order', 'cell_type_designation_label', 'cell_type_designation_color', 'cell_type_designation_order', 'external_donor_name_label', 'external_donor_name_color', 'external_donor_name_order', 'specimen_type', 'full_genotype_label', 'outlier_type', 'homolog_class_label', 'homolog_subclass_label', 'nUMI', 'nGene', 'QC', 'cluster', 'Allen.cl

In [718]:
o2m_homolog_write.var

Unnamed: 0_level_0,human_ensembl_id,human_gene_name,orthology_type,human_chromosome,human_long_gene_name,mouse_homologs_ids,mouse_homologs_names,mouse_ensembl_id,mouse_gene_name,human_entrez_id
human_entrez_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
9086.0,ENSG00000198692,EIF1AY,ortholog_one2many,Y,"eukaryotic translation initiation factor 1A, Y...","[ENSMUSG00000057561, ENSMUSG00000072905, ENSMU...","[Eif1a, Eif1ad13, Eif1ad2, Eif1ad7, Eif1ad15, ...",ENSMUSG00000057561,Eif1a,9086.0
6815.0,ENSG00000198252,STYX,ortholog_one2many,14,serine/threonine/tyrosine interacting protein,"[ENSMUSG00000053205, ENSMUSG00000071748]","[Styx-ps, Styx]",ENSMUSG00000053205,Styx,6815.0
128821.0,ENSG00000101435,CST9L,ortholog_one2many,20,cystatin 9-like,"[ENSMUSG00000027445, ENSMUSG00000027446]","[Cstdc2, Cst9]",ENSMUSG00000027445,Cst9,128821.0
51635.0,ENSG00000100612,DHRS7,ortholog_one2many,14,dehydrogenase/reductase (SDR family) member 7,"[ENSMUSG00000021094, ENSMUSG00000109482]","[Dhrs7, Gm4756]",ENSMUSG00000021094,Dhrs7,51635.0
51528.0,ENSG00000050130,JKAMP,ortholog_one2many,14,JNK1/MAPK8-associated membrane protein,"[ENSMUSG00000005078, ENSMUSG00000056197]","[Jkamp, Jkampl]",ENSMUSG00000005078,Jkamp,51528.0
...,...,...,...,...,...,...,...,...,...,...
8991.0,ENSG00000143416,SELENBP1,ortholog_one2many,1,selenium binding protein 1,"[ENSMUSG00000068874, ENSMUSG00000068877]","[Selenbp1, Selenbp2]",ENSMUSG00000068874,Selenbp1,8991.0
2944.0,ENSG00000134184,GSTM1,ortholog_one2many,1,glutathione S-transferase mu 1,"[ENSMUSG00000004038, ENSMUSG00000040562, ENSMU...","[Gstm3, Gstm2, Gstm6, Gstm1]",ENSMUSG00000004038,Gstm3,2944.0
2023.0,ENSG00000074800,ENO1,ortholog_one2many,1,"enolase 1, (alpha)","[ENSMUSG00000059040, ENSMUSG00000063524]","[Eno1b, Eno1]",ENSMUSG00000059040,Eno1b,2023.0
7170.0,ENSG00000143549,TPM3,ortholog_one2many,1,tropomyosin 3,"[ENSMUSG00000027940, ENSMUSG00000058126]","[Tpm3, Tpm3-rs7]",ENSMUSG00000027940,Tpm3,7170.0


In [673]:
## Add to convert the array in str to write the result in a file

In [720]:
type(o2m_homolog_write.var.mouse_homologs_ids.iloc[0])

list

In [722]:
type(o2m_homolog_write.var.human_ensembl_id.iloc[0])

str

In [724]:
str(o2m_homolog_write.var.mouse_homologs_ids.iloc[0])

"['ENSMUSG00000057561', 'ENSMUSG00000072905', 'ENSMUSG00000079029', 'ENSMUSG00000079031', 'ENSMUSG00000079034', 'ENSMUSG00000092019', 'ENSMUSG00000093847', 'ENSMUSG00000095717', 'ENSMUSG00000095724', 'ENSMUSG00000095799', 'ENSMUSG00000096049', 'ENSMUSG00000096619', 'ENSMUSG00000096803', 'ENSMUSG00000113201', 'ENSMUSG00000113805', 'ENSMUSG00000113971', 'ENSMUSG00000114075']"

In [725]:
test = str(o2m_homolog_write.var.mouse_homologs_ids.iloc[0]).replace(']','').replace('[',"").replace("'","").split(', ')
test

['ENSMUSG00000057561',
 'ENSMUSG00000072905',
 'ENSMUSG00000079029',
 'ENSMUSG00000079031',
 'ENSMUSG00000079034',
 'ENSMUSG00000092019',
 'ENSMUSG00000093847',
 'ENSMUSG00000095717',
 'ENSMUSG00000095724',
 'ENSMUSG00000095799',
 'ENSMUSG00000096049',
 'ENSMUSG00000096619',
 'ENSMUSG00000096803',
 'ENSMUSG00000113201',
 'ENSMUSG00000113805',
 'ENSMUSG00000113971',
 'ENSMUSG00000114075']

In [726]:
test[0]

'ENSMUSG00000057561'

In [727]:
type(test)

list

In [729]:
o2m_homolog_write.var[['mouse_homologs_ids', 'mouse_homologs_names']] = o2m_homolog_write.var[['mouse_homologs_ids', 'mouse_homologs_names']].astype(str)

In [730]:
type(o2m_homolog_write.var.mouse_homologs_ids.iloc[0])

str

In [731]:
o2m_homolog_write.var.mouse_homologs_ids.iloc[0]

"['ENSMUSG00000057561', 'ENSMUSG00000072905', 'ENSMUSG00000079029', 'ENSMUSG00000079031', 'ENSMUSG00000079034', 'ENSMUSG00000092019', 'ENSMUSG00000093847', 'ENSMUSG00000095717', 'ENSMUSG00000095724', 'ENSMUSG00000095799', 'ENSMUSG00000096049', 'ENSMUSG00000096619', 'ENSMUSG00000096803', 'ENSMUSG00000113201', 'ENSMUSG00000113805', 'ENSMUSG00000113971', 'ENSMUSG00000114075']"

In [732]:
o2m_homolog_write.write_h5ad(os.path.join(path_data,'o2m_homolog_human_mouse_'+str(date.today())+'.h5ad'),compression='gzip')