# Import packages and define paths

In [1]:
import scanpy as sc
import anndata
import numpy as np
import gc
import pandas as pd 
import os
from datetime import date
pd.set_option('display.max_columns', None)
from biomart import BiomartServer
gc.isenabled()

True

In [2]:
os.path.realpath('one2many_human_mouse.ipynb')

'/nfs/research/irene/anaelle/Scripts/human_mouse/one2many_human_mouse.ipynb'

In [None]:
path_project = '/nfs/research/irene/anaelle'
path_scripts = os.path.join(path_project, 'Scripts','human_mouse_scripts')
print(path_scripts)
path_data = os.path.join(path_project, 'data')
print(path_data)

# Load Biomart output

In [4]:
human_mouse_homolog_genes = pd.read_csv(os.path.join(path_data, 'homolog_human_mouse_ensembl_gene.csv'))

In [5]:
human_mouse_homolog_genes = human_mouse_homolog_genes.drop('Unnamed: 0', axis=1)

In [6]:
human_mouse_homolog_genes

Unnamed: 0,human_ensembl_gene_id,human_entrezgene_id,human_external_gene_name,mouse_homolog_ensembl_gene,orthology_type,mouse_homolog_gene_name
0,ENSG00000198888,4535.0,MT-ND1,ENSMUSG00000064341,ortholog_one2one,mt-Nd1
1,ENSG00000198763,4536.0,MT-ND2,ENSMUSG00000064345,ortholog_one2one,mt-Nd2
2,ENSG00000198804,4512.0,MT-CO1,ENSMUSG00000064351,ortholog_one2one,mt-Co1
3,ENSG00000198712,4513.0,MT-CO2,ENSMUSG00000064354,ortholog_one2one,mt-Co2
4,ENSG00000228253,4509.0,MT-ATP8,ENSMUSG00000064356,ortholog_one2one,mt-Atp8
...,...,...,...,...,...,...
21936,ENSG00000187017,83715.0,ESPN,ENSMUSG00000028943,ortholog_one2one,Espn
21937,ENSG00000198216,777.0,CACNA1E,ENSMUSG00000004110,ortholog_one2one,Cacna1e
21938,ENSG00000179930,127665.0,ZNF648,ENSMUSG00000066797,ortholog_one2one,Zfp648
21939,ENSG00000162437,55225.0,RAVER2,ENSMUSG00000035275,ortholog_one2one,Raver2


# Import human and mouse data

In [7]:
mouse_data = sc.read_h5ad(os.path.join(path_data,'mouse/mouse_raw_counts_from_cellxgene.h5ad'))

In [8]:
mouse_data

AnnData object with n_obs × n_vars = 159738 × 30639
    obs: 'Unnamed: 0', 'nUMI', 'nGene', 'QC', 'cluster', 'Allen.cluster_id', 'Allen.cluster_label', 'Allen.class_label', 'Allen.subclass_label', 'comb.QC', 'row', 'BICCN_cluster_id', 'BICCN_cluster_label', 'BICCN_class_label', 'BICCN_subclass_label', 'size', 'gene.counts', 'umi.counts', 'Broad.QC.doublet', 'Broad.QC.Mito', 'Broad.passQC', 'MALE', 'Comb.QC', 'cl', 'temp_class_label', 'BICCN_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'tissue_ontology_term_id', 'cell_type_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'donor_id', 'suspension_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype'

In [9]:
human_data = sc.read_h5ad(os.path.join(path_data, 'human/hdata2023-06-05.h5ad'))



In [10]:
human_data

AnnData object with n_obs × n_vars = 76533 × 50281
    obs: 'sample_name', 'exp_component_name', 'cluster_label', 'cluster_color', 'cluster_order', 'class_label', 'class_color', 'class_order', 'subclass_label', 'subclass_color', 'subclass_order', 'donor_sex_label', 'donor_sex_color', 'donor_sex_order', 'region_label', 'region_color', 'region_order', 'cortical_layer_label', 'cortical_layer_color', 'cortical_layer_order', 'cell_type_accession_label', 'cell_type_accession_color', 'cell_type_accession_order', 'cell_type_alias_label', 'cell_type_alias_color', 'cell_type_alias_order', 'cell_type_alt_alias_label', 'cell_type_alt_alias_color', 'cell_type_alt_alias_order', 'cell_type_designation_label', 'cell_type_designation_color', 'cell_type_designation_order', 'external_donor_name_label', 'external_donor_name_color', 'external_donor_name_order', 'specimen_type', 'full_genotype_label', 'outlier_call', 'outlier_type'
    var: 'gene', 'chromosome', 'entrez_id', 'gene_name', 'mouse_homologenes'

# Get only one2many label

In [11]:
human_mouse_homolog_genes.orthology_type.unique()

array(['ortholog_one2one', 'ortholog_one2many', 'ortholog_many2many'],
      dtype=object)

In [12]:
human_mouse_homolog_genes = human_mouse_homolog_genes.loc[human_mouse_homolog_genes.orthology_type == 'ortholog_one2many']

In [13]:
human_mouse_homolog_genes

Unnamed: 0,human_ensembl_gene_id,human_entrezgene_id,human_external_gene_name,mouse_homolog_ensembl_gene,orthology_type,mouse_homolog_gene_name
13,ENSG00000277196,5625.0,PRODH,ENSMUSG00000003526,ortholog_one2many,Prodh
15,ENSG00000198692,9086.0,EIF1AY,ENSMUSG00000057561,ortholog_one2many,Eif1a
16,ENSG00000198692,9086.0,EIF1AY,ENSMUSG00000113201,ortholog_one2many,Eif1ad13
17,ENSG00000198692,9086.0,EIF1AY,ENSMUSG00000079031,ortholog_one2many,Eif1ad2
18,ENSG00000198692,9086.0,EIF1AY,ENSMUSG00000079029,ortholog_one2many,Eif1ad7
...,...,...,...,...,...,...
21930,ENSG00000241128,388761.0,OR14A2,ENSMUSG00000062878,ortholog_one2many,Or14a257
21931,ENSG00000241128,388761.0,OR14A2,ENSMUSG00000055610,ortholog_one2many,Or14a260
21932,ENSG00000241128,388761.0,OR14A2,ENSMUSG00000062042,ortholog_one2many,Or14a256
21933,ENSG00000241128,388761.0,OR14A2,ENSMUSG00000055571,ortholog_one2many,Or14a259


## Check for duplicates

In [14]:
human_mouse_homolog_genes.duplicated().any()

False

In [15]:
human_mouse_homolog_genes.human_ensembl_gene_id.duplicated().any()

True

In [16]:
human_mouse_homolog_genes.human_entrezgene_id.duplicated().any()

True

In [17]:
human_mouse_homolog_genes.mouse_homolog_ensembl_gene.duplicated().any()

True

### 1 entrez id ~ many ensembl

In [18]:
doublets = {}
for index, row in human_mouse_homolog_genes.iterrows():
    if row['human_entrezgene_id'] not in doublets.keys():
        doublets[row['human_entrezgene_id']] = []
    if row.human_ensembl_gene_id not in doublets[row['human_entrezgene_id']]:
        doublets[row['human_entrezgene_id']].append(row.human_ensembl_gene_id)

In [19]:
len(doublets)

826

In [20]:
real_duplicates = {}
for gene in doublets.keys():
    if len(doublets[gene])>1:
        real_duplicates[gene] = doublets[gene]

In [21]:
real_duplicates

{5625.0: ['ENSG00000277196', 'ENSG00000100033'],
 6606.0: ['ENSG00000205571', 'ENSG00000172062'],
 29057.0: ['ENSG00000268350', 'ENSG00000179304'],
 1159.0: ['ENSG00000237289', 'ENSG00000223572'],
 552900.0: ['ENSG00000169627', 'ENSG00000183336'],
 79008.0: ['ENSG00000132207', 'ENSG00000181625'],
 3963.0: ['ENSG00000178934', 'ENSG00000205076'],
 55894.0: ['ENSG00000177243', 'ENSG00000176797'],
 245910.0: ['ENSG00000198129', 'ENSG00000186572'],
 10156.0: ['ENSG00000105808', 'ENSG00000170667'],
 51326.0: ['ENSG00000185829', 'ENSG00000228696']}

In [22]:
to_be_removed = []
for gene in real_duplicates.keys():
    to_be_removed.append(real_duplicates[gene][1])

In [23]:
to_be_removed

['ENSG00000100033',
 'ENSG00000172062',
 'ENSG00000179304',
 'ENSG00000223572',
 'ENSG00000183336',
 'ENSG00000181625',
 'ENSG00000205076',
 'ENSG00000176797',
 'ENSG00000186572',
 'ENSG00000170667',
 'ENSG00000228696']

In [24]:
human_mouse_homolog_genes[~human_mouse_homolog_genes.human_ensembl_gene_id.isin(to_be_removed)]

Unnamed: 0,human_ensembl_gene_id,human_entrezgene_id,human_external_gene_name,mouse_homolog_ensembl_gene,orthology_type,mouse_homolog_gene_name
13,ENSG00000277196,5625.0,PRODH,ENSMUSG00000003526,ortholog_one2many,Prodh
15,ENSG00000198692,9086.0,EIF1AY,ENSMUSG00000057561,ortholog_one2many,Eif1a
16,ENSG00000198692,9086.0,EIF1AY,ENSMUSG00000113201,ortholog_one2many,Eif1ad13
17,ENSG00000198692,9086.0,EIF1AY,ENSMUSG00000079031,ortholog_one2many,Eif1ad2
18,ENSG00000198692,9086.0,EIF1AY,ENSMUSG00000079029,ortholog_one2many,Eif1ad7
...,...,...,...,...,...,...
21930,ENSG00000241128,388761.0,OR14A2,ENSMUSG00000062878,ortholog_one2many,Or14a257
21931,ENSG00000241128,388761.0,OR14A2,ENSMUSG00000055610,ortholog_one2many,Or14a260
21932,ENSG00000241128,388761.0,OR14A2,ENSMUSG00000062042,ortholog_one2many,Or14a256
21933,ENSG00000241128,388761.0,OR14A2,ENSMUSG00000055571,ortholog_one2many,Or14a259


In [25]:
human_mouse_homolog_genes = human_mouse_homolog_genes[~human_mouse_homolog_genes.human_ensembl_gene_id.isin(to_be_removed)]

In [26]:
human_mouse_homolog_genes.duplicated().any()

False

### 1 ensembl ~ many entrez id

In [27]:
doublets = {}
for index, row in human_mouse_homolog_genes.iterrows():
    if row['human_ensembl_gene_id'] not in doublets.keys():
        doublets[row['human_ensembl_gene_id']] = []
    if row.human_entrezgene_id not in doublets[row['human_ensembl_gene_id']]:
        doublets[row['human_ensembl_gene_id']].append(row.human_entrezgene_id)

In [28]:
len(doublets)

826

In [29]:
real_duplicates = {}
for gene in doublets.keys():
    if len(doublets[gene])>1:
        real_duplicates[gene] = doublets[gene]

In [30]:
real_duplicates

{}

## Get many2one

In [31]:
many2one_ids = [gene for gene in human_mouse_homolog_genes.human_ensembl_gene_id if (human_mouse_homolog_genes.loc[human_mouse_homolog_genes.human_ensembl_gene_id == gene].shape[0] == 1)]

In [32]:
len(many2one_ids)

470

In [33]:
470+1346

1816

In [34]:
many2one_genes = human_mouse_homolog_genes.loc[human_mouse_homolog_genes.human_ensembl_gene_id.isin(many2one_ids)]

In [35]:
many2one_genes

Unnamed: 0,human_ensembl_gene_id,human_entrezgene_id,human_external_gene_name,mouse_homolog_ensembl_gene,orthology_type,mouse_homolog_gene_name
13,ENSG00000277196,5625.0,PRODH,ENSMUSG00000003526,ortholog_one2many,Prodh
42,ENSG00000169953,159119.0,HSFY2,ENSMUSG00000045336,ortholog_one2many,Hsfy2
45,ENSG00000172468,86614.0,HSFY1,ENSMUSG00000045336,ortholog_one2many,Hsfy2
62,ENSG00000205916,57135.0,DAZ4,ENSMUSG00000010592,ortholog_one2many,Dazl
67,ENSG00000067646,7544.0,ZFY,ENSMUSG00000079509,ortholog_one2many,Zfx
...,...,...,...,...,...,...
21675,ENSG00000244414,3078.0,CFHR1,ENSMUSG00000057037,ortholog_one2many,Cfhr1
21677,ENSG00000203747,2214.0,FCGR3A,ENSMUSG00000059089,ortholog_one2many,Fcgr4
21678,ENSG00000162747,2215.0,FCGR3B,ENSMUSG00000059089,ortholog_one2many,Fcgr4
21805,ENSG00000080910,3080.0,CFHR2,ENSMUSG00000057037,ortholog_one2many,Cfhr1


In [36]:
many2one_genes.human_ensembl_gene_id.duplicated().any()

False

# Create new anndata objects

## Human

In [38]:
m2o_human = human_data[:, human_data.var.entrez_id.isin(many2one_genes['human_entrezgene_id'])]

In [40]:
m2o_human

View of AnnData object with n_obs × n_vars = 76533 × 470
    obs: 'sample_name', 'exp_component_name', 'cluster_label', 'cluster_color', 'cluster_order', 'class_label', 'class_color', 'class_order', 'subclass_label', 'subclass_color', 'subclass_order', 'donor_sex_label', 'donor_sex_color', 'donor_sex_order', 'region_label', 'region_color', 'region_order', 'cortical_layer_label', 'cortical_layer_color', 'cortical_layer_order', 'cell_type_accession_label', 'cell_type_accession_color', 'cell_type_accession_order', 'cell_type_alias_label', 'cell_type_alias_color', 'cell_type_alias_order', 'cell_type_alt_alias_label', 'cell_type_alt_alias_color', 'cell_type_alt_alias_order', 'cell_type_designation_label', 'cell_type_designation_color', 'cell_type_designation_order', 'external_donor_name_label', 'external_donor_name_color', 'external_donor_name_order', 'specimen_type', 'full_genotype_label', 'outlier_call', 'outlier_type'
    var: 'gene', 'chromosome', 'entrez_id', 'gene_name', 'mouse_homolo

## Mouse

In [43]:
m2o_mouse = mouse_data[:, mouse_data.var_names.isin(many2one_genes['mouse_homolog_ensembl_gene'])]

In [44]:
m2o_mouse

View of AnnData object with n_obs × n_vars = 159738 × 212
    obs: 'Unnamed: 0', 'nUMI', 'nGene', 'QC', 'cluster', 'Allen.cluster_id', 'Allen.cluster_label', 'Allen.class_label', 'Allen.subclass_label', 'comb.QC', 'row', 'BICCN_cluster_id', 'BICCN_cluster_label', 'BICCN_class_label', 'BICCN_subclass_label', 'size', 'gene.counts', 'umi.counts', 'Broad.QC.doublet', 'Broad.QC.Mito', 'Broad.passQC', 'MALE', 'Comb.QC', 'cl', 'temp_class_label', 'BICCN_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'tissue_ontology_term_id', 'cell_type_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'donor_id', 'suspension_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype'

## Format obs df

In [45]:
m2o_human.obs['homolog_class_label'] = m2o_human.obs['class_label']
m2o_human.obs['homolog_subclass_label'] = m2o_human.obs['subclass_label']

  m2o_human.obs['homolog_class_label'] = m2o_human.obs['class_label']


In [46]:
m2o_human.obs

Unnamed: 0,sample_name,exp_component_name,cluster_label,cluster_color,cluster_order,class_label,class_color,class_order,subclass_label,subclass_color,subclass_order,donor_sex_label,donor_sex_color,donor_sex_order,region_label,region_color,region_order,cortical_layer_label,cortical_layer_color,cortical_layer_order,cell_type_accession_label,cell_type_accession_color,cell_type_accession_order,cell_type_alias_label,cell_type_alias_color,cell_type_alias_order,cell_type_alt_alias_label,cell_type_alt_alias_color,cell_type_alt_alias_order,cell_type_designation_label,cell_type_designation_color,cell_type_designation_order,external_donor_name_label,external_donor_name_color,external_donor_name_order,specimen_type,full_genotype_label,outlier_call,outlier_type,homolog_class_label,homolog_subclass_label
0,AAACCCAAGGATTTCC-LKTX_190129_01_A01,AAACCCAAGGATTTCC-21L8TX_180927_001_A01,Inh L1-2 SST CCNJL,#fb8d00,50,GABAergic,#FF7373,1,Sst,#FF9900,5,F,#FF7373,1,M1,#FF7373,1,all,#FF7373,1,CS1912131050,#fb8d00,50,Inh L1-2 SST CCNJL,#fb8d00,50,,#fb8d00,50,Neuron 50,#fb8d00,50,H18.30.001,#FF7373,1,nucleus,,False,,GABAergic,Sst
1,AAACCCAAGTATGGCG-LKTX_190129_01_A01,AAACCCAAGTATGGCG-21L8TX_180927_001_A01,Exc L5-6 FEZF2 IFNG-AS1,#2c815f,116,Glutamatergic,#3DCC3D,2,L5/6 NP,#3E9E64,12,F,#FF7373,1,M1,#FF7373,1,all,#FF7373,1,CS1912131116,#2c815f,116,Exc L5-6 FEZF2 IFNG-AS1,#2c815f,116,,#2c815f,116,Neuron 116,#2c815f,116,H18.30.001,#FF7373,1,nucleus,,False,,Glutamatergic,L5/6 NP
2,AAACCCACAAAGTGTA-LKTX_190129_01_A01,AAACCCACAAAGTGTA-21L8TX_180927_001_A01,Exc L3-5 RORB LINC01202,#547d7a,87,Glutamatergic,#3DCC3D,2,L5 IT,#50B2AD,8,F,#FF7373,1,M1,#FF7373,1,all,#FF7373,1,CS1912131087,#547d7a,87,Exc L3-5 RORB LINC01202,#547d7a,87,,#547d7a,87,Neuron 87,#547d7a,87,H18.30.001,#FF7373,1,nucleus,,False,,Glutamatergic,L5 IT
3,AAACCCACACTACTTT-LKTX_190129_01_A01,AAACCCACACTACTTT-21L8TX_180927_001_A01,Exc L2 LINC00507 GLRA3,#cecd32,75,Glutamatergic,#3DCC3D,2,L2/3 IT,#C4EC04,7,F,#FF7373,1,M1,#FF7373,1,all,#FF7373,1,CS1912131075,#cecd32,75,Exc L2 LINC00507 GLRA3,#cecd32,75,,#cecd32,75,Neuron 75,#cecd32,75,H18.30.001,#FF7373,1,nucleus,,False,,Glutamatergic,L2/3 IT
4,AAACCCACAGTGAGCA-LKTX_190129_01_A01,AAACCCACAGTGAGCA-21L8TX_180927_001_A01,Oligo L2-6 OPALIN FTH1P3,#003a28,120,Non-Neuronal,#171799,3,Oligo,#2E3E39,17,F,#FF7373,1,M1,#FF7373,1,all,#FF7373,1,CS1912131120,#003a28,120,Oligo L2-6 OPALIN FTH1P3,#003a28,120,,#003a28,120,Non-neuron 3,#003a28,120,H18.30.001,#FF7373,1,nucleus,,False,,Non-Neuronal,Oligo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76528,TTTGTTGAGATGGCGT-LKTX_190130_01_H01,TTTGTTGAGATGGCGT-35L8TX_181108_001_D01,Oligo L2-6 OPALIN FTH1P3,#003a28,120,Non-Neuronal,#171799,3,Oligo,#2E3E39,17,F,#FF7373,1,M1,#FF7373,1,all,#FF7373,1,CS1912131120,#003a28,120,Oligo L2-6 OPALIN FTH1P3,#003a28,120,,#003a28,120,Non-neuron 3,#003a28,120,H18.30.001,#FF7373,1,nucleus,,False,,Non-Neuronal,Oligo
76529,TTTGTTGCACAGCCAC-LKTX_190130_01_H01,TTTGTTGCACAGCCAC-35L8TX_181108_001_D01,Exc L3-5 RORB LNX2,#01d9d5,90,Glutamatergic,#3DCC3D,2,L5 IT,#50B2AD,8,F,#FF7373,1,M1,#FF7373,1,all,#FF7373,1,CS1912131090,#01d9d5,90,Exc L3-5 RORB LNX2,#01d9d5,90,,#01d9d5,90,Neuron 90,#01d9d5,90,H18.30.001,#FF7373,1,nucleus,,False,,Glutamatergic,L5 IT
76530,TTTGTTGCAGAGACTG-LKTX_190130_01_H01,TTTGTTGCAGAGACTG-35L8TX_181108_001_D01,Exc L2-3 RORB PTPN3,#b7ce00,81,Glutamatergic,#3DCC3D,2,L2/3 IT,#C4EC04,7,F,#FF7373,1,M1,#FF7373,1,all,#FF7373,1,CS1912131081,#b7ce00,81,Exc L2-3 RORB PTPN3,#b7ce00,81,,#b7ce00,81,Neuron 81,#b7ce00,81,H18.30.001,#FF7373,1,nucleus,,False,,Glutamatergic,L2/3 IT
76531,TTTGTTGCATAATGAG-LKTX_190130_01_H01,TTTGTTGCATAATGAG-35L8TX_181108_001_D01,Oligo L2-6 OPALIN FTH1P3,#003a28,120,Non-Neuronal,#171799,3,Oligo,#2E3E39,17,F,#FF7373,1,M1,#FF7373,1,all,#FF7373,1,CS1912131120,#003a28,120,Oligo L2-6 OPALIN FTH1P3,#003a28,120,,#003a28,120,Non-neuron 3,#003a28,120,H18.30.001,#FF7373,1,nucleus,,False,,Non-Neuronal,Oligo


In [47]:
m2o_mouse.obs['sample_name'] = m2o_mouse.obs.index

  m2o_mouse.obs['sample_name'] = m2o_mouse.obs.index


In [48]:
m2o_mouse.obs['homolog_class_label'] = m2o_mouse.obs['Allen.class_label']
m2o_mouse.obs['homolog_subclass_label'] = m2o_mouse.obs['Allen.subclass_label']

In [49]:
m2o_mouse.obs = m2o_mouse.obs.drop('Unnamed: 0', axis = 1)

In [50]:
m2o_mouse.obs

Unnamed: 0,nUMI,nGene,QC,cluster,Allen.cluster_id,Allen.cluster_label,Allen.class_label,Allen.subclass_label,comb.QC,row,BICCN_cluster_id,BICCN_cluster_label,BICCN_class_label,BICCN_subclass_label,size,gene.counts,umi.counts,Broad.QC.doublet,Broad.QC.Mito,Broad.passQC,MALE,Comb.QC,cl,temp_class_label,BICCN_ontology_term_id,assay_ontology_term_id,disease_ontology_term_id,tissue_ontology_term_id,cell_type_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id,sex_ontology_term_id,is_primary_data,organism_ontology_term_id,donor_id,suspension_type,cell_type,assay,disease,organism,sex,tissue,self_reported_ethnicity,development_stage,sample_name,homolog_class_label,homolog_subclass_label
pBICCNsMMrMOpRMiF007d190314_AAAGGGCAGGGCTAAC,4499,2094,PassQC,Astro_14,91.0,Astro Aqp4_1,Non-Neuronal,Astro,PassQC,4,91.0,Astro Aqp4_1,Non-Neuronal,Astro,16143.0,2601.086105,5846.112804,0.03,0.0,0.97,0.448615,,113.0,Non-NeuronalAstro,ILX:0770141,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000127,na,MmusDv:0000061,PATO:0000383,False,NCBITaxon:10090,F007,nucleus,astrocyte,10x 3' v3,normal,Mus musculus,female,primary motor cortex,na,early adult stage,pBICCNsMMrMOpRMiF007d190314_AAAGGGCAGGGCTAAC,Non-Neuronal,Astro
pBICCNsMMrMOpRMiF007d190314_AAAGGTAGTGGCTGAA,11900,4182,PassQC,Ex2_9,48.0,L2/3 IT_1,Glutamatergic,L2/3 IT,PassQC,5,48.0,L2/3 IT_1,Glutamatergic,L2/3 IT,23971.0,5300.903467,19543.896210,0.11,0.0,0.89,0.402945,,64.0,GlutamatergicL2/3 IT,ILX:0770156,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000679,na,MmusDv:0000061,PATO:0000383,False,NCBITaxon:10090,F007,nucleus,glutamatergic neuron,10x 3' v3,normal,Mus musculus,female,primary motor cortex,na,early adult stage,pBICCNsMMrMOpRMiF007d190314_AAAGGTAGTGGCTGAA,Glutamatergic,L2/3 IT
pBICCNsMMrMOpRMiF007d190314_AAAGTCCCAAGAGCTG,4166,2025,PassQC,Astro_0,91.0,Astro Aqp4_1,Non-Neuronal,Astro,PassQC,6,91.0,Astro Aqp4_1,Non-Neuronal,Astro,16143.0,2601.086105,5846.112804,0.03,0.0,0.97,0.448615,,113.0,Non-NeuronalAstro,ILX:0770141,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000127,na,MmusDv:0000061,PATO:0000383,False,NCBITaxon:10090,F007,nucleus,astrocyte,10x 3' v3,normal,Mus musculus,female,primary motor cortex,na,early adult stage,pBICCNsMMrMOpRMiF007d190314_AAAGTCCCAAGAGCTG,Non-Neuronal,Astro
pBICCNsMMrMOpRMiF007d190314_AAATGGAAGATCACCT,11834,4090,PassQC,Ex3_0,66.0,L6 CT Cpa6,Glutamatergic,L6 CT,PassQC,7,66.0,L6 CT Cpa6,Glutamatergic,L6 CT,16829.0,4862.752035,15796.942180,0.00,0.0,0.99,0.471210,,104.0,GlutamatergicL6 CT,ILX:0770162,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000679,na,MmusDv:0000061,PATO:0000383,False,NCBITaxon:10090,F007,nucleus,glutamatergic neuron,10x 3' v3,normal,Mus musculus,female,primary motor cortex,na,early adult stage,pBICCNsMMrMOpRMiF007d190314_AAATGGAAGATCACCT,Glutamatergic,L6 CT
pBICCNsMMrMOpRMiF007d190314_AACAAAGCACGCGCAT,1957,1362,PassQC,Micro_14,113.0,Macrophage_2,Non-Neuronal,Macrophage,PassQC,8,113.0,Macrophage_2,Non-Neuronal,Macrophage,7801.0,1583.783361,2507.487117,0.00,0.0,1.00,0.482374,,123.0,Non-NeuronalMacrophage,Macrophage,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000235,na,MmusDv:0000061,PATO:0000383,False,NCBITaxon:10090,F007,nucleus,macrophage,10x 3' v3,normal,Mus musculus,female,primary motor cortex,na,early adult stage,pBICCNsMMrMOpRMiF007d190314_AACAAAGCACGCGCAT,Non-Neuronal,Macrophage
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
pBICCNsMMrMOpRMiM006d190320_TTTGGTTCATGAGTAA,20193,5695,PassQC,Ex1_5,53.0,L6 IT Sulf1_1,Glutamatergic,L6 IT,PassQC,215818,53.0,L6 IT Sulf1_1,Glutamatergic,L6 IT,3978.0,5218.066616,18064.753140,0.01,0.0,0.99,0.471594,,85.0,GlutamatergicL6 IT,ILX:0770158,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000679,na,MmusDv:0000061,PATO:0000384,False,NCBITaxon:10090,M006,nucleus,glutamatergic neuron,10x 3' v3,normal,Mus musculus,male,primary motor cortex,na,early adult stage,pBICCNsMMrMOpRMiM006d190320_TTTGGTTCATGAGTAA,Glutamatergic,L6 IT
pBICCNsMMrMOpRMiM006d190320_TTTGGTTTCGCAAGAG,2858,1602,PassQC,Oligo_19,90.0,Oligo Opalin_4,Non-Neuronal,Oligo,PassQC,215820,90.0,Oligo Opalin_4,Non-Neuronal,Oligo,16566.0,1740.348364,3419.937704,0.06,0.0,0.94,0.487323,,150.0,Non-NeuronalOligo,ILX:0770140,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000128,na,MmusDv:0000061,PATO:0000384,False,NCBITaxon:10090,M006,nucleus,oligodendrocyte,10x 3' v3,normal,Mus musculus,male,primary motor cortex,na,early adult stage,pBICCNsMMrMOpRMiM006d190320_TTTGGTTTCGCAAGAG,Non-Neuronal,Oligo
pBICCNsMMrMOpRMiM006d190320_TTTGTTGAGACTCTTG,35854,7344,PassQC,Ex1_3,51.0,L5 IT S100b,Glutamatergic,L5 IT,PassQC,215821,51.0,L5 IT S100b,Glutamatergic,L5 IT,8684.0,5464.036043,20579.022920,0.01,0.0,0.99,0.385306,,70.0,GlutamatergicL5 IT,ILX:0770157,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000679,na,MmusDv:0000061,PATO:0000384,False,NCBITaxon:10090,M006,nucleus,glutamatergic neuron,10x 3' v3,normal,Mus musculus,male,primary motor cortex,na,early adult stage,pBICCNsMMrMOpRMiM006d190320_TTTGTTGAGACTCTTG,Glutamatergic,L5 IT
pBICCNsMMrMOpRMiM006d190320_TTTGTTGTCACCTTGC,23493,6146,PassQC,Ex1_3,52.0,L5 IT Pld5,Glutamatergic,L5 IT,PassQC,215822,52.0,L5 IT Pld5,Glutamatergic,L5 IT,3621.0,5680.692074,22718.304610,0.01,0.0,0.99,0.446838,,83.0,GlutamatergicL5 IT,ILX:0770157,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000679,na,MmusDv:0000061,PATO:0000384,False,NCBITaxon:10090,M006,nucleus,glutamatergic neuron,10x 3' v3,normal,Mus musculus,male,primary motor cortex,na,early adult stage,pBICCNsMMrMOpRMiM006d190320_TTTGTTGTCACCTTGC,Glutamatergic,L5 IT


# Create m2o object

## Create matrix and var df

In [51]:
many2one_genes

Unnamed: 0,human_ensembl_gene_id,human_entrezgene_id,human_external_gene_name,mouse_homolog_ensembl_gene,orthology_type,mouse_homolog_gene_name
13,ENSG00000277196,5625.0,PRODH,ENSMUSG00000003526,ortholog_one2many,Prodh
42,ENSG00000169953,159119.0,HSFY2,ENSMUSG00000045336,ortholog_one2many,Hsfy2
45,ENSG00000172468,86614.0,HSFY1,ENSMUSG00000045336,ortholog_one2many,Hsfy2
62,ENSG00000205916,57135.0,DAZ4,ENSMUSG00000010592,ortholog_one2many,Dazl
67,ENSG00000067646,7544.0,ZFY,ENSMUSG00000079509,ortholog_one2many,Zfx
...,...,...,...,...,...,...
21675,ENSG00000244414,3078.0,CFHR1,ENSMUSG00000057037,ortholog_one2many,Cfhr1
21677,ENSG00000203747,2214.0,FCGR3A,ENSMUSG00000059089,ortholog_one2many,Fcgr4
21678,ENSG00000162747,2215.0,FCGR3B,ENSMUSG00000059089,ortholog_one2many,Fcgr4
21805,ENSG00000080910,3080.0,CFHR2,ENSMUSG00000057037,ortholog_one2many,Cfhr1


In [52]:
np.where(m2o_mouse.var_names == m2o_mouse.var.loc[m2o_mouse.var.index == 'ENSMUSG00000045336'].index[0])[0][0]

209

In [53]:
m2o_human.var

Unnamed: 0,gene,chromosome,entrez_id,gene_name,mouse_homologenes
302,ACTR3B,7,57180,ARP3 actin-related protein 3 homolog B (yeast),Actr3b
309,ACTR3C,7,653857,ARP3 actin-related protein 3 homolog C (yeast),
466,ADH1A,4,124,"alcohol dehydrogenase 1A (class I), alpha poly...",
467,ADH1B,4,125,"alcohol dehydrogenase 1B (class I), beta polyp...",
468,ADH1C,4,126,"alcohol dehydrogenase 1C (class I), gamma poly...",Adh1
...,...,...,...,...,...
50138,ZNF823,19,55552,zinc finger protein 823,
50154,ZNF844,19,284391,zinc finger protein 844,
50187,ZNF92,7,168374,zinc finger protein 92,
50196,ZNF98,19,148198,zinc finger protein 98,


In [54]:
m2o_human.var.loc[m2o_human.var.entrez_id == 159119.0]

Unnamed: 0,gene,chromosome,entrez_id,gene_name,mouse_homologenes
10654,HSFY2,Y,159119,"heat shock transcription factor, Y linked 2",Hsfy2


In [56]:
m2o_human.X[:, [0,2]].T

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [57]:
np.mean(m2o_human.X[:, [0,2]].T,axis=0)

array([0. , 0. , 0.5, ..., 0. , 0. , 0. ], dtype=float32)

In [58]:
np.asarray(m2o_mouse.X[:,np.where(m2o_mouse.var_names == m2o_mouse.var.loc[m2o_mouse.var.index == 'ENSMUSG00000003526'].index[0])[0][0]].todense().T)[0]

array([0., 2., 3., ..., 0., 1., 1.], dtype=float32)

In [59]:
np.concatenate([np.asarray(m2o_mouse.X[:,np.where(m2o_mouse.var_names == m2o_mouse.var.loc[m2o_mouse.var.index == 'ENSMUSG00000003526'].index[0])[0][0]].todense().T)[0],
                np.mean(m2o_human.X[:, [0,2]].T,axis=0)])

array([0., 2., 3., ..., 0., 0., 0.], dtype=float32)

In [136]:
def construct_m2o_matrixanddf():
    # def the variables to return 
    matrix = []
    already_done = {}
    var_df = []
    human_gene_ids = []
    
    # start the loop throught the m2o dataframe
    for index, row in many2one_genes.iterrows():
        #get the human id
        mouse_id = row.mouse_homolog_ensembl_gene
        
        # check if it has already been done, if not we get into the if
        if mouse_id not in already_done.keys():
            # starts new lines for dataframes
            var_line = {}
            
            # get all the mouse info
            var_line['mouse_ensembl_id'] = mouse_id
            var_line['mouse_gene_name'] = row.mouse_homolog_gene_name
            var_line['orthology_type'] = row.orthology_type
            
            # get all the human homolog genes for the mouse gene
            current_df = many2one_genes.loc[many2one_genes.mouse_homolog_ensembl_gene == mouse_id]
            
            # def human arrays for stocking info
            human_counts = []
            human_names = []
            human_ensembl_ids = []
            human_entrez_ids = []
            index = []
            
            # pass throught all the mouse genes
            for index2, row2 in current_df.iterrows():
                # create arrays for the df
                human_names.append(row2.human_external_gene_name)
                human_ensembl_ids.append(row2.human_ensembl_gene_id)
                human_entrez_ids.append(row2.human_entrezgene_id)
                
                # append the array to check afterwards
                human_gene_ids.append(row2.human_entrezgene_id)
                
                # get the index of the gene in the matrix
                index.append(np.where(m2o_human.var_names == m2o_human.var.loc[m2o_human.var.entrez_id == row2.human_entrezgene_id].index[0])[0][0])
            
            # get the cells counts for each gene
            homolog_matrix = m2o_human.X[:, index].T
            # get the mean for each cell
            human_counts = np.mean(homolog_matrix, axis=0)        
            
            # add the newline to the count matrix
            if matrix == []:
                matrix = np.array([np.concatenate([human_counts,
                                                   np.asarray(m2o_mouse.X[:,np.where(m2o_mouse.var_names == m2o_mouse.var.loc[m2o_mouse.var.index == mouse_id].index[0])[0][0]].todense().T)[0]])])
            else : 
                matrix = np.append(matrix,
                                   [np.concatenate([human_counts, 
                                                    np.asarray(m2o_mouse.X[:,np.where(m2o_mouse.var_names == m2o_mouse.var.loc[m2o_mouse.var.index == mouse_id].index[0])[0][0]].todense().T)[0]])],
                                   axis = 0)
            
            # To find the genes afterward if necessary
            already_done[mouse_id] = [human_ensembl_ids, human_entrez_ids]
            
            # add the information for the mouse's genes to the df
            var_line['human_homologs_ensembl_ids'] = human_ensembl_ids
            var_line['human_homologs_entrez_ids'] = human_entrez_ids
            var_line['human_homologs_names'] = human_names
            human_ensembl_ids.sort()
            var_line['human_ensembl_id'] = human_ensembl_ids[0]
            var_line['human_entrez_id'] = many2one_genes.loc[many2one_genes.human_ensembl_gene_id == human_ensembl_ids[0]].human_entrezgene_id.iloc[0]
            var_line['human_gene_name'] = many2one_genes.loc[many2one_genes.human_ensembl_gene_id == human_ensembl_ids[0]].human_external_gene_name.iloc[0]
            var_line['human_long_gene_name'] = m2o_human.var.loc[m2o_human.var.entrez_id == var_line['human_entrez_id']].gene_name.iloc[0]
            var_line['human_chromosome'] = m2o_human.var.loc[m2o_human.var.entrez_id == var_line['human_entrez_id']].chromosome.iloc[0]
            
            
            # add the newline to the rows of the df
            var_df.append(var_line)
                
    return matrix.T, var_df, already_done, human_gene_ids

In [137]:
m2o_matrix, m2o_var_df, m_done, human_genes_ids_done = construct_m2o_matrixanddf()

  if matrix == []:


In [138]:
m2o_matrix.shape

(236271, 212)

In [139]:
m2o_matrix

array([[0. , 0. , 1.2, ..., 0. , 0. , 0. ],
       [0. , 0. , 1.2, ..., 0.5, 0. , 2. ],
       [0. , 0. , 0.4, ..., 0. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [1. , 0. , 1. , ..., 0. , 0. , 0. ],
       [1. , 0. , 0. , ..., 0. , 0. , 0. ]], dtype=float32)

In [140]:
m2o_matrix[:76533,:]

array([[0. , 0. , 1.2, ..., 0. , 0. , 0. ],
       [0. , 0. , 1.2, ..., 0.5, 0. , 2. ],
       [0. , 0. , 0.4, ..., 0. , 0. , 0. ],
       ...,
       [0. , 0. , 0.2, ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 2.2, ..., 0. , 0. , 1. ]], dtype=float32)

In [141]:
m2o_matrix[76533:,:]

array([[0., 0., 0., ..., 0., 0., 0.],
       [2., 0., 0., ..., 0., 0., 0.],
       [3., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [143]:
many2one_genes.mouse_homolog_ensembl_gene.unique().size

212

In [144]:
many2one_genes.human_ensembl_gene_id.duplicated().any()

False

In [145]:
len(m_done.keys())

212

In [146]:
len(human_genes_ids_done) # same length as the df

470

In [147]:
m2o_var = pd.DataFrame(m2o_var_df)

In [148]:
m2o_var

Unnamed: 0,mouse_ensembl_id,mouse_gene_name,orthology_type,human_homologs_ensembl_ids,human_homologs_entrez_ids,human_homologs_names,human_ensembl_id,human_entrez_id,human_gene_name,human_long_gene_name,human_chromosome
0,ENSMUSG00000003526,Prodh,ortholog_one2many,[ENSG00000277196],[5625.0],[PRODH],ENSG00000277196,5625.0,PRODH,proline dehydrogenase (oxidase) 1,22
1,ENSMUSG00000045336,Hsfy2,ortholog_one2many,"[ENSG00000169953, ENSG00000172468]","[159119.0, 86614.0]","[HSFY2, HSFY1]",ENSG00000169953,159119.0,HSFY2,"heat shock transcription factor, Y linked 2",Y
2,ENSMUSG00000010592,Dazl,ortholog_one2many,"[ENSG00000092345, ENSG00000187191, ENSG0000018...","[57135.0, 57054.0, 1617.0, 57055.0, 1618.0]","[DAZ4, DAZ3, DAZ1, DAZ2, DAZL]",ENSG00000092345,1618.0,DAZL,deleted in azoospermia-like,3
3,ENSMUSG00000079509,Zfx,ortholog_one2many,"[ENSG00000005889, ENSG00000067646]","[7544.0, 7543.0]","[ZFY, ZFX]",ENSG00000005889,7543.0,ZFX,"zinc finger protein, X-linked",X
4,ENSMUSG00000025246,Tbl1x,ortholog_one2many,"[ENSG00000092377, ENSG00000101849]","[90665.0, 6907.0]","[TBL1Y, TBL1X]",ENSG00000092377,90665.0,TBL1Y,"transducin (beta)-like 1, Y-linked",Y
...,...,...,...,...,...,...,...,...,...,...,...
207,ENSMUSG00000056895,H2bc27,ortholog_one2many,[ENSG00000196890],[128312.0],[H2BC26],ENSG00000196890,128312.0,H2BC26,"histone cluster 3, H2bb",1
208,ENSMUSG00000062421,Arf2,ortholog_one2many,[ENSG00000185829],[51326.0],[ARL17A],ENSG00000185829,51326.0,ARL17A,ADP-ribosylation factor-like 17A,17
209,ENSMUSG00000057037,Cfhr1,ortholog_one2many,"[ENSG00000080910, ENSG00000244414]","[3078.0, 3080.0]","[CFHR1, CFHR2]",ENSG00000080910,3080.0,CFHR2,complement factor H-related 2,1
210,ENSMUSG00000059089,Fcgr4,ortholog_one2many,"[ENSG00000162747, ENSG00000203747]","[2214.0, 2215.0]","[FCGR3A, FCGR3B]",ENSG00000162747,2215.0,FCGR3B,"Fc fragment of IgG, low affinity IIIb, recepto...",1


In [150]:
m2oaso2o = 0
for tab in m2o_var.human_homologs_ensembl_ids:
    if len(tab) == 1:
        m2oaso2o = m2oaso2o+1
m2oaso2o

46

## Create anndata object

In [151]:
m2o_homolog = anndata.AnnData(X = m2o_matrix, obs = pd.concat([m2o_human.obs,m2o_mouse.obs], axis = 0),var = m2o_var)



In [152]:
m2o_homolog

AnnData object with n_obs × n_vars = 236271 × 212
    obs: 'sample_name', 'exp_component_name', 'cluster_label', 'cluster_color', 'cluster_order', 'class_label', 'class_color', 'class_order', 'subclass_label', 'subclass_color', 'subclass_order', 'donor_sex_label', 'donor_sex_color', 'donor_sex_order', 'region_label', 'region_color', 'region_order', 'cortical_layer_label', 'cortical_layer_color', 'cortical_layer_order', 'cell_type_accession_label', 'cell_type_accession_color', 'cell_type_accession_order', 'cell_type_alias_label', 'cell_type_alias_color', 'cell_type_alias_order', 'cell_type_alt_alias_label', 'cell_type_alt_alias_color', 'cell_type_alt_alias_order', 'cell_type_designation_label', 'cell_type_designation_color', 'cell_type_designation_order', 'external_donor_name_label', 'external_donor_name_color', 'external_donor_name_order', 'specimen_type', 'full_genotype_label', 'outlier_call', 'outlier_type', 'homolog_class_label', 'homolog_subclass_label', 'nUMI', 'nGene', 'QC', 'clu

In [153]:
m2o_homolog.var

Unnamed: 0,mouse_ensembl_id,mouse_gene_name,orthology_type,human_homologs_ensembl_ids,human_homologs_entrez_ids,human_homologs_names,human_ensembl_id,human_entrez_id,human_gene_name,human_long_gene_name,human_chromosome
0,ENSMUSG00000003526,Prodh,ortholog_one2many,[ENSG00000277196],[5625.0],[PRODH],ENSG00000277196,5625.0,PRODH,proline dehydrogenase (oxidase) 1,22
1,ENSMUSG00000045336,Hsfy2,ortholog_one2many,"[ENSG00000169953, ENSG00000172468]","[159119.0, 86614.0]","[HSFY2, HSFY1]",ENSG00000169953,159119.0,HSFY2,"heat shock transcription factor, Y linked 2",Y
2,ENSMUSG00000010592,Dazl,ortholog_one2many,"[ENSG00000092345, ENSG00000187191, ENSG0000018...","[57135.0, 57054.0, 1617.0, 57055.0, 1618.0]","[DAZ4, DAZ3, DAZ1, DAZ2, DAZL]",ENSG00000092345,1618.0,DAZL,deleted in azoospermia-like,3
3,ENSMUSG00000079509,Zfx,ortholog_one2many,"[ENSG00000005889, ENSG00000067646]","[7544.0, 7543.0]","[ZFY, ZFX]",ENSG00000005889,7543.0,ZFX,"zinc finger protein, X-linked",X
4,ENSMUSG00000025246,Tbl1x,ortholog_one2many,"[ENSG00000092377, ENSG00000101849]","[90665.0, 6907.0]","[TBL1Y, TBL1X]",ENSG00000092377,90665.0,TBL1Y,"transducin (beta)-like 1, Y-linked",Y
...,...,...,...,...,...,...,...,...,...,...,...
207,ENSMUSG00000056895,H2bc27,ortholog_one2many,[ENSG00000196890],[128312.0],[H2BC26],ENSG00000196890,128312.0,H2BC26,"histone cluster 3, H2bb",1
208,ENSMUSG00000062421,Arf2,ortholog_one2many,[ENSG00000185829],[51326.0],[ARL17A],ENSG00000185829,51326.0,ARL17A,ADP-ribosylation factor-like 17A,17
209,ENSMUSG00000057037,Cfhr1,ortholog_one2many,"[ENSG00000080910, ENSG00000244414]","[3078.0, 3080.0]","[CFHR1, CFHR2]",ENSG00000080910,3080.0,CFHR2,complement factor H-related 2,1
210,ENSMUSG00000059089,Fcgr4,ortholog_one2many,"[ENSG00000162747, ENSG00000203747]","[2214.0, 2215.0]","[FCGR3A, FCGR3B]",ENSG00000162747,2215.0,FCGR3B,"Fc fragment of IgG, low affinity IIIb, recepto...",1


In [154]:
m2o_homolog.var.human_homologs_ensembl_ids

0                                      [ENSG00000277196]
1                     [ENSG00000169953, ENSG00000172468]
2      [ENSG00000092345, ENSG00000187191, ENSG0000018...
3                     [ENSG00000005889, ENSG00000067646]
4                     [ENSG00000092377, ENSG00000101849]
                             ...                        
207                                    [ENSG00000196890]
208                                    [ENSG00000185829]
209                   [ENSG00000080910, ENSG00000244414]
210                   [ENSG00000162747, ENSG00000203747]
211                                    [ENSG00000158473]
Name: human_homologs_ensembl_ids, Length: 212, dtype: object

In [155]:
m2o_homolog.obs

Unnamed: 0,sample_name,exp_component_name,cluster_label,cluster_color,cluster_order,class_label,class_color,class_order,subclass_label,subclass_color,subclass_order,donor_sex_label,donor_sex_color,donor_sex_order,region_label,region_color,region_order,cortical_layer_label,cortical_layer_color,cortical_layer_order,cell_type_accession_label,cell_type_accession_color,cell_type_accession_order,cell_type_alias_label,cell_type_alias_color,cell_type_alias_order,cell_type_alt_alias_label,cell_type_alt_alias_color,cell_type_alt_alias_order,cell_type_designation_label,cell_type_designation_color,cell_type_designation_order,external_donor_name_label,external_donor_name_color,external_donor_name_order,specimen_type,full_genotype_label,outlier_call,outlier_type,homolog_class_label,homolog_subclass_label,nUMI,nGene,QC,cluster,Allen.cluster_id,Allen.cluster_label,Allen.class_label,Allen.subclass_label,comb.QC,row,BICCN_cluster_id,BICCN_cluster_label,BICCN_class_label,BICCN_subclass_label,size,gene.counts,umi.counts,Broad.QC.doublet,Broad.QC.Mito,Broad.passQC,MALE,Comb.QC,cl,temp_class_label,BICCN_ontology_term_id,assay_ontology_term_id,disease_ontology_term_id,tissue_ontology_term_id,cell_type_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id,sex_ontology_term_id,is_primary_data,organism_ontology_term_id,donor_id,suspension_type,cell_type,assay,disease,organism,sex,tissue,self_reported_ethnicity,development_stage
0,AAACCCAAGGATTTCC-LKTX_190129_01_A01,AAACCCAAGGATTTCC-21L8TX_180927_001_A01,Inh L1-2 SST CCNJL,#fb8d00,50.0,GABAergic,#FF7373,1.0,Sst,#FF9900,5.0,F,#FF7373,1.0,M1,#FF7373,1.0,all,#FF7373,1.0,CS1912131050,#fb8d00,50.0,Inh L1-2 SST CCNJL,#fb8d00,50.0,,#fb8d00,50.0,Neuron 50,#fb8d00,50.0,H18.30.001,#FF7373,1.0,nucleus,,False,,GABAergic,Sst,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,AAACCCAAGTATGGCG-LKTX_190129_01_A01,AAACCCAAGTATGGCG-21L8TX_180927_001_A01,Exc L5-6 FEZF2 IFNG-AS1,#2c815f,116.0,Glutamatergic,#3DCC3D,2.0,L5/6 NP,#3E9E64,12.0,F,#FF7373,1.0,M1,#FF7373,1.0,all,#FF7373,1.0,CS1912131116,#2c815f,116.0,Exc L5-6 FEZF2 IFNG-AS1,#2c815f,116.0,,#2c815f,116.0,Neuron 116,#2c815f,116.0,H18.30.001,#FF7373,1.0,nucleus,,False,,Glutamatergic,L5/6 NP,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,AAACCCACAAAGTGTA-LKTX_190129_01_A01,AAACCCACAAAGTGTA-21L8TX_180927_001_A01,Exc L3-5 RORB LINC01202,#547d7a,87.0,Glutamatergic,#3DCC3D,2.0,L5 IT,#50B2AD,8.0,F,#FF7373,1.0,M1,#FF7373,1.0,all,#FF7373,1.0,CS1912131087,#547d7a,87.0,Exc L3-5 RORB LINC01202,#547d7a,87.0,,#547d7a,87.0,Neuron 87,#547d7a,87.0,H18.30.001,#FF7373,1.0,nucleus,,False,,Glutamatergic,L5 IT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,AAACCCACACTACTTT-LKTX_190129_01_A01,AAACCCACACTACTTT-21L8TX_180927_001_A01,Exc L2 LINC00507 GLRA3,#cecd32,75.0,Glutamatergic,#3DCC3D,2.0,L2/3 IT,#C4EC04,7.0,F,#FF7373,1.0,M1,#FF7373,1.0,all,#FF7373,1.0,CS1912131075,#cecd32,75.0,Exc L2 LINC00507 GLRA3,#cecd32,75.0,,#cecd32,75.0,Neuron 75,#cecd32,75.0,H18.30.001,#FF7373,1.0,nucleus,,False,,Glutamatergic,L2/3 IT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,AAACCCACAGTGAGCA-LKTX_190129_01_A01,AAACCCACAGTGAGCA-21L8TX_180927_001_A01,Oligo L2-6 OPALIN FTH1P3,#003a28,120.0,Non-Neuronal,#171799,3.0,Oligo,#2E3E39,17.0,F,#FF7373,1.0,M1,#FF7373,1.0,all,#FF7373,1.0,CS1912131120,#003a28,120.0,Oligo L2-6 OPALIN FTH1P3,#003a28,120.0,,#003a28,120.0,Non-neuron 3,#003a28,120.0,H18.30.001,#FF7373,1.0,nucleus,,False,,Non-Neuronal,Oligo,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
pBICCNsMMrMOpRMiM006d190320_TTTGGTTCATGAGTAA,pBICCNsMMrMOpRMiM006d190320_TTTGGTTCATGAGTAA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Glutamatergic,L6 IT,20193.0,5695.0,PassQC,Ex1_5,53.0,L6 IT Sulf1_1,Glutamatergic,L6 IT,PassQC,215818.0,53.0,L6 IT Sulf1_1,Glutamatergic,L6 IT,3978.0,5218.066616,18064.753140,0.01,0.0,0.99,0.471594,,85.0,GlutamatergicL6 IT,ILX:0770158,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000679,na,MmusDv:0000061,PATO:0000384,False,NCBITaxon:10090,M006,nucleus,glutamatergic neuron,10x 3' v3,normal,Mus musculus,male,primary motor cortex,na,early adult stage
pBICCNsMMrMOpRMiM006d190320_TTTGGTTTCGCAAGAG,pBICCNsMMrMOpRMiM006d190320_TTTGGTTTCGCAAGAG,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Non-Neuronal,Oligo,2858.0,1602.0,PassQC,Oligo_19,90.0,Oligo Opalin_4,Non-Neuronal,Oligo,PassQC,215820.0,90.0,Oligo Opalin_4,Non-Neuronal,Oligo,16566.0,1740.348364,3419.937704,0.06,0.0,0.94,0.487323,,150.0,Non-NeuronalOligo,ILX:0770140,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000128,na,MmusDv:0000061,PATO:0000384,False,NCBITaxon:10090,M006,nucleus,oligodendrocyte,10x 3' v3,normal,Mus musculus,male,primary motor cortex,na,early adult stage
pBICCNsMMrMOpRMiM006d190320_TTTGTTGAGACTCTTG,pBICCNsMMrMOpRMiM006d190320_TTTGTTGAGACTCTTG,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Glutamatergic,L5 IT,35854.0,7344.0,PassQC,Ex1_3,51.0,L5 IT S100b,Glutamatergic,L5 IT,PassQC,215821.0,51.0,L5 IT S100b,Glutamatergic,L5 IT,8684.0,5464.036043,20579.022920,0.01,0.0,0.99,0.385306,,70.0,GlutamatergicL5 IT,ILX:0770157,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000679,na,MmusDv:0000061,PATO:0000384,False,NCBITaxon:10090,M006,nucleus,glutamatergic neuron,10x 3' v3,normal,Mus musculus,male,primary motor cortex,na,early adult stage
pBICCNsMMrMOpRMiM006d190320_TTTGTTGTCACCTTGC,pBICCNsMMrMOpRMiM006d190320_TTTGTTGTCACCTTGC,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Glutamatergic,L5 IT,23493.0,6146.0,PassQC,Ex1_3,52.0,L5 IT Pld5,Glutamatergic,L5 IT,PassQC,215822.0,52.0,L5 IT Pld5,Glutamatergic,L5 IT,3621.0,5680.692074,22718.304610,0.01,0.0,0.99,0.446838,,83.0,GlutamatergicL5 IT,ILX:0770157,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000679,na,MmusDv:0000061,PATO:0000384,False,NCBITaxon:10090,M006,nucleus,glutamatergic neuron,10x 3' v3,normal,Mus musculus,male,primary motor cortex,na,early adult stage


## Check the mouse counts

In [128]:
m2o_mouse.var

Unnamed: 0_level_0,feature_is_filtered,feature_name,feature_reference,feature_biotype
feature_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSMUSG00000020029,False,Nudt4,NCBITaxon:10090,gene
ENSMUSG00000054850,False,Smim10l2a,NCBITaxon:10090,gene
ENSMUSG00000075470,False,Alg10b,NCBITaxon:10090,gene
ENSMUSG00000019920,False,Lims1,NCBITaxon:10090,gene
ENSMUSG00000055692,False,Tmem191c,NCBITaxon:10090,gene
...,...,...,...,...
ENSMUSG00000073894,False,Rbmxl2,NCBITaxon:10090,gene
ENSMUSG00000053044,False,Cd8b1,NCBITaxon:10090,gene
ENSMUSG00000045336,False,Hsfy2,NCBITaxon:10090,gene
ENSMUSG00000056367,False,Actr3b,NCBITaxon:10090,gene


In [129]:
m2o_mouse.obs_names

Index(['pBICCNsMMrMOpRMiF007d190314_AAAGGGCAGGGCTAAC',
       'pBICCNsMMrMOpRMiF007d190314_AAAGGTAGTGGCTGAA',
       'pBICCNsMMrMOpRMiF007d190314_AAAGTCCCAAGAGCTG',
       'pBICCNsMMrMOpRMiF007d190314_AAATGGAAGATCACCT',
       'pBICCNsMMrMOpRMiF007d190314_AACAAAGCACGCGCAT',
       'pBICCNsMMrMOpRMiF007d190314_AACAAGAAGATCGCTT',
       'pBICCNsMMrMOpRMiF007d190314_AACAAGAAGCAGGTCA',
       'pBICCNsMMrMOpRMiF007d190314_AACACACAGACTCAAA',
       'pBICCNsMMrMOpRMiF007d190314_AACACACCAAATACAG',
       'pBICCNsMMrMOpRMiF007d190314_AACACACTCGCTTGCT',
       ...
       'pBICCNsMMrMOpRMiM006d190320_TTTCCTCTCGAGCCTG',
       'pBICCNsMMrMOpRMiM006d190320_TTTCCTCTCTAACACG',
       'pBICCNsMMrMOpRMiM006d190320_TTTGATCAGTTACGTC',
       'pBICCNsMMrMOpRMiM006d190320_TTTGGAGAGTTGTAAG',
       'pBICCNsMMrMOpRMiM006d190320_TTTGGTTAGGATTTCC',
       'pBICCNsMMrMOpRMiM006d190320_TTTGGTTCATGAGTAA',
       'pBICCNsMMrMOpRMiM006d190320_TTTGGTTTCGCAAGAG',
       'pBICCNsMMrMOpRMiM006d190320_TTTGTTGAGACTCTTG',

In [82]:
m2o_mouse.var_names

Index(['ENSMUSG00000020029', 'ENSMUSG00000054850', 'ENSMUSG00000075470',
       'ENSMUSG00000019920', 'ENSMUSG00000055692', 'ENSMUSG00000060639',
       'ENSMUSG00000083012', 'ENSMUSG00000032966', 'ENSMUSG00000073421',
       'ENSMUSG00000026124',
       ...
       'ENSMUSG00000045306', 'ENSMUSG00000024104', 'ENSMUSG00000059463',
       'ENSMUSG00000028427', 'ENSMUSG00000024101', 'ENSMUSG00000073894',
       'ENSMUSG00000053044', 'ENSMUSG00000045336', 'ENSMUSG00000056367',
       'ENSMUSG00000001666'],
      dtype='object', name='feature_id', length=212)

In [83]:
np.where(m2o_mouse.obs_names == 'pBICCNsMMrMOpRMiF007d190314_AAAGGGCAGGGCTAAC')[0][0]

0

In [84]:
m2o_homolog.obs_names

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       'pBICCNsMMrMOpRMiM006d190320_TTTCCTCTCGAGCCTG',
       'pBICCNsMMrMOpRMiM006d190320_TTTCCTCTCTAACACG',
       'pBICCNsMMrMOpRMiM006d190320_TTTGATCAGTTACGTC',
       'pBICCNsMMrMOpRMiM006d190320_TTTGGAGAGTTGTAAG',
       'pBICCNsMMrMOpRMiM006d190320_TTTGGTTAGGATTTCC',
       'pBICCNsMMrMOpRMiM006d190320_TTTGGTTCATGAGTAA',
       'pBICCNsMMrMOpRMiM006d190320_TTTGGTTTCGCAAGAG',
       'pBICCNsMMrMOpRMiM006d190320_TTTGTTGAGACTCTTG',
       'pBICCNsMMrMOpRMiM006d190320_TTTGTTGTCACCTTGC',
       'pBICCNsMMrMOpRMiM006d190320_TTTGTTGTCATGCTAG'],
      dtype='object', length=236271)

In [85]:
np.where(m2o_homolog.obs_names == 'pBICCNsMMrMOpRMiF007d190314_AAAGGGCAGGGCTAAC')[0][0]

76533

In [86]:
m2o_mouse.X[0,1]

0.0

In [87]:
np.where(m2o_mouse.var_names == 'ENSMUSG00000020029')

(array([0]),)

In [88]:
m2o_homolog.var = m2o_homolog.var.set_index('mouse_homolog_ensembl_gene')
m2o_homolog.var['mouse_homolog_ensembl_gene'] = m2o_homolog.var.index

In [89]:
m2o_homolog.var_names

Index(['ENSMUSG00000003526', 'ENSMUSG00000045336', 'ENSMUSG00000010592',
       'ENSMUSG00000079509', 'ENSMUSG00000025246', 'ENSMUSG00000034755',
       'ENSMUSG00000006345', 'ENSMUSG00000075573', 'ENSMUSG00000021000',
       'ENSMUSG00000021222',
       ...
       'ENSMUSG00000019920', 'ENSMUSG00000115958', 'ENSMUSG00000042784',
       'ENSMUSG00000000982', 'ENSMUSG00000018930', 'ENSMUSG00000056895',
       'ENSMUSG00000062421', 'ENSMUSG00000057037', 'ENSMUSG00000059089',
       'ENSMUSG00000028076'],
      dtype='object', name='mouse_homolog_ensembl_gene', length=212)

In [92]:
len(m2o_mouse.obs_names)

159738

In [99]:
def check_cell_counts():
    correct = 0
    incorrect = {}
    cpt = 0
    for cell in m2o_mouse.obs_names:
        mouse_cell_index = np.where(m2o_mouse.obs_names == str(cell))[0][0]
        homolog_mouse_cell_index = np.where(m2o_homolog.obs_names == str(cell))[0][0]
        for gene in m2o_mouse.var_names:
            mouse_gene_index = np.where(m2o_mouse.var_names == str(gene))[0][0]
            homolog_mouse_gene_index = np.where(m2o_homolog.var_names == str(gene))[0][0]
            if m2o_mouse.X[int(mouse_cell_index),int(mouse_gene_index)] == m2o_homolog.X[int(homolog_mouse_cell_index),int(homolog_mouse_gene_index)]:
                correct = correct+1
            else :
                incorrect[gene] = [cell,mouse_gene_index,homolog_mouse_gene_index]
    return correct, incorrect

In [None]:
correctm, incorrectm = check_cell_counts()

In [97]:
print(correctm, len(incorrectm.keys()))

33864456 0


In [98]:
correctm == len(m2o_mouse.obs_names) * len(m2o_mouse.var_names)

True

## Write the results 

In [156]:
m2o_homolog_write = m2o_homolog

In [157]:
m2o_homolog_write.obs = m2o_homolog_write.obs.drop(['outlier_call','is_primary_data'], axis = 1)

In [158]:
m2o_homolog_write.var

Unnamed: 0,mouse_ensembl_id,mouse_gene_name,orthology_type,human_homologs_ensembl_ids,human_homologs_entrez_ids,human_homologs_names,human_ensembl_id,human_entrez_id,human_gene_name,human_long_gene_name,human_chromosome
0,ENSMUSG00000003526,Prodh,ortholog_one2many,[ENSG00000277196],[5625.0],[PRODH],ENSG00000277196,5625.0,PRODH,proline dehydrogenase (oxidase) 1,22
1,ENSMUSG00000045336,Hsfy2,ortholog_one2many,"[ENSG00000169953, ENSG00000172468]","[159119.0, 86614.0]","[HSFY2, HSFY1]",ENSG00000169953,159119.0,HSFY2,"heat shock transcription factor, Y linked 2",Y
2,ENSMUSG00000010592,Dazl,ortholog_one2many,"[ENSG00000092345, ENSG00000187191, ENSG0000018...","[57135.0, 57054.0, 1617.0, 57055.0, 1618.0]","[DAZ4, DAZ3, DAZ1, DAZ2, DAZL]",ENSG00000092345,1618.0,DAZL,deleted in azoospermia-like,3
3,ENSMUSG00000079509,Zfx,ortholog_one2many,"[ENSG00000005889, ENSG00000067646]","[7544.0, 7543.0]","[ZFY, ZFX]",ENSG00000005889,7543.0,ZFX,"zinc finger protein, X-linked",X
4,ENSMUSG00000025246,Tbl1x,ortholog_one2many,"[ENSG00000092377, ENSG00000101849]","[90665.0, 6907.0]","[TBL1Y, TBL1X]",ENSG00000092377,90665.0,TBL1Y,"transducin (beta)-like 1, Y-linked",Y
...,...,...,...,...,...,...,...,...,...,...,...
207,ENSMUSG00000056895,H2bc27,ortholog_one2many,[ENSG00000196890],[128312.0],[H2BC26],ENSG00000196890,128312.0,H2BC26,"histone cluster 3, H2bb",1
208,ENSMUSG00000062421,Arf2,ortholog_one2many,[ENSG00000185829],[51326.0],[ARL17A],ENSG00000185829,51326.0,ARL17A,ADP-ribosylation factor-like 17A,17
209,ENSMUSG00000057037,Cfhr1,ortholog_one2many,"[ENSG00000080910, ENSG00000244414]","[3078.0, 3080.0]","[CFHR1, CFHR2]",ENSG00000080910,3080.0,CFHR2,complement factor H-related 2,1
210,ENSMUSG00000059089,Fcgr4,ortholog_one2many,"[ENSG00000162747, ENSG00000203747]","[2214.0, 2215.0]","[FCGR3A, FCGR3B]",ENSG00000162747,2215.0,FCGR3B,"Fc fragment of IgG, low affinity IIIb, recepto...",1


In [159]:
m2o_homolog_write.var[['human_homologs_ensembl_ids', 'human_homologs_entrez_ids', 'human_homologs_names']] = m2o_homolog_write.var[['human_homologs_ensembl_ids', 'human_homologs_entrez_ids', 'human_homologs_names']].astype(str)

In [160]:
m2o_homolog_write.write_h5ad(os.path.join(path_data,'m2o_homolog_human_mouse_'+str(date.today())+'.h5ad'),compression='gzip')