# Import packages and paths

In [1]:
import scanpy as sc
import anndata
import numpy as np
import gc
import pandas as pd 
import os
from datetime import date
pd.set_option('display.max_columns', None)
from biomart import BiomartServer
gc.isenabled()

True

In [145]:
os.path.realpath('many2many_human_mouse.ipynb')

'/nfs/research/irene/anaelle/Scripts/many2many_human_mouse.ipynb'

In [3]:
path_project = '/nfs/research/irene/anaelle'
print(path_project)
path_scripts = os.path.join(path_project, 'Scripts','human_mouse_scripts')
print(path_scripts)
path_data = os.path.join(path_project, 'data')
print(path_data)

/nfs/research/irene/anaelle
/nfs/research/irene/anaelle/Scripts/human_mouse
/nfs/research/irene/anaelle/data


# Load Biomart output

In [4]:
human_mouse_homolog_genes = pd.read_csv(os.path.join(path_data, 'homolog_human_mouse_ensembl_gene.csv'))

In [5]:
human_mouse_homolog_genes = human_mouse_homolog_genes.drop('Unnamed: 0', axis=1)

In [6]:
human_mouse_homolog_genes

Unnamed: 0,human_ensembl_gene_id,human_entrezgene_id,human_external_gene_name,mouse_homolog_ensembl_gene,orthology_type,mouse_homolog_gene_name
0,ENSG00000198888,4535.0,MT-ND1,ENSMUSG00000064341,ortholog_one2one,mt-Nd1
1,ENSG00000198763,4536.0,MT-ND2,ENSMUSG00000064345,ortholog_one2one,mt-Nd2
2,ENSG00000198804,4512.0,MT-CO1,ENSMUSG00000064351,ortholog_one2one,mt-Co1
3,ENSG00000198712,4513.0,MT-CO2,ENSMUSG00000064354,ortholog_one2one,mt-Co2
4,ENSG00000228253,4509.0,MT-ATP8,ENSMUSG00000064356,ortholog_one2one,mt-Atp8
...,...,...,...,...,...,...
21936,ENSG00000187017,83715.0,ESPN,ENSMUSG00000028943,ortholog_one2one,Espn
21937,ENSG00000198216,777.0,CACNA1E,ENSMUSG00000004110,ortholog_one2one,Cacna1e
21938,ENSG00000179930,127665.0,ZNF648,ENSMUSG00000066797,ortholog_one2one,Zfp648
21939,ENSG00000162437,55225.0,RAVER2,ENSMUSG00000035275,ortholog_one2one,Raver2


# Import human and mouse data

In [7]:
mouse_data = sc.read_h5ad(os.path.join(path_data,'mouse/mouse_raw_counts_from_cellxgene.h5ad'))

In [8]:
mouse_data

AnnData object with n_obs × n_vars = 159738 × 30639
    obs: 'Unnamed: 0', 'nUMI', 'nGene', 'QC', 'cluster', 'Allen.cluster_id', 'Allen.cluster_label', 'Allen.class_label', 'Allen.subclass_label', 'comb.QC', 'row', 'BICCN_cluster_id', 'BICCN_cluster_label', 'BICCN_class_label', 'BICCN_subclass_label', 'size', 'gene.counts', 'umi.counts', 'Broad.QC.doublet', 'Broad.QC.Mito', 'Broad.passQC', 'MALE', 'Comb.QC', 'cl', 'temp_class_label', 'BICCN_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'tissue_ontology_term_id', 'cell_type_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'donor_id', 'suspension_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype'

In [9]:
human_data = sc.read_h5ad(os.path.join(path_data, 'human/hdata2023-06-05.h5ad'))



In [10]:
human_data

AnnData object with n_obs × n_vars = 76533 × 50281
    obs: 'sample_name', 'exp_component_name', 'cluster_label', 'cluster_color', 'cluster_order', 'class_label', 'class_color', 'class_order', 'subclass_label', 'subclass_color', 'subclass_order', 'donor_sex_label', 'donor_sex_color', 'donor_sex_order', 'region_label', 'region_color', 'region_order', 'cortical_layer_label', 'cortical_layer_color', 'cortical_layer_order', 'cell_type_accession_label', 'cell_type_accession_color', 'cell_type_accession_order', 'cell_type_alias_label', 'cell_type_alias_color', 'cell_type_alias_order', 'cell_type_alt_alias_label', 'cell_type_alt_alias_color', 'cell_type_alt_alias_order', 'cell_type_designation_label', 'cell_type_designation_color', 'cell_type_designation_order', 'external_donor_name_label', 'external_donor_name_color', 'external_donor_name_order', 'specimen_type', 'full_genotype_label', 'outlier_call', 'outlier_type'
    var: 'gene', 'chromosome', 'entrez_id', 'gene_name', 'mouse_homologenes'

# Get only many2many label

In [13]:
human_mouse_homolog_genes.orthology_type.unique()

array(['ortholog_one2one', 'ortholog_one2many', 'ortholog_many2many'],
      dtype=object)

In [16]:
many2many_genes = human_mouse_homolog_genes[human_mouse_homolog_genes.orthology_type == 'ortholog_many2many']

In [48]:
many2many_genes

Unnamed: 0,human_ensembl_gene_id,human_entrezgene_id,human_external_gene_name,mouse_homolog_ensembl_gene,orthology_type,mouse_homolog_gene_name
32,ENSG00000244395,378949.0,RBMY1D,ENSMUSG00000093987,ortholog_many2many,Gm21704
33,ENSG00000244395,378949.0,RBMY1D,ENSMUSG00000095852,ortholog_many2many,Gm10256
34,ENSG00000244395,378949.0,RBMY1D,ENSMUSG00000101667,ortholog_many2many,Gm29289
35,ENSG00000244395,378949.0,RBMY1D,ENSMUSG00000096520,ortholog_many2many,Gm3376
36,ENSG00000244395,378949.0,RBMY1D,ENSMUSG00000102053,ortholog_many2many,Gm4064
...,...,...,...,...,...,...
21794,ENSG00000279804,391003.0,PRAMEF18,ENSMUSG00000095954,ortholog_many2many,Gm3183
21795,ENSG00000279804,391003.0,PRAMEF18,ENSMUSG00000070617,ortholog_many2many,Pramel23
21796,ENSG00000279804,391003.0,PRAMEF18,ENSMUSG00000095409,ortholog_many2many,Gm13043
21797,ENSG00000279804,391003.0,PRAMEF18,ENSMUSG00000072821,ortholog_many2many,Gm6351


## Check for duplicates

### 1 entrez id ~ many ensembl

In [18]:
doublets = {}
for index, row in many2many_genes.iterrows():
    if row['human_entrezgene_id'] not in doublets.keys():
        doublets[row['human_entrezgene_id']] = []
    if row.human_ensembl_gene_id not in doublets[row['human_entrezgene_id']]:
        doublets[row['human_entrezgene_id']].append(row.human_ensembl_gene_id)

In [19]:
len(doublets)

447

In [20]:
real_duplicates = {}
for gene in doublets.keys():
    if len(doublets[gene])>1:
        real_duplicates[gene] = doublets[gene]

In [21]:
real_duplicates

{1673.0: ['ENSG00000177257', 'ENSG00000171711'],
 245908.0: ['ENSG00000186599', 'ENSG00000186562'],
 643862.0: ['ENSG00000262461',
  'ENSG00000275976',
  'ENSG00000186645',
  'ENSG00000274570',
  'ENSG00000273520'],
 641776.0: ['ENSG00000286038', 'ENSG00000286014', 'ENSG00000286137']}

In [25]:
to_be_removed = []
for gene in real_duplicates.keys():
    for dublet in real_duplicates[gene][1:]:
        to_be_removed.append(dublet)
print(len(to_be_removed))
to_be_removed

8


['ENSG00000171711',
 'ENSG00000186562',
 'ENSG00000275976',
 'ENSG00000186645',
 'ENSG00000274570',
 'ENSG00000273520',
 'ENSG00000286014',
 'ENSG00000286137']

In [26]:
many2many_genes[~many2many_genes.human_ensembl_gene_id.isin(to_be_removed)]

Unnamed: 0,human_ensembl_gene_id,human_entrezgene_id,human_external_gene_name,mouse_homolog_ensembl_gene,orthology_type,mouse_homolog_gene_name
32,ENSG00000244395,378949.0,RBMY1D,ENSMUSG00000093987,ortholog_many2many,Gm21704
33,ENSG00000244395,378949.0,RBMY1D,ENSMUSG00000095852,ortholog_many2many,Gm10256
34,ENSG00000244395,378949.0,RBMY1D,ENSMUSG00000101667,ortholog_many2many,Gm29289
35,ENSG00000244395,378949.0,RBMY1D,ENSMUSG00000096520,ortholog_many2many,Gm3376
36,ENSG00000244395,378949.0,RBMY1D,ENSMUSG00000102053,ortholog_many2many,Gm4064
...,...,...,...,...,...,...
21794,ENSG00000279804,391003.0,PRAMEF18,ENSMUSG00000095954,ortholog_many2many,Gm3183
21795,ENSG00000279804,391003.0,PRAMEF18,ENSMUSG00000070617,ortholog_many2many,Pramel23
21796,ENSG00000279804,391003.0,PRAMEF18,ENSMUSG00000095409,ortholog_many2many,Gm13043
21797,ENSG00000279804,391003.0,PRAMEF18,ENSMUSG00000072821,ortholog_many2many,Gm6351


In [27]:
many2many_genes = many2many_genes[~many2many_genes.human_ensembl_gene_id.isin(to_be_removed)]

In [28]:
many2many_genes.duplicated().any()

False

### 1 ensembl ~ many entrez id

In [29]:
doublets = {}
for index, row in many2many_genes.iterrows():
    if row['human_ensembl_gene_id'] not in doublets.keys():
        doublets[row['human_ensembl_gene_id']] = []
    if row.human_entrezgene_id not in doublets[row['human_ensembl_gene_id']]:
        doublets[row['human_ensembl_gene_id']].append(row.human_entrezgene_id)

In [30]:
len(doublets)

447

In [31]:
real_duplicates = {}
for gene in doublets.keys():
    if len(doublets[gene])>1:
        real_duplicates[gene] = doublets[gene]

In [32]:
real_duplicates

{}

# Create new anndata objects

## Human

In [33]:
m2m_human = human_data[:, human_data.var.entrez_id.isin(many2many_genes['human_entrezgene_id'])]

In [34]:
m2m_human

View of AnnData object with n_obs × n_vars = 76533 × 447
    obs: 'sample_name', 'exp_component_name', 'cluster_label', 'cluster_color', 'cluster_order', 'class_label', 'class_color', 'class_order', 'subclass_label', 'subclass_color', 'subclass_order', 'donor_sex_label', 'donor_sex_color', 'donor_sex_order', 'region_label', 'region_color', 'region_order', 'cortical_layer_label', 'cortical_layer_color', 'cortical_layer_order', 'cell_type_accession_label', 'cell_type_accession_color', 'cell_type_accession_order', 'cell_type_alias_label', 'cell_type_alias_color', 'cell_type_alias_order', 'cell_type_alt_alias_label', 'cell_type_alt_alias_color', 'cell_type_alt_alias_order', 'cell_type_designation_label', 'cell_type_designation_color', 'cell_type_designation_order', 'external_donor_name_label', 'external_donor_name_color', 'external_donor_name_order', 'specimen_type', 'full_genotype_label', 'outlier_call', 'outlier_type'
    var: 'gene', 'chromosome', 'entrez_id', 'gene_name', 'mouse_homolo

In [36]:
len(many2many_genes.human_entrezgene_id.unique())

447

## Mouse

In [37]:
m2m_mouse = mouse_data[:, mouse_data.var_names.isin(many2many_genes['mouse_homolog_ensembl_gene'])]

In [38]:
m2m_mouse

View of AnnData object with n_obs × n_vars = 159738 × 658
    obs: 'Unnamed: 0', 'nUMI', 'nGene', 'QC', 'cluster', 'Allen.cluster_id', 'Allen.cluster_label', 'Allen.class_label', 'Allen.subclass_label', 'comb.QC', 'row', 'BICCN_cluster_id', 'BICCN_cluster_label', 'BICCN_class_label', 'BICCN_subclass_label', 'size', 'gene.counts', 'umi.counts', 'Broad.QC.doublet', 'Broad.QC.Mito', 'Broad.passQC', 'MALE', 'Comb.QC', 'cl', 'temp_class_label', 'BICCN_ontology_term_id', 'assay_ontology_term_id', 'disease_ontology_term_id', 'tissue_ontology_term_id', 'cell_type_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'development_stage_ontology_term_id', 'sex_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'donor_id', 'suspension_type', 'cell_type', 'assay', 'disease', 'organism', 'sex', 'tissue', 'self_reported_ethnicity', 'development_stage'
    var: 'feature_is_filtered', 'feature_name', 'feature_reference', 'feature_biotype'

In [39]:
len(many2many_genes.mouse_homolog_ensembl_gene.unique())

658

## Format obs df

In [40]:
m2m_human.obs['homolog_class_label'] = m2m_human.obs['class_label']
m2m_human.obs['homolog_subclass_label'] = m2m_human.obs['subclass_label']

  m2m_human.obs['homolog_class_label'] = m2m_human.obs['class_label']


In [41]:
m2m_human.obs

Unnamed: 0,sample_name,exp_component_name,cluster_label,cluster_color,cluster_order,class_label,class_color,class_order,subclass_label,subclass_color,subclass_order,donor_sex_label,donor_sex_color,donor_sex_order,region_label,region_color,region_order,cortical_layer_label,cortical_layer_color,cortical_layer_order,cell_type_accession_label,cell_type_accession_color,cell_type_accession_order,cell_type_alias_label,cell_type_alias_color,cell_type_alias_order,cell_type_alt_alias_label,cell_type_alt_alias_color,cell_type_alt_alias_order,cell_type_designation_label,cell_type_designation_color,cell_type_designation_order,external_donor_name_label,external_donor_name_color,external_donor_name_order,specimen_type,full_genotype_label,outlier_call,outlier_type,homolog_class_label,homolog_subclass_label
0,AAACCCAAGGATTTCC-LKTX_190129_01_A01,AAACCCAAGGATTTCC-21L8TX_180927_001_A01,Inh L1-2 SST CCNJL,#fb8d00,50,GABAergic,#FF7373,1,Sst,#FF9900,5,F,#FF7373,1,M1,#FF7373,1,all,#FF7373,1,CS1912131050,#fb8d00,50,Inh L1-2 SST CCNJL,#fb8d00,50,,#fb8d00,50,Neuron 50,#fb8d00,50,H18.30.001,#FF7373,1,nucleus,,False,,GABAergic,Sst
1,AAACCCAAGTATGGCG-LKTX_190129_01_A01,AAACCCAAGTATGGCG-21L8TX_180927_001_A01,Exc L5-6 FEZF2 IFNG-AS1,#2c815f,116,Glutamatergic,#3DCC3D,2,L5/6 NP,#3E9E64,12,F,#FF7373,1,M1,#FF7373,1,all,#FF7373,1,CS1912131116,#2c815f,116,Exc L5-6 FEZF2 IFNG-AS1,#2c815f,116,,#2c815f,116,Neuron 116,#2c815f,116,H18.30.001,#FF7373,1,nucleus,,False,,Glutamatergic,L5/6 NP
2,AAACCCACAAAGTGTA-LKTX_190129_01_A01,AAACCCACAAAGTGTA-21L8TX_180927_001_A01,Exc L3-5 RORB LINC01202,#547d7a,87,Glutamatergic,#3DCC3D,2,L5 IT,#50B2AD,8,F,#FF7373,1,M1,#FF7373,1,all,#FF7373,1,CS1912131087,#547d7a,87,Exc L3-5 RORB LINC01202,#547d7a,87,,#547d7a,87,Neuron 87,#547d7a,87,H18.30.001,#FF7373,1,nucleus,,False,,Glutamatergic,L5 IT
3,AAACCCACACTACTTT-LKTX_190129_01_A01,AAACCCACACTACTTT-21L8TX_180927_001_A01,Exc L2 LINC00507 GLRA3,#cecd32,75,Glutamatergic,#3DCC3D,2,L2/3 IT,#C4EC04,7,F,#FF7373,1,M1,#FF7373,1,all,#FF7373,1,CS1912131075,#cecd32,75,Exc L2 LINC00507 GLRA3,#cecd32,75,,#cecd32,75,Neuron 75,#cecd32,75,H18.30.001,#FF7373,1,nucleus,,False,,Glutamatergic,L2/3 IT
4,AAACCCACAGTGAGCA-LKTX_190129_01_A01,AAACCCACAGTGAGCA-21L8TX_180927_001_A01,Oligo L2-6 OPALIN FTH1P3,#003a28,120,Non-Neuronal,#171799,3,Oligo,#2E3E39,17,F,#FF7373,1,M1,#FF7373,1,all,#FF7373,1,CS1912131120,#003a28,120,Oligo L2-6 OPALIN FTH1P3,#003a28,120,,#003a28,120,Non-neuron 3,#003a28,120,H18.30.001,#FF7373,1,nucleus,,False,,Non-Neuronal,Oligo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76528,TTTGTTGAGATGGCGT-LKTX_190130_01_H01,TTTGTTGAGATGGCGT-35L8TX_181108_001_D01,Oligo L2-6 OPALIN FTH1P3,#003a28,120,Non-Neuronal,#171799,3,Oligo,#2E3E39,17,F,#FF7373,1,M1,#FF7373,1,all,#FF7373,1,CS1912131120,#003a28,120,Oligo L2-6 OPALIN FTH1P3,#003a28,120,,#003a28,120,Non-neuron 3,#003a28,120,H18.30.001,#FF7373,1,nucleus,,False,,Non-Neuronal,Oligo
76529,TTTGTTGCACAGCCAC-LKTX_190130_01_H01,TTTGTTGCACAGCCAC-35L8TX_181108_001_D01,Exc L3-5 RORB LNX2,#01d9d5,90,Glutamatergic,#3DCC3D,2,L5 IT,#50B2AD,8,F,#FF7373,1,M1,#FF7373,1,all,#FF7373,1,CS1912131090,#01d9d5,90,Exc L3-5 RORB LNX2,#01d9d5,90,,#01d9d5,90,Neuron 90,#01d9d5,90,H18.30.001,#FF7373,1,nucleus,,False,,Glutamatergic,L5 IT
76530,TTTGTTGCAGAGACTG-LKTX_190130_01_H01,TTTGTTGCAGAGACTG-35L8TX_181108_001_D01,Exc L2-3 RORB PTPN3,#b7ce00,81,Glutamatergic,#3DCC3D,2,L2/3 IT,#C4EC04,7,F,#FF7373,1,M1,#FF7373,1,all,#FF7373,1,CS1912131081,#b7ce00,81,Exc L2-3 RORB PTPN3,#b7ce00,81,,#b7ce00,81,Neuron 81,#b7ce00,81,H18.30.001,#FF7373,1,nucleus,,False,,Glutamatergic,L2/3 IT
76531,TTTGTTGCATAATGAG-LKTX_190130_01_H01,TTTGTTGCATAATGAG-35L8TX_181108_001_D01,Oligo L2-6 OPALIN FTH1P3,#003a28,120,Non-Neuronal,#171799,3,Oligo,#2E3E39,17,F,#FF7373,1,M1,#FF7373,1,all,#FF7373,1,CS1912131120,#003a28,120,Oligo L2-6 OPALIN FTH1P3,#003a28,120,,#003a28,120,Non-neuron 3,#003a28,120,H18.30.001,#FF7373,1,nucleus,,False,,Non-Neuronal,Oligo


In [42]:
m2m_mouse.obs['sample_name'] = m2m_mouse.obs.index

  m2m_mouse.obs['sample_name'] = m2m_mouse.obs.index


In [43]:
m2m_mouse.obs['homolog_class_label'] = m2m_mouse.obs['Allen.class_label']
m2m_mouse.obs['homolog_subclass_label'] = m2m_mouse.obs['Allen.subclass_label']

In [44]:
m2m_mouse.obs = m2m_mouse.obs.drop('Unnamed: 0', axis = 1)

In [45]:
m2m_mouse.obs

Unnamed: 0,nUMI,nGene,QC,cluster,Allen.cluster_id,Allen.cluster_label,Allen.class_label,Allen.subclass_label,comb.QC,row,BICCN_cluster_id,BICCN_cluster_label,BICCN_class_label,BICCN_subclass_label,size,gene.counts,umi.counts,Broad.QC.doublet,Broad.QC.Mito,Broad.passQC,MALE,Comb.QC,cl,temp_class_label,BICCN_ontology_term_id,assay_ontology_term_id,disease_ontology_term_id,tissue_ontology_term_id,cell_type_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id,sex_ontology_term_id,is_primary_data,organism_ontology_term_id,donor_id,suspension_type,cell_type,assay,disease,organism,sex,tissue,self_reported_ethnicity,development_stage,sample_name,homolog_class_label,homolog_subclass_label
pBICCNsMMrMOpRMiF007d190314_AAAGGGCAGGGCTAAC,4499,2094,PassQC,Astro_14,91.0,Astro Aqp4_1,Non-Neuronal,Astro,PassQC,4,91.0,Astro Aqp4_1,Non-Neuronal,Astro,16143.0,2601.086105,5846.112804,0.03,0.0,0.97,0.448615,,113.0,Non-NeuronalAstro,ILX:0770141,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000127,na,MmusDv:0000061,PATO:0000383,False,NCBITaxon:10090,F007,nucleus,astrocyte,10x 3' v3,normal,Mus musculus,female,primary motor cortex,na,early adult stage,pBICCNsMMrMOpRMiF007d190314_AAAGGGCAGGGCTAAC,Non-Neuronal,Astro
pBICCNsMMrMOpRMiF007d190314_AAAGGTAGTGGCTGAA,11900,4182,PassQC,Ex2_9,48.0,L2/3 IT_1,Glutamatergic,L2/3 IT,PassQC,5,48.0,L2/3 IT_1,Glutamatergic,L2/3 IT,23971.0,5300.903467,19543.896210,0.11,0.0,0.89,0.402945,,64.0,GlutamatergicL2/3 IT,ILX:0770156,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000679,na,MmusDv:0000061,PATO:0000383,False,NCBITaxon:10090,F007,nucleus,glutamatergic neuron,10x 3' v3,normal,Mus musculus,female,primary motor cortex,na,early adult stage,pBICCNsMMrMOpRMiF007d190314_AAAGGTAGTGGCTGAA,Glutamatergic,L2/3 IT
pBICCNsMMrMOpRMiF007d190314_AAAGTCCCAAGAGCTG,4166,2025,PassQC,Astro_0,91.0,Astro Aqp4_1,Non-Neuronal,Astro,PassQC,6,91.0,Astro Aqp4_1,Non-Neuronal,Astro,16143.0,2601.086105,5846.112804,0.03,0.0,0.97,0.448615,,113.0,Non-NeuronalAstro,ILX:0770141,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000127,na,MmusDv:0000061,PATO:0000383,False,NCBITaxon:10090,F007,nucleus,astrocyte,10x 3' v3,normal,Mus musculus,female,primary motor cortex,na,early adult stage,pBICCNsMMrMOpRMiF007d190314_AAAGTCCCAAGAGCTG,Non-Neuronal,Astro
pBICCNsMMrMOpRMiF007d190314_AAATGGAAGATCACCT,11834,4090,PassQC,Ex3_0,66.0,L6 CT Cpa6,Glutamatergic,L6 CT,PassQC,7,66.0,L6 CT Cpa6,Glutamatergic,L6 CT,16829.0,4862.752035,15796.942180,0.00,0.0,0.99,0.471210,,104.0,GlutamatergicL6 CT,ILX:0770162,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000679,na,MmusDv:0000061,PATO:0000383,False,NCBITaxon:10090,F007,nucleus,glutamatergic neuron,10x 3' v3,normal,Mus musculus,female,primary motor cortex,na,early adult stage,pBICCNsMMrMOpRMiF007d190314_AAATGGAAGATCACCT,Glutamatergic,L6 CT
pBICCNsMMrMOpRMiF007d190314_AACAAAGCACGCGCAT,1957,1362,PassQC,Micro_14,113.0,Macrophage_2,Non-Neuronal,Macrophage,PassQC,8,113.0,Macrophage_2,Non-Neuronal,Macrophage,7801.0,1583.783361,2507.487117,0.00,0.0,1.00,0.482374,,123.0,Non-NeuronalMacrophage,Macrophage,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000235,na,MmusDv:0000061,PATO:0000383,False,NCBITaxon:10090,F007,nucleus,macrophage,10x 3' v3,normal,Mus musculus,female,primary motor cortex,na,early adult stage,pBICCNsMMrMOpRMiF007d190314_AACAAAGCACGCGCAT,Non-Neuronal,Macrophage
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
pBICCNsMMrMOpRMiM006d190320_TTTGGTTCATGAGTAA,20193,5695,PassQC,Ex1_5,53.0,L6 IT Sulf1_1,Glutamatergic,L6 IT,PassQC,215818,53.0,L6 IT Sulf1_1,Glutamatergic,L6 IT,3978.0,5218.066616,18064.753140,0.01,0.0,0.99,0.471594,,85.0,GlutamatergicL6 IT,ILX:0770158,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000679,na,MmusDv:0000061,PATO:0000384,False,NCBITaxon:10090,M006,nucleus,glutamatergic neuron,10x 3' v3,normal,Mus musculus,male,primary motor cortex,na,early adult stage,pBICCNsMMrMOpRMiM006d190320_TTTGGTTCATGAGTAA,Glutamatergic,L6 IT
pBICCNsMMrMOpRMiM006d190320_TTTGGTTTCGCAAGAG,2858,1602,PassQC,Oligo_19,90.0,Oligo Opalin_4,Non-Neuronal,Oligo,PassQC,215820,90.0,Oligo Opalin_4,Non-Neuronal,Oligo,16566.0,1740.348364,3419.937704,0.06,0.0,0.94,0.487323,,150.0,Non-NeuronalOligo,ILX:0770140,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000128,na,MmusDv:0000061,PATO:0000384,False,NCBITaxon:10090,M006,nucleus,oligodendrocyte,10x 3' v3,normal,Mus musculus,male,primary motor cortex,na,early adult stage,pBICCNsMMrMOpRMiM006d190320_TTTGGTTTCGCAAGAG,Non-Neuronal,Oligo
pBICCNsMMrMOpRMiM006d190320_TTTGTTGAGACTCTTG,35854,7344,PassQC,Ex1_3,51.0,L5 IT S100b,Glutamatergic,L5 IT,PassQC,215821,51.0,L5 IT S100b,Glutamatergic,L5 IT,8684.0,5464.036043,20579.022920,0.01,0.0,0.99,0.385306,,70.0,GlutamatergicL5 IT,ILX:0770157,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000679,na,MmusDv:0000061,PATO:0000384,False,NCBITaxon:10090,M006,nucleus,glutamatergic neuron,10x 3' v3,normal,Mus musculus,male,primary motor cortex,na,early adult stage,pBICCNsMMrMOpRMiM006d190320_TTTGTTGAGACTCTTG,Glutamatergic,L5 IT
pBICCNsMMrMOpRMiM006d190320_TTTGTTGTCACCTTGC,23493,6146,PassQC,Ex1_3,52.0,L5 IT Pld5,Glutamatergic,L5 IT,PassQC,215822,52.0,L5 IT Pld5,Glutamatergic,L5 IT,3621.0,5680.692074,22718.304610,0.01,0.0,0.99,0.446838,,83.0,GlutamatergicL5 IT,ILX:0770157,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000679,na,MmusDv:0000061,PATO:0000384,False,NCBITaxon:10090,M006,nucleus,glutamatergic neuron,10x 3' v3,normal,Mus musculus,male,primary motor cortex,na,early adult stage,pBICCNsMMrMOpRMiM006d190320_TTTGTTGTCACCTTGC,Glutamatergic,L5 IT


# Create m2m object

## Create matrix and var df

In [47]:
many2many_genes

Unnamed: 0,human_ensembl_gene_id,human_entrezgene_id,human_external_gene_name,mouse_homolog_ensembl_gene,orthology_type,mouse_homolog_gene_name
32,ENSG00000244395,378949.0,RBMY1D,ENSMUSG00000093987,ortholog_many2many,Gm21704
33,ENSG00000244395,378949.0,RBMY1D,ENSMUSG00000095852,ortholog_many2many,Gm10256
34,ENSG00000244395,378949.0,RBMY1D,ENSMUSG00000101667,ortholog_many2many,Gm29289
35,ENSG00000244395,378949.0,RBMY1D,ENSMUSG00000096520,ortholog_many2many,Gm3376
36,ENSG00000244395,378949.0,RBMY1D,ENSMUSG00000102053,ortholog_many2many,Gm4064
...,...,...,...,...,...,...
21794,ENSG00000279804,391003.0,PRAMEF18,ENSMUSG00000095954,ortholog_many2many,Gm3183
21795,ENSG00000279804,391003.0,PRAMEF18,ENSMUSG00000070617,ortholog_many2many,Pramel23
21796,ENSG00000279804,391003.0,PRAMEF18,ENSMUSG00000095409,ortholog_many2many,Gm13043
21797,ENSG00000279804,391003.0,PRAMEF18,ENSMUSG00000072821,ortholog_many2many,Gm6351


In [52]:
np.where(m2m_mouse.var_names == m2m_mouse.var.loc[m2m_mouse.var.index == 'ENSMUSG00000095852'].index[0])[0][0]

209

In [57]:
m2m_human.var

Unnamed: 0,gene,chromosome,entrez_id,gene_name,mouse_homologenes
198,ACOT1,14,641371,acyl-CoA thioesterase 1,Acot3
202,ACOT2,14,10965,acyl-CoA thioesterase 2,Acot2
232,ACSM2A,16,123876,acyl-CoA synthetase medium-chain family member 2A,Acsm2
233,ACSM2B,16,348158,acyl-CoA synthetase medium-chain family member 2B,
753,ALDH9A1,1,223,"aldehyde dehydrogenase 9 family, member A1",Aldh9a1
...,...,...,...,...,...
50148,ZNF837,19,116412,zinc finger protein 837,
50181,ZNF891,12,101060200,zinc finger protein 891,
50249,ZSCAN5A,19,79149,zinc finger and SCAN domain containing 5A,
50250,ZSCAN5B,19,342933,zinc finger and SCAN domain containing 5B,Zscan5b


In [58]:
m2m_human.var.loc[m2m_human.var.entrez_id == 641371]

Unnamed: 0,gene,chromosome,entrez_id,gene_name,mouse_homologenes
198,ACOT1,14,641371,acyl-CoA thioesterase 1,Acot3


In [59]:
m2m_human.X[:, [0,2]].T

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [60]:
np.mean(m2m_human.X[:, [0,2]].T,axis=0)

array([0.5, 0. , 0. , ..., 0. , 0. , 0. ], dtype=float32)

In [58]:
np.asarray(m2o_mouse.X[:,np.where(m2o_mouse.var_names == m2o_mouse.var.loc[m2o_mouse.var.index == 'ENSMUSG00000003526'].index[0])[0][0]].todense().T)[0]

array([0., 2., 3., ..., 0., 1., 1.], dtype=float32)

In [59]:
np.concatenate([np.asarray(m2o_mouse.X[:,np.where(m2o_mouse.var_names == m2o_mouse.var.loc[m2o_mouse.var.index == 'ENSMUSG00000003526'].index[0])[0][0]].todense().T)[0],
                np.mean(m2o_human.X[:, [0,2]].T,axis=0)])

array([0., 2., 3., ..., 0., 0., 0.], dtype=float32)

In [204]:
hid = 'ENSG00000088782'

In [205]:
mouse_df = many2many_genes.loc[many2many_genes.human_ensembl_gene_id == hid]
mouse_df

Unnamed: 0,human_ensembl_gene_id,human_entrezgene_id,human_external_gene_name,mouse_homolog_ensembl_gene,orthology_type,mouse_homolog_gene_name
224,ENSG00000088782,140850.0,DEFB127,ENSMUSG00000027468,ortholog_many2many,Defb22


In [206]:
new_df = many2many_genes.loc[many2many_genes.mouse_homolog_ensembl_gene.isin(mouse_df.mouse_homolog_ensembl_gene)]

In [207]:
new_df.shape

(7, 6)

In [219]:
new_df2 = many2many_genes.loc[many2many_genes.human_ensembl_gene_id.isin(new_df.human_ensembl_gene_id)]

In [220]:
new_df2.shape

(9, 6)

In [221]:
new_df2

Unnamed: 0,human_ensembl_gene_id,human_entrezgene_id,human_external_gene_name,mouse_homolog_ensembl_gene,orthology_type,mouse_homolog_gene_name
224,ENSG00000088782,140850.0,DEFB127,ENSMUSG00000027468,ortholog_many2many,Defb22
3136,ENSG00000186458,400830.0,DEFB132,ENSMUSG00000048500,ortholog_many2many,Defb15
3137,ENSG00000186458,400830.0,DEFB132,ENSMUSG00000052554,ortholog_many2many,Defb34
3138,ENSG00000186458,400830.0,DEFB132,ENSMUSG00000027468,ortholog_many2many,Defb22
7076,ENSG00000184276,245911.0,DEFB108B,ENSMUSG00000027468,ortholog_many2many,Defb22
8853,ENSG00000204548,245934.0,DEFB121,ENSMUSG00000027468,ortholog_many2many,Defb22
10815,ENSG00000205882,613211.0,DEFB134,ENSMUSG00000027468,ortholog_many2many,Defb22
11910,ENSG00000177023,503618.0,DEFB104B,ENSMUSG00000027468,ortholog_many2many,Defb22
11930,ENSG00000176782,140596.0,DEFB104A,ENSMUSG00000027468,ortholog_many2many,Defb22


In [232]:
many2many_genes.loc[many2many_genes.mouse_homolog_ensembl_gene.isin(new_df2.mouse_homolog_ensembl_gene)]

Unnamed: 0,human_ensembl_gene_id,human_entrezgene_id,human_external_gene_name,mouse_homolog_ensembl_gene,orthology_type,mouse_homolog_gene_name
224,ENSG00000088782,140850.0,DEFB127,ENSMUSG00000027468,ortholog_many2many,Defb22
3136,ENSG00000186458,400830.0,DEFB132,ENSMUSG00000048500,ortholog_many2many,Defb15
3137,ENSG00000186458,400830.0,DEFB132,ENSMUSG00000052554,ortholog_many2many,Defb34
3138,ENSG00000186458,400830.0,DEFB132,ENSMUSG00000027468,ortholog_many2many,Defb22
7076,ENSG00000184276,245911.0,DEFB108B,ENSMUSG00000027468,ortholog_many2many,Defb22
8853,ENSG00000204548,245934.0,DEFB121,ENSMUSG00000027468,ortholog_many2many,Defb22
10815,ENSG00000205882,613211.0,DEFB134,ENSMUSG00000027468,ortholog_many2many,Defb22
11910,ENSG00000177023,503618.0,DEFB104B,ENSMUSG00000027468,ortholog_many2many,Defb22
11930,ENSG00000176782,140596.0,DEFB104A,ENSMUSG00000027468,ortholog_many2many,Defb22


In [236]:
len(new_df2.index) == len(many2many_genes.loc[many2many_genes.mouse_homolog_ensembl_gene.isin(new_df2.mouse_homolog_ensembl_gene)].index)

True

In [None]:
current_df = 

In [225]:
del current_df

In [77]:
many2many_genes[~many2many_genes.mouse_homolog_ensembl_gene.isin(current_df.mouse_homolog_ensembl_gene)]

Unnamed: 0,human_ensembl_gene_id,human_entrezgene_id,human_external_gene_name,mouse_homolog_ensembl_gene,orthology_type,mouse_homolog_gene_name
47,ENSG00000176679,90655.0,TGIF2LY,ENSMUSG00000100133,ortholog_many2many,Tgif2lx1
48,ENSG00000176679,90655.0,TGIF2LY,ENSMUSG00000100194,ortholog_many2many,Tgif2lx2
82,ENSG00000089012,55423.0,SIRPG,ENSMUSG00000095028,ortholog_many2many,Sirpb1b
83,ENSG00000089012,55423.0,SIRPG,ENSMUSG00000074677,ortholog_many2many,Sirpb1c
84,ENSG00000089012,55423.0,SIRPG,ENSMUSG00000078780,ortholog_many2many,Gm5150
...,...,...,...,...,...,...
21794,ENSG00000279804,391003.0,PRAMEF18,ENSMUSG00000095954,ortholog_many2many,Gm3183
21795,ENSG00000279804,391003.0,PRAMEF18,ENSMUSG00000070617,ortholog_many2many,Pramel23
21796,ENSG00000279804,391003.0,PRAMEF18,ENSMUSG00000095409,ortholog_many2many,Gm13043
21797,ENSG00000279804,391003.0,PRAMEF18,ENSMUSG00000072821,ortholog_many2many,Gm6351


In [96]:
'ENSG00000176679' in many2many_genes.human_ensembl_gene_id.values

True

In [180]:
len(many2many_genes.index)

3937

In [283]:
def get_gene_list(human_ensembl_id):
    cpt = 0
    flag = False
    # Get all the mouse ids connected to the human id
    mouse_ids = many2many_genes.loc[many2many_genes.human_ensembl_gene_id == human_ensembl_id]
    # Get all the human ids connected to the mouse ids
    human_ids = many2many_genes.loc[many2many_genes.human_ensembl_gene_id.isin(mouse_ids.human_ensembl_gene_id)]
    # Repeat until they are equal
    while flag == False:
        cpt = cpt+1
        mouse_ids = many2many_genes.loc[many2many_genes.mouse_homolog_ensembl_gene.isin(human_ids.mouse_homolog_ensembl_gene)]
        human_ids = many2many_genes.loc[many2many_genes.human_ensembl_gene_id.isin(mouse_ids.human_ensembl_gene_id)]
        if len(mouse_ids.index) == len(human_ids.index):
            flag = True
    return human_ids

In [245]:
get_gene_list('ENSG00000088782')

Index([224], dtype='int64')
Index([224, 3138, 7076, 8853, 10815, 11910, 11930], dtype='int64')
Index([224, 3136, 3137, 3138, 7076, 8853, 10815, 11910, 11930], dtype='int64')
1 False
Index([224, 3136, 3137, 3138, 7076, 8853, 10815, 11910, 11930], dtype='int64')
Index([224, 3136, 3137, 3138, 7076, 8853, 10815, 11910, 11930], dtype='int64')
2 True


Unnamed: 0,human_ensembl_gene_id,human_entrezgene_id,human_external_gene_name,mouse_homolog_ensembl_gene,orthology_type,mouse_homolog_gene_name
224,ENSG00000088782,140850.0,DEFB127,ENSMUSG00000027468,ortholog_many2many,Defb22
3136,ENSG00000186458,400830.0,DEFB132,ENSMUSG00000048500,ortholog_many2many,Defb15
3137,ENSG00000186458,400830.0,DEFB132,ENSMUSG00000052554,ortholog_many2many,Defb34
3138,ENSG00000186458,400830.0,DEFB132,ENSMUSG00000027468,ortholog_many2many,Defb22
7076,ENSG00000184276,245911.0,DEFB108B,ENSMUSG00000027468,ortholog_many2many,Defb22
8853,ENSG00000204548,245934.0,DEFB121,ENSMUSG00000027468,ortholog_many2many,Defb22
10815,ENSG00000205882,613211.0,DEFB134,ENSMUSG00000027468,ortholog_many2many,Defb22
11910,ENSG00000177023,503618.0,DEFB104B,ENSMUSG00000027468,ortholog_many2many,Defb22
11930,ENSG00000176782,140596.0,DEFB104A,ENSMUSG00000027468,ortholog_many2many,Defb22


In [282]:
def construct_m2m_matrixanddf():
    # def the variables to return 
    matrix = []
    todo = many2many_genes
    var_df = []
    
    gp = 0
    cpt = 0
    lentodo = len(many2many_genes.index)
    
    # start the loop throught the m2o dataframe
    for index, row in many2many_genes.iterrows():
        #get the human id
        human_id = row.human_ensembl_gene_id
        
        # check if it has already been done, if not we get into the if
        if str(human_id) in todo.human_ensembl_gene_id.values:
            # starts new lines for dataframes
            var_line = {}
            var_line['orthology_type'] = row.orthology_type
            
            # get all the mouse and human homolog genes
            working_df = get_gene_list(human_id)
            
            ######### Get mouse info #########
            # def mouse arrays for stocking info
            mouse_names = []
            mouse_ensembl_ids = []
            mouse_index = []

            ######### Get human info #########
            # def human arrays for stocking info
            human_names = []
            human_ensembl_ids = []
            human_entrez_ids = []
            human_index = []
            
            # pass throught all the genes
            for index2, row2 in working_df.iterrows():
                if row2.human_ensembl_gene_id not in human_ensembl_ids :
                    human_names.append(row2.human_external_gene_name)
                    human_ensembl_ids.append(row2.human_ensembl_gene_id)
                    human_entrez_ids.append(row2.human_entrezgene_id)
                    
                    # get the index of the gene in the matrix
                    human_index.append(np.where(m2m_human.var_names == m2m_human.var.loc[m2m_human.var.entrez_id == row2.human_entrezgene_id].index[0])[0][0])
                
                if row2.mouse_homolog_ensembl_gene not in mouse_ensembl_ids :
                    # create arrays for the df
                    mouse_names.append(row2.mouse_homolog_gene_name)
                    mouse_ensembl_ids.append(row2.mouse_homolog_ensembl_gene)

                    # get the index of the gene in the matrix
                    mouse_index.append(np.where(m2m_mouse.var_names == m2m_mouse.var.loc[m2m_mouse.var.index == row2.mouse_homolog_ensembl_gene].index[0])[0][0])
            
            ######### Get mouse matrix #########
            # get the cells counts for each gene
            mouse_homolog_matrix = np.asarray(m2m_mouse.X.todense()[:, mouse_index].T)
            # get the mean for each cell
            mouse_counts = np.mean(mouse_homolog_matrix, axis=0)
            
            ######### Get human matrix #########
            # get the cells counts for each gene
            human_homolog_matrix = m2m_human.X[:, human_index].T
            # get the mean for each cell
            human_counts = np.mean(human_homolog_matrix, axis=0)        
            
            ######### Add new line to matrix #########
            # add the newline to the count matrix
            if matrix == []:
                matrix = np.array([np.concatenate([human_counts,mouse_counts])])
            else : 
                matrix = np.append(matrix, [np.concatenate([human_counts, mouse_counts])], axis = 0)
            
            # Remove all genes from todo
            todo = todo[~todo.mouse_homolog_ensembl_gene.isin(mouse_ensembl_ids)]
            todo = todo[~todo.human_ensembl_gene_id.isin(human_ensembl_ids)]
            
            # add the information for the genes to the df
            var_line['human_homologs_ensembl_ids'] = human_ensembl_ids
            var_line['human_homologs_entrez_ids'] = human_entrez_ids
            var_line['human_homologs_names'] = human_names
            human_ensembl_ids.sort()
            var_line['human_ensembl_id'] = human_ensembl_ids[0]
            var_line['human_entrez_id'] = working_df.loc[working_df.human_ensembl_gene_id == human_ensembl_ids[0]].human_entrezgene_id.iloc[0]
            var_line['human_gene_name'] = working_df.loc[working_df.human_ensembl_gene_id == human_ensembl_ids[0]].human_external_gene_name.iloc[0]
            var_line['human_long_gene_name'] = m2m_human.var.loc[m2m_human.var.entrez_id == var_line['human_entrez_id']].gene_name.iloc[0]
            var_line['human_chromosome'] = m2m_human.var.loc[m2m_human.var.entrez_id == var_line['human_entrez_id']].chromosome.iloc[0]
            
            var_line['mouse_homologs_ids'] = mouse_ensembl_ids
            var_line['mouse_homologs_names'] = mouse_names
            mouse_ensembl_ids.sort()
            var_line['mouse_ensembl_id'] = mouse_ensembl_ids[0]
            var_line['mouse_gene_name'] = working_df.loc[working_df.mouse_homolog_ensembl_gene == mouse_ensembl_ids[0]].mouse_homolog_gene_name.iloc[0]
            
            # add the newline to the rows of the df
            var_df.append(var_line)
            
            gp = gp+1
            
        cpt = cpt+1
        print('gp :', gp, 'tot :', cpt, '/', lentodo, end = '\r')
        
    return matrix.T, var_df, todo

In [254]:
m2m_matrix, m2m_var_df, todo = construct_m2m_matrixanddf()

gp : 1 tot : 10 / 3937

  if matrix == []:


gp : 129 tot : 3937 / 3937

In [255]:
todo ## Well empty so every genes have been done

Unnamed: 0,human_ensembl_gene_id,human_entrezgene_id,human_external_gene_name,mouse_homolog_ensembl_gene,orthology_type,mouse_homolog_gene_name


In [256]:
m2m_matrix.shape

(236271, 129)

In [257]:
m2m_matrix

array([[0.        , 0.        , 0.33333334, ..., 0.14285715, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.5       ],
       [0.        , 0.        , 0.33333334, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.33333334, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.8333333 , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.5       , ..., 0.        , 0.        ,
        0.        ]], dtype=float32)

In [258]:
m2m_matrix[:76533,:]

array([[0.        , 0.        , 0.33333334, ..., 0.14285715, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.5       ],
       [0.        , 0.        , 0.33333334, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ],
       [0.        , 0.        , 1.        , ..., 0.2857143 , 0.        ,
        0.        ]], dtype=float32)

In [259]:
m2m_matrix[76533:,:]

array([[0.        , 0.        , 0.16666667, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.5       , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.33333334, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.8333333 , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.5       , ..., 0.        , 0.        ,
        0.        ]], dtype=float32)

In [260]:
m2m_matrix.max()

194.5

In [261]:
m2m_var = pd.DataFrame(m2m_var_df)

In [262]:
m2m_var

Unnamed: 0,orthology_type,human_homologs_ensembl_ids,human_homologs_entrez_ids,human_homologs_names,human_ensembl_id,human_entrez_id,human_gene_name,human_long_gene_name,human_chromosome,mouse_homologs_ids,mouse_homologs_names,mouse_ensembl_id,mouse_gene_name
0,ortholog_many2many,"[ENSG00000169800, ENSG00000226941, ENSG0000023...","[378949.0, 159163.0, 378950.0, 378951.0, 37894...","[RBMY1D, RBMY1F, RBMY1E, RBMY1J, RBMY1B, RBMY1A1]",ENSG00000169800,159163.0,RBMY1F,"RNA binding motif protein, Y-linked, family 1,...",Y,"[ENSMUSG00000091987, ENSMUSG00000093918, ENSMU...","[Gm21704, Gm10256, Gm29289, Gm3376, Gm4064, Gm...",ENSMUSG00000091987,Gm10352
1,ortholog_many2many,"[ENSG00000153779, ENSG00000176679]","[90655.0, 90316.0]","[TGIF2LY, TGIF2LX]",ENSG00000153779,90316.0,TGIF2LX,"TGFB-induced factor homeobox 2-like, X-linked",X,"[ENSMUSG00000100133, ENSMUSG00000100194]","[Tgif2lx1, Tgif2lx2]",ENSMUSG00000100133,Tgif2lx1
2,ortholog_many2many,"[ENSG00000089012, ENSG00000101307, ENSG0000019...","[55423.0, 10326.0, 140885.0]","[SIRPG, SIRPB1, SIRPA]",ENSG00000089012,55423.0,SIRPG,signal-regulatory protein gamma,20,"[ENSMUSG00000037902, ENSMUSG00000074677, ENSMU...","[Sirpb1b, Sirpb1c, Gm5150, Gm9733, Sirpa, Sirp...",ENSMUSG00000037902,Sirpa
3,ortholog_many2many,"[ENSG00000164816, ENSG00000164821, ENSG0000016...","[1669.0, 1671.0, 1667.0, 728358.0, 1668.0, 167...","[DEFA4, DEFA6, DEFA1, DEFA1B, DEFA3, DEFA5]",ENSG00000164816,1670.0,DEFA5,"defensin, alpha 5, Paneth cell-specific",8,"[ENSMUSG00000058618, ENSMUSG00000060070, ENSMU...","[Defa24, Defa5, Defa27, Defa22, Defa26, Defa17...",ENSMUSG00000058618,Defa39
4,ortholog_many2many,"[ENSG00000088782, ENSG00000176782, ENSG0000017...","[140850.0, 400830.0, 245911.0, 245934.0, 61321...","[DEFB127, DEFB132, DEFB108B, DEFB121, DEFB134,...",ENSG00000088782,140850.0,DEFB127,"defensin, beta 127",20,"[ENSMUSG00000027468, ENSMUSG00000048500, ENSMU...","[Defb22, Defb15, Defb34]",ENSMUSG00000027468,Defb22
...,...,...,...,...,...,...,...,...,...,...,...,...,...
124,ortholog_many2many,[ENSG00000143149],[223.0],[ALDH9A1],ENSG00000143149,223.0,ALDH9A1,"aldehyde dehydrogenase 9 family, member A1",1,[ENSMUSG00000026687],[Aldh9a1],ENSMUSG00000026687,Aldh9a1
125,ortholog_many2many,"[ENSG00000274736, ENSG00000275718]","[6359.0, 6368.0]","[CCL15, CCL23]",ENSG00000274736,6368.0,CCL23,chemokine (C-C motif) ligand 23,17,"[ENSMUSG00000018927, ENSMUSG00000019122]","[Ccl9, Ccl6]",ENSMUSG00000018927,Ccl6
126,ortholog_many2many,"[ENSG00000212124, ENSG00000212126, ENSG0000022...","[259296.0, 259295.0, 259294.0, 259290.0, 25929...","[TAS2R50, TAS2R20, TAS2R19, TAS2R31, TAS2R46, ...",ENSG00000212124,259294.0,TAS2R19,"taste receptor, type 2, member 19",12,"[ENSMUSG00000053217, ENSMUSG00000059382]","[Tas2r120, Tas2r136]",ENSMUSG00000053217,Tas2r136
127,ortholog_many2many,"[ENSG00000072694, ENSG00000143226]","[2213.0, 2212.0]","[FCGR2B, FCGR2A]",ENSG00000072694,2213.0,FCGR2B,"Fc fragment of IgG, low affinity IIb, receptor...",1,"[ENSMUSG00000026656, ENSMUSG00000059498]","[Fcgr2b, Fcgr3]",ENSMUSG00000026656,Fcgr2b


In [264]:
m2m_var.mouse_ensembl_id.duplicated().any()

False

In [265]:
m2m_var.human_ensembl_id.duplicated().any()

False

## Create anndata object

In [266]:
m2m_homolog = anndata.AnnData(X = m2m_matrix, obs = pd.concat([m2m_human.obs,m2m_mouse.obs], axis = 0), var = m2m_var)



In [267]:
m2m_homolog

AnnData object with n_obs × n_vars = 236271 × 129
    obs: 'sample_name', 'exp_component_name', 'cluster_label', 'cluster_color', 'cluster_order', 'class_label', 'class_color', 'class_order', 'subclass_label', 'subclass_color', 'subclass_order', 'donor_sex_label', 'donor_sex_color', 'donor_sex_order', 'region_label', 'region_color', 'region_order', 'cortical_layer_label', 'cortical_layer_color', 'cortical_layer_order', 'cell_type_accession_label', 'cell_type_accession_color', 'cell_type_accession_order', 'cell_type_alias_label', 'cell_type_alias_color', 'cell_type_alias_order', 'cell_type_alt_alias_label', 'cell_type_alt_alias_color', 'cell_type_alt_alias_order', 'cell_type_designation_label', 'cell_type_designation_color', 'cell_type_designation_order', 'external_donor_name_label', 'external_donor_name_color', 'external_donor_name_order', 'specimen_type', 'full_genotype_label', 'outlier_call', 'outlier_type', 'homolog_class_label', 'homolog_subclass_label', 'nUMI', 'nGene', 'QC', 'clu

In [268]:
m2m_homolog.var_names

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       '119', '120', '121', '122', '123', '124', '125', '126', '127', '128'],
      dtype='object', length=129)

In [269]:
m2m_homolog.var

Unnamed: 0,orthology_type,human_homologs_ensembl_ids,human_homologs_entrez_ids,human_homologs_names,human_ensembl_id,human_entrez_id,human_gene_name,human_long_gene_name,human_chromosome,mouse_homologs_ids,mouse_homologs_names,mouse_ensembl_id,mouse_gene_name
0,ortholog_many2many,"[ENSG00000169800, ENSG00000226941, ENSG0000023...","[378949.0, 159163.0, 378950.0, 378951.0, 37894...","[RBMY1D, RBMY1F, RBMY1E, RBMY1J, RBMY1B, RBMY1A1]",ENSG00000169800,159163.0,RBMY1F,"RNA binding motif protein, Y-linked, family 1,...",Y,"[ENSMUSG00000091987, ENSMUSG00000093918, ENSMU...","[Gm21704, Gm10256, Gm29289, Gm3376, Gm4064, Gm...",ENSMUSG00000091987,Gm10352
1,ortholog_many2many,"[ENSG00000153779, ENSG00000176679]","[90655.0, 90316.0]","[TGIF2LY, TGIF2LX]",ENSG00000153779,90316.0,TGIF2LX,"TGFB-induced factor homeobox 2-like, X-linked",X,"[ENSMUSG00000100133, ENSMUSG00000100194]","[Tgif2lx1, Tgif2lx2]",ENSMUSG00000100133,Tgif2lx1
2,ortholog_many2many,"[ENSG00000089012, ENSG00000101307, ENSG0000019...","[55423.0, 10326.0, 140885.0]","[SIRPG, SIRPB1, SIRPA]",ENSG00000089012,55423.0,SIRPG,signal-regulatory protein gamma,20,"[ENSMUSG00000037902, ENSMUSG00000074677, ENSMU...","[Sirpb1b, Sirpb1c, Gm5150, Gm9733, Sirpa, Sirp...",ENSMUSG00000037902,Sirpa
3,ortholog_many2many,"[ENSG00000164816, ENSG00000164821, ENSG0000016...","[1669.0, 1671.0, 1667.0, 728358.0, 1668.0, 167...","[DEFA4, DEFA6, DEFA1, DEFA1B, DEFA3, DEFA5]",ENSG00000164816,1670.0,DEFA5,"defensin, alpha 5, Paneth cell-specific",8,"[ENSMUSG00000058618, ENSMUSG00000060070, ENSMU...","[Defa24, Defa5, Defa27, Defa22, Defa26, Defa17...",ENSMUSG00000058618,Defa39
4,ortholog_many2many,"[ENSG00000088782, ENSG00000176782, ENSG0000017...","[140850.0, 400830.0, 245911.0, 245934.0, 61321...","[DEFB127, DEFB132, DEFB108B, DEFB121, DEFB134,...",ENSG00000088782,140850.0,DEFB127,"defensin, beta 127",20,"[ENSMUSG00000027468, ENSMUSG00000048500, ENSMU...","[Defb22, Defb15, Defb34]",ENSMUSG00000027468,Defb22
...,...,...,...,...,...,...,...,...,...,...,...,...,...
124,ortholog_many2many,[ENSG00000143149],[223.0],[ALDH9A1],ENSG00000143149,223.0,ALDH9A1,"aldehyde dehydrogenase 9 family, member A1",1,[ENSMUSG00000026687],[Aldh9a1],ENSMUSG00000026687,Aldh9a1
125,ortholog_many2many,"[ENSG00000274736, ENSG00000275718]","[6359.0, 6368.0]","[CCL15, CCL23]",ENSG00000274736,6368.0,CCL23,chemokine (C-C motif) ligand 23,17,"[ENSMUSG00000018927, ENSMUSG00000019122]","[Ccl9, Ccl6]",ENSMUSG00000018927,Ccl6
126,ortholog_many2many,"[ENSG00000212124, ENSG00000212126, ENSG0000022...","[259296.0, 259295.0, 259294.0, 259290.0, 25929...","[TAS2R50, TAS2R20, TAS2R19, TAS2R31, TAS2R46, ...",ENSG00000212124,259294.0,TAS2R19,"taste receptor, type 2, member 19",12,"[ENSMUSG00000053217, ENSMUSG00000059382]","[Tas2r120, Tas2r136]",ENSMUSG00000053217,Tas2r136
127,ortholog_many2many,"[ENSG00000072694, ENSG00000143226]","[2213.0, 2212.0]","[FCGR2B, FCGR2A]",ENSG00000072694,2213.0,FCGR2B,"Fc fragment of IgG, low affinity IIb, receptor...",1,"[ENSMUSG00000026656, ENSMUSG00000059498]","[Fcgr2b, Fcgr3]",ENSMUSG00000026656,Fcgr2b


In [270]:
m2m_homolog.obs_names

Index(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ...
       'pBICCNsMMrMOpRMiM006d190320_TTTCCTCTCGAGCCTG',
       'pBICCNsMMrMOpRMiM006d190320_TTTCCTCTCTAACACG',
       'pBICCNsMMrMOpRMiM006d190320_TTTGATCAGTTACGTC',
       'pBICCNsMMrMOpRMiM006d190320_TTTGGAGAGTTGTAAG',
       'pBICCNsMMrMOpRMiM006d190320_TTTGGTTAGGATTTCC',
       'pBICCNsMMrMOpRMiM006d190320_TTTGGTTCATGAGTAA',
       'pBICCNsMMrMOpRMiM006d190320_TTTGGTTTCGCAAGAG',
       'pBICCNsMMrMOpRMiM006d190320_TTTGTTGAGACTCTTG',
       'pBICCNsMMrMOpRMiM006d190320_TTTGTTGTCACCTTGC',
       'pBICCNsMMrMOpRMiM006d190320_TTTGTTGTCATGCTAG'],
      dtype='object', length=236271)

In [271]:
m2m_homolog.obs

Unnamed: 0,sample_name,exp_component_name,cluster_label,cluster_color,cluster_order,class_label,class_color,class_order,subclass_label,subclass_color,subclass_order,donor_sex_label,donor_sex_color,donor_sex_order,region_label,region_color,region_order,cortical_layer_label,cortical_layer_color,cortical_layer_order,cell_type_accession_label,cell_type_accession_color,cell_type_accession_order,cell_type_alias_label,cell_type_alias_color,cell_type_alias_order,cell_type_alt_alias_label,cell_type_alt_alias_color,cell_type_alt_alias_order,cell_type_designation_label,cell_type_designation_color,cell_type_designation_order,external_donor_name_label,external_donor_name_color,external_donor_name_order,specimen_type,full_genotype_label,outlier_call,outlier_type,homolog_class_label,homolog_subclass_label,nUMI,nGene,QC,cluster,Allen.cluster_id,Allen.cluster_label,Allen.class_label,Allen.subclass_label,comb.QC,row,BICCN_cluster_id,BICCN_cluster_label,BICCN_class_label,BICCN_subclass_label,size,gene.counts,umi.counts,Broad.QC.doublet,Broad.QC.Mito,Broad.passQC,MALE,Comb.QC,cl,temp_class_label,BICCN_ontology_term_id,assay_ontology_term_id,disease_ontology_term_id,tissue_ontology_term_id,cell_type_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id,sex_ontology_term_id,is_primary_data,organism_ontology_term_id,donor_id,suspension_type,cell_type,assay,disease,organism,sex,tissue,self_reported_ethnicity,development_stage
0,AAACCCAAGGATTTCC-LKTX_190129_01_A01,AAACCCAAGGATTTCC-21L8TX_180927_001_A01,Inh L1-2 SST CCNJL,#fb8d00,50.0,GABAergic,#FF7373,1.0,Sst,#FF9900,5.0,F,#FF7373,1.0,M1,#FF7373,1.0,all,#FF7373,1.0,CS1912131050,#fb8d00,50.0,Inh L1-2 SST CCNJL,#fb8d00,50.0,,#fb8d00,50.0,Neuron 50,#fb8d00,50.0,H18.30.001,#FF7373,1.0,nucleus,,False,,GABAergic,Sst,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,AAACCCAAGTATGGCG-LKTX_190129_01_A01,AAACCCAAGTATGGCG-21L8TX_180927_001_A01,Exc L5-6 FEZF2 IFNG-AS1,#2c815f,116.0,Glutamatergic,#3DCC3D,2.0,L5/6 NP,#3E9E64,12.0,F,#FF7373,1.0,M1,#FF7373,1.0,all,#FF7373,1.0,CS1912131116,#2c815f,116.0,Exc L5-6 FEZF2 IFNG-AS1,#2c815f,116.0,,#2c815f,116.0,Neuron 116,#2c815f,116.0,H18.30.001,#FF7373,1.0,nucleus,,False,,Glutamatergic,L5/6 NP,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,AAACCCACAAAGTGTA-LKTX_190129_01_A01,AAACCCACAAAGTGTA-21L8TX_180927_001_A01,Exc L3-5 RORB LINC01202,#547d7a,87.0,Glutamatergic,#3DCC3D,2.0,L5 IT,#50B2AD,8.0,F,#FF7373,1.0,M1,#FF7373,1.0,all,#FF7373,1.0,CS1912131087,#547d7a,87.0,Exc L3-5 RORB LINC01202,#547d7a,87.0,,#547d7a,87.0,Neuron 87,#547d7a,87.0,H18.30.001,#FF7373,1.0,nucleus,,False,,Glutamatergic,L5 IT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,AAACCCACACTACTTT-LKTX_190129_01_A01,AAACCCACACTACTTT-21L8TX_180927_001_A01,Exc L2 LINC00507 GLRA3,#cecd32,75.0,Glutamatergic,#3DCC3D,2.0,L2/3 IT,#C4EC04,7.0,F,#FF7373,1.0,M1,#FF7373,1.0,all,#FF7373,1.0,CS1912131075,#cecd32,75.0,Exc L2 LINC00507 GLRA3,#cecd32,75.0,,#cecd32,75.0,Neuron 75,#cecd32,75.0,H18.30.001,#FF7373,1.0,nucleus,,False,,Glutamatergic,L2/3 IT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,AAACCCACAGTGAGCA-LKTX_190129_01_A01,AAACCCACAGTGAGCA-21L8TX_180927_001_A01,Oligo L2-6 OPALIN FTH1P3,#003a28,120.0,Non-Neuronal,#171799,3.0,Oligo,#2E3E39,17.0,F,#FF7373,1.0,M1,#FF7373,1.0,all,#FF7373,1.0,CS1912131120,#003a28,120.0,Oligo L2-6 OPALIN FTH1P3,#003a28,120.0,,#003a28,120.0,Non-neuron 3,#003a28,120.0,H18.30.001,#FF7373,1.0,nucleus,,False,,Non-Neuronal,Oligo,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
pBICCNsMMrMOpRMiM006d190320_TTTGGTTCATGAGTAA,pBICCNsMMrMOpRMiM006d190320_TTTGGTTCATGAGTAA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Glutamatergic,L6 IT,20193.0,5695.0,PassQC,Ex1_5,53.0,L6 IT Sulf1_1,Glutamatergic,L6 IT,PassQC,215818.0,53.0,L6 IT Sulf1_1,Glutamatergic,L6 IT,3978.0,5218.066616,18064.753140,0.01,0.0,0.99,0.471594,,85.0,GlutamatergicL6 IT,ILX:0770158,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000679,na,MmusDv:0000061,PATO:0000384,False,NCBITaxon:10090,M006,nucleus,glutamatergic neuron,10x 3' v3,normal,Mus musculus,male,primary motor cortex,na,early adult stage
pBICCNsMMrMOpRMiM006d190320_TTTGGTTTCGCAAGAG,pBICCNsMMrMOpRMiM006d190320_TTTGGTTTCGCAAGAG,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Non-Neuronal,Oligo,2858.0,1602.0,PassQC,Oligo_19,90.0,Oligo Opalin_4,Non-Neuronal,Oligo,PassQC,215820.0,90.0,Oligo Opalin_4,Non-Neuronal,Oligo,16566.0,1740.348364,3419.937704,0.06,0.0,0.94,0.487323,,150.0,Non-NeuronalOligo,ILX:0770140,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000128,na,MmusDv:0000061,PATO:0000384,False,NCBITaxon:10090,M006,nucleus,oligodendrocyte,10x 3' v3,normal,Mus musculus,male,primary motor cortex,na,early adult stage
pBICCNsMMrMOpRMiM006d190320_TTTGTTGAGACTCTTG,pBICCNsMMrMOpRMiM006d190320_TTTGTTGAGACTCTTG,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Glutamatergic,L5 IT,35854.0,7344.0,PassQC,Ex1_3,51.0,L5 IT S100b,Glutamatergic,L5 IT,PassQC,215821.0,51.0,L5 IT S100b,Glutamatergic,L5 IT,8684.0,5464.036043,20579.022920,0.01,0.0,0.99,0.385306,,70.0,GlutamatergicL5 IT,ILX:0770157,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000679,na,MmusDv:0000061,PATO:0000384,False,NCBITaxon:10090,M006,nucleus,glutamatergic neuron,10x 3' v3,normal,Mus musculus,male,primary motor cortex,na,early adult stage
pBICCNsMMrMOpRMiM006d190320_TTTGTTGTCACCTTGC,pBICCNsMMrMOpRMiM006d190320_TTTGTTGTCACCTTGC,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Glutamatergic,L5 IT,23493.0,6146.0,PassQC,Ex1_3,52.0,L5 IT Pld5,Glutamatergic,L5 IT,PassQC,215822.0,52.0,L5 IT Pld5,Glutamatergic,L5 IT,3621.0,5680.692074,22718.304610,0.01,0.0,0.99,0.446838,,83.0,GlutamatergicL5 IT,ILX:0770157,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000679,na,MmusDv:0000061,PATO:0000384,False,NCBITaxon:10090,M006,nucleus,glutamatergic neuron,10x 3' v3,normal,Mus musculus,male,primary motor cortex,na,early adult stage


## Check duplicates

In [272]:
m2m_homolog.var.mouse_ensembl_id.duplicated().sum()

0

In [273]:
m2m_homolog.var.human_ensembl_id.duplicated().sum()

0

## Write the results

In [275]:
m2m_homolog_write = m2m_homolog

In [276]:
m2m_homolog_write.obs = m2m_homolog_write.obs.drop(['outlier_call', 'is_primary_data'], axis=1)

In [277]:
m2m_homolog_write.obs

Unnamed: 0,sample_name,exp_component_name,cluster_label,cluster_color,cluster_order,class_label,class_color,class_order,subclass_label,subclass_color,subclass_order,donor_sex_label,donor_sex_color,donor_sex_order,region_label,region_color,region_order,cortical_layer_label,cortical_layer_color,cortical_layer_order,cell_type_accession_label,cell_type_accession_color,cell_type_accession_order,cell_type_alias_label,cell_type_alias_color,cell_type_alias_order,cell_type_alt_alias_label,cell_type_alt_alias_color,cell_type_alt_alias_order,cell_type_designation_label,cell_type_designation_color,cell_type_designation_order,external_donor_name_label,external_donor_name_color,external_donor_name_order,specimen_type,full_genotype_label,outlier_type,homolog_class_label,homolog_subclass_label,nUMI,nGene,QC,cluster,Allen.cluster_id,Allen.cluster_label,Allen.class_label,Allen.subclass_label,comb.QC,row,BICCN_cluster_id,BICCN_cluster_label,BICCN_class_label,BICCN_subclass_label,size,gene.counts,umi.counts,Broad.QC.doublet,Broad.QC.Mito,Broad.passQC,MALE,Comb.QC,cl,temp_class_label,BICCN_ontology_term_id,assay_ontology_term_id,disease_ontology_term_id,tissue_ontology_term_id,cell_type_ontology_term_id,self_reported_ethnicity_ontology_term_id,development_stage_ontology_term_id,sex_ontology_term_id,organism_ontology_term_id,donor_id,suspension_type,cell_type,assay,disease,organism,sex,tissue,self_reported_ethnicity,development_stage
0,AAACCCAAGGATTTCC-LKTX_190129_01_A01,AAACCCAAGGATTTCC-21L8TX_180927_001_A01,Inh L1-2 SST CCNJL,#fb8d00,50.0,GABAergic,#FF7373,1.0,Sst,#FF9900,5.0,F,#FF7373,1.0,M1,#FF7373,1.0,all,#FF7373,1.0,CS1912131050,#fb8d00,50.0,Inh L1-2 SST CCNJL,#fb8d00,50.0,,#fb8d00,50.0,Neuron 50,#fb8d00,50.0,H18.30.001,#FF7373,1.0,nucleus,,,GABAergic,Sst,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,AAACCCAAGTATGGCG-LKTX_190129_01_A01,AAACCCAAGTATGGCG-21L8TX_180927_001_A01,Exc L5-6 FEZF2 IFNG-AS1,#2c815f,116.0,Glutamatergic,#3DCC3D,2.0,L5/6 NP,#3E9E64,12.0,F,#FF7373,1.0,M1,#FF7373,1.0,all,#FF7373,1.0,CS1912131116,#2c815f,116.0,Exc L5-6 FEZF2 IFNG-AS1,#2c815f,116.0,,#2c815f,116.0,Neuron 116,#2c815f,116.0,H18.30.001,#FF7373,1.0,nucleus,,,Glutamatergic,L5/6 NP,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,AAACCCACAAAGTGTA-LKTX_190129_01_A01,AAACCCACAAAGTGTA-21L8TX_180927_001_A01,Exc L3-5 RORB LINC01202,#547d7a,87.0,Glutamatergic,#3DCC3D,2.0,L5 IT,#50B2AD,8.0,F,#FF7373,1.0,M1,#FF7373,1.0,all,#FF7373,1.0,CS1912131087,#547d7a,87.0,Exc L3-5 RORB LINC01202,#547d7a,87.0,,#547d7a,87.0,Neuron 87,#547d7a,87.0,H18.30.001,#FF7373,1.0,nucleus,,,Glutamatergic,L5 IT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,AAACCCACACTACTTT-LKTX_190129_01_A01,AAACCCACACTACTTT-21L8TX_180927_001_A01,Exc L2 LINC00507 GLRA3,#cecd32,75.0,Glutamatergic,#3DCC3D,2.0,L2/3 IT,#C4EC04,7.0,F,#FF7373,1.0,M1,#FF7373,1.0,all,#FF7373,1.0,CS1912131075,#cecd32,75.0,Exc L2 LINC00507 GLRA3,#cecd32,75.0,,#cecd32,75.0,Neuron 75,#cecd32,75.0,H18.30.001,#FF7373,1.0,nucleus,,,Glutamatergic,L2/3 IT,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,AAACCCACAGTGAGCA-LKTX_190129_01_A01,AAACCCACAGTGAGCA-21L8TX_180927_001_A01,Oligo L2-6 OPALIN FTH1P3,#003a28,120.0,Non-Neuronal,#171799,3.0,Oligo,#2E3E39,17.0,F,#FF7373,1.0,M1,#FF7373,1.0,all,#FF7373,1.0,CS1912131120,#003a28,120.0,Oligo L2-6 OPALIN FTH1P3,#003a28,120.0,,#003a28,120.0,Non-neuron 3,#003a28,120.0,H18.30.001,#FF7373,1.0,nucleus,,,Non-Neuronal,Oligo,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
pBICCNsMMrMOpRMiM006d190320_TTTGGTTCATGAGTAA,pBICCNsMMrMOpRMiM006d190320_TTTGGTTCATGAGTAA,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Glutamatergic,L6 IT,20193.0,5695.0,PassQC,Ex1_5,53.0,L6 IT Sulf1_1,Glutamatergic,L6 IT,PassQC,215818.0,53.0,L6 IT Sulf1_1,Glutamatergic,L6 IT,3978.0,5218.066616,18064.753140,0.01,0.0,0.99,0.471594,,85.0,GlutamatergicL6 IT,ILX:0770158,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000679,na,MmusDv:0000061,PATO:0000384,NCBITaxon:10090,M006,nucleus,glutamatergic neuron,10x 3' v3,normal,Mus musculus,male,primary motor cortex,na,early adult stage
pBICCNsMMrMOpRMiM006d190320_TTTGGTTTCGCAAGAG,pBICCNsMMrMOpRMiM006d190320_TTTGGTTTCGCAAGAG,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Non-Neuronal,Oligo,2858.0,1602.0,PassQC,Oligo_19,90.0,Oligo Opalin_4,Non-Neuronal,Oligo,PassQC,215820.0,90.0,Oligo Opalin_4,Non-Neuronal,Oligo,16566.0,1740.348364,3419.937704,0.06,0.0,0.94,0.487323,,150.0,Non-NeuronalOligo,ILX:0770140,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000128,na,MmusDv:0000061,PATO:0000384,NCBITaxon:10090,M006,nucleus,oligodendrocyte,10x 3' v3,normal,Mus musculus,male,primary motor cortex,na,early adult stage
pBICCNsMMrMOpRMiM006d190320_TTTGTTGAGACTCTTG,pBICCNsMMrMOpRMiM006d190320_TTTGTTGAGACTCTTG,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Glutamatergic,L5 IT,35854.0,7344.0,PassQC,Ex1_3,51.0,L5 IT S100b,Glutamatergic,L5 IT,PassQC,215821.0,51.0,L5 IT S100b,Glutamatergic,L5 IT,8684.0,5464.036043,20579.022920,0.01,0.0,0.99,0.385306,,70.0,GlutamatergicL5 IT,ILX:0770157,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000679,na,MmusDv:0000061,PATO:0000384,NCBITaxon:10090,M006,nucleus,glutamatergic neuron,10x 3' v3,normal,Mus musculus,male,primary motor cortex,na,early adult stage
pBICCNsMMrMOpRMiM006d190320_TTTGTTGTCACCTTGC,pBICCNsMMrMOpRMiM006d190320_TTTGTTGTCACCTTGC,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Glutamatergic,L5 IT,23493.0,6146.0,PassQC,Ex1_3,52.0,L5 IT Pld5,Glutamatergic,L5 IT,PassQC,215822.0,52.0,L5 IT Pld5,Glutamatergic,L5 IT,3621.0,5680.692074,22718.304610,0.01,0.0,0.99,0.446838,,83.0,GlutamatergicL5 IT,ILX:0770157,EFO:0009922,PATO:0000461,UBERON:0001384,CL:0000679,na,MmusDv:0000061,PATO:0000384,NCBITaxon:10090,M006,nucleus,glutamatergic neuron,10x 3' v3,normal,Mus musculus,male,primary motor cortex,na,early adult stage


In [278]:
m2m_homolog_write.var[['human_homologs_ensembl_ids', 'human_homologs_entrez_ids', 'human_homologs_names','mouse_homologs_ids','mouse_homologs_names']] = m2m_homolog_write.var[['human_homologs_ensembl_ids', 'human_homologs_entrez_ids', 'human_homologs_names','mouse_homologs_ids','mouse_homologs_names']].astype(str)

In [279]:
m2m_homolog_write.var

Unnamed: 0,orthology_type,human_homologs_ensembl_ids,human_homologs_entrez_ids,human_homologs_names,human_ensembl_id,human_entrez_id,human_gene_name,human_long_gene_name,human_chromosome,mouse_homologs_ids,mouse_homologs_names,mouse_ensembl_id,mouse_gene_name
0,ortholog_many2many,"['ENSG00000169800', 'ENSG00000226941', 'ENSG00...","[378949.0, 159163.0, 378950.0, 378951.0, 37894...","['RBMY1D', 'RBMY1F', 'RBMY1E', 'RBMY1J', 'RBMY...",ENSG00000169800,159163.0,RBMY1F,"RNA binding motif protein, Y-linked, family 1,...",Y,"['ENSMUSG00000091987', 'ENSMUSG00000093918', '...","['Gm21704', 'Gm10256', 'Gm29289', 'Gm3376', 'G...",ENSMUSG00000091987,Gm10352
1,ortholog_many2many,"['ENSG00000153779', 'ENSG00000176679']","[90655.0, 90316.0]","['TGIF2LY', 'TGIF2LX']",ENSG00000153779,90316.0,TGIF2LX,"TGFB-induced factor homeobox 2-like, X-linked",X,"['ENSMUSG00000100133', 'ENSMUSG00000100194']","['Tgif2lx1', 'Tgif2lx2']",ENSMUSG00000100133,Tgif2lx1
2,ortholog_many2many,"['ENSG00000089012', 'ENSG00000101307', 'ENSG00...","[55423.0, 10326.0, 140885.0]","['SIRPG', 'SIRPB1', 'SIRPA']",ENSG00000089012,55423.0,SIRPG,signal-regulatory protein gamma,20,"['ENSMUSG00000037902', 'ENSMUSG00000074677', '...","['Sirpb1b', 'Sirpb1c', 'Gm5150', 'Gm9733', 'Si...",ENSMUSG00000037902,Sirpa
3,ortholog_many2many,"['ENSG00000164816', 'ENSG00000164821', 'ENSG00...","[1669.0, 1671.0, 1667.0, 728358.0, 1668.0, 167...","['DEFA4', 'DEFA6', 'DEFA1', 'DEFA1B', 'DEFA3',...",ENSG00000164816,1670.0,DEFA5,"defensin, alpha 5, Paneth cell-specific",8,"['ENSMUSG00000058618', 'ENSMUSG00000060070', '...","['Defa24', 'Defa5', 'Defa27', 'Defa22', 'Defa2...",ENSMUSG00000058618,Defa39
4,ortholog_many2many,"['ENSG00000088782', 'ENSG00000176782', 'ENSG00...","[140850.0, 400830.0, 245911.0, 245934.0, 61321...","['DEFB127', 'DEFB132', 'DEFB108B', 'DEFB121', ...",ENSG00000088782,140850.0,DEFB127,"defensin, beta 127",20,"['ENSMUSG00000027468', 'ENSMUSG00000048500', '...","['Defb22', 'Defb15', 'Defb34']",ENSMUSG00000027468,Defb22
...,...,...,...,...,...,...,...,...,...,...,...,...,...
124,ortholog_many2many,['ENSG00000143149'],[223.0],['ALDH9A1'],ENSG00000143149,223.0,ALDH9A1,"aldehyde dehydrogenase 9 family, member A1",1,['ENSMUSG00000026687'],['Aldh9a1'],ENSMUSG00000026687,Aldh9a1
125,ortholog_many2many,"['ENSG00000274736', 'ENSG00000275718']","[6359.0, 6368.0]","['CCL15', 'CCL23']",ENSG00000274736,6368.0,CCL23,chemokine (C-C motif) ligand 23,17,"['ENSMUSG00000018927', 'ENSMUSG00000019122']","['Ccl9', 'Ccl6']",ENSMUSG00000018927,Ccl6
126,ortholog_many2many,"['ENSG00000212124', 'ENSG00000212126', 'ENSG00...","[259296.0, 259295.0, 259294.0, 259290.0, 25929...","['TAS2R50', 'TAS2R20', 'TAS2R19', 'TAS2R31', '...",ENSG00000212124,259294.0,TAS2R19,"taste receptor, type 2, member 19",12,"['ENSMUSG00000053217', 'ENSMUSG00000059382']","['Tas2r120', 'Tas2r136']",ENSMUSG00000053217,Tas2r136
127,ortholog_many2many,"['ENSG00000072694', 'ENSG00000143226']","[2213.0, 2212.0]","['FCGR2B', 'FCGR2A']",ENSG00000072694,2213.0,FCGR2B,"Fc fragment of IgG, low affinity IIb, receptor...",1,"['ENSMUSG00000026656', 'ENSMUSG00000059498']","['Fcgr2b', 'Fcgr3']",ENSMUSG00000026656,Fcgr2b


In [280]:
m2m_homolog_write.write_h5ad(os.path.join(path_data,'human_mouse','m2m_homolog_human_mouse_'+str(date.today())+'.h5ad'),compression='gzip')