# Accessing 10x RNA-seq gene expression data

In [114]:
import pandas as pd
from pathlib import Path
import numpy as np
import anndata

from abc_atlas_access.abc_atlas_cache.abc_project_cache import AbcProjectCache
from abc_atlas_access.abc_atlas_cache.anndata_utils import get_gene_data

We will interact with the data using the **AbcProjectCache**. 

**Change the download_base variable to where you have downloaded the data in your system.**

In [115]:
download_base = Path('../../data/abc_atlas')
abc_cache = AbcProjectCache.from_cache_dir(download_base)

abc_cache.current_manifest

type.compare_manifests('releases/20250531/manifest.json', 'releases/20251031/manifest.json')
To load another version of the dataset, run
type.load_manifest('releases/20251031/manifest.json')


'releases/20250531/manifest.json'

In [117]:
type.load_manifest('releases/20251031/manifest.json')


AttributeError: type object 'type' has no attribute 'load_manifest'

## Load the metadata

In [38]:
# Load the cell metadata.
cell = abc_cache.get_metadata_dataframe(
    directory='Zeng-Aging-Mouse-10Xv3',
    file_name='cell_metadata',
    dtype={'cell_label': str,
           'wmb_cluster_alias': 'Int64'}
)
cell.set_index('cell_label', inplace=True)

cell_colors = abc_cache.get_metadata_dataframe(
    directory='Zeng-Aging-Mouse-10Xv3',
    file_name='cell_annotation_colors'
).set_index('cell_label')

cluster_info = abc_cache.get_metadata_dataframe(
    directory='Zeng-Aging-Mouse-10Xv3',
    file_name='cluster'
).set_index('cluster_alias')

cell_cluster_mapping = abc_cache.get_metadata_dataframe(
    directory='Zeng-Aging-Mouse-WMB-taxonomy',
    file_name='cell_cluster_mapping_annotations'
).set_index('cell_label')
cell_cluster_mapping.head()

# Join on the cell_label index.
cell_extended = cell.join(cell_cluster_mapping, rsuffix='_cl_map')
cell_extended = cell_extended.join(cell_colors, rsuffix='_cl_colors')
# Join the cluster information in by joining on the Aging dataset's cluster_alias column.
cell_extended = cell_extended.join(cluster_info, on='cluster_alias', rsuffix='_cl_info')

# Quick run through to drop duplicated columns
drop_cols = []
for col in cell_extended.columns:
    if col.endswith(('_cl_map', '_cl_colors', '_cl_info')):
        drop_cols.append(col)
cell_extended.drop(columns=drop_cols, inplace=True)

# The dataset is sorted on cell_label by default, this causes some plotting weirdness
# due to all "adult" cells being first in the order. Below we scrabble the DataFrame
# to better reproduce plots from the paper.
cell_extended = cell_extended.sample(frac=1, random_state=12345)

# del cell
del cell_colors
del cell_cluster_mapping
del cluster_info

cell_extended.head()

Unnamed: 0_level_0,cell_barcode,gene_count,umi_count,doublet_score,x,y,cluster_alias,cell_in_wmb_study,wmb_cluster_alias,library_label,...,proportion_adult_cells,proportion_aged_cells,odds_ratio,log2_odds_ratio,cluster_age_bias,max_region_of_interest_color,cluster_age_bias_color,neurotransmitter_combined_label,neurotransmitter_label,neurotransmitter_color
cell_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCCTGTTGTGAATTAG-135_B01,GCCTGTTGTGAATTAG,6777,37834.0,0.2,-1.185296,1.881103,278,True,1079.0,L8TX_190716_01_D07,...,0.577949,0.422051,1.013716,0.019653,unassigned,#80CDF8,#DADEDF,GABA,GABA,#FF3358
TCCGAAAGTGAAGCGT-761_A04,TCCGAAAGTGAAGCGT,3396,9585.0,0.030303,14.007499,-0.220336,817,False,,L8TX_210805_01_H01,...,0.454494,0.545506,1.046448,0.0655,unassigned,#8599CC,#DADEDF,No-NT,No-NT,#666666
TTACCATGTCGTGGTC-327_A06,TTACCATGTCGTGGTC,4294,10527.0,0.02,6.550781,-1.378886,804,False,,L8TX_200813_01_H10,...,0.37057,0.62943,1.741208,0.800089,unassigned,#80C0E2,#DADEDF,No-NT,No-NT,#666666
ACGGGTCGTACGAGCA-385_D06,ACGGGTCGTACGAGCA,2943,6957.0,0.0,13.650183,0.289086,817,False,,L8TX_201008_01_A12,...,0.454494,0.545506,1.046448,0.0655,unassigned,#8599CC,#DADEDF,No-NT,No-NT,#666666
CTTAGGATCTGTCCCA-301_B04,CTTAGGATCTGTCCCA,7907,47406.0,0.037037,6.583878,-10.639808,152,False,,L8TX_200723_01_B10,...,0.403409,0.596591,0.587232,-0.767999,unassigned,#72D569,#DADEDF,Glut,Glut,#2B93DF


In [94]:
print('Column names in cell:', cell_extended.columns.to_list())
cell_extended['class_name'].value_counts()

Column names in cell: ['cell_barcode', 'gene_count', 'umi_count', 'doublet_score', 'x', 'y', 'cluster_alias', 'cell_in_wmb_study', 'wmb_cluster_alias', 'library_label', 'alignment_job_id', 'library_method', 'barcoded_cell_sample_label', 'enrichment_population', 'region_of_interest_label', 'anatomical_division_label', 'library_in_wmb_study', 'donor_label', 'population_sampling', 'donor_genotype', 'donor_sex', 'donor_age', 'donor_age_category', 'donor_in_wmb_study', 'feature_matrix_label', 'dataset_label', 'abc_sample_id', 'cluster_label', 'cluster_order', 'cluster_name', 'cluster_color', 'class_name', 'subclass_name', 'supertype_name', 'class_color', 'subclass_color', 'supertype_color', 'anatomical_division_color', 'anatomical_division_order', 'donor_age_category_color', 'donor_age_category_order', 'donor_sex_color', 'donor_sex_order', 'region_of_interest_color', 'region_of_interest_order', 'number_of_cells', 'max_region_of_interest_label', 'proportion_max_region_of_interest_label', 'nu

class_name
30 Astro-Epen        266372
31 OPC-Oligo         253468
01 IT-ET Glut        166299
33 Vascular           84721
34 Immune             72736
02 NP-CT-L6b Glut     58901
19 MB Glut            48744
09 CNU-LGE GABA       42802
20 MB GABA            23894
26 P GABA             23821
06 CTX-CGE GABA       19256
07 CTX-MGE GABA       19232
23 P Glut             17158
12 HY GABA            15960
14 HY Glut            14508
04 DG-IMN Glut        11984
21 MB Dopa             5544
11 CNU-HYa GABA        3735
16 HY MM Glut          3345
22 MB-HB Sero          2898
08 CNU-MGE GABA        2126
03 OB-CR Glut          1761
13 CNU-HYa Glut        1399
24 MY Glut             1077
05 OB-IMN GABA          824
Name: count, dtype: int64

In [113]:
cell_extended['region_of_interest_label'].value_counts()

region_of_interest_label
MB - PAG-RAmb                  143187
HPF - ENT                       89730
HY - HY                         82431
HPF - HIP                       78199
Iscortex  - ACA                 76283
P - Pmot/sat-post               70309
Isocortex  - AI                 70288
P - Pmot/sat-ant                70031
Isocortex - RSP                 69541
MB - VTA-SN                     69333
HPF - PAR-POST-PRE-SUB-ProS     69056
PAL - PAL                       66139
Isocortex - PL-ILA-ORB          64690
STR - STRv                      58034
STR - sAMY                      55600
STR - STRd                      29714
Name: count, dtype: int64

In [82]:
# get gene name and symbol information

# select only a few genes
gene_names = ['Aqp4', 'Gpc5', 'Dscaml1', 'Sox4', 'Myoc', 'Siah3', 'Adgrv1', 'Emx2', 'Brinp3', 'Ddn', 'Sfrp1']
gene = abc_cache.get_metadata_dataframe(directory='WMB-10X', file_name='gene').set_index('gene_identifier')
gene.query("gene_symbol in @gene_names")
# gene[gene['gene_symbol'].isin(gene_names)]
# gene.head()


Unnamed: 0_level_0,gene_symbol,name,mapped_ncbi_identifier,comment
gene_identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ENSMUSG00000035131,Brinp3,bone morphogenetic protein/retinoic acid induc...,NCBIGene:215378,
ENSMUSG00000026697,Myoc,myocilin,NCBIGene:17926,
ENSMUSG00000031548,Sfrp1,secreted frizzled-related protein 1,NCBIGene:20377,
ENSMUSG00000032087,Dscaml1,DS cell adhesion molecule like 1,NCBIGene:114873,
ENSMUSG00000076431,Sox4,SRY (sex determining region Y)-box 4,NCBIGene:20677,
ENSMUSG00000069170,Adgrv1,adhesion G protein-coupled receptor V1,NCBIGene:110789,
ENSMUSG00000091722,Siah3,siah E3 ubiquitin protein ligase family member 3,NCBIGene:380918,
ENSMUSG00000022112,Gpc5,glypican 5,NCBIGene:103978,
ENSMUSG00000059213,Ddn,dendrin,NCBIGene:13199,
ENSMUSG00000024411,Aqp4,aquaporin 4,NCBIGene:11829,


### Loading specific genes from the data


In [None]:
# define genes
gene_names = ['Aqp4', 'Gpc5', 'Dscaml1', 'Sox4', 'Myoc', 'Siah3', 'Adgrv1', 'Emx2', 'Brinp3', 'Ddn', 'Sfrp1']

# define cell subgroup
hippo_cells = cell_extended[cell_extended['region_of_interest_label'] == 'HPF - HIP']
cortex_cells = cell_extended.query("region_of_interest_label  == 'Iscortex  - ACA' & class_name ==  '30 Astro-Epen'")
example_cell = cell_extended[cell_extended['cell_barcode'] == 'AAACCCAAGAATAACC'] # one example cell

# read the expression matrix
gene_data = get_gene_data(
    abc_atlas_cache=abc_cache,
    all_cells=cortex_cells,
    all_genes=gene,
    selected_genes=gene_names
)
gene_data[pd.notna(gene_data[gene_data.columns[0]])]

loading file: Zeng-Aging-Mouse-10Xv3


In [106]:
?get_gene_data

[1;31mSignature:[0m
[0mget_gene_data[0m[1;33m([0m[1;33m
[0m    [0mabc_atlas_cache[0m[1;33m:[0m [0mabc_atlas_access[0m[1;33m.[0m[0mabc_atlas_cache[0m[1;33m.[0m[0mabc_project_cache[0m[1;33m.[0m[0mAbcProjectCache[0m[1;33m,[0m[1;33m
[0m    [0mall_cells[0m[1;33m:[0m [0mpandas[0m[1;33m.[0m[0mcore[0m[1;33m.[0m[0mframe[0m[1;33m.[0m[0mDataFrame[0m[1;33m,[0m[1;33m
[0m    [0mall_genes[0m[1;33m:[0m [0mpandas[0m[1;33m.[0m[0mcore[0m[1;33m.[0m[0mframe[0m[1;33m.[0m[0mDataFrame[0m[1;33m,[0m[1;33m
[0m    [0mselected_genes[0m[1;33m:[0m [0mList[0m[1;33m[[0m[0mstr[0m[1;33m][0m[1;33m,[0m[1;33m
[0m    [0mdata_type[0m[1;33m:[0m [0mstr[0m [1;33m=[0m [1;34m'log2'[0m[1;33m,[0m[1;33m
[0m    [0mchunk_size[0m[1;33m:[0m [0mint[0m [1;33m=[0m [1;36m8192[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Load expression matrix data from the ABC Atlas and extract data for


In [105]:
gene_data

gene_symbol,Brinp3,Myoc,Sfrp1,Dscaml1,Sox4,Adgrv1,Siah3,Gpc5,Ddn,Aqp4,Emx2
cell_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
GTACAGTGTATCGCTA-310_A05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.137723,0.0,8.44112,0.0
AGGTAGGGTCTCCTGT-316_C03,9.503889,0.0,0.0,6.517732,0.0,0.0,0.0,12.961513,0.0,7.509838,0.0
TGATGGTAGCTACTAC-393_C05,9.009558,0.0,0.0,0.0,0.0,0.0,0.0,12.984214,7.017931,9.330926,0.0
TTCGATTAGATGCTTC-316_A03,8.81147,0.0,0.0,0.0,0.0,0.0,0.0,11.978538,0.0,7.814678,6.821072
TTGTGGATCCTTCACG-314_B01,9.674965,0.0,0.0,0.0,7.0988,0.0,0.0,12.410429,0.0,7.0988,8.093528
...,...,...,...,...,...,...,...,...,...,...,...
AGATGCTTCTCGTTTA-310_A05,10.977281,0.0,0.0,0.0,6.81947,0.0,0.0,12.361503,0.0,8.809857,6.81947
TACTTGTAGCTACAAA-393_C05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.29075,7.837065,11.079336,7.837065
CAACCTCAGCTCGTGC-352_A04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.300072,0.0,9.132433,8.548756
AGATCCAAGACCTCCG-393_D05,8.950899,0.0,0.0,0.0,7.371756,0.0,0.0,13.021403,0.0,0.0,7.371756


The returned DataFrame is indexed by ``cell_label`` and can thus be joined with the ``cell`` DataFrame for further analysis.

In [107]:
cells_with_genes = cell_extended.join(gene_data)
cells_with_genes

Unnamed: 0_level_0,cell_barcode,gene_count,umi_count,doublet_score,x,y,cluster_alias,cell_in_wmb_study,wmb_cluster_alias,library_label,...,Myoc,Sfrp1,Dscaml1,Sox4,Adgrv1,Siah3,Gpc5,Ddn,Aqp4,Emx2
cell_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
GCCTGTTGTGAATTAG-135_B01,GCCTGTTGTGAATTAG,6777,37834.0,0.200000,-1.185296,1.881103,278,True,1079,L8TX_190716_01_D07,...,,,,,,,,,,
TCCGAAAGTGAAGCGT-761_A04,TCCGAAAGTGAAGCGT,3396,9585.0,0.030303,14.007499,-0.220336,817,False,,L8TX_210805_01_H01,...,,,,,,,,,,
TTACCATGTCGTGGTC-327_A06,TTACCATGTCGTGGTC,4294,10527.0,0.020000,6.550781,-1.378886,804,False,,L8TX_200813_01_H10,...,,,,,,,,,,
ACGGGTCGTACGAGCA-385_D06,ACGGGTCGTACGAGCA,2943,6957.0,0.000000,13.650183,0.289086,817,False,,L8TX_201008_01_A12,...,,,,,,,,,,
CTTAGGATCTGTCCCA-301_B04,CTTAGGATCTGTCCCA,7907,47406.0,0.037037,6.583878,-10.639808,152,False,,L8TX_200723_01_B10,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AAGACAAAGAGCCCAA-337_B06,AAGACAAAGAGCCCAA,1767,2909.0,0.000000,-2.582976,17.539478,833,False,,L8TX_200820_01_E01,...,,,,,,,,,,
TGTACAGTCCAGCCTT-314_B01,TGTACAGTCCAGCCTT,1640,2716.0,0.012658,-4.430760,17.868887,835,False,,L8TX_200810_01_A08,...,,,,,,,,,,
CACTTCGCACAGCTTA-324_D03,CACTTCGCACAGCTTA,7643,40484.0,0.130000,-11.713069,1.782393,242,False,,L8TX_200813_01_F08,...,,,,,,,,,,
TTCATTGCACTAACGT-696.1_B03,TTCATTGCACTAACGT,8523,46961.0,0.200000,-13.516253,8.201415,555,False,,L8TX_210624_01_H03,...,,,,,,,,,,
