In [1]:
import os
import numpy as np
import pandas as pd
import scanpy.api as sc

  from ._conv import register_converters as _register_converters


In [2]:
anno = pd.read_csv( "annotations_droplets.csv" )
anno["cell"] = anno["cell"].apply( lambda x: bytes(x, "UTF8") )
anno.head()

Unnamed: 0,cell,tissue,cell_ontology_class,cell_ontology_term_iri,cell_ontology_id
0,b'10X_P4_3_AAAGTAGAGATGCCAG',Bladder,mesenchymal cell,http://purl.obolibrary.org/obo/CL_0008019,CL:0008019
1,b'10X_P4_3_AACCGCGTCCAACCAA',Bladder,mesenchymal cell,http://purl.obolibrary.org/obo/CL_0008019,CL:0008019
2,b'10X_P4_3_AACTCCCGTCGGGTCT',Bladder,mesenchymal cell,http://purl.obolibrary.org/obo/CL_0008019,CL:0008019
3,b'10X_P4_3_AACTCTTAGTTGCAGG',Bladder,bladder cell,http://purl.obolibrary.org/obo/CL_1001319,CL:1001319
4,b'10X_P4_3_AACTCTTTCATAACCG',Bladder,mesenchymal cell,http://purl.obolibrary.org/obo/CL_0008019,CL:0008019


In [3]:
# This function reads one subdirectory of the "droplet" data directory of the
# Tabula muris data, i.e., one "channel".

def read_Tm_10x( dirname, path = "droplet" ):
    
    # Read the sparse matrix:
    a = sc.read( os.path.join( path, dirname, "matrix.mtx" ) ).transpose()
    
    # Add the barcodes and gene symbols:
    a.obs_names = pd.read_table( os.path.join( path, dirname, "barcodes.tsv" ), header=None )[0]
    a.var_names = pd.read_table( os.path.join( path, dirname, "genes.tsv" ), header=None )[0]
    
    # The diname has the format "tisse-channel"
    assert len( dirname.split("-") ) == 2 
    
    # Remove "-1" label from cell barcodes, prefix channel:
    a.obs_names = np.fromiter( [ dirname.split("-")[1] + "_" + s.replace( "-1", "" ) 
       for s in a.obs_names ], 'S32' )
    
    # Look these up in the annotation table and keep only those found
    # This is necessary because 4 of the dropseq directrories contain superfluous barcodes
    annotated_cells = set( anno["cell"].values )
    a = a[ np.fromiter( ( s in annotated_cells for s in a.obs_names ), dtype=bool ), ]    
    
    # Store the data from the dirname
    a.obs["tissue"] = dirname.split("-")[0]
    a.obs["channel"] = dirname.split("-")[1]
    
    # Merge in the annotation data
    a.obs = a.obs.merge( anno, how="left", left_index=True, right_on="cell", 
        suffixes=("",".y") ).set_index( "cell" )
    
    # Remove unnecessary columns
    assert all( a.obs["tissue"] == a.obs["tissue.y"] )
    a.obs = a.obs.drop( columns = [ "cell_ontology_term_iri", "tissue.y" ] )

    return a

In [4]:
path10x = "droplet"
dirs10x = sorted( os.listdir( path10x ) )
dirs10x

['Bladder-10X_P4_3',
 'Bladder-10X_P4_4',
 'Bladder-10X_P7_7',
 'Heart-10X_P7_4',
 'Kidney-10X_P4_5',
 'Kidney-10X_P4_6',
 'Kidney-10X_P7_5',
 'Liver-10X_P4_2',
 'Liver-10X_P7_0',
 'Liver-10X_P7_1',
 'Lung-10X_P7_8',
 'Lung-10X_P7_9',
 'Lung-10X_P8_12',
 'Lung-10X_P8_13',
 'Mammary-10X_P7_12',
 'Mammary-10X_P7_13',
 'Marrow-10X_P7_2',
 'Marrow-10X_P7_3',
 'Muscle-10X_P7_14',
 'Muscle-10X_P7_15',
 'Spleen-10X_P4_7',
 'Spleen-10X_P7_6',
 'Thymus-10X_P7_11',
 'Tongue-10X_P4_0',
 'Tongue-10X_P4_1',
 'Tongue-10X_P7_10',
 'Trachea-10X_P8_14',
 'Trachea-10X_P8_15']

In [6]:
ac = read_Tm_10x( dirs10x[0], path10x )
print( f"Read {dirs10x[0]}.")
for d in dirs10x[1:]:
    a2 = read_Tm_10x( d, path10x )
    assert( all( ac.var_names == a2.var_names ) )
    ac = ac.concatenate( a2, index_unique=None )
    print( f"Read {d}.")
del a2

ac

Read Bladder-10X_P4_3.
Read Bladder-10X_P4_4.
Read Bladder-10X_P7_7.
Read Heart-10X_P7_4.
Read Kidney-10X_P4_5.
Read Kidney-10X_P4_6.
Read Kidney-10X_P7_5.
Read Liver-10X_P4_2.
Read Liver-10X_P7_0.
Read Liver-10X_P7_1.
Read Lung-10X_P7_8.
Read Lung-10X_P7_9.
Read Lung-10X_P8_12.
Read Lung-10X_P8_13.
Read Mammary-10X_P7_12.
Read Mammary-10X_P7_13.
Read Marrow-10X_P7_2.
Read Marrow-10X_P7_3.
Read Muscle-10X_P7_14.
Read Muscle-10X_P7_15.
Read Spleen-10X_P4_7.
Read Spleen-10X_P7_6.
Read Thymus-10X_P7_11.
Read Tongue-10X_P4_0.
Read Tongue-10X_P4_1.
Read Tongue-10X_P7_10.
Read Trachea-10X_P8_14.
Read Trachea-10X_P8_15.


AnnData object with n_obs × n_vars = 54837 × 23433 
    obs: 'tissue', 'channel', 'cell_ontology_class', 'cell_ontology_id', 'batch'

In [10]:
# Load channel metadata
meta = pd.read_csv( "metadata_droplet.csv" )
meta.head()

Unnamed: 0,channel,mouse.id,tissue,subtissue,mouse.sex
0,10X_P4_0,3-M-8,Tongue,,M
1,10X_P4_1,3-M-9,Tongue,,M
2,10X_P4_2,3-M-8/9,Liver,hepatocytes,M
3,10X_P4_3,3-M-8,Bladder,,M
4,10X_P4_4,3-M-9,Bladder,,M


In [14]:
# Merge in channel metadata
ac.obs = ac.obs.merge( meta, how="left", on="channel" )

In [15]:
ac.write( "Tabula_muris_10x.h5ad" )