This notebook changes the cell labels to a higher level in the cell ontology.

If you downloaded the data sets from file.biolab.si/tsne-embedding, you do **not** need to run this notebook.

In [1]:
from os import path

import openTSNE

import anndata
import numpy as np
import pandas as pd
import scanpy as sc
import string

import sys
sys.path.append(path.join("..", "notebooks"))
sys.path.append(path.join("..", "notebooks", "utils.py"))
import utils

import matplotlib.pyplot as plt
%matplotlib inline

# Hrvatin

In [2]:
fname = path.join("..", "data", "hrvatin_2018.h5ad")

In [3]:
adata = anndata.read_h5ad(fname)

In [4]:
adata.obs["labels"] = adata.obs["labels"].replace({
    "neuron of cerebral cortex": "Neuron",
    "CNS interneuron": "Neuron",
    "hippocampal neuron": "Neuron",
    
    "microglial cell": "Microglia",
    "oligodendrocyte": "Oligodendrocyte",
    "oligodendrocyte precursor cell": "OPC",
    "astrocyte": "Astrocyte",
    "endothelial cell": "Endothelial cell",
    "pericyte cell": "Pericyte",
    "macrophage": "Macrophage",
    "smooth muscle cell": "Muscle cell",
})
adata.obs["labels"].value_counts()

Neuron              15223
Microglia           10158
Oligodendrocyte      8630
Astrocyte            7039
Endothelial cell     3450
OPC                  1826
Pericyte              782
Muscle cell           621
Macrophage            537
Name: labels, dtype: int64

In [5]:
adata

AnnData object with n_obs × n_vars = 48266 × 25186 
    obs: 'labels', 'batch_id', 'paper_cell_type', 'paper_cell_subtype'
    uns: 'name', 'organism', 'tissue', 'year'

In [6]:
adata.write_h5ad(fname)

... storing 'labels' as categorical


# Chen

In [7]:
fname = path.join("..", "data", "chen_2017.h5ad")

In [8]:
new = anndata.read_h5ad(fname)

In [9]:
new.obs["labels"] = new.obs["labels"].replace({    
    "neuron of cerebral cortex": "Neuron",
    "GABAergic neuron": "Neuron",
    "glutamatergic neuron": "Neuron",
    "histaminergic neuron": "Neuron",
    "CNS interneuron": "Neuron",
    "hippocampal neuron": "Neuron",
    
    "microglial cell": "Microglia",
    "oligodendrocyte": "Oligodendrocyte",
    "oligodendrocyte precursor cell": "OPC",
    "astrocyte": "Astrocyte",
    "endothelial cell": "Endothelial cell",
    "pericyte cell": "Pericyte",
    "macrophage": "Macrophage",
    "smooth muscle cell": "Muscle cell",
})
new.obs["labels"].value_counts()

Oligodendrocyte    3541
unknown            2531
Neuron             2315
OPC                1792
epithelial cell    1197
Astrocyte          1148
Microglia           724
tanycyte            609
ependymal cell      413
Macrophage          167
Name: labels, dtype: int64

In [10]:
new

AnnData object with n_obs × n_vars = 14437 × 23284 
    obs: 'labels', 'paper_celltype', 'batch_id'
    uns: 'name', 'organism', 'tissue', 'year'

In [11]:
new.write_h5ad(fname)

... storing 'labels' as categorical


In [12]:
utils.cell_type_counts([adata, new])

Unnamed: 0,hrvatin_2018,chen_2017
Astrocyte,7039.0,1148.0
Endothelial cell,3450.0,
Macrophage,537.0,167.0
Microglia,10158.0,724.0
Muscle cell,621.0,
Neuron,15223.0,2315.0
OPC,1826.0,1792.0
Oligodendrocyte,8630.0,3541.0
Pericyte,782.0,
ependymal cell,,413.0


# Baron

In [13]:
fname = path.join("..", "data", "baron_2016h.h5ad")

In [14]:
adata = anndata.read_h5ad(fname)

In [15]:
adata.obs["labels"] = adata.obs["labels"].replace({    
    "type B pancreatic cell": "Beta cells",
    "pancreatic A cell": "Alpha cells",
    "pancreatic D cell": "Delta cells",
    "pancreatic PP cell": "PP cells",
    #"pancreatic epsilon cell": "Epsilon cells",
    "pancreatic epsilon cell": "Other",
    
    "pancreatic ductal cell": "Ductal cells",
    "pancreatic acinar cell": "Acinar cells",
    "pancreatic stellate cell": "PaSC",  # pancreatic stellate cell
    
    "endothelial cell": "Endothelial cell",
    "macrophage": "Other",
    "mast cell": "Other",
    "Schwann cell": "Other",
    "T cell": "Other",
})
adata.obs["labels"].value_counts()

Beta cells          2525
Alpha cells         2326
Ductal cells        1077
Acinar cells         958
Delta cells          601
PaSC                 457
PP cells             255
Endothelial cell     252
Other                118
Name: labels, dtype: int64

In [16]:
adata

AnnData object with n_obs × n_vars = 8569 × 20125 
    obs: 'batch_id', 'labels'
    uns: 'name', 'organism', 'tissue', 'year'

In [17]:
adata.write_h5ad(fname)

... storing 'labels' as categorical


# Xin

In [18]:
fname = path.join("..", "data", "xin_2016.h5ad")

In [19]:
new = anndata.read_h5ad(fname)

In [20]:
new.obs["labels"] = new.obs["labels"].replace({
    "type B pancreatic cell": "Beta cells",
    "pancreatic A cell": "Alpha cells",
    "pancreatic D cell": "Delta cells",
    "pancreatic PP cell": "PP cells",
    #"pancreatic epsilon cell": "Epsilon cells",
    "pancreatic epsilon cell": "Other",
    
    "pancreatic ductal cell": "Ductal cells",
    "pancreatic acinar cell": "Acinar cells",
    "pancreatic stellate cell": "PaSC",  # pancreatic stellate cell
    
    "endothelial cell": "Endothelial cell",
    "macrophage": "Other",
    "mast cell": "Other",
    "Schwann cell": "Other",
    "T cell": "Other",
})
new.obs["labels"].value_counts()

Alpha cells    886
Beta cells     472
PP cells        85
Delta cells     49
Name: labels, dtype: int64

In [21]:
new

AnnData object with n_obs × n_vars = 1492 × 39851 
    obs: 'batch_id', 'disease', 'age', 'sex', 'labels'
    uns: 'name', 'organism', 'tissue', 'year'

In [22]:
new.write_h5ad(fname)

... storing 'labels' as categorical


In [23]:
utils.cell_type_counts([adata, new])

Unnamed: 0,baron_2016h,xin_2016
Acinar cells,958,
Alpha cells,2326,886.0
Beta cells,2525,472.0
Delta cells,601,49.0
Ductal cells,1077,
Endothelial cell,252,
Other,118,
PP cells,255,85.0
PaSC,457,


# Macosko

In [24]:
fname = path.join("..", "data", "macosko_2015.h5ad")

In [25]:
adata = anndata.read_h5ad(fname)

In [26]:
adata.obs["labels"] = adata.obs["labels"].replace({
    "retinal rod cell": "Rods",
    "retinal bipolar neuron": "Bipolar cells",
    "amacrine cell": "Amacrine cells",
    "retinal cone cell": "Cones",
    "Mueller cell": "Muller glia",
    "retinal ganglion cell": "Retinal ganglion cells",
    "endothelial cell": "Vascular endothelium",
    "retina horizontal cell": "Horizontal cells",
    "fibroblast": "Fibroblasts",
    "microglial cell": "Microglia",
    "pericyte cell": "Pericytes",
    "astrocyte": "Astrocytes",
})
adata.obs["labels"].value_counts()

Rods                      29400
Bipolar cells              6285
Amacrine cells             4426
Cones                      1868
Muller glia                1624
Retinal ganglion cells      432
Vascular endothelium        252
Horizontal cells            252
Fibroblasts                  85
Microglia                    67
Pericytes                    63
Astrocytes                   54
Name: labels, dtype: int64

In [27]:
adata

AnnData object with n_obs × n_vars = 44808 × 24658 
    obs: 'batch_id', 'cluster_id', 'labels'
    uns: 'name', 'organism', 'tissue', 'year'

In [28]:
adata.write_h5ad(fname)

... storing 'labels' as categorical


# Shekhar

In [29]:
fname = path.join("..", "data", "shekhar_2016.h5ad")

In [30]:
new = anndata.read_h5ad(fname)

In [31]:
new.obs["labels"] = new.obs["labels"].astype(str)
new.obs["labels"][new.obs["labels"].str.contains("bipolar cell")] = "retinal bipolar neuron"
new.obs["labels"] = new.obs["labels"].replace({
    "retinal rod cell": "Rods",
    "retinal bipolar neuron": "Bipolar cells",
    "amacrine cell": "Amacrine cells",
    "retinal cone cell": "Cones",
    "Mueller cell": "Muller glia",
    "retinal ganglion cell": "Retinal ganglion cells",
    "endothelial cell": "Vascular endothelium",
    "retina horizontal cell": "Horizontal cells",
    "fibroblast": "Fibroblasts",
    "microglial cell": "Microglia",
    "pericyte cell": "Pericytes",
    "astrocyte": "Astrocytes",
})
new.obs["labels"].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Bipolar cells     23494
Muller glia        2945
Amacrine cells      252
Rods                 91
Cones                48
Name: labels, dtype: int64

In [32]:
new

AnnData object with n_obs × n_vars = 26830 × 24903 
    obs: 'batch_id', 'cluster_id', 'labels', 'subclusters'
    uns: 'name', 'organism', 'tissue', 'year'

In [33]:
new.write_h5ad(fname)

... storing 'labels' as categorical


In [34]:
utils.cell_type_counts([adata, new])

Unnamed: 0,macosko_2015,shekhar_2016
Amacrine cells,4426,252.0
Astrocytes,54,
Bipolar cells,6285,23494.0
Cones,1868,48.0
Fibroblasts,85,
Horizontal cells,252,
Microglia,67,
Muller glia,1624,2945.0
Pericytes,63,
Retinal ganglion cells,432,
