In [34]:
import anndata
import scanpy
import pandas as pd

This loads the full body 10x stringently filtered portion of the Fly Cell Atlas. The source is here https://flycellatlas.org/scope. You'll notice the downloads on that page are loom files. In fact, they appear to be standards non-compliant loom files, which caused me much pain in reading them, and I have no idea how I did it. Somehow, I managed to strip a lot of the information and convert it to an h5ad file, which is what we operate on here.

In [2]:
adata_FCA = anndata.io.read_h5ad("/mnt/z/Caelen/snRNAseq_v2s_fca_biohub_all_wo_blood_10x_converted.h5ad")

In [10]:
adata_FCA

AnnData object with n_obs × n_vars = 506660 × 16366
    obs: 'R_annotation', 'R_annotation__ontology_id', 'R_annotation_broad', 'R_annotation_broad__ontology_id', 'age', 'sex', 'tissue', 'batch'
    var: 'gene_names'

This is to differentiate these cells from ours when we integrate them later.

In [19]:
adata_FCA.obs["study"] = "FCA"

Our samples are only females, so we subset the FCA to only females.

In [11]:
adata_FCA = adata_FCA[adata_FCA.obs["sex"] == "female"]
adata_FCA

View of AnnData object with n_obs × n_vars = 254580 × 16366
    obs: 'R_annotation', 'R_annotation__ontology_id', 'R_annotation_broad', 'R_annotation_broad__ontology_id', 'age', 'sex', 'tissue', 'batch'
    var: 'gene_names'

Here we remove low confidence annotations and male cell types that appear in females. I'm not sure whats up with those.

In [23]:
# low confidence
# maybe these should be combined with the high confidence versions?
adata_FCA = adata_FCA[adata_FCA.obs["R_annotation"] != "adult hindgut*"]
adata_FCA = adata_FCA[adata_FCA.obs["R_annotation"] != "adult midgut*"]
adata_FCA = adata_FCA[adata_FCA.obs["R_annotation"] != "posterior midgut*"]
adata_FCA = adata_FCA[adata_FCA.obs["R_annotation"] != "principal cell*"]
adata_FCA = adata_FCA[adata_FCA.obs["R_annotation"] != "hemocyte*"]

# shouldn't be in females?
adata_FCA = adata_FCA[adata_FCA.obs["R_annotation"] != "ejaculatory bulb"]
adata_FCA = adata_FCA[adata_FCA.obs["R_annotation"] != "male germline differentiating cell"]
adata_FCA = adata_FCA[adata_FCA.obs["R_annotation"] != "male accessory gland"]
adata_FCA = adata_FCA[adata_FCA.obs["R_annotation"] != "testis"]

adata_FCA = adata_FCA[adata_FCA.obs["R_annotation_broad"] != "male germline cell"]
adata_FCA = adata_FCA[adata_FCA.obs["R_annotation_broad"] != "male reproductive system"]

adata_FCA

View of AnnData object with n_obs × n_vars = 253970 × 16366
    obs: 'R_annotation', 'R_annotation__ontology_id', 'R_annotation_broad', 'R_annotation_broad__ontology_id', 'age', 'sex', 'tissue', 'batch', 'DIY_annotation', 'study'
    var: 'gene_names'

Here we just look at the cell type distributions.

In [14]:
counts = adata_FCA.obs["R_annotation_broad"].value_counts()
percentages = adata_FCA.obs["R_annotation_broad"].value_counts(normalize=True) * 100

cell_type_summary = pd.DataFrame({
    'count': counts,
    'percentage': percentages
})

cell_type_summary

Unnamed: 0_level_0,count,percentage
R_annotation_broad,Unnamed: 1_level_1,Unnamed: 2_level_1
unannotated,71211,27.990645
epithelial cell,41815,16.436068
muscle cell,32388,12.730632
female reproductive system,19053,7.489092
neuron,17997,7.074014
sensory neuron,15924,6.259188
fat cell,13263,5.213238
artefact,9387,3.689713
glial cell,7135,2.804528
female germline cell,5652,2.221611


In [16]:
counts = adata_FCA.obs["R_annotation"].value_counts()
percentages = adata_FCA.obs["R_annotation"].value_counts(normalize=True) * 100

import pandas as pd
cell_type_summary = pd.DataFrame({
    'count': counts,
    'percentage': percentages
})

cell_type_summary

Unnamed: 0_level_0,count,percentage
R_annotation,Unnamed: 1_level_1,Unnamed: 2_level_1
unannotated,71211,27.990645
epithelial cell,26458,10.399748
muscle cell,25321,9.952832
adult fat body,13131,5.161354
artefact,9230,3.628002
...,...,...
nociceptive neuron,8,0.003145
adult midgut*,7,0.002751
antennal trichoid sensillum at4,7,0.002751
cardial cell,1,0.000393


In [17]:
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["R_annotation"]

  adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["R_annotation"]


We could remove unannotated and artefact cells, under the assumption that these may have been misannotated and since we are going to make our final labels by hand. I've tried both ways and I'm undecided.

In [None]:
#adata_FCA = adata_FCA[adata_FCA.obs["R_annotation"] != "unannotated"]
#adata_FCA = adata_FCA[adata_FCA.obs["R_annotation_broad"] != "unannotated"]
#adata_FCA = adata_FCA[adata_FCA.obs["R_annotation_broad"] != "artefact"]

Here we create a custom cell type annotation. The FCA broad annotation is too broad, but the main annotation has so many rare/similar cell types that it makes type projection difficult. So we manually collapse many of the similar/rare types into broader categories. My choices here are somewhat arbitrary, and are based on my vague knowledge of biology and on which types appeared to cluster together in earlier attempts.

In [18]:
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("antimicrobial peptide-producing cell", "artefact")

adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult heart", "cardial cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult ostium", "cardial cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("cardiomyocyte, working adult heart muscle (non-ostia)", "cardial cell")

adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult enterocyte", "enterocyte")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult midgut enterocyte", "enterocyte")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("enterocyte-like", "enterocyte")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("enterocyte of anterior adult midgut epithelium", "enterocyte")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("enterocyte of posterior adult midgut epithelium", "enterocyte")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult differentiating enterocyte", "enterocyte")

adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("enteroendocrine cell", "epithelial cell")

# Could also be epidermal, no distinct cluster.
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("epidermal cell that specialized in antimicrobial response", "epithelial cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("epidermal cell of the abdominal posterior compartment", "epithelial cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("tendon cell", "epithelial cell") # I guess tendons are epidermal cells? FBbt_00007325

adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("cardia (1)", "cardia")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("cardia (2)", "cardia")

adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("stalk follicle cell", "follicle cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("polar follicle cell", "follicle cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("stretch follicle cell", "follicle cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("dorsal appendage forming follicle cell", "follicle cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("follicle stem cell and prefollicle cell", "follicle cell")

adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("eo support cell", "tormogen or eo support cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("tormogen cell", "tormogen or eo support cell")

adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("midgut large flat cell", "midgut")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("copper cell", "midgut")

adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult abdominal pericardial cell", "Malpighian tubule")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult Malpighian tubule", "Malpighian tubule")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult Malpighian tubule bar-shaped cell of initial segment", "Malpighian tubule")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult Malpighian tubule principal cell", "Malpighian tubule")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult Malpighian tubule principal cell of initial segment", "Malpighian tubule")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult Malpighian tubule principal cell of lower segment", "Malpighian tubule")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult Malpighian tubule principal cell of lower ureter", "Malpighian tubule")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult Malpighian tubule principal cell subtype", "Malpighian tubule")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult Malpighian tubule stellate cell of main segment", "Malpighian tubule")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult Malpighian tubule Type I cell", "Malpighian tubule")

adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult fat body", "fat cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("pericerebral adult fat mass", "fat cell")

adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("16-cell germline cyst in germarium region 2a and 2b", "female germline cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("germ cell stage 4 and later", "female germline cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("germline cell", "female germline cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("germline cell, unknown stage", "female germline cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("post-mitotic germ cell early 16-cell cyst", "female germline cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("young germ cell", "female germline cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("post-mitotic endocycling nurse cell", "female germline cell")

adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("btl-GAL4 positive female cell, cluster 1, likely to be ovary cell", "ovary cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("btl-GAL4 positive female cell, cluster 2, likely to be ovary cell", "ovary cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("btl-GAL4 positive female cell, cluster 3, likely to be ovary cell", "ovary cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("btl-GAL4 positive female cell, likely to be ovary cell, sim+", "ovary cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("btl-GAL4 positive female cell, likely to be ovary cell, sim+, H15+", "ovary cell")

adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("choriogenic main body follicle cell and corpus luteum", "follicle cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("choriogenic main body follicle cell St. 12", "follicle cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("choriogenic main body follicle cell St. 14", "follicle cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("central main body follicle cell ca. St. 6-8", "follicle cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("follicle cell St. 9+", "follicle cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("main body follicle cell ca. until St. 5", "follicle cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("posterior terminal follicle cell ca. St. 5-8", "follicle cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("prefollicle cell/stalk follicle cell", "follicle cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("escort cell", "follicle cell")

adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult antenna glial cell", "glial cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult brain cell body glial cell", "glial cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult brain perineurial glial cell", "glial cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult glial cell", "glial cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult lamina epithelial/marginal glial cell", "glial cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult lobula astrocyte-like glial cell", "glial cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult optic chiasma glial cell", "glial cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult reticular neuropil associated glial cell", "glial cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("cell body glial cell", "glial cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("CNS surface associated glial cell", "glial cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("ensheathing glial cell", "glial cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("optic-lobe-associated cortex glial cell", "glial cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("perineurial glial sheath", "glial cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("pigment cell", "glial cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("subperineurial glial cell", "glial cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("peripheral glial cell", "glial cell")

adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("crystal cell", "hemocyte")

adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("ovarian sheath muscle", "visceral muscle")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult midgut muscle", "visceral muscle")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("visceral muscle of the midgut", "visceral muscle")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("visceral muscle of the crop", "visceral muscle")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult crop circular muscle", "visceral muscle")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult heart ventral longitudinal muscle", "visceral muscle")

adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("indirect flight muscle", "somatic muscle")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("skeletal muscle of head", "somatic muscle")

adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult alary muscle", "alary muscle")

adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult neuron", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult peripheral nervous system", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult peripheral neuron of the heart", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult ventral nervous system", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("alpha/beta Kenyon cell", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("alpha'/beta' Kenyon cell", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("antennal lobe projection neuron", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("centrifugal neuron C2", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("centrifugal neuron C3", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("columnar neuron T1", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("distal medullary amacrine neuron Dm10", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("distal medullary amacrine neuron Dm11", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("distal medullary amacrine neuron Dm12", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("distal medullary amacrine neuron Dm3", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("distal medullary amacrine neuron Dm8", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("distal medullary amacrine neuron Dm9", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("dopaminergic neuron", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("dopaminergic PAM neuron", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("gamma Kenyon cell", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("Kenyon cell", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("lamina intrinsic neuron", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("lamina monopolar neuron L1", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("lamina monopolar neuron L2", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("lamina monopolar neuron L3", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("lamina monopolar neuron L4", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("lamina monopolar neuron L5", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("lamina intrinsic amacrine neuron Lai", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("lamina wide-field 1 neuron", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("lamina wide-field 2 neuron", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("leg muscle motor neuron", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("lobula columnar neuron LC10", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("lobula columnar neuron LC12", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("lobula columnar neuron LC17", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("medullary intrinsic neuron Mi1", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("medullary intrinsic neuron Mi15", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("medullary intrinsic neuron Mi4", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("medullary intrinsic neuron Mi9", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("multidendritic neuron", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("neuron of haltere", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("octopaminergic/tyraminergic neuron", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("Poxn neuron", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("proximal medullary amacrine neuron Pm2", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("proximal medullary amacrine neuron Pm4", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("sacculus/arista neuron", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("T neuron T2", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("T neuron T2a", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("T neuron T3", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("T neuron T4/T5", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("T neuron T4/T5a-b", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("T neuron T4/T5c-d", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("transmedullary neuron Tm1", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("transmedullary neuron Tm2", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("transmedullary neuron Tm20", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("transmedullary neuron Tm29", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("transmedullary neuron Tm3a", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("transmedullary neuron Tm4", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("transmedullary neuron Tm5c", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("transmedullary neuron Tm9", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("transmedullary Y neuron TmY14", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("transmedullary Y neuron TmY4", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("transmedullary Y neuron TmY5a", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("transmedullary Y neuron TmY8", "neuron")

adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult oenocyte", "oenocyte")

adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult olfactory receptor neuron acid-sensing, Ir64a", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult olfactory receptor neuron acid-sensing, Ir75a/b/c, Ir64a", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult olfactory receptor neuron Gr21a/63a", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult olfactory receptor neuron Ir56a+, Orco-", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult olfactory receptor neuron Ir75d", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult olfactory receptor neuron Ir84a, Ir31a, Ir76a, Ir76b, Ir8a, Or35a", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult olfactory receptor neuron Or13a", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult olfactory receptor neuron Or22a, Or42b, Or59b", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult olfactory receptor neuron Or47a, Or56a and likely other ORN types", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult olfactory receptor neuron Or47b", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult olfactory receptor neuron Or65", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult olfactory receptor neuron Or67a and likely other unknown ORN types", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult olfactory receptor neuron Or67d", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult olfactory receptor neuron Or83c and Or82a", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult olfactory receptor neuron Or85a, Or43b", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult olfactory receptor neuron Or88a", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult olfactory receptor neuron Or92a", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult olfactory receptor neuron unknown type, Orco-", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult olfactory receptor neuron unknown type, Orco+", "neuron")

adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("olfactory receptor neuron", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("olfactory receptor neuron, coeloconics", "neuron")

adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("photoreceptor cell R7", "photoreceptor")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("photoreceptor cell R8", "photoreceptor")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("photoreceptor-like", "photoreceptor")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("ocellus retinula cell", "photoreceptor")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("eye photoreceptor cell", "photoreceptor")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("outer photoreceptor cell", "photoreceptor")

adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("antennal trichoid sensillum at4", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("arista and sacculus thermosensory and hygrosensory neuron Ir21a, Ir40a, Gr28b", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("auditory sensory neuron", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("bitter-sensitive labellar taste bristle", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("dorsal rim area", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("gustatory receptor neuron", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("gustatory receptor neuron of the labellum", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("Johnston organ neuron", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("labral sense organ mechanosensory neuron", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("leg taste bristle chemosensory neuron", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("maxillary palp olfactory receptor neuron", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("maxillary palpus", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("mechanosensory neuron", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("mechanosensory neuron of haltere", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("mechanosensory neuron of leg chordotonal organ", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("nociceptive neuron", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("pheromone-sensing neuron", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("scolopidial neuron", "neuron")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("sensory neuron", "neuron")

adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult hindgut", "hindgut")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult pylorus", "hindgut")

adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult salivary gland", "salivary gland")

adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult renal stem cell", "stem cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("intestinal stem cell", "stem cell")
adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("enteroblast", "stem cell")

  adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("antimicrobial peptide-producing cell", "artefact")
  adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult heart", "cardial cell")
  adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult ostium", "cardial cell")
  adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("cardiomyocyte, working adult heart muscle (non-ostia)", "cardial cell")
  adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult enterocyte", "enterocyte")
  adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("adult midgut enterocyte", "enterocyte")
  adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("enterocyte-like", "enterocyte")
  adata_FCA.obs["DIY_annotation"] = adata_FCA.obs["DIY_annotation"].replace("enterocyte of anterior adult midgut epithelium", "enterocyte")
  adata_FCA.obs["DIY_annotation"

We examine the cell type distributions again after our changes.

In [24]:
counts = adata_FCA.obs["R_annotation"].value_counts()
percentages = adata_FCA.obs["R_annotation"].value_counts(normalize=True) * 100

cell_type_summary = pd.DataFrame({
    'count': counts,
    'percentage': percentages
})

cell_type_summary

Unnamed: 0_level_0,count,percentage
R_annotation,Unnamed: 1_level_1,Unnamed: 2_level_1
unannotated,71211,28.039138
epithelial cell,26458,10.417766
muscle cell,25321,9.970075
adult fat body,13131,5.170296
artefact,9230,3.634288
...,...,...
copper cell,9,0.003544
nociceptive neuron,8,0.003150
antennal trichoid sensillum at4,7,0.002756
cardial cell,1,0.000394


In [25]:
counts = adata_FCA.obs["R_annotation_broad"].value_counts()
percentages = adata_FCA.obs["R_annotation_broad"].value_counts(normalize=True) * 100

cell_type_summary = pd.DataFrame({
    'count': counts,
    'percentage': percentages
})

cell_type_summary

Unnamed: 0_level_0,count,percentage
R_annotation_broad,Unnamed: 1_level_1,Unnamed: 2_level_1
unannotated,71211,28.039138
epithelial cell,41520,16.348388
muscle cell,32388,12.752687
female reproductive system,19053,7.502067
neuron,17997,7.08627
sensory neuron,15924,6.270032
fat cell,13263,5.22227
artefact,9387,3.696106
glial cell,7135,2.809387
female germline cell,5652,2.22546


In [26]:
counts = adata_FCA.obs["DIY_annotation"].value_counts()
percentages = adata_FCA.obs["DIY_annotation"].value_counts(normalize=True) * 100

cell_type_summary = pd.DataFrame({
    'count': counts,
    'percentage': percentages
})

cell_type_summary

Unnamed: 0_level_0,count,percentage
DIY_annotation,Unnamed: 1_level_1,Unnamed: 2_level_1
unannotated,71211,28.039138
epithelial cell,27647,10.885931
neuron,26934,10.60519
muscle cell,25321,9.970075
follicle cell,15549,6.122377
fat cell,13263,5.22227
ovary cell,10450,4.114659
artefact,9387,3.696106
glial cell,7928,3.121629
female germline cell,5903,2.32429


This ensures there are no cell types with very few cells, about which scANVI complains. It may be better to group these with other similar cell types, but for now we just remove them.

In [27]:
counts = adata_FCA.obs["R_annotation"].value_counts()

adata_FCA = adata_FCA[adata_FCA.obs["R_annotation"].isin(counts[counts > 3].index)]

In [28]:
counts = adata_FCA.obs["R_annotation"].value_counts()
percentages = adata_FCA.obs["R_annotation"].value_counts(normalize=True) * 100

cell_type_summary = pd.DataFrame({
    'count': counts,
    'percentage': percentages
})

cell_type_summary

Unnamed: 0_level_0,count,percentage
R_annotation,Unnamed: 1_level_1,Unnamed: 2_level_1
unannotated,71211,28.039359
epithelial cell,26458,10.417848
muscle cell,25321,9.970154
adult fat body,13131,5.170336
artefact,9230,3.634316
...,...,...
adult olfactory receptor neuron Or13a,15,0.005906
adult heart,10,0.003938
copper cell,9,0.003544
nociceptive neuron,8,0.003150


In [29]:
adata_FCA

View of AnnData object with n_obs × n_vars = 253968 × 16366
    obs: 'R_annotation', 'R_annotation__ontology_id', 'R_annotation_broad', 'R_annotation_broad__ontology_id', 'age', 'sex', 'tissue', 'batch', 'DIY_annotation', 'study'
    var: 'gene_names'

We filter some of the very low count cells and genes, just to make sure there's nothing weird going on that scANVI would complain about later.

In [None]:
scanpy.pp.filter_cells(adata_FCA, min_genes=50)
scanpy.pp.filter_cells(adata_FCA, min_counts=100)

scanpy.pp.filter_genes(adata_FCA, min_cells=10)
scanpy.pp.filter_genes(adata_FCA, min_counts=10)

In [35]:
adata_FCA

AnnData object with n_obs × n_vars = 253968 × 15017
    obs: 'R_annotation', 'R_annotation__ontology_id', 'R_annotation_broad', 'R_annotation_broad__ontology_id', 'age', 'sex', 'tissue', 'batch', 'DIY_annotation', 'study', 'n_genes', 'n_counts'
    var: 'gene_names', 'n_cells', 'n_counts'

In [36]:
adata_FCA.write_h5ad("/mnt/z/Caelen/snRNAseq_v2/adata_FCA.h5ad")