### **Curating Human_adult_healthy_10x_data.h5ad**

Article: Developmental cell programs are co-opted in inflammatory skin disease

DOI: 10.1126/science.aba6500

Data Source : https://developmental.cellatlas.io/diseased-skin

##### **Mount farm**

mount-farm

##### **Packages required for curation**

In [1]:
#Import all packages required for curation

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess
import math
import re

### **Curation Schema**

##### **X (Matrix Layers)**

##### **AnnData object**

In [3]:
# Load the AnnData object

In [4]:
adata = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/inflammatory_skin_disease/Data/Human_adult_healthy_10x_data.h5ad')

In [5]:
# View the AnnData object

In [6]:
adata

AnnData object with n_obs × n_vars = 195739 × 2219
    obs: 'sample_id', 'Status', 'Site', 'Tissue', 'Location', 'donor_id', 'Sex', 'full_clustering'
    var: 'gene_ids-SKN8090524', 'feature_types-SKN8090524', 'gene_ids-SKN8090525', 'feature_types-SKN8090525', 'gene_ids-SKN8090526', 'feature_types-SKN8090526', 'gene_ids-SKN8090527', 'feature_types-SKN8090527', 'gene_ids-SKN8090528', 'feature_types-SKN8090528', 'gene_ids-SKN8090529', 'feature_types-SKN8090529', 'gene_ids-SKN8090530', 'feature_types-SKN8090530', 'gene_ids-SKN8090531', 'feature_types-SKN8090531', 'gene_ids-SKN8090536', 'feature_types-SKN8090536', 'gene_ids-SKN8090537', 'feature_types-SKN8090537', 'gene_ids-SKN8090538', 'feature_types-SKN8090538', 'gene_ids-SKN8090539', 'feature_types-SKN8090539', 'gene_ids-SKN8090540', 'feature_types-SKN8090540', 'gene_ids-SKN8090541', 'feature_types-SKN8090541', 'gene_ids-SKN8090542', 'feature_types-SKN8090542', 'gene_ids-SKN8090543', 'feature_types-SKN8090543', 'gene_ids-SKN8090548', 'f

##### **X - expression matrix**

In [7]:
# View the expression matrix of the anndata object.

In [8]:
adata.X

<195739x2219 sparse matrix of type '<class 'numpy.float32'>'
	with 60047983 stored elements in Compressed Sparse Row format>

In [9]:
# Print the matrix to check whether they are normalized counts or raw counts. if the matrix has floating numbers,they are normalized counts.if they are integers, they are raw counts.

In [10]:
print(adata.X)

  (0, 2218)	1.505772
  (0, 2217)	0.77436334
  (0, 2216)	1.0129876
  (0, 2215)	0.46033537
  (0, 2214)	1.505772
  (0, 2213)	1.8344115
  (0, 2212)	1.0129876
  (0, 2211)	1.3668635
  (0, 2209)	2.2175145
  (0, 2208)	2.444336
  (0, 2207)	1.505772
  (0, 2206)	1.505772
  (0, 2200)	2.3373425
  (0, 2194)	1.3668635
  (0, 2188)	1.0129876
  (0, 2187)	1.0129876
  (0, 2144)	0.77436334
  (0, 2137)	0.46033537
  (0, 2126)	1.2054981
  (0, 2124)	0.77436334
  (0, 2115)	2.8204465
  (0, 2114)	1.3668635
  (0, 2103)	0.77436334
  (0, 2102)	0.46033537
  (0, 2100)	2.5860116
  :	:
  (195738, 282)	1.7293646
  (195738, 281)	1.971684
  (195738, 267)	0.93440187
  (195738, 247)	0.93440187
  (195738, 245)	1.7293646
  (195738, 214)	1.4088825
  (195738, 211)	2.800745
  (195738, 209)	0.93440187
  (195738, 204)	0.93440187
  (195738, 203)	0.93440187
  (195738, 181)	0.93440187
  (195738, 153)	0.93440187
  (195738, 125)	1.7293646
  (195738, 122)	1.4088825
  (195738, 120)	0.93440187
  (195738, 75)	0.93440187
  (195738, 61)	1.408

##### **Raw counts matrix**

In [11]:
# If X has normalized counts, check for the raw counts matrix.

In [12]:
# check whether raw counts are present in adata.raw

In [13]:
adata.raw

<anndata._core.raw.Raw at 0x7fb232498670>

In [14]:
print(adata.raw.X)

  (0, 30)	1.0
  (0, 32)	1.0
  (0, 36)	1.0
  (0, 42)	1.0
  (0, 46)	1.0
  (0, 48)	1.0
  (0, 59)	1.0
  (0, 71)	1.0
  (0, 84)	1.0
  (0, 86)	2.0
  (0, 132)	37.0
  (0, 144)	1.0
  (0, 147)	4.0
  (0, 148)	1.0
  (0, 149)	2.0
  (0, 160)	1.0
  (0, 165)	1.0
  (0, 168)	4.0
  (0, 176)	3.0
  (0, 194)	1.0
  (0, 198)	2.0
  (0, 211)	1.0
  (0, 249)	1.0
  (0, 251)	1.0
  (0, 289)	1.0
  :	:
  (195738, 28139)	1.0
  (195738, 28150)	1.0
  (195738, 28176)	1.0
  (195738, 28340)	1.0
  (195738, 28341)	1.0
  (195738, 28362)	1.0
  (195738, 28435)	1.0
  (195738, 28473)	2.0
  (195738, 28476)	1.0
  (195738, 28516)	2.0
  (195738, 28545)	2.0
  (195738, 28575)	1.0
  (195738, 28647)	1.0
  (195738, 28648)	1.0
  (195738, 28660)	1.0
  (195738, 28694)	1.0
  (195738, 28698)	1.0
  (195738, 28699)	9.0
  (195738, 28700)	3.0
  (195738, 28702)	1.0
  (195738, 28703)	2.0
  (195738, 28704)	1.0
  (195738, 28706)	3.0
  (195738, 28707)	1.0
  (195738, 28709)	1.0


In [15]:
# Check whether adata and araw has same dimensions.

In [16]:
adata.raw.X

<195739x28728 sparse matrix of type '<class 'numpy.float32'>'
	with 346152176 stored elements in Compressed Sparse Row format>

In [17]:
araw = adata.raw.to_adata()

In [18]:
araw

AnnData object with n_obs × n_vars = 195739 × 28728
    obs: 'sample_id', 'Status', 'Site', 'Tissue', 'Location', 'donor_id', 'Sex', 'full_clustering'
    var: 'gene_ids-SKN8090524', 'feature_types-SKN8090524', 'gene_ids-SKN8090525', 'feature_types-SKN8090525', 'gene_ids-SKN8090526', 'feature_types-SKN8090526', 'gene_ids-SKN8090527', 'feature_types-SKN8090527', 'gene_ids-SKN8090528', 'feature_types-SKN8090528', 'gene_ids-SKN8090529', 'feature_types-SKN8090529', 'gene_ids-SKN8090530', 'feature_types-SKN8090530', 'gene_ids-SKN8090531', 'feature_types-SKN8090531', 'gene_ids-SKN8090536', 'feature_types-SKN8090536', 'gene_ids-SKN8090537', 'feature_types-SKN8090537', 'gene_ids-SKN8090538', 'feature_types-SKN8090538', 'gene_ids-SKN8090539', 'feature_types-SKN8090539', 'gene_ids-SKN8090540', 'feature_types-SKN8090540', 'gene_ids-SKN8090541', 'feature_types-SKN8090541', 'gene_ids-SKN8090542', 'feature_types-SKN8090542', 'gene_ids-SKN8090543', 'feature_types-SKN8090543', 'gene_ids-SKN8090548', '

In [19]:
araw.X

<195739x28728 sparse matrix of type '<class 'numpy.float32'>'
	with 346152176 stored elements in Compressed Sparse Row format>

##### **Variables(var)**

In [20]:
# View the var of anndata and raw object

In [21]:
adata.var

Unnamed: 0_level_0,gene_ids-SKN8090524,feature_types-SKN8090524,gene_ids-SKN8090525,feature_types-SKN8090525,gene_ids-SKN8090526,feature_types-SKN8090526,gene_ids-SKN8090527,feature_types-SKN8090527,gene_ids-SKN8090528,feature_types-SKN8090528,...,feature_types-4820STDY7389012,gene_ids-4820STDY7389013,feature_types-4820STDY7389013,gene_ids-4820STDY7389014,feature_types-4820STDY7389014,n_cells,highly_variable,means,dispersions,dispersions_norm
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HES4,ENSG00000188290,Gene Expression,ENSG00000188290,Gene Expression,ENSG00000188290,Gene Expression,ENSG00000188290,Gene Expression,ENSG00000188290,Gene Expression,...,Gene Expression,ENSG00000188290,Gene Expression,ENSG00000188290,Gene Expression,45674,True,0.150606,1.569678,1.580956
ISG15,ENSG00000187608,Gene Expression,ENSG00000187608,Gene Expression,ENSG00000187608,Gene Expression,ENSG00000187608,Gene Expression,ENSG00000187608,Gene Expression,...,Gene Expression,ENSG00000187608,Gene Expression,ENSG00000187608,Gene Expression,98887,True,0.396491,2.325791,1.874672
TNFRSF18,ENSG00000186891,Gene Expression,ENSG00000186891,Gene Expression,ENSG00000186891,Gene Expression,ENSG00000186891,Gene Expression,ENSG00000186891,Gene Expression,...,Gene Expression,ENSG00000186891,Gene Expression,ENSG00000186891,Gene Expression,131416,False,0.507757,1.696750,0.466432
TNFRSF4,ENSG00000186827,Gene Expression,ENSG00000186827,Gene Expression,ENSG00000186827,Gene Expression,ENSG00000186827,Gene Expression,ENSG00000186827,Gene Expression,...,Gene Expression,ENSG00000186827,Gene Expression,ENSG00000186827,Gene Expression,78378,True,0.427953,2.096149,1.569884
MXRA8,ENSG00000162576,Gene Expression,ENSG00000162576,Gene Expression,ENSG00000162576,Gene Expression,ENSG00000162576,Gene Expression,ENSG00000162576,Gene Expression,...,Gene Expression,ENSG00000162576,Gene Expression,ENSG00000162576,Gene Expression,30696,True,0.109826,1.131393,0.858503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MT-ND4L,ENSG00000212907,Gene Expression,ENSG00000212907,Gene Expression,ENSG00000212907,Gene Expression,ENSG00000212907,Gene Expression,ENSG00000212907,Gene Expression,...,Gene Expression,ENSG00000212907,Gene Expression,ENSG00000212907,Gene Expression,431192,True,2.513127,3.521863,1.173905
MT-ND4,ENSG00000198886,Gene Expression,ENSG00000198886,Gene Expression,ENSG00000198886,Gene Expression,ENSG00000198886,Gene Expression,ENSG00000198886,Gene Expression,...,Gene Expression,ENSG00000198886,Gene Expression,ENSG00000198886,Gene Expression,466390,True,2.513310,3.404372,1.051254
MT-ND5,ENSG00000198786,Gene Expression,ENSG00000198786,Gene Expression,ENSG00000198786,Gene Expression,ENSG00000198786,Gene Expression,ENSG00000198786,Gene Expression,...,Gene Expression,ENSG00000198786,Gene Expression,ENSG00000198786,Gene Expression,431867,False,1.976774,2.651981,0.168285
MT-ND6,ENSG00000198695,Gene Expression,ENSG00000198695,Gene Expression,ENSG00000198695,Gene Expression,ENSG00000198695,Gene Expression,ENSG00000198695,Gene Expression,...,Gene Expression,ENSG00000198695,Gene Expression,ENSG00000198695,Gene Expression,153150,True,0.533141,1.905717,0.681757


In [22]:
araw.var

Unnamed: 0_level_0,gene_ids-SKN8090524,feature_types-SKN8090524,gene_ids-SKN8090525,feature_types-SKN8090525,gene_ids-SKN8090526,feature_types-SKN8090526,gene_ids-SKN8090527,feature_types-SKN8090527,gene_ids-SKN8090528,feature_types-SKN8090528,...,feature_types-4820STDY7389010,gene_ids-4820STDY7389011,feature_types-4820STDY7389011,gene_ids-4820STDY7389012,feature_types-4820STDY7389012,gene_ids-4820STDY7389013,feature_types-4820STDY7389013,gene_ids-4820STDY7389014,feature_types-4820STDY7389014,n_cells
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MIR1302-2HG,ENSG00000243485,0,ENSG00000243485,0,ENSG00000243485,0,ENSG00000243485,0,ENSG00000243485,0,...,0,ENSG00000243485,0,ENSG00000243485,0,ENSG00000243485,0,ENSG00000243485,0,3
AL627309.1,ENSG00000238009,0,ENSG00000238009,0,ENSG00000238009,0,ENSG00000238009,0,ENSG00000238009,0,...,0,ENSG00000238009,0,ENSG00000238009,0,ENSG00000238009,0,ENSG00000238009,0,391
AL627309.3,ENSG00000239945,0,ENSG00000239945,0,ENSG00000239945,0,ENSG00000239945,0,ENSG00000239945,0,...,0,ENSG00000239945,0,ENSG00000239945,0,ENSG00000239945,0,ENSG00000239945,0,9
AL627309.2,ENSG00000239906,0,ENSG00000239906,0,ENSG00000239906,0,ENSG00000239906,0,ENSG00000239906,0,...,0,ENSG00000239906,0,ENSG00000239906,0,ENSG00000239906,0,ENSG00000239906,0,12
AL732372.1,ENSG00000236601,0,ENSG00000236601,0,ENSG00000236601,0,ENSG00000236601,0,ENSG00000236601,0,...,0,ENSG00000236601,0,ENSG00000236601,0,ENSG00000236601,0,ENSG00000236601,0,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AC004556.1,ENSG00000276345,0,ENSG00000276345,0,ENSG00000276345,0,ENSG00000276345,0,ENSG00000276345,0,...,0,ENSG00000276345,0,ENSG00000276345,0,ENSG00000276345,0,ENSG00000276345,0,4433
AC233755.2,ENSG00000277856,0,ENSG00000277856,0,ENSG00000277856,0,ENSG00000277856,0,ENSG00000277856,0,...,0,ENSG00000277856,0,ENSG00000277856,0,ENSG00000277856,0,ENSG00000277856,0,23
AC233755.1,ENSG00000275063,0,ENSG00000275063,0,ENSG00000275063,0,ENSG00000275063,0,ENSG00000275063,0,...,0,ENSG00000275063,0,ENSG00000275063,0,ENSG00000275063,0,ENSG00000275063,0,72
AC240274.1,ENSG00000271254,0,ENSG00000271254,0,ENSG00000271254,0,ENSG00000271254,0,ENSG00000271254,0,...,0,ENSG00000271254,0,ENSG00000271254,0,ENSG00000271254,0,ENSG00000271254,0,2853


In [23]:
gene_names1 = adata.var.loc[:,adata.var.columns.str.startswith('gene_ids')]
gene_names2 = araw.var.loc[:,araw.var.columns.str.startswith('gene_ids')]

In [24]:
len(gene_names1)

2219

In [25]:
len(gene_names2)

28728

In [26]:
ensg1 = []

for k in tqdm(range(len(gene_names1))):
    gene_valcount = gene_names1.iloc[k].value_counts()
    ensg_values = gene_valcount.index[gene_valcount.index.str.startswith('ENSG')]
    if len(ensg_values) > 0:
        ensg1.append(ensg_values[0])
    else:
        ensg1.append('nan')

100%|██████████| 2219/2219 [00:00<00:00, 2302.58it/s]


In [27]:
ensg2 = []

for k in tqdm(range(len(gene_names2))):
    gene_valcount = gene_names2.iloc[k].value_counts()
    ensg_values = gene_valcount.index[gene_valcount.index.str.startswith('ENSG')]
    if len(ensg_values) > 0:
        ensg2.append(ensg_values[0])
    else:
        ensg2.append('nan')

100%|██████████| 28728/28728 [00:12<00:00, 2300.06it/s]


In [28]:
len(ensg1)

2219

In [29]:
len(ensg2)

28728

In [30]:
# copy the index column values to a new column called gene_symbols

In [31]:
adata.var['gene_symbols'] = adata.var_names
araw.var['gene_symbols'] = araw.var_names

In [32]:
#set ensembl ids as index column

In [33]:
adata.var_names = ensg1

In [34]:
araw.var_names = ensg2

In [35]:
# Load the approved genes file.

In [36]:
approved_genes = pd.read_csv('/home/jovyan/CXG_DATASETS_PORTAL/gene_info/genes_approved.csv')

In [37]:
#Create a dictionary from the approved genes file 

In [38]:
genedict = {key: 1 for key in list(approved_genes.feature_id)}

In [39]:
genedict

{'ERCC-00002': 1,
 'ERCC-00003': 1,
 'ERCC-00004': 1,
 'ERCC-00009': 1,
 'ERCC-00012': 1,
 'ERCC-00013': 1,
 'ERCC-00014': 1,
 'ERCC-00016': 1,
 'ERCC-00017': 1,
 'ERCC-00019': 1,
 'ERCC-00022': 1,
 'ERCC-00024': 1,
 'ERCC-00025': 1,
 'ERCC-00028': 1,
 'ERCC-00031': 1,
 'ERCC-00033': 1,
 'ERCC-00034': 1,
 'ERCC-00035': 1,
 'ERCC-00039': 1,
 'ERCC-00040': 1,
 'ERCC-00041': 1,
 'ERCC-00042': 1,
 'ERCC-00043': 1,
 'ERCC-00044': 1,
 'ERCC-00046': 1,
 'ERCC-00048': 1,
 'ERCC-00051': 1,
 'ERCC-00053': 1,
 'ERCC-00054': 1,
 'ERCC-00057': 1,
 'ERCC-00058': 1,
 'ERCC-00059': 1,
 'ERCC-00060': 1,
 'ERCC-00061': 1,
 'ERCC-00062': 1,
 'ERCC-00067': 1,
 'ERCC-00069': 1,
 'ERCC-00071': 1,
 'ERCC-00073': 1,
 'ERCC-00074': 1,
 'ERCC-00075': 1,
 'ERCC-00076': 1,
 'ERCC-00077': 1,
 'ERCC-00078': 1,
 'ERCC-00079': 1,
 'ERCC-00081': 1,
 'ERCC-00083': 1,
 'ERCC-00084': 1,
 'ERCC-00085': 1,
 'ERCC-00086': 1,
 'ERCC-00092': 1,
 'ERCC-00095': 1,
 'ERCC-00096': 1,
 'ERCC-00097': 1,
 'ERCC-00098': 1,
 'ERCC-000

In [40]:
len(genedict)

116184

In [41]:
#Filter out the genes which are not in the approved genes file.

In [42]:
var_to_keep_adata = [x for x in adata.var_names if (x in genedict)]
var_to_keep_araw = [x for x in araw.var_names if (x in genedict)]

In [43]:
len(var_to_keep_adata)

2216

In [44]:
len(var_to_keep_araw)

28494

In [45]:
adata.var

Unnamed: 0,gene_ids-SKN8090524,feature_types-SKN8090524,gene_ids-SKN8090525,feature_types-SKN8090525,gene_ids-SKN8090526,feature_types-SKN8090526,gene_ids-SKN8090527,feature_types-SKN8090527,gene_ids-SKN8090528,feature_types-SKN8090528,...,gene_ids-4820STDY7389013,feature_types-4820STDY7389013,gene_ids-4820STDY7389014,feature_types-4820STDY7389014,n_cells,highly_variable,means,dispersions,dispersions_norm,gene_symbols
ENSG00000188290,ENSG00000188290,Gene Expression,ENSG00000188290,Gene Expression,ENSG00000188290,Gene Expression,ENSG00000188290,Gene Expression,ENSG00000188290,Gene Expression,...,ENSG00000188290,Gene Expression,ENSG00000188290,Gene Expression,45674,True,0.150606,1.569678,1.580956,HES4
ENSG00000187608,ENSG00000187608,Gene Expression,ENSG00000187608,Gene Expression,ENSG00000187608,Gene Expression,ENSG00000187608,Gene Expression,ENSG00000187608,Gene Expression,...,ENSG00000187608,Gene Expression,ENSG00000187608,Gene Expression,98887,True,0.396491,2.325791,1.874672,ISG15
ENSG00000186891,ENSG00000186891,Gene Expression,ENSG00000186891,Gene Expression,ENSG00000186891,Gene Expression,ENSG00000186891,Gene Expression,ENSG00000186891,Gene Expression,...,ENSG00000186891,Gene Expression,ENSG00000186891,Gene Expression,131416,False,0.507757,1.696750,0.466432,TNFRSF18
ENSG00000186827,ENSG00000186827,Gene Expression,ENSG00000186827,Gene Expression,ENSG00000186827,Gene Expression,ENSG00000186827,Gene Expression,ENSG00000186827,Gene Expression,...,ENSG00000186827,Gene Expression,ENSG00000186827,Gene Expression,78378,True,0.427953,2.096149,1.569884,TNFRSF4
ENSG00000162576,ENSG00000162576,Gene Expression,ENSG00000162576,Gene Expression,ENSG00000162576,Gene Expression,ENSG00000162576,Gene Expression,ENSG00000162576,Gene Expression,...,ENSG00000162576,Gene Expression,ENSG00000162576,Gene Expression,30696,True,0.109826,1.131393,0.858503,MXRA8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000212907,ENSG00000212907,Gene Expression,ENSG00000212907,Gene Expression,ENSG00000212907,Gene Expression,ENSG00000212907,Gene Expression,ENSG00000212907,Gene Expression,...,ENSG00000212907,Gene Expression,ENSG00000212907,Gene Expression,431192,True,2.513127,3.521863,1.173905,MT-ND4L
ENSG00000198886,ENSG00000198886,Gene Expression,ENSG00000198886,Gene Expression,ENSG00000198886,Gene Expression,ENSG00000198886,Gene Expression,ENSG00000198886,Gene Expression,...,ENSG00000198886,Gene Expression,ENSG00000198886,Gene Expression,466390,True,2.513310,3.404372,1.051254,MT-ND4
ENSG00000198786,ENSG00000198786,Gene Expression,ENSG00000198786,Gene Expression,ENSG00000198786,Gene Expression,ENSG00000198786,Gene Expression,ENSG00000198786,Gene Expression,...,ENSG00000198786,Gene Expression,ENSG00000198786,Gene Expression,431867,False,1.976774,2.651981,0.168285,MT-ND5
ENSG00000198695,ENSG00000198695,Gene Expression,ENSG00000198695,Gene Expression,ENSG00000198695,Gene Expression,ENSG00000198695,Gene Expression,ENSG00000198695,Gene Expression,...,ENSG00000198695,Gene Expression,ENSG00000198695,Gene Expression,153150,True,0.533141,1.905717,0.681757,MT-ND6


In [46]:
araw.var

Unnamed: 0,gene_ids-SKN8090524,feature_types-SKN8090524,gene_ids-SKN8090525,feature_types-SKN8090525,gene_ids-SKN8090526,feature_types-SKN8090526,gene_ids-SKN8090527,feature_types-SKN8090527,gene_ids-SKN8090528,feature_types-SKN8090528,...,gene_ids-4820STDY7389011,feature_types-4820STDY7389011,gene_ids-4820STDY7389012,feature_types-4820STDY7389012,gene_ids-4820STDY7389013,feature_types-4820STDY7389013,gene_ids-4820STDY7389014,feature_types-4820STDY7389014,n_cells,gene_symbols
ENSG00000243485,ENSG00000243485,0,ENSG00000243485,0,ENSG00000243485,0,ENSG00000243485,0,ENSG00000243485,0,...,ENSG00000243485,0,ENSG00000243485,0,ENSG00000243485,0,ENSG00000243485,0,3,MIR1302-2HG
ENSG00000238009,ENSG00000238009,0,ENSG00000238009,0,ENSG00000238009,0,ENSG00000238009,0,ENSG00000238009,0,...,ENSG00000238009,0,ENSG00000238009,0,ENSG00000238009,0,ENSG00000238009,0,391,AL627309.1
ENSG00000239945,ENSG00000239945,0,ENSG00000239945,0,ENSG00000239945,0,ENSG00000239945,0,ENSG00000239945,0,...,ENSG00000239945,0,ENSG00000239945,0,ENSG00000239945,0,ENSG00000239945,0,9,AL627309.3
ENSG00000239906,ENSG00000239906,0,ENSG00000239906,0,ENSG00000239906,0,ENSG00000239906,0,ENSG00000239906,0,...,ENSG00000239906,0,ENSG00000239906,0,ENSG00000239906,0,ENSG00000239906,0,12,AL627309.2
ENSG00000236601,ENSG00000236601,0,ENSG00000236601,0,ENSG00000236601,0,ENSG00000236601,0,ENSG00000236601,0,...,ENSG00000236601,0,ENSG00000236601,0,ENSG00000236601,0,ENSG00000236601,0,80,AL732372.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000276345,ENSG00000276345,0,ENSG00000276345,0,ENSG00000276345,0,ENSG00000276345,0,ENSG00000276345,0,...,ENSG00000276345,0,ENSG00000276345,0,ENSG00000276345,0,ENSG00000276345,0,4433,AC004556.1
ENSG00000277856,ENSG00000277856,0,ENSG00000277856,0,ENSG00000277856,0,ENSG00000277856,0,ENSG00000277856,0,...,ENSG00000277856,0,ENSG00000277856,0,ENSG00000277856,0,ENSG00000277856,0,23,AC233755.2
ENSG00000275063,ENSG00000275063,0,ENSG00000275063,0,ENSG00000275063,0,ENSG00000275063,0,ENSG00000275063,0,...,ENSG00000275063,0,ENSG00000275063,0,ENSG00000275063,0,ENSG00000275063,0,72,AC233755.1
ENSG00000271254,ENSG00000271254,0,ENSG00000271254,0,ENSG00000271254,0,ENSG00000271254,0,ENSG00000271254,0,...,ENSG00000271254,0,ENSG00000271254,0,ENSG00000271254,0,ENSG00000271254,0,2853,AC240274.1


In [47]:
# Modify the anndata object by filtering out the filtered genes.

In [48]:
adata = adata[:, var_to_keep_adata].copy()
araw = araw[:, var_to_keep_araw].copy()

In [49]:
adata.var

Unnamed: 0,gene_ids-SKN8090524,feature_types-SKN8090524,gene_ids-SKN8090525,feature_types-SKN8090525,gene_ids-SKN8090526,feature_types-SKN8090526,gene_ids-SKN8090527,feature_types-SKN8090527,gene_ids-SKN8090528,feature_types-SKN8090528,...,gene_ids-4820STDY7389013,feature_types-4820STDY7389013,gene_ids-4820STDY7389014,feature_types-4820STDY7389014,n_cells,highly_variable,means,dispersions,dispersions_norm,gene_symbols
ENSG00000188290,ENSG00000188290,Gene Expression,ENSG00000188290,Gene Expression,ENSG00000188290,Gene Expression,ENSG00000188290,Gene Expression,ENSG00000188290,Gene Expression,...,ENSG00000188290,Gene Expression,ENSG00000188290,Gene Expression,45674,True,0.150606,1.569678,1.580956,HES4
ENSG00000187608,ENSG00000187608,Gene Expression,ENSG00000187608,Gene Expression,ENSG00000187608,Gene Expression,ENSG00000187608,Gene Expression,ENSG00000187608,Gene Expression,...,ENSG00000187608,Gene Expression,ENSG00000187608,Gene Expression,98887,True,0.396491,2.325791,1.874672,ISG15
ENSG00000186891,ENSG00000186891,Gene Expression,ENSG00000186891,Gene Expression,ENSG00000186891,Gene Expression,ENSG00000186891,Gene Expression,ENSG00000186891,Gene Expression,...,ENSG00000186891,Gene Expression,ENSG00000186891,Gene Expression,131416,False,0.507757,1.696750,0.466432,TNFRSF18
ENSG00000186827,ENSG00000186827,Gene Expression,ENSG00000186827,Gene Expression,ENSG00000186827,Gene Expression,ENSG00000186827,Gene Expression,ENSG00000186827,Gene Expression,...,ENSG00000186827,Gene Expression,ENSG00000186827,Gene Expression,78378,True,0.427953,2.096149,1.569884,TNFRSF4
ENSG00000162576,ENSG00000162576,Gene Expression,ENSG00000162576,Gene Expression,ENSG00000162576,Gene Expression,ENSG00000162576,Gene Expression,ENSG00000162576,Gene Expression,...,ENSG00000162576,Gene Expression,ENSG00000162576,Gene Expression,30696,True,0.109826,1.131393,0.858503,MXRA8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000212907,ENSG00000212907,Gene Expression,ENSG00000212907,Gene Expression,ENSG00000212907,Gene Expression,ENSG00000212907,Gene Expression,ENSG00000212907,Gene Expression,...,ENSG00000212907,Gene Expression,ENSG00000212907,Gene Expression,431192,True,2.513127,3.521863,1.173905,MT-ND4L
ENSG00000198886,ENSG00000198886,Gene Expression,ENSG00000198886,Gene Expression,ENSG00000198886,Gene Expression,ENSG00000198886,Gene Expression,ENSG00000198886,Gene Expression,...,ENSG00000198886,Gene Expression,ENSG00000198886,Gene Expression,466390,True,2.513310,3.404372,1.051254,MT-ND4
ENSG00000198786,ENSG00000198786,Gene Expression,ENSG00000198786,Gene Expression,ENSG00000198786,Gene Expression,ENSG00000198786,Gene Expression,ENSG00000198786,Gene Expression,...,ENSG00000198786,Gene Expression,ENSG00000198786,Gene Expression,431867,False,1.976774,2.651981,0.168285,MT-ND5
ENSG00000198695,ENSG00000198695,Gene Expression,ENSG00000198695,Gene Expression,ENSG00000198695,Gene Expression,ENSG00000198695,Gene Expression,ENSG00000198695,Gene Expression,...,ENSG00000198695,Gene Expression,ENSG00000198695,Gene Expression,153150,True,0.533141,1.905717,0.681757,MT-ND6


In [50]:
araw.var

Unnamed: 0,gene_ids-SKN8090524,feature_types-SKN8090524,gene_ids-SKN8090525,feature_types-SKN8090525,gene_ids-SKN8090526,feature_types-SKN8090526,gene_ids-SKN8090527,feature_types-SKN8090527,gene_ids-SKN8090528,feature_types-SKN8090528,...,gene_ids-4820STDY7389011,feature_types-4820STDY7389011,gene_ids-4820STDY7389012,feature_types-4820STDY7389012,gene_ids-4820STDY7389013,feature_types-4820STDY7389013,gene_ids-4820STDY7389014,feature_types-4820STDY7389014,n_cells,gene_symbols
ENSG00000243485,ENSG00000243485,0,ENSG00000243485,0,ENSG00000243485,0,ENSG00000243485,0,ENSG00000243485,0,...,ENSG00000243485,0,ENSG00000243485,0,ENSG00000243485,0,ENSG00000243485,0,3,MIR1302-2HG
ENSG00000238009,ENSG00000238009,0,ENSG00000238009,0,ENSG00000238009,0,ENSG00000238009,0,ENSG00000238009,0,...,ENSG00000238009,0,ENSG00000238009,0,ENSG00000238009,0,ENSG00000238009,0,391,AL627309.1
ENSG00000239945,ENSG00000239945,0,ENSG00000239945,0,ENSG00000239945,0,ENSG00000239945,0,ENSG00000239945,0,...,ENSG00000239945,0,ENSG00000239945,0,ENSG00000239945,0,ENSG00000239945,0,9,AL627309.3
ENSG00000239906,ENSG00000239906,0,ENSG00000239906,0,ENSG00000239906,0,ENSG00000239906,0,ENSG00000239906,0,...,ENSG00000239906,0,ENSG00000239906,0,ENSG00000239906,0,ENSG00000239906,0,12,AL627309.2
ENSG00000236601,ENSG00000236601,0,ENSG00000236601,0,ENSG00000236601,0,ENSG00000236601,0,ENSG00000236601,0,...,ENSG00000236601,0,ENSG00000236601,0,ENSG00000236601,0,ENSG00000236601,0,80,AL732372.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000276345,ENSG00000276345,0,ENSG00000276345,0,ENSG00000276345,0,ENSG00000276345,0,ENSG00000276345,0,...,ENSG00000276345,0,ENSG00000276345,0,ENSG00000276345,0,ENSG00000276345,0,4433,AC004556.1
ENSG00000277856,ENSG00000277856,0,ENSG00000277856,0,ENSG00000277856,0,ENSG00000277856,0,ENSG00000277856,0,...,ENSG00000277856,0,ENSG00000277856,0,ENSG00000277856,0,ENSG00000277856,0,23,AC233755.2
ENSG00000275063,ENSG00000275063,0,ENSG00000275063,0,ENSG00000275063,0,ENSG00000275063,0,ENSG00000275063,0,...,ENSG00000275063,0,ENSG00000275063,0,ENSG00000275063,0,ENSG00000275063,0,72,AC233755.1
ENSG00000271254,ENSG00000271254,0,ENSG00000271254,0,ENSG00000271254,0,ENSG00000271254,0,ENSG00000271254,0,...,ENSG00000271254,0,ENSG00000271254,0,ENSG00000271254,0,ENSG00000271254,0,2853,AC240274.1


In [51]:
adata.var = adata.var.iloc[:, 0:0]
araw.var = araw.var.iloc[:, 0:0]

In [52]:
# View var

In [53]:
adata.var

ENSG00000188290
ENSG00000187608
ENSG00000186891
ENSG00000186827
ENSG00000162576
...
ENSG00000212907
ENSG00000198886
ENSG00000198786
ENSG00000198695
ENSG00000198727


In [54]:
araw.var

ENSG00000243485
ENSG00000238009
ENSG00000239945
ENSG00000239906
ENSG00000236601
...
ENSG00000276345
ENSG00000277856
ENSG00000275063
ENSG00000271254
ENSG00000277475


feature is filtered

In [55]:
def add_zero():
	global adata
	global araw
	if araw.shape[1] > adata.shape[1]:
		genes_add = [x for x in araw.var.index.to_list() if x not in adata.var.index.to_list()]
		new_matrix = sparse.csr_matrix((adata.X.data, adata.X.indices, adata.X.indptr), shape = araw.shape)
		all_genes = adata.var.index.to_list()
		all_genes.extend(genes_add)
		new_var = pd.DataFrame(index=all_genes)
		new_var = pd.merge(new_var, araw.var, left_index=True, right_index=True, how='left')
		new_var['feature_is_filtered'] = False
		new_var.loc[genes_add, 'feature_is_filtered'] = True
		new_adata = ad.AnnData(X=new_matrix, obs=adata.obs, var=new_var, uns=adata.uns, obsm=adata.obsm)
		if adata.layers:
			for layer in adata.layers:
				new_layer = sparse.csr_matrix((adata.layers[layer].data, adata.layers[layer].indices, adata.layers[layer].indptr), shape = araw.shape)
				new_adata.layers[layer] = new_layer
		new_adata = new_adata[:,araw.var.index.to_list()]
		new_adata.var = new_adata.var.merge(adata.var, left_index=True, right_index=True, how='left')
		adata = new_adata
	else:
		adata.var['feature_is_filtered'] = False


In [56]:
add_zero()

In [57]:
adata.var

Unnamed: 0,feature_is_filtered
ENSG00000243485,True
ENSG00000238009,True
ENSG00000239945,True
ENSG00000239906,True
ENSG00000236601,True
...,...
ENSG00000276345,True
ENSG00000277856,True
ENSG00000275063,True
ENSG00000271254,True


In [58]:
list(adata.var['feature_is_filtered'].unique())

[True, False]

In [59]:
false_count = (adata.var['feature_is_filtered']== False).sum()

In [60]:
false_count

2216

In [61]:
araw.var

ENSG00000243485
ENSG00000238009
ENSG00000239945
ENSG00000239906
ENSG00000236601
...
ENSG00000276345
ENSG00000277856
ENSG00000275063
ENSG00000271254
ENSG00000277475


#### **obs (Cell metadata)**

In [62]:
#view obs

In [63]:
adata.obs

Unnamed: 0_level_0,sample_id,Status,Site,Tissue,Location,donor_id,Sex,full_clustering
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AAACCTGAGAGTGACC-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC
AAACCTGAGGTCGGAT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC
AAACCTGAGTCGAGTG-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC
AAACCTGCATACTCTT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC
AAACCTGCATTCACTT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Melanocyte
...,...,...,...,...,...,...,...,...
TTTGGTTTCAGGCCCA-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,migLC
TTTGGTTTCGCCTGTT-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th
TTTGTCAAGGAATCGC-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th
TTTGTCAAGGACTGGT-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th


In [64]:
# view the column names in obs

In [65]:
adata.obs.columns

Index(['sample_id', 'Status', 'Site', 'Tissue', 'Location', 'donor_id', 'Sex',
       'full_clustering'],
      dtype='object')

In [66]:
list(adata.obs['sample_id'].unique())

['SKN8104894',
 'SKN8104895',
 'SKN8104896',
 'SKN8104897',
 'SKN8104899',
 'SKN8104900',
 'SKN8104901',
 'SKN8104902',
 'SKN8105192',
 'SKN8105193',
 'SKN8105194',
 'SKN8105195',
 'SKN8105197',
 'SKN8105198',
 'SKN8105199',
 'SKN8105200',
 '4820STDY7388991',
 '4820STDY7388992',
 '4820STDY7388993',
 '4820STDY7388994',
 '4820STDY7388995',
 '4820STDY7388996',
 '4820STDY7388997',
 '4820STDY7388998',
 '4820STDY7388999',
 '4820STDY7389000',
 '4820STDY7389001',
 '4820STDY7389002',
 '4820STDY7389003',
 '4820STDY7389004',
 '4820STDY7389005',
 '4820STDY7389006',
 '4820STDY7389007',
 '4820STDY7389008',
 '4820STDY7389009',
 '4820STDY7389010',
 '4820STDY7389011',
 '4820STDY7389012',
 '4820STDY7389013',
 '4820STDY7389014']

#### **assay_ontology_term_id**

In [67]:
adata.obs['assay_ontology_term_id'] = ['EFO:0009899']* len(adata.obs)

In [68]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay_ontology_term_id'].astype('category')

In [69]:
# view adata.obs

In [70]:
adata.obs

Unnamed: 0_level_0,sample_id,Status,Site,Tissue,Location,donor_id,Sex,full_clustering,assay_ontology_term_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AAACCTGAGAGTGACC-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899
AAACCTGAGGTCGGAT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899
AAACCTGAGTCGAGTG-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899
AAACCTGCATACTCTT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899
AAACCTGCATTCACTT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Melanocyte,EFO:0009899
...,...,...,...,...,...,...,...,...,...
TTTGGTTTCAGGCCCA-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,migLC,EFO:0009899
TTTGGTTTCGCCTGTT-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899
TTTGTCAAGGAATCGC-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899
TTTGTCAAGGACTGGT-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899


#### **cell_type_ontology_term_id**

In [71]:
#identify the column in adata.obs related. to cell type annotation

In [72]:
adata.obs.columns

Index(['sample_id', 'Status', 'Site', 'Tissue', 'Location', 'donor_id', 'Sex',
       'full_clustering', 'assay_ontology_term_id'],
      dtype='object')

In [73]:
list(adata.obs['full_clustering'].unique())

['Differentiated_KC',
 'Melanocyte',
 'Undifferentiated_KC',
 'LE2',
 'F2',
 'LE1',
 'F1',
 'migLC',
 'Th',
 'F3',
 'VE1',
 'VE2',
 'Tc',
 'LC',
 'Treg',
 'NK',
 'Macro_1',
 'ILC1_NK',
 'ILC2',
 'ILC1_3',
 'Macro_2',
 'Plasma',
 'DC2',
 'Inf_mac',
 'Mono_mac',
 'moDC',
 'MigDC',
 'DC1',
 'Pericyte_1',
 'Pericyte_2',
 'Schwann_2',
 'VE3',
 'Schwann_1',
 'Mast_cell']

In [74]:
# create a dictionary of cell type and ontology term

In [75]:
mapping= {'Differentiated_KC': 'CL:0000312', #not diff
 'Melanocyte' : 'CL:0000148',
 'Undifferentiated_KC' :'CL:0000312', #not undiff
 'LE2':'CL:0002138',
 'F2':'CL:0002620',
 'LE1':'CL:0002138',
 'F1' :'CL:0002620',
 'migLC' :'CL:0000453', # not mig
 'Th':'CL:0000912',
 'F3':'CL:0002620',
 'VE1' :'CL:0002139',
 'VE2':'CL:0002139',
 'Tc':'CL:0000910',
 'LC' :'CL:0000453',
 'Treg':'CL:0000815',
 'NK' :'CL:0000623',
 'Macro_1':'CL:0000235',
 'ILC1_NK':'CL:0001065', #not sure
 'ILC2':'CL:0001065',
 'ILC1_3':'CL:0001065',
 'Macro_2':'CL:0000235',
 'Plasma':'CL:0000786',
 'DC2':'CL:0000451',
 'Inf_mac':'CL:0000863',
 'Mono_mac':'CL:0000235', # not monocyte derived
 'moDC':'CL:0011031',
 'MigDC':'CL:0000451', #not mig
 'DC1' :'CL:0000990',
 'Pericyte_1':'CL:0000669',
 'Pericyte_2':'CL:0000669',
 'Schwann_2':'CL:0002573',
 'VE3':'CL:0002139',
 'Schwann_1':'CL:0002573',
 'Mast_cell' :'CL:0000097'}

In [76]:
# add the cell_type_ontology_term_id column

In [77]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['full_clustering'].map(mapping)

In [78]:
# change datatype of the column

In [79]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [80]:
# view adata.obs

In [81]:
adata.obs

Unnamed: 0_level_0,sample_id,Status,Site,Tissue,Location,donor_id,Sex,full_clustering,assay_ontology_term_id,cell_type_ontology_term_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AAACCTGAGAGTGACC-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312
AAACCTGAGGTCGGAT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312
AAACCTGAGTCGAGTG-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312
AAACCTGCATACTCTT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312
AAACCTGCATTCACTT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Melanocyte,EFO:0009899,CL:0000148
...,...,...,...,...,...,...,...,...,...,...
TTTGGTTTCAGGCCCA-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,migLC,EFO:0009899,CL:0000453
TTTGGTTTCGCCTGTT-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912
TTTGTCAAGGAATCGC-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912
TTTGTCAAGGACTGGT-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912


#### **development_stage_ontology_term_id**

In [82]:
# identify the column in adata which corresponds to age

In [83]:
adata.obs['development_stage_ontology_term_id'] = ['HsapDv:0000087']* len(adata.obs)

In [84]:
# change datatype of the column

In [85]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].astype('category')

In [86]:
# view unique values of development_stage_ontology_term_id column

In [87]:
list(adata.obs['development_stage_ontology_term_id'].unique())

['HsapDv:0000087']

In [88]:
# view adata.obs

In [89]:
adata.obs

Unnamed: 0_level_0,sample_id,Status,Site,Tissue,Location,donor_id,Sex,full_clustering,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AAACCTGAGAGTGACC-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087
AAACCTGAGGTCGGAT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087
AAACCTGAGTCGAGTG-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087
AAACCTGCATACTCTT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087
AAACCTGCATTCACTT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Melanocyte,EFO:0009899,CL:0000148,HsapDv:0000087
...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTTCAGGCCCA-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,migLC,EFO:0009899,CL:0000453,HsapDv:0000087
TTTGGTTTCGCCTGTT-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912,HsapDv:0000087
TTTGTCAAGGAATCGC-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912,HsapDv:0000087
TTTGTCAAGGACTGGT-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912,HsapDv:0000087


#### **donor_id**

In [90]:
#identify the column in adata.obs which provides donor information

In [91]:
adata.obs.columns

Index(['sample_id', 'Status', 'Site', 'Tissue', 'Location', 'donor_id', 'Sex',
       'full_clustering', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id'],
      dtype='object')

In [92]:
list(adata.obs['donor_id'].unique())

['S4', 'S5', 'S1', 'S2', 'S3']

In [93]:
# add the donor_id column

In [94]:
adata.obs['donor_id'] = adata.obs['donor_id']

In [95]:
# change datatype of the column

In [96]:
adata.obs['donor_id'] = adata.obs['donor_id'].astype('category')

In [97]:
# view unique values of donor_id column

In [98]:
list(adata.obs['donor_id'].unique())

['S4', 'S5', 'S1', 'S2', 'S3']

In [99]:
#view obs

In [100]:
adata.obs

Unnamed: 0_level_0,sample_id,Status,Site,Tissue,Location,donor_id,Sex,full_clustering,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AAACCTGAGAGTGACC-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087
AAACCTGAGGTCGGAT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087
AAACCTGAGTCGAGTG-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087
AAACCTGCATACTCTT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087
AAACCTGCATTCACTT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Melanocyte,EFO:0009899,CL:0000148,HsapDv:0000087
...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTTCAGGCCCA-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,migLC,EFO:0009899,CL:0000453,HsapDv:0000087
TTTGGTTTCGCCTGTT-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912,HsapDv:0000087
TTTGTCAAGGAATCGC-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912,HsapDv:0000087
TTTGTCAAGGACTGGT-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912,HsapDv:0000087


#### **disease_ontology_term_id**

In [101]:
# Assign normal since all are healthy patients

In [102]:
# add the disease_ontology_term_id column

In [103]:
adata.obs['disease_ontology_term_id'] = ['PATO:0000461']* len(adata.obs)

In [104]:
# change datatype of the column

In [105]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')

In [106]:
# view obs

In [107]:
adata.obs

Unnamed: 0_level_0,sample_id,Status,Site,Tissue,Location,donor_id,Sex,full_clustering,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AAACCTGAGAGTGACC-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461
AAACCTGAGGTCGGAT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461
AAACCTGAGTCGAGTG-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461
AAACCTGCATACTCTT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461
AAACCTGCATTCACTT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Melanocyte,EFO:0009899,CL:0000148,HsapDv:0000087,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTTCAGGCCCA-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,migLC,EFO:0009899,CL:0000453,HsapDv:0000087,PATO:0000461
TTTGGTTTCGCCTGTT-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912,HsapDv:0000087,PATO:0000461
TTTGTCAAGGAATCGC-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912,HsapDv:0000087,PATO:0000461
TTTGTCAAGGACTGGT-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912,HsapDv:0000087,PATO:0000461


#### **is_primary_data**

In [108]:
adata.obs['is_primary_data'] = [True] * len(adata.obs)

In [109]:
adata.obs['is_primary_data']

index
AAACCTGAGAGTGACC-1-SKN8104894         True
AAACCTGAGGTCGGAT-1-SKN8104894         True
AAACCTGAGTCGAGTG-1-SKN8104894         True
AAACCTGCATACTCTT-1-SKN8104894         True
AAACCTGCATTCACTT-1-SKN8104894         True
                                      ... 
TTTGGTTTCAGGCCCA-1-4820STDY7389014    True
TTTGGTTTCGCCTGTT-1-4820STDY7389014    True
TTTGTCAAGGAATCGC-1-4820STDY7389014    True
TTTGTCAAGGACTGGT-1-4820STDY7389014    True
TTTGTCACACTACAGT-1-4820STDY7389014    True
Name: is_primary_data, Length: 195739, dtype: bool

In [110]:
list(adata.obs['is_primary_data'].unique())

[True]

In [111]:
#change data type of column

In [112]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

In [113]:
# view obs

In [114]:
adata.obs

Unnamed: 0_level_0,sample_id,Status,Site,Tissue,Location,donor_id,Sex,full_clustering,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AAACCTGAGAGTGACC-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True
AAACCTGAGGTCGGAT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True
AAACCTGAGTCGAGTG-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True
AAACCTGCATACTCTT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True
AAACCTGCATTCACTT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Melanocyte,EFO:0009899,CL:0000148,HsapDv:0000087,PATO:0000461,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTTCAGGCCCA-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,migLC,EFO:0009899,CL:0000453,HsapDv:0000087,PATO:0000461,True
TTTGGTTTCGCCTGTT-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912,HsapDv:0000087,PATO:0000461,True
TTTGTCAAGGAATCGC-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912,HsapDv:0000087,PATO:0000461,True
TTTGTCAAGGACTGGT-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912,HsapDv:0000087,PATO:0000461,True


#### **organism_ontology_term_id**

In [115]:
# assign organism id 

In [116]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [117]:
#change data type of column

In [118]:
adata.obs['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].astype('category')

In [119]:
# view obs

In [120]:
adata.obs

Unnamed: 0_level_0,sample_id,Status,Site,Tissue,Location,donor_id,Sex,full_clustering,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
AAACCTGAGAGTGACC-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606
AAACCTGAGGTCGGAT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606
AAACCTGAGTCGAGTG-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606
AAACCTGCATACTCTT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606
AAACCTGCATTCACTT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Melanocyte,EFO:0009899,CL:0000148,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTTCAGGCCCA-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,migLC,EFO:0009899,CL:0000453,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606
TTTGGTTTCGCCTGTT-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606
TTTGTCAAGGAATCGC-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606
TTTGTCAAGGACTGGT-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606


#### **self_reported_ethnicity_ontology_term_id**

In [121]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [122]:
# change data type

In [123]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['self_reported_ethnicity_ontology_term_id'].astype('category')

In [124]:
# view obs

In [125]:
adata.obs

Unnamed: 0_level_0,sample_id,Status,Site,Tissue,Location,donor_id,Sex,full_clustering,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
AAACCTGAGAGTGACC-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown
AAACCTGAGGTCGGAT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown
AAACCTGAGTCGAGTG-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown
AAACCTGCATACTCTT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown
AAACCTGCATTCACTT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Melanocyte,EFO:0009899,CL:0000148,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTTCAGGCCCA-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,migLC,EFO:0009899,CL:0000453,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown
TTTGGTTTCGCCTGTT-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown
TTTGTCAAGGAATCGC-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown
TTTGTCAAGGACTGGT-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown


#### **sex_ontology_term_id**

In [126]:
# identify the column in adata.obs which corresponds to sex

In [127]:
adata.obs.columns

Index(['sample_id', 'Status', 'Site', 'Tissue', 'Location', 'donor_id', 'Sex',
       'full_clustering', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

In [128]:
# list the unique values 

In [129]:
list(adata.obs['Sex'].unique())

['Female']

In [130]:
# create a dictionary of sex and sex ontology term id

In [131]:
mapping= {'Female': 'PATO:0000383', 'Male': 'PATO:0000384'}

In [132]:
# add sex_ontology_term_id column

In [133]:
adata.obs['sex_ontology_term_id'] = adata.obs['Sex'].map(mapping)

In [134]:
# change data type

In [135]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].astype('category')

In [136]:
adata.obs

Unnamed: 0_level_0,sample_id,Status,Site,Tissue,Location,donor_id,Sex,full_clustering,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
AAACCTGAGAGTGACC-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
AAACCTGAGGTCGGAT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
AAACCTGAGTCGAGTG-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
AAACCTGCATACTCTT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
AAACCTGCATTCACTT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Melanocyte,EFO:0009899,CL:0000148,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTTCAGGCCCA-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,migLC,EFO:0009899,CL:0000453,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
TTTGGTTTCGCCTGTT-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
TTTGTCAAGGAATCGC-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
TTTGTCAAGGACTGGT-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383


#### **suspension_type**

In [137]:
adata.obs['suspension_type'] = ['cell'] * len(adata.obs)

In [138]:
# change data type of column

In [139]:
adata.obs['suspension_type'] = adata.obs['suspension_type'].astype('category')

In [140]:
# view obs

In [141]:
adata.obs

Unnamed: 0_level_0,sample_id,Status,Site,Tissue,Location,donor_id,Sex,full_clustering,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
AAACCTGAGAGTGACC-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell
AAACCTGAGGTCGGAT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell
AAACCTGAGTCGAGTG-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell
AAACCTGCATACTCTT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell
AAACCTGCATTCACTT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Melanocyte,EFO:0009899,CL:0000148,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTTCAGGCCCA-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,migLC,EFO:0009899,CL:0000453,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell
TTTGGTTTCGCCTGTT-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell
TTTGTCAAGGAATCGC-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell
TTTGTCAAGGACTGGT-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell


#### **tissue_ontology_term_id**

In [142]:
# identify the column in adata.obs which corresponds to tissue

In [143]:
adata.obs.columns

Index(['sample_id', 'Status', 'Site', 'Tissue', 'Location', 'donor_id', 'Sex',
       'full_clustering', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type'],
      dtype='object')

In [144]:
list(adata.obs['Tissue'].unique())

['Epidermis', 'Dermis']

In [145]:
# create a dictionary with tissue and corresponding tissue ontology term id

In [146]:
mapping= {'Epidermis' :'UBERON:0001003', 'Dermis':'UBERON:0002067'}

In [147]:
# add 'tissue_ontology_term_id' column

In [148]:
adata.obs['tissue_ontology_term_id'] = adata.obs['Tissue'].map(mapping)

In [149]:
# change data type of column

In [150]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].astype('category')

In [151]:
#list the unique values in 'tissue_ontology_term_id' column

In [152]:
list(adata.obs['tissue_ontology_term_id'].unique())

['UBERON:0001003', 'UBERON:0002067']

In [153]:
# view obs

In [154]:
adata.obs

Unnamed: 0_level_0,sample_id,Status,Site,Tissue,Location,donor_id,Sex,full_clustering,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
AAACCTGAGAGTGACC-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0001003
AAACCTGAGGTCGGAT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0001003
AAACCTGAGTCGAGTG-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0001003
AAACCTGCATACTCTT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0001003
AAACCTGCATTCACTT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Melanocyte,EFO:0009899,CL:0000148,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0001003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTTCAGGCCCA-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,migLC,EFO:0009899,CL:0000453,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0001003
TTTGGTTTCGCCTGTT-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0001003
TTTGTCAAGGAATCGC-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0001003
TTTGTCAAGGACTGGT-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0001003


#### **obsm (Embeddings)**

In [155]:
# view obsm

In [156]:
# check whether all columns are prefixed with X

In [157]:
adata.obsm

AxisArrays with keys: X_pca, X_umap

In [158]:
adata.obsm['X_umap']

array([[  5.0807314 ,  -3.8733916 ],
       [  4.596959  ,  -3.6801147 ],
       [  4.1807704 ,  -2.3423617 ],
       ...,
       [-12.053621  ,  -0.87002707],
       [ -9.127302  ,   3.8453593 ],
       [-10.755628  ,   0.8345877 ]], dtype=float32)

In [198]:
adata.obsm['X_pca']

array([[ 9.9841194e+00, -2.2185330e+00,  1.6403893e-01, ...,
        -1.5081295e-01,  1.1193346e-01, -4.2834993e-02],
       [ 9.9650440e+00, -2.2218668e+00,  1.1430279e+00, ...,
        -5.9738815e-01, -8.8433594e-01, -1.9773588e-01],
       [ 8.6658745e+00, -1.6018285e+00,  6.0335267e-01, ...,
        -5.7250727e-02,  3.8694251e-01,  3.2225635e-03],
       ...,
       [-3.0367842e+00, -2.8482955e+00, -5.3698788e+00, ...,
         1.6399151e-01, -1.1160437e+00,  8.6227775e-01],
       [-3.7346234e+00, -4.7002740e+00, -6.1988091e+00, ...,
        -3.4111005e-01, -1.6505019e-01, -1.3037416e-02],
       [-3.1587920e+00, -4.2325478e+00, -6.5852427e+00, ...,
        -2.8956270e-01, -1.4929320e-01,  6.8257344e-01]], dtype=float32)

#### **uns (Dataset Metadata)**

In [160]:
# View

In [161]:
adata.uns

OverloadedDict, wrapping:
	{'Status_colors': array(['#ff7f0e'], dtype=object), 'anno_colors': array(['#0e6c8b', '#b8bae5', '#d1db94', '#98d3e2', '#f59670', '#ec6f8a',
       '#156515', '#98d76a', '#89168d', '#8d0de7', '#0f21e6', '#c49573',
       '#a34891', '#dcadc0', '#314f90'], dtype=object), 'dendrogram_leiden': {'categories_idx_ordered': array([25,  5, 13, 12,  2, 11,  8, 16, 10,  0,  3, 21, 22,  6,  9, 24, 18,
       14,  4, 15, 23, 19, 20, 17,  1,  7]), 'cor_method': array(['pearson'], dtype=object), 'correlation_matrix': array([[ 1.00000000e+00, -2.16178999e-01, -3.31594207e-01,
         6.84818167e-01, -1.36117944e-01, -2.64201282e-01,
         8.99005749e-02, -2.11576067e-01, -3.36302662e-01,
         1.20403944e-01,  4.66394444e-01, -3.56058911e-01,
        -3.24003081e-01, -2.42023345e-01, -2.18801435e-01,
        -2.04811963e-01, -3.10970129e-01, -2.11908160e-01,
         4.95883296e-02, -2.44351089e-01, -1.71514076e-01,
         6.43439354e-02,  2.77316845e-01,  8.50104218

In [162]:
adata.uns.keys

<bound method OverloadedDict.keys of OverloadedDict, wrapping:
	{'Status_colors': array(['#ff7f0e'], dtype=object), 'anno_colors': array(['#0e6c8b', '#b8bae5', '#d1db94', '#98d3e2', '#f59670', '#ec6f8a',
       '#156515', '#98d76a', '#89168d', '#8d0de7', '#0f21e6', '#c49573',
       '#a34891', '#dcadc0', '#314f90'], dtype=object), 'dendrogram_leiden': {'categories_idx_ordered': array([25,  5, 13, 12,  2, 11,  8, 16, 10,  0,  3, 21, 22,  6,  9, 24, 18,
       14,  4, 15, 23, 19, 20, 17,  1,  7]), 'cor_method': array(['pearson'], dtype=object), 'correlation_matrix': array([[ 1.00000000e+00, -2.16178999e-01, -3.31594207e-01,
         6.84818167e-01, -1.36117944e-01, -2.64201282e-01,
         8.99005749e-02, -2.11576067e-01, -3.36302662e-01,
         1.20403944e-01,  4.66394444e-01, -3.56058911e-01,
        -3.24003081e-01, -2.42023345e-01, -2.18801435e-01,
        -2.04811963e-01, -3.10970129e-01, -2.11908160e-01,
         4.95883296e-02, -2.44351089e-01, -1.71514076e-01,
         6.43439

In [163]:
# Give a title for the dataset

In [164]:
adata.uns['title'] = 'Human healthy adult skin scRNA-seq data'

In [165]:
# Set the default embedding

In [166]:
adata.uns['default_embedding'] = 'X_umap'

### **Final check**

In [167]:
# view anndata object

In [168]:
adata

AnnData object with n_obs × n_vars = 195739 × 28494
    obs: 'sample_id', 'Status', 'Site', 'Tissue', 'Location', 'donor_id', 'Sex', 'full_clustering', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'disease_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'suspension_type', 'tissue_ontology_term_id'
    var: 'feature_is_filtered'
    uns: 'Status_colors', 'anno_colors', 'dendrogram_leiden', 'full_clustering_colors', 'leiden', 'leiden_colors', 'neighbors', 'pca', 'rank_genes_groups', 'title', 'default_embedding'
    obsm: 'X_pca', 'X_umap'

In [169]:
# view obs and var data types

In [170]:
adata.obs.dtypes

sample_id                                   category
Status                                      category
Site                                        category
Tissue                                      category
Location                                    category
donor_id                                    category
Sex                                         category
full_clustering                             category
assay_ontology_term_id                      category
cell_type_ontology_term_id                  category
development_stage_ontology_term_id          category
disease_ontology_term_id                    category
is_primary_data                                 bool
organism_ontology_term_id                   category
self_reported_ethnicity_ontology_term_id    category
sex_ontology_term_id                        category
suspension_type                             category
tissue_ontology_term_id                     category
dtype: object

In [171]:
adata.var.dtypes

feature_is_filtered    bool
dtype: object

In [172]:
adata.obsm.dtypes

AttributeError: 'AxisArrays' object has no attribute 'dtypes'

In [173]:
adata.uns.dtypes

AttributeError: 'OverloadedDict' object has no attribute 'dtypes'

In [174]:
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

In [175]:
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")
for c in dty[dty['dtype'] == 'object'].index.values:
    adata.obs[c] = adata.obs[c].astype('category') 
    print(f"changed {c} from object to category")

In [176]:
# view obs

In [177]:
adata.obs

Unnamed: 0_level_0,sample_id,Status,Site,Tissue,Location,donor_id,Sex,full_clustering,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
AAACCTGAGAGTGACC-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0001003
AAACCTGAGGTCGGAT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0001003
AAACCTGAGTCGAGTG-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0001003
AAACCTGCATACTCTT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0001003
AAACCTGCATTCACTT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Epidermis,Breast,S4,Female,Melanocyte,EFO:0009899,CL:0000148,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0001003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTTCAGGCCCA-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,migLC,EFO:0009899,CL:0000453,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0001003
TTTGGTTTCGCCTGTT-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0001003
TTTGTCAAGGAATCGC-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0001003
TTTGTCAAGGACTGGT-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Epidermis,Breast,S3,Female,Th,EFO:0009899,CL:0000912,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0001003


In [178]:
adata.obs.columns

Index(['sample_id', 'Status', 'Site', 'Tissue', 'Location', 'donor_id', 'Sex',
       'full_clustering', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_ontology_term_id'],
      dtype='object')

In [179]:
# delete unwanted columns in obs

In [180]:
del adata.obs['Tissue']
del adata.obs['Sex']

In [181]:
# view obs

In [182]:
adata.obs

Unnamed: 0_level_0,sample_id,Status,Site,Location,donor_id,full_clustering,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
AAACCTGAGAGTGACC-1-SKN8104894,SKN8104894,Healthy,non_lesion,Breast,S4,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0001003
AAACCTGAGGTCGGAT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Breast,S4,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0001003
AAACCTGAGTCGAGTG-1-SKN8104894,SKN8104894,Healthy,non_lesion,Breast,S4,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0001003
AAACCTGCATACTCTT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Breast,S4,Differentiated_KC,EFO:0009899,CL:0000312,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0001003
AAACCTGCATTCACTT-1-SKN8104894,SKN8104894,Healthy,non_lesion,Breast,S4,Melanocyte,EFO:0009899,CL:0000148,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0001003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTTCAGGCCCA-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Breast,S3,migLC,EFO:0009899,CL:0000453,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0001003
TTTGGTTTCGCCTGTT-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Breast,S3,Th,EFO:0009899,CL:0000912,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0001003
TTTGTCAAGGAATCGC-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Breast,S3,Th,EFO:0009899,CL:0000912,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0001003
TTTGTCAAGGACTGGT-1-4820STDY7389014,4820STDY7389014,Healthy,non_lesion,Breast,S3,Th,EFO:0009899,CL:0000912,HsapDv:0000087,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0001003


In [183]:
adata.obs.columns

Index(['sample_id', 'Status', 'Site', 'Location', 'donor_id',
       'full_clustering', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_ontology_term_id'],
      dtype='object')

In [184]:
# view var

In [185]:
adata.var

Unnamed: 0,feature_is_filtered
ENSG00000243485,True
ENSG00000238009,True
ENSG00000239945,True
ENSG00000239906,True
ENSG00000236601,True
...,...
ENSG00000276345,True
ENSG00000277856,True
ENSG00000275063,True
ENSG00000271254,True


In [186]:
#view uns

In [187]:
adata.uns

OverloadedDict, wrapping:
	{'Status_colors': array(['#ff7f0e'], dtype=object), 'anno_colors': array(['#0e6c8b', '#b8bae5', '#d1db94', '#98d3e2', '#f59670', '#ec6f8a',
       '#156515', '#98d76a', '#89168d', '#8d0de7', '#0f21e6', '#c49573',
       '#a34891', '#dcadc0', '#314f90'], dtype=object), 'dendrogram_leiden': {'categories_idx_ordered': array([25,  5, 13, 12,  2, 11,  8, 16, 10,  0,  3, 21, 22,  6,  9, 24, 18,
       14,  4, 15, 23, 19, 20, 17,  1,  7]), 'cor_method': array(['pearson'], dtype=object), 'correlation_matrix': array([[ 1.00000000e+00, -2.16178999e-01, -3.31594207e-01,
         6.84818167e-01, -1.36117944e-01, -2.64201282e-01,
         8.99005749e-02, -2.11576067e-01, -3.36302662e-01,
         1.20403944e-01,  4.66394444e-01, -3.56058911e-01,
        -3.24003081e-01, -2.42023345e-01, -2.18801435e-01,
        -2.04811963e-01, -3.10970129e-01, -2.11908160e-01,
         4.95883296e-02, -2.44351089e-01, -1.71514076e-01,
         6.43439354e-02,  2.77316845e-01,  8.50104218

In [188]:
list(adata.uns.keys())

['Status_colors',
 'anno_colors',
 'dendrogram_leiden',
 'full_clustering_colors',
 'leiden',
 'leiden_colors',
 'neighbors',
 'pca',
 'rank_genes_groups',
 'title',
 'default_embedding']

In [189]:
adata.obs.columns

Index(['sample_id', 'Status', 'Site', 'Location', 'donor_id',
       'full_clustering', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_ontology_term_id'],
      dtype='object')

In [190]:
# Remove unwanted columns in uns

In [191]:
del adata.uns['anno_colors']

In [192]:
#check the format of expression matrix

In [193]:
adata.X

<195739x28494 sparse matrix of type '<class 'numpy.float32'>'
	with 60023278 stored elements in Compressed Sparse Row format>

In [194]:
araw.X

<195739x28494 sparse matrix of type '<class 'numpy.float32'>'
	with 345974985 stored elements in Compressed Sparse Row format>

In [195]:
#Copy raw counts to adata.raw

In [196]:
adata.raw = araw

In [197]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/inflammatory_skin_disease/Final_objects/Human_adult_healthy_10x_data.h5ad', compression = 'gzip')