### **Curating bcc_and_normal-CG_portal_fat.h5ad**

Article: Multi-scale spatial mapping of cell populations across anatomical sites in healthy human skin and basal cell carcinoma

DOI: https://doi.org/10.1101/2023.08.08.551504

Data Source : https://spatial-skin-atlas.cellgeni.sanger.ac.uk

##### **Mount farm**

mount-farm

##### **Packages required for curation**

In [1]:
#Import all packages required for curation

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess
import math
import re
from collections import Counter

### **Curation Schema**

##### **X (Matrix Layers)**

##### **AnnData object**

In [3]:
# Load the AnnData object

In [4]:
adata = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/spatial-skin/Data/bcc_and_normal-CG_portal_fat.h5ad')

In [5]:
# View the AnnData object

In [6]:
adata

AnnData object with n_obs × n_vars = 155401 × 32983
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'names', 'group', 'location', 'integrated_snn_res.0.65', 'seurat_clusters', 'celltypes', 'subcelltypes'
    uns: 'celltypes_colors', 'log1p', 'subcelltypes_colors'
    obsm: 'X_umap'

##### **X - expression matrix**

In [7]:
# View the expression matrix of the anndata object.

In [8]:
adata.X

<155401x32983 sparse matrix of type '<class 'numpy.float32'>'
	with 216318227 stored elements in Compressed Sparse Row format>

In [9]:
# Print the matrix to check whether they are normalized counts or raw counts. if the matrix has floating numbers,they are normalized counts.if they are integers, they are raw counts.

In [10]:
print(adata.X)

  (0, 18)	1.6368257
  (0, 24)	1.1214871
  (0, 108)	1.6368257
  (0, 228)	1.1214871
  (0, 256)	1.1214871
  (0, 260)	1.1214871
  (0, 309)	1.9752257
  (0, 322)	1.6368257
  (0, 330)	1.1214871
  (0, 332)	2.9767911
  (0, 336)	1.1214871
  (0, 358)	1.1214871
  (0, 376)	1.1214871
  (0, 387)	1.9752257
  (0, 389)	1.9752257
  (0, 391)	1.1214871
  (0, 400)	1.9752257
  (0, 423)	1.1214871
  (0, 425)	1.1214871
  (0, 446)	1.1214871
  (0, 461)	1.1214871
  (0, 510)	1.1214871
  (0, 528)	1.1214871
  (0, 582)	1.1214871
  (0, 595)	1.6368257
  :	:
  (155400, 24767)	1.1307192
  (155400, 24775)	1.1307192
  (155400, 25199)	1.1307192
  (155400, 25224)	1.1307192
  (155400, 25351)	1.1307192
  (155400, 25608)	1.1307192
  (155400, 25657)	1.1307192
  (155400, 25662)	1.1307192
  (155400, 25715)	1.1307192
  (155400, 25832)	1.1307192
  (155400, 25878)	1.1307192
  (155400, 25930)	1.1307192
  (155400, 26027)	1.1307192
  (155400, 26134)	1.1307192
  (155400, 26343)	1.1307192
  (155400, 26387)	1.1307192
  (155400, 26426)	1.130

##### **Raw counts matrix**

In [11]:
# If X has normalized counts, check for the raw counts matrix.

In [12]:
# check whether raw counts are present in adata.raw

In [13]:
print(adata.raw.X)

  (0, 18)	2.0
  (0, 24)	1.0
  (0, 108)	2.0
  (0, 228)	1.0
  (0, 256)	1.0
  (0, 260)	1.0
  (0, 309)	3.0
  (0, 322)	2.0
  (0, 330)	1.0
  (0, 332)	9.0
  (0, 336)	1.0
  (0, 358)	1.0
  (0, 376)	1.0
  (0, 387)	3.0
  (0, 389)	3.0
  (0, 391)	1.0
  (0, 400)	3.0
  (0, 423)	1.0
  (0, 425)	1.0
  (0, 446)	1.0
  (0, 461)	1.0
  (0, 510)	1.0
  (0, 528)	1.0
  (0, 582)	1.0
  (0, 595)	2.0
  :	:
  (155400, 24767)	1.0
  (155400, 24775)	1.0
  (155400, 25199)	1.0
  (155400, 25224)	1.0
  (155400, 25351)	1.0
  (155400, 25608)	1.0
  (155400, 25657)	1.0
  (155400, 25662)	1.0
  (155400, 25715)	1.0
  (155400, 25832)	1.0
  (155400, 25878)	1.0
  (155400, 25930)	1.0
  (155400, 26027)	1.0
  (155400, 26134)	1.0
  (155400, 26343)	1.0
  (155400, 26387)	1.0
  (155400, 26426)	1.0
  (155400, 26472)	1.0
  (155400, 26679)	1.0
  (155400, 26802)	3.0
  (155400, 27181)	1.0
  (155400, 27183)	1.0
  (155400, 27223)	2.0
  (155400, 27274)	2.0
  (155400, 27432)	1.0


In [14]:
adata.obs['barcodes'] = adata.obs_names

In [15]:
adata.obs['barcodes'] = adata.obs['barcodes'].str.extract(r'([ATGC]{10,16})', expand=False)

In [16]:
adata.obs['barcodes']

AAACCTGAGTTAAGTG-1_1     AAACCTGAGTTAAGTG
AAACCTGGTGCAACTT-1_1     AAACCTGGTGCAACTT
AAACCTGTCATGTGGT-1_1     AAACCTGTCATGTGGT
AAACCTGTCCATTCTA-1_1     AAACCTGTCCATTCTA
AAACCTGTCGGATGGA-1_1     AAACCTGTCGGATGGA
                               ...       
TTTGGTTCAACCAATC-1_27    TTTGGTTCAACCAATC
TTTGGTTTCACCTTAT-1_27    TTTGGTTTCACCTTAT
TTTGGTTTCCTACACC-1_27    TTTGGTTTCCTACACC
TTTGTTGCAGAACGCA-1_27    TTTGTTGCAGAACGCA
TTTGTTGTCACGGGAA-1_27    TTTGTTGTCACGGGAA
Name: barcodes, Length: 155401, dtype: object

In [17]:
barcode_count = adata.obs.loc[adata.obs['orig.ident'] == 'bcc_nose1', 'barcodes'].nunique()
print("Number of barcodes for orig.ident='bcc_nose1':", barcode_count)

Number of barcodes for orig.ident='bcc_nose1': 3125


In [18]:
filtered_barcodes = adata.obs.loc[adata.obs['orig.ident'] == 'bcc_nose1', 'barcodes']

In [19]:
num_duplicates = filtered_barcodes.duplicated().sum()

In [20]:
num_duplicates

3125

In [21]:
filtered_adata = adata[adata.obs['orig.ident'] == 'bcc_nose1']

In [22]:
filtered_adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,names,group,location,integrated_snn_res.0.65,seurat_clusters,celltypes,subcelltypes,barcodes
AAACCCACACCTGCTT-1_24,bcc_nose1,2086,1059,3.643337,AAACCCACACCTGCTT-1_24,BCCface,nose,2,2,NK cells,NK,AAACCCACACCTGCTT
AAACCCACACGGTCTG-1_24,bcc_nose1,5169,1450,2.940607,AAACCCACACGGTCTG-1_24,BCCface,nose,5,5,T cells,T reg,AAACCCACACGGTCTG
AAACCCATCAGAATAG-1_24,bcc_nose1,3589,1132,0.390081,AAACCCATCAGAATAG-1_24,BCCface,nose,2,2,NK cells,NK,AAACCCATCAGAATAG
AAACCCATCGCTAAAC-1_24,bcc_nose1,5082,1624,0.924833,AAACCCATCGCTAAAC-1_24,BCCface,nose,1,1,T cells,Th,AAACCCATCGCTAAAC
AAACGAAAGCCGATTT-1_24,bcc_nose1,5015,1712,2.452642,AAACGAAAGCCGATTT-1_24,BCCface,nose,5,5,T cells,T reg,AAACGAAAGCCGATTT
...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTTCTGGTCAA-1_25,bcc_nose1,8789,2211,1.183297,TTTGGTTTCTGGTCAA-1_25,BCCface,nose,1,1,T cells,Th,TTTGGTTTCTGGTCAA
TTTGTTGAGCACGATG-1_25,bcc_nose1,8262,2377,3.159041,TTTGTTGAGCACGATG-1_25,BCCface,nose,20,20,Melanocytes,Melanocytes,TTTGTTGAGCACGATG
TTTGTTGCACGGCGTT-1_25,bcc_nose1,1411,712,1.771793,TTTGTTGCACGGCGTT-1_25,BCCface,nose,13,13,T cells,Tc,TTTGTTGCACGGCGTT
TTTGTTGGTGGCGCTT-1_25,bcc_nose1,2925,1161,0.547009,TTTGTTGGTGGCGCTT-1_25,BCCface,nose,2,2,NK cells,NK,TTTGTTGGTGGCGCTT


In [23]:
index_string_check = any('25' in str(index) for index in filtered_adata.obs.index)

In [24]:
index_string_check

True

In [25]:
adata = adata[~(adata.obs.duplicated(subset=['barcodes'], keep='first') & (adata.obs['orig.ident'] == 'bcc_nose1'))]

In [26]:
adata

View of AnnData object with n_obs × n_vars = 152189 × 32983
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'names', 'group', 'location', 'integrated_snn_res.0.65', 'seurat_clusters', 'celltypes', 'subcelltypes', 'barcodes'
    uns: 'celltypes_colors', 'log1p', 'subcelltypes_colors'
    obsm: 'X_umap'

In [27]:
filtered_adata = adata[adata.obs['orig.ident'] == 'bcc_nose1']

In [28]:
filtered_adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,names,group,location,integrated_snn_res.0.65,seurat_clusters,celltypes,subcelltypes,barcodes
AAACCCACACCTGCTT-1_24,bcc_nose1,2086,1059,3.643337,AAACCCACACCTGCTT-1_24,BCCface,nose,2,2,NK cells,NK,AAACCCACACCTGCTT
AAACCCACACGGTCTG-1_24,bcc_nose1,5169,1450,2.940607,AAACCCACACGGTCTG-1_24,BCCface,nose,5,5,T cells,T reg,AAACCCACACGGTCTG
AAACCCATCAGAATAG-1_24,bcc_nose1,3589,1132,0.390081,AAACCCATCAGAATAG-1_24,BCCface,nose,2,2,NK cells,NK,AAACCCATCAGAATAG
AAACCCATCGCTAAAC-1_24,bcc_nose1,5082,1624,0.924833,AAACCCATCGCTAAAC-1_24,BCCface,nose,1,1,T cells,Th,AAACCCATCGCTAAAC
AAACGAAAGCCGATTT-1_24,bcc_nose1,5015,1712,2.452642,AAACGAAAGCCGATTT-1_24,BCCface,nose,5,5,T cells,T reg,AAACGAAAGCCGATTT
...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTGTTAGAAAC-1_24,bcc_nose1,3511,1288,2.962119,TTTGGTTGTTAGAAAC-1_24,BCCface,nose,2,2,NK cells,NK,TTTGGTTGTTAGAAAC
TTTGGTTTCTGGTCAA-1_24,bcc_nose1,8789,2211,1.183297,TTTGGTTTCTGGTCAA-1_24,BCCface,nose,1,1,T cells,Th,TTTGGTTTCTGGTCAA
TTTGTTGAGCACGATG-1_24,bcc_nose1,8262,2377,3.159041,TTTGTTGAGCACGATG-1_24,BCCface,nose,20,20,Melanocytes,Melanocytes,TTTGTTGAGCACGATG
TTTGTTGGTGGCGCTT-1_24,bcc_nose1,2925,1161,0.547009,TTTGTTGGTGGCGCTT-1_24,BCCface,nose,2,2,NK cells,NK,TTTGTTGGTGGCGCTT


In [29]:
index_string_check = any('25' in str(index) for index in filtered_adata.obs.index)

In [30]:
index_string_check

False

In [31]:
barcode_count = adata.obs.loc[adata.obs['orig.ident'] == 'bcc_nose1', 'barcodes'].nunique()
print("Number of barcodes for orig.ident='bcc_nose1':", barcode_count)

Number of barcodes for orig.ident='bcc_nose1': 3038


In [32]:
adata

View of AnnData object with n_obs × n_vars = 152189 × 32983
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'names', 'group', 'location', 'integrated_snn_res.0.65', 'seurat_clusters', 'celltypes', 'subcelltypes', 'barcodes'
    uns: 'celltypes_colors', 'log1p', 'subcelltypes_colors'
    obsm: 'X_umap'

In [33]:
adata.obs['barcodes_with_suffix'] = adata.obs.apply(lambda x: x['barcodes'] + '_' + x['orig.ident'], axis=1)

  adata.obs['barcodes_with_suffix'] = adata.obs.apply(lambda x: x['barcodes'] + '_' + x['orig.ident'], axis=1)


In [34]:
araw = adata.raw.to_adata()

In [35]:
araw.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/spatial-skin/Raw/bcc_and_normal-CG_portal_fat_raw.h5ad', compression = 'gzip')

In [36]:
araw

AnnData object with n_obs × n_vars = 152189 × 32983
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'names', 'group', 'location', 'integrated_snn_res.0.65', 'seurat_clusters', 'celltypes', 'subcelltypes', 'barcodes', 'barcodes_with_suffix'
    uns: 'celltypes_colors', 'log1p', 'subcelltypes_colors'
    obsm: 'X_umap'

In [37]:
#araw = araw[~araw.obs.index.duplicated(keep='first')]

In [38]:
adata

AnnData object with n_obs × n_vars = 152189 × 32983
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'names', 'group', 'location', 'integrated_snn_res.0.65', 'seurat_clusters', 'celltypes', 'subcelltypes', 'barcodes', 'barcodes_with_suffix'
    uns: 'celltypes_colors', 'log1p', 'subcelltypes_colors'
    obsm: 'X_umap'

##### **Variables(var)**

In [39]:
# View the var of anndata and raw object

In [40]:
adata.var

RP11-34P13.7
AL627309.1
AP006222.2
RP4-669L17.10
RP11-206L10.3
...
AC008878.1
AC025809.1
PSG8
AL008638.1
LINC01670


In [41]:
adata.var['name'] = adata.var.index

In [42]:
adata.var

Unnamed: 0,name
RP11-34P13.7,RP11-34P13.7
AL627309.1,AL627309.1
AP006222.2,AP006222.2
RP4-669L17.10,RP4-669L17.10
RP11-206L10.3,RP11-206L10.3
...,...
AC008878.1,AC008878.1
AC025809.1,AC025809.1
PSG8,PSG8
AL008638.1,AL008638.1


In [43]:
araw.var

RP11-34P13.7
AL627309.1
AP006222.2
RP4-669L17.10
RP11-206L10.3
...
AC008878.1
AC025809.1
PSG8
AL008638.1
LINC01670


In [44]:
araw.var['name'] = araw.var.index

In [45]:
araw.var

Unnamed: 0,name
RP11-34P13.7,RP11-34P13.7
AL627309.1,AL627309.1
AP006222.2,AP006222.2
RP4-669L17.10,RP4-669L17.10
RP11-206L10.3,RP11-206L10.3
...,...
AC008878.1,AC008878.1
AC025809.1,AC025809.1
PSG8,PSG8
AL008638.1,AL008638.1


In [46]:
#Since ensembl ids are not provided in adata.var. Load the GRCh38-1.2.0 reference to match ensembl ids to gene symbols

In [47]:
#gene_info = pd.read_table('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/_genelists/GRCh38-3.0.0.gene_names.tsv')

In [48]:
gene_info = pd.read_table('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/Endometrium_reference_integrated_atlas/GRCh38-3.0.0.gene_names-2.tsv')

In [49]:
gene_info

Unnamed: 0,ensembl_ids,gene
0,ENSG00000243485,MIR1302-2HG
1,ENSG00000237613,FAM138A
2,ENSG00000186092,OR4F5
3,ENSG00000238009,AL627309.1
4,ENSG00000239945,AL627309.3
...,...,...
33533,ENSG00000277856,AC233755.2
33534,ENSG00000275063,AC233755.1
33535,ENSG00000271254,AC240274.1
33536,ENSG00000277475,AC213203.1


In [50]:
column_names = ['Gene ID','Gene Name']

In [51]:
gene_info.columns = column_names 

In [52]:
gene_info

Unnamed: 0,Gene ID,Gene Name
0,ENSG00000243485,MIR1302-2HG
1,ENSG00000237613,FAM138A
2,ENSG00000186092,OR4F5
3,ENSG00000238009,AL627309.1
4,ENSG00000239945,AL627309.3
...,...,...
33533,ENSG00000277856,AC233755.2
33534,ENSG00000275063,AC233755.1
33535,ENSG00000271254,AC240274.1
33536,ENSG00000277475,AC213203.1


In [53]:
dupl_gene_name = gene_info[gene_info.duplicated(subset= 'Gene Name' , keep=False)]

In [54]:
dupl_gene_name 

Unnamed: 0,Gene ID,Gene Name
2230,ENSG00000143248,RGS5
2232,ENSG00000232995,RGS5
2997,ENSG00000285053,TBCE
2999,ENSG00000284770,TBCE
4798,ENSG00000128655,PDE11A
4799,ENSG00000284741,PDE11A
5435,ENSG00000237940,LINC01238
5438,ENSG00000261186,LINC01238
5832,ENSG00000283706,PRSS50
5833,ENSG00000206549,PRSS50


In [55]:
len(dupl_gene_name)

48

In [56]:
#create a dictionary with gene symbols and ensembl ids from the gene information file

In [57]:
#gene_info_genesym_to_ensembl = dict(zip(gene_info['ensembl_id'],gene_info['gene_symbols']))

In [58]:
gene_info_genesym_to_ensembl=dict(gene_info.drop_duplicates('Gene Name').set_index('Gene Name')['Gene ID'])

In [59]:
gene_info_genesym_to_ensembl

{'MIR1302-2HG': 'ENSG00000243485',
 'FAM138A': 'ENSG00000237613',
 'OR4F5': 'ENSG00000186092',
 'AL627309.1': 'ENSG00000238009',
 'AL627309.3': 'ENSG00000239945',
 'AL627309.2': 'ENSG00000239906',
 'AL627309.4': 'ENSG00000241599',
 'AL732372.1': 'ENSG00000236601',
 'OR4F29': 'ENSG00000284733',
 'AC114498.1': 'ENSG00000235146',
 'OR4F16': 'ENSG00000284662',
 'AL669831.2': 'ENSG00000229905',
 'AL669831.5': 'ENSG00000237491',
 'FAM87B': 'ENSG00000177757',
 'LINC00115': 'ENSG00000225880',
 'FAM41C': 'ENSG00000230368',
 'AL645608.7': 'ENSG00000272438',
 'AL645608.3': 'ENSG00000230699',
 'AL645608.5': 'ENSG00000241180',
 'AL645608.1': 'ENSG00000223764',
 'SAMD11': 'ENSG00000187634',
 'NOC2L': 'ENSG00000188976',
 'KLHL17': 'ENSG00000187961',
 'PLEKHN1': 'ENSG00000187583',
 'PERM1': 'ENSG00000187642',
 'AL645608.8': 'ENSG00000272512',
 'HES4': 'ENSG00000188290',
 'ISG15': 'ENSG00000187608',
 'AL645608.2': 'ENSG00000224969',
 'AGRN': 'ENSG00000188157',
 'AL645608.9': 'ENSG00000273443',
 'RNF223

In [60]:
len(gene_info_genesym_to_ensembl)

33514

In [61]:
#Store ensembl ids in a new column in adata.var by matching gene symbols and ensembl ids from the gene information file

In [62]:
adata.var['ensembl_id'] = adata.var['name'].map(gene_info_genesym_to_ensembl)

In [63]:
araw.var['ensembl_id'] = araw.var['name'].map(gene_info_genesym_to_ensembl)

In [64]:
adata.var

Unnamed: 0,name,ensembl_id
RP11-34P13.7,RP11-34P13.7,
AL627309.1,AL627309.1,ENSG00000238009
AP006222.2,AP006222.2,
RP4-669L17.10,RP4-669L17.10,
RP11-206L10.3,RP11-206L10.3,
...,...,...
AC008878.1,AC008878.1,ENSG00000267952
AC025809.1,AC025809.1,ENSG00000267662
PSG8,PSG8,ENSG00000124467
AL008638.1,AL008638.1,ENSG00000231405


In [65]:
araw.var

Unnamed: 0,name,ensembl_id
RP11-34P13.7,RP11-34P13.7,
AL627309.1,AL627309.1,ENSG00000238009
AP006222.2,AP006222.2,
RP4-669L17.10,RP4-669L17.10,
RP11-206L10.3,RP11-206L10.3,
...,...,...
AC008878.1,AC008878.1,ENSG00000267952
AC025809.1,AC025809.1,ENSG00000267662
PSG8,PSG8,ENSG00000124467
AL008638.1,AL008638.1,ENSG00000231405


In [66]:
nan_ensembl_names = adata.var.loc[adata.var['ensembl_id'].isna(), 'name'].tolist()

In [67]:
nan_ensembl_names

['RP11-34P13.7',
 'AP006222.2',
 'RP4-669L17.10',
 'RP11-206L10.3',
 'RP11-206L10.2',
 'RP11-206L10.9',
 'RP11-54O7.1',
 'C1orf170',
 'RP11-54O7.17',
 'RP11-465B22.8',
 'FAM132A',
 'RP5-902P8.12',
 'CPSF3L',
 'GLTPD1',
 'RP4-758J18.2',
 'RP4-758J18.13',
 'RP4-758J18.10',
 'C1orf233',
 'RP11-345P4.9',
 'RP11-345P4.10',
 'RP11-345P4.7',
 'SLC35E2',
 'RP1-140A9.1',
 'C1orf222',
 'RP11-547D24.1',
 'RP5-892K4.1',
 'RP11-181G12.2',
 'C1orf86',
 'RP11-181G12.4',
 'RP3-395M20.12',
 'RP3-395M20.9',
 'RP11-46F15.2',
 'TP73-AS1',
 'RP1-120G22.11',
 'RP1-202O8.3',
 'RP11-242F24.1',
 'RP11-312B8.1',
 'RP4-549F15.1',
 'RP11-338N10.2',
 'RP3-467L1.4',
 'RP11-431K24.1',
 'RP5-1115A15.1',
 'RP4-633I8.4',
 'RP3-510D11.2',
 'RP11-558F24.4',
 'APITD1',
 'APITD1-CORT',
 'RP5-1113E3.3',
 'RP4-734G22.3',
 'RP4-635E18.8',
 'RP4-635E18.7',
 'RP4-635E18.6',
 'PTCHD2',
 'RP11-56N19.5',
 'RP11-474O21.5',
 'RP3-467K16.2',
 'RP3-467K16.7',
 'RP3-467K16.4',
 'RP4-680D5.2',
 'RP11-169K16.9',
 'C1orf64',
 'RP11-276H7.

In [68]:
adata.var.loc[adata.var['ensembl_id'].isna(), 'name'].to_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/spatial-skin/Suppl_info/spatial_ensembl_na.txt',index=False)

In [69]:
len(nan_ensembl_names)

5288

In [70]:
adata.var_names = adata.var['ensembl_id']

In [71]:
araw.var_names = araw.var['ensembl_id']

In [72]:
adata.var['ensembl_id'].isna().sum()

5288

In [73]:
adata.var.dropna(inplace=True)

In [74]:
araw.var.dropna(inplace=True)

In [75]:
adata.var

Unnamed: 0_level_0,name,ensembl_id
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000238009,AL627309.1,ENSG00000238009
ENSG00000177757,FAM87B,ENSG00000177757
ENSG00000225880,LINC00115,ENSG00000225880
ENSG00000230368,FAM41C,ENSG00000230368
ENSG00000187634,SAMD11,ENSG00000187634
...,...,...
ENSG00000267952,AC008878.1,ENSG00000267952
ENSG00000267662,AC025809.1,ENSG00000267662
ENSG00000124467,PSG8,ENSG00000124467
ENSG00000231405,AL008638.1,ENSG00000231405


In [76]:
araw.var

Unnamed: 0_level_0,name,ensembl_id
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000238009,AL627309.1,ENSG00000238009
ENSG00000177757,FAM87B,ENSG00000177757
ENSG00000225880,LINC00115,ENSG00000225880
ENSG00000230368,FAM41C,ENSG00000230368
ENSG00000187634,SAMD11,ENSG00000187634
...,...,...
ENSG00000267952,AC008878.1,ENSG00000267952
ENSG00000267662,AC025809.1,ENSG00000267662
ENSG00000124467,PSG8,ENSG00000124467
ENSG00000231405,AL008638.1,ENSG00000231405


In [77]:
# Load the approved genes file.

In [78]:
approved_genes = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/Endometrium_reference_integrated_atlas/genes_approved.csv')

In [79]:
#Create a dictionary from the approved genes file 

In [80]:
genedict = {key: 1 for key in list(approved_genes.feature_id)}

In [81]:
genedict

{'ERCC-00002': 1,
 'ERCC-00003': 1,
 'ERCC-00004': 1,
 'ERCC-00009': 1,
 'ERCC-00012': 1,
 'ERCC-00013': 1,
 'ERCC-00014': 1,
 'ERCC-00016': 1,
 'ERCC-00017': 1,
 'ERCC-00019': 1,
 'ERCC-00022': 1,
 'ERCC-00024': 1,
 'ERCC-00025': 1,
 'ERCC-00028': 1,
 'ERCC-00031': 1,
 'ERCC-00033': 1,
 'ERCC-00034': 1,
 'ERCC-00035': 1,
 'ERCC-00039': 1,
 'ERCC-00040': 1,
 'ERCC-00041': 1,
 'ERCC-00042': 1,
 'ERCC-00043': 1,
 'ERCC-00044': 1,
 'ERCC-00046': 1,
 'ERCC-00048': 1,
 'ERCC-00051': 1,
 'ERCC-00053': 1,
 'ERCC-00054': 1,
 'ERCC-00057': 1,
 'ERCC-00058': 1,
 'ERCC-00059': 1,
 'ERCC-00060': 1,
 'ERCC-00061': 1,
 'ERCC-00062': 1,
 'ERCC-00067': 1,
 'ERCC-00069': 1,
 'ERCC-00071': 1,
 'ERCC-00073': 1,
 'ERCC-00074': 1,
 'ERCC-00075': 1,
 'ERCC-00076': 1,
 'ERCC-00077': 1,
 'ERCC-00078': 1,
 'ERCC-00079': 1,
 'ERCC-00081': 1,
 'ERCC-00083': 1,
 'ERCC-00084': 1,
 'ERCC-00085': 1,
 'ERCC-00086': 1,
 'ERCC-00092': 1,
 'ERCC-00095': 1,
 'ERCC-00096': 1,
 'ERCC-00097': 1,
 'ERCC-00098': 1,
 'ERCC-000

In [82]:
len(genedict)

119799

In [83]:
#Filter out the genes which are not in the approved genes file.

In [84]:
var_to_keep_adata = [x for x in adata.var_names if (x in genedict)]
var_to_keep_araw = [x for x in araw.var_names if (x in genedict)]

In [85]:
len(var_to_keep_adata)

27405

In [86]:
len(var_to_keep_araw)

27405

In [87]:
adata.var

Unnamed: 0_level_0,name,ensembl_id
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000238009,AL627309.1,ENSG00000238009
ENSG00000177757,FAM87B,ENSG00000177757
ENSG00000225880,LINC00115,ENSG00000225880
ENSG00000230368,FAM41C,ENSG00000230368
ENSG00000187634,SAMD11,ENSG00000187634
...,...,...
ENSG00000267952,AC008878.1,ENSG00000267952
ENSG00000267662,AC025809.1,ENSG00000267662
ENSG00000124467,PSG8,ENSG00000124467
ENSG00000231405,AL008638.1,ENSG00000231405


In [88]:
araw.var

Unnamed: 0_level_0,name,ensembl_id
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000238009,AL627309.1,ENSG00000238009
ENSG00000177757,FAM87B,ENSG00000177757
ENSG00000225880,LINC00115,ENSG00000225880
ENSG00000230368,FAM41C,ENSG00000230368
ENSG00000187634,SAMD11,ENSG00000187634
...,...,...
ENSG00000267952,AC008878.1,ENSG00000267952
ENSG00000267662,AC025809.1,ENSG00000267662
ENSG00000124467,PSG8,ENSG00000124467
ENSG00000231405,AL008638.1,ENSG00000231405


In [89]:
# Modify the anndata object by filtering out the filtered genes.

In [90]:
adata = adata[:, var_to_keep_adata].copy()
araw = araw[:, var_to_keep_araw].copy()

In [91]:
adata.var

Unnamed: 0_level_0,name,ensembl_id
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000238009,AL627309.1,ENSG00000238009
ENSG00000177757,FAM87B,ENSG00000177757
ENSG00000225880,LINC00115,ENSG00000225880
ENSG00000230368,FAM41C,ENSG00000230368
ENSG00000187634,SAMD11,ENSG00000187634
...,...,...
ENSG00000267952,AC008878.1,ENSG00000267952
ENSG00000267662,AC025809.1,ENSG00000267662
ENSG00000124467,PSG8,ENSG00000124467
ENSG00000231405,AL008638.1,ENSG00000231405


In [92]:
araw.var

Unnamed: 0_level_0,name,ensembl_id
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000238009,AL627309.1,ENSG00000238009
ENSG00000177757,FAM87B,ENSG00000177757
ENSG00000225880,LINC00115,ENSG00000225880
ENSG00000230368,FAM41C,ENSG00000230368
ENSG00000187634,SAMD11,ENSG00000187634
...,...,...
ENSG00000267952,AC008878.1,ENSG00000267952
ENSG00000267662,AC025809.1,ENSG00000267662
ENSG00000124467,PSG8,ENSG00000124467
ENSG00000231405,AL008638.1,ENSG00000231405


In [93]:
adata.var = adata.var.iloc[:, 0:0]
araw.var = araw.var.iloc[:, 0:0]

In [94]:
# View var

In [95]:
adata.var

ENSG00000238009
ENSG00000177757
ENSG00000225880
ENSG00000230368
ENSG00000187634
...
ENSG00000267952
ENSG00000267662
ENSG00000124467
ENSG00000231405
ENSG00000279094


In [96]:
araw.var

ENSG00000238009
ENSG00000177757
ENSG00000225880
ENSG00000230368
ENSG00000187634
...
ENSG00000267952
ENSG00000267662
ENSG00000124467
ENSG00000231405
ENSG00000279094


feature is filtered

In [97]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [98]:
adata.var

Unnamed: 0_level_0,feature_is_filtered
ensembl_id,Unnamed: 1_level_1
ENSG00000238009,False
ENSG00000177757,False
ENSG00000225880,False
ENSG00000230368,False
ENSG00000187634,False
...,...
ENSG00000267952,False
ENSG00000267662,False
ENSG00000124467,False
ENSG00000231405,False


In [99]:
araw.var

ENSG00000238009
ENSG00000177757
ENSG00000225880
ENSG00000230368
ENSG00000187634
...
ENSG00000267952
ENSG00000267662
ENSG00000124467
ENSG00000231405
ENSG00000279094


#### **obs (Cell metadata)**

In [100]:
#view obs

In [101]:
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,names,group,location,integrated_snn_res.0.65,seurat_clusters,celltypes,subcelltypes,barcodes,barcodes_with_suffix
AAACCTGAGTTAAGTG-1_1,body_solebordo,1971,852,2.790462,AAACCTGAGTTAAGTG-1_1,body,abdomen,14,14,ILC_NK,ILC_NK,AAACCTGAGTTAAGTG,AAACCTGAGTTAAGTG_body_solebordo
AAACCTGGTGCAACTT-1_1,body_solebordo,5114,1263,2.718029,AAACCTGGTGCAACTT-1_1,body,abdomen,7,7,DC,DC1,AAACCTGGTGCAACTT,AAACCTGGTGCAACTT_body_solebordo
AAACCTGTCATGTGGT-1_1,body_solebordo,1860,754,4.623656,AAACCTGTCATGTGGT-1_1,body,abdomen,11,11,Pericytes,RGS5+ pericytes,AAACCTGTCATGTGGT,AAACCTGTCATGTGGT_body_solebordo
AAACCTGTCCATTCTA-1_1,body_solebordo,3148,978,1.016518,AAACCTGTCCATTCTA-1_1,body,abdomen,1,1,T cells,Th,AAACCTGTCCATTCTA,AAACCTGTCCATTCTA_body_solebordo
AAACCTGTCGGATGGA-1_1,body_solebordo,2912,928,2.712912,AAACCTGTCGGATGGA-1_1,body,abdomen,6,6,Macrophages,Macro1_2,AAACCTGTCGGATGGA,AAACCTGTCGGATGGA_body_solebordo
...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTCAACCAATC-1_27,bcc_temple2,7440,2434,0.268817,TTTGGTTCAACCAATC-1_27,BCCface,temple,1,1,T cells,Th,TTTGGTTCAACCAATC,TTTGGTTCAACCAATC_bcc_temple2
TTTGGTTTCACCTTAT-1_27,bcc_temple2,862,492,0.928074,TTTGGTTTCACCTTAT-1_27,BCCface,temple,19,19,Mast cells,MastC,TTTGGTTTCACCTTAT,TTTGGTTTCACCTTAT_bcc_temple2
TTTGGTTTCCTACACC-1_27,bcc_temple2,5155,1573,0.290980,TTTGGTTTCCTACACC-1_27,BCCface,temple,14,14,ILC_NK,ILC_NK,TTTGGTTTCCTACACC,TTTGGTTTCCTACACC_bcc_temple2
TTTGTTGCAGAACGCA-1_27,bcc_temple2,2654,1117,0.339111,TTTGTTGCAGAACGCA-1_27,BCCface,temple,1,1,T cells,Th,TTTGTTGCAGAACGCA,TTTGTTGCAGAACGCA_bcc_temple2


In [102]:
# view the column names in obs

In [103]:
adata.obs.columns

Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'names',
       'group', 'location', 'integrated_snn_res.0.65', 'seurat_clusters',
       'celltypes', 'subcelltypes', 'barcodes', 'barcodes_with_suffix'],
      dtype='object')

#### **assay_ontology_term_id**

In [104]:
mapping = {'body_solebordo':'EFO:0009899','body_tabib':'EFO:0009901','face_cheek1':'EFO:0009922', 'face_cheek2':'EFO:0009922', 'face_cheek3b':'EFO:0009922', 'face_cheek4':'EFO:0009922', 'face_ear1':'EFO:0009922', 'face_ear2':'EFO:0009922', 'face_forehead1':'EFO:0009922', 'face_forehead2':'EFO:0009922', 'face_forehead3':'EFO:0009922', 'face_forehead4':'EFO:0009922', 'face_forehead5':'EFO:0009922', 'face_nose1':'EFO:0009922', 'face_temple1':'EFO:0009922', 'face_temple2':'EFO:0009922', 'bcc_cheek1':'EFO:0009922', 'bcc_cheek2':'EFO:0009922', 'bcc_ear1':'EFO:0009922', 'bcc_ear2':'EFO:0009922', 'bcc_forehead1':'EFO:0009922', 'bcc_nose1':'EFO:0009922', 'bcc_temple1':'EFO:0009922', 'bcc_temple2':'EFO:0009922'}

In [105]:
adata.obs['assay_ontology_term_id'] = adata.obs['orig.ident'].map(mapping)

In [106]:
list(adata.obs['assay_ontology_term_id'].unique())

['EFO:0009899', 'EFO:0009922', 'EFO:0009901']

In [107]:
#assay_info = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/Megagut/Suppl_info/barcode_assay_summary.csv')

In [108]:
#mapping = dict(zip(assay_info['barcode'], assay_info['assay_summary']))

In [109]:
#adata.obs['assay'] = adata.obs['barcodes'].map(mapping)

In [110]:
#list(adata.obs['assay'].unique())

In [111]:
#unique_combinations = adata.obs[['orig.ident', 'assay']].drop_duplicates()

In [112]:
#unique_combinations

In [113]:
#unique_combinations.to_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/Megagut/Suppl_info/unique_combinations.csv', index=False)


In [114]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay_ontology_term_id'].astype('category')

In [115]:
# view adata.obs

In [116]:
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,names,group,location,integrated_snn_res.0.65,seurat_clusters,celltypes,subcelltypes,barcodes,barcodes_with_suffix,assay_ontology_term_id
AAACCTGAGTTAAGTG-1_1,body_solebordo,1971,852,2.790462,AAACCTGAGTTAAGTG-1_1,body,abdomen,14,14,ILC_NK,ILC_NK,AAACCTGAGTTAAGTG,AAACCTGAGTTAAGTG_body_solebordo,EFO:0009899
AAACCTGGTGCAACTT-1_1,body_solebordo,5114,1263,2.718029,AAACCTGGTGCAACTT-1_1,body,abdomen,7,7,DC,DC1,AAACCTGGTGCAACTT,AAACCTGGTGCAACTT_body_solebordo,EFO:0009899
AAACCTGTCATGTGGT-1_1,body_solebordo,1860,754,4.623656,AAACCTGTCATGTGGT-1_1,body,abdomen,11,11,Pericytes,RGS5+ pericytes,AAACCTGTCATGTGGT,AAACCTGTCATGTGGT_body_solebordo,EFO:0009899
AAACCTGTCCATTCTA-1_1,body_solebordo,3148,978,1.016518,AAACCTGTCCATTCTA-1_1,body,abdomen,1,1,T cells,Th,AAACCTGTCCATTCTA,AAACCTGTCCATTCTA_body_solebordo,EFO:0009899
AAACCTGTCGGATGGA-1_1,body_solebordo,2912,928,2.712912,AAACCTGTCGGATGGA-1_1,body,abdomen,6,6,Macrophages,Macro1_2,AAACCTGTCGGATGGA,AAACCTGTCGGATGGA_body_solebordo,EFO:0009899
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTCAACCAATC-1_27,bcc_temple2,7440,2434,0.268817,TTTGGTTCAACCAATC-1_27,BCCface,temple,1,1,T cells,Th,TTTGGTTCAACCAATC,TTTGGTTCAACCAATC_bcc_temple2,EFO:0009922
TTTGGTTTCACCTTAT-1_27,bcc_temple2,862,492,0.928074,TTTGGTTTCACCTTAT-1_27,BCCface,temple,19,19,Mast cells,MastC,TTTGGTTTCACCTTAT,TTTGGTTTCACCTTAT_bcc_temple2,EFO:0009922
TTTGGTTTCCTACACC-1_27,bcc_temple2,5155,1573,0.290980,TTTGGTTTCCTACACC-1_27,BCCface,temple,14,14,ILC_NK,ILC_NK,TTTGGTTTCCTACACC,TTTGGTTTCCTACACC_bcc_temple2,EFO:0009922
TTTGTTGCAGAACGCA-1_27,bcc_temple2,2654,1117,0.339111,TTTGTTGCAGAACGCA-1_27,BCCface,temple,1,1,T cells,Th,TTTGTTGCAGAACGCA,TTTGTTGCAGAACGCA_bcc_temple2,EFO:0009922


#### **cell_type_ontology_term_id**

In [117]:
#identify the column in adata.obs related. to cell type annotation

In [118]:
adata.obs.columns

Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'names',
       'group', 'location', 'integrated_snn_res.0.65', 'seurat_clusters',
       'celltypes', 'subcelltypes', 'barcodes', 'barcodes_with_suffix',
       'assay_ontology_term_id'],
      dtype='object')

In [119]:
list(adata.obs['subcelltypes'].unique())

['ILC_NK',
 'DC1',
 'RGS5+ pericytes',
 'Th',
 'Macro1_2',
 'TAGLN+ pericytes',
 'APOD+ fibroblasts',
 'Suprabasal keratinocytes',
 'Chondrocytes',
 'PTGDS+ fibroblasts',
 'POSTN+ fibroblasts',
 'SFRP2+ fibroblasts',
 'DC2',
 'VEC',
 'CD8+ T RM',
 'Neuronal_SchwannC',
 'T reg',
 'Basal keratinocytes',
 'LEC',
 'Monocytes',
 'NK',
 'MigDC',
 'SMC',
 'PlasmaC',
 'Melanocytes',
 'MastC',
 'IL8+ DC1',
 'Tc',
 'BC',
 'Skeletal muscle cells']

In [120]:
# create a dictionary of cell type and ontology term

In [121]:
mapping= {'ILC_NK':'CL:0001065', #not NK 
 'DC1':'CL:0000990',
 'RGS5+ pericytes':'CL:0000669',
 'Th' :'CL:0000912',
 'Macro1_2':'CL:0000235',
 'TAGLN+ pericytes':'CL:0000669',
 'APOD+ fibroblasts':'CL:0000057',
 'Suprabasal keratinocytes':'CL:4033013',
 'Chondrocytes':'CL:0000138',
 'PTGDS+ fibroblasts':'CL:0000057',
 'POSTN+ fibroblasts':'CL:0000057',
 'SFRP2+ fibroblasts':'CL:0000057',
 'DC2':'CL:0000784',
 'VEC':'CL:0002139',
 'CD8+ T RM':'CL:0001203',
 'Neuronal_SchwannC':'CL:0002573',
 'T reg':'CL:0000815',
 'Basal keratinocytes':'CL:0002187',
 'LEC':'CL:0002138',
 'Monocytes':'CL:0000576',
 'NK':'CL:0000623',
 'MigDC':'CL:0000451', #not mig
 'SMC':'CL:0000192',
 'PlasmaC':'CL:0000786',
 'Melanocytes':'CL:1000458',
 'MastC':'CL:0000097',
 'IL8+ DC1':'CL:0000990',
 'Tc':'CL:0000910',
 'BC':'CL:0000646',
 'Skeletal muscle cells':'CL:0000188'}

In [122]:
# add the cell_type_ontology_term_id column

In [123]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['subcelltypes'].map(mapping)

In [124]:
# change datatype of the column

In [125]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [126]:
# view adata.obs

In [127]:
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,names,group,location,integrated_snn_res.0.65,seurat_clusters,celltypes,subcelltypes,barcodes,barcodes_with_suffix,assay_ontology_term_id,cell_type_ontology_term_id
AAACCTGAGTTAAGTG-1_1,body_solebordo,1971,852,2.790462,AAACCTGAGTTAAGTG-1_1,body,abdomen,14,14,ILC_NK,ILC_NK,AAACCTGAGTTAAGTG,AAACCTGAGTTAAGTG_body_solebordo,EFO:0009899,CL:0001065
AAACCTGGTGCAACTT-1_1,body_solebordo,5114,1263,2.718029,AAACCTGGTGCAACTT-1_1,body,abdomen,7,7,DC,DC1,AAACCTGGTGCAACTT,AAACCTGGTGCAACTT_body_solebordo,EFO:0009899,CL:0000990
AAACCTGTCATGTGGT-1_1,body_solebordo,1860,754,4.623656,AAACCTGTCATGTGGT-1_1,body,abdomen,11,11,Pericytes,RGS5+ pericytes,AAACCTGTCATGTGGT,AAACCTGTCATGTGGT_body_solebordo,EFO:0009899,CL:0000669
AAACCTGTCCATTCTA-1_1,body_solebordo,3148,978,1.016518,AAACCTGTCCATTCTA-1_1,body,abdomen,1,1,T cells,Th,AAACCTGTCCATTCTA,AAACCTGTCCATTCTA_body_solebordo,EFO:0009899,CL:0000912
AAACCTGTCGGATGGA-1_1,body_solebordo,2912,928,2.712912,AAACCTGTCGGATGGA-1_1,body,abdomen,6,6,Macrophages,Macro1_2,AAACCTGTCGGATGGA,AAACCTGTCGGATGGA_body_solebordo,EFO:0009899,CL:0000235
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTCAACCAATC-1_27,bcc_temple2,7440,2434,0.268817,TTTGGTTCAACCAATC-1_27,BCCface,temple,1,1,T cells,Th,TTTGGTTCAACCAATC,TTTGGTTCAACCAATC_bcc_temple2,EFO:0009922,CL:0000912
TTTGGTTTCACCTTAT-1_27,bcc_temple2,862,492,0.928074,TTTGGTTTCACCTTAT-1_27,BCCface,temple,19,19,Mast cells,MastC,TTTGGTTTCACCTTAT,TTTGGTTTCACCTTAT_bcc_temple2,EFO:0009922,CL:0000097
TTTGGTTTCCTACACC-1_27,bcc_temple2,5155,1573,0.290980,TTTGGTTTCCTACACC-1_27,BCCface,temple,14,14,ILC_NK,ILC_NK,TTTGGTTTCCTACACC,TTTGGTTTCCTACACC_bcc_temple2,EFO:0009922,CL:0001065
TTTGTTGCAGAACGCA-1_27,bcc_temple2,2654,1117,0.339111,TTTGTTGCAGAACGCA-1_27,BCCface,temple,1,1,T cells,Th,TTTGTTGCAGAACGCA,TTTGTTGCAGAACGCA_bcc_temple2,EFO:0009922,CL:0000912


In [128]:
list(adata.obs['orig.ident'].unique())

['body_solebordo',
 'face_cheek1',
 'body_tabib',
 'face_cheek2',
 'face_cheek3b',
 'face_cheek4',
 'face_ear1',
 'face_ear2',
 'face_forehead1',
 'face_forehead2',
 'face_forehead3',
 'face_forehead4',
 'face_forehead5',
 'face_nose1',
 'face_temple1',
 'face_temple2',
 'bcc_cheek1',
 'bcc_cheek2',
 'bcc_ear1',
 'bcc_ear2',
 'bcc_forehead1',
 'bcc_nose1',
 'bcc_temple1',
 'bcc_temple2']

In [129]:
adata.obs['orig.ident']= adata.obs['orig.ident'].replace('face_cheek3b', 'face_cheek3')

In [130]:
sum(adata.obs['orig.ident'] == 'body_solebordo') + sum(adata.obs['orig.ident'] == 'body_tabib')

20562

#### **development_stage_ontology_term_id**

In [131]:
# identify the column in adata which corresponds to age

In [132]:
list(adata.obs['orig.ident'].unique())

['body_solebordo',
 'face_cheek1',
 'body_tabib',
 'face_cheek2',
 'face_cheek3',
 'face_cheek4',
 'face_ear1',
 'face_ear2',
 'face_forehead1',
 'face_forehead2',
 'face_forehead3',
 'face_forehead4',
 'face_forehead5',
 'face_nose1',
 'face_temple1',
 'face_temple2',
 'bcc_cheek1',
 'bcc_cheek2',
 'bcc_ear1',
 'bcc_ear2',
 'bcc_forehead1',
 'bcc_nose1',
 'bcc_temple1',
 'bcc_temple2']

In [133]:
mapping= {'body_solebordo':'HsapDv:0000087',
 'face_cheek1':'HsapDv:0000211',
 'body_tabib':'HsapDv:0000087',
 'face_cheek2':'HsapDv:0000156',
 'face_cheek3':'HsapDv:0000207',
 'face_cheek4':'HsapDv:0000216',
 'face_ear1':'HsapDv:0000171',
 'face_ear2':'HsapDv:0000212',
 'face_forehead1':'HsapDv:0000211',
 'face_forehead2':'HsapDv:0000164',
 'face_forehead3':'HsapDv:0000206',
 'face_forehead4':'HsapDv:0000216',
 'face_forehead5':'HsapDv:0000167',
 'face_nose1':'HsapDv:0000150',
 'face_temple1':'HsapDv:0000153',
 'face_temple2':'HsapDv:0000172',
 'bcc_cheek1':'HsapDv:0000133',
 'bcc_cheek2':'HsapDv:0000211',
 'bcc_ear1':'HsapDv:0000164',
 'bcc_ear2':'HsapDv:0000163',
 'bcc_forehead1':'HsapDv:0000167',
 'bcc_nose1':'HsapDv:0000217',
 'bcc_temple1':'HsapDv:0000145',
 'bcc_temple2':'HsapDv:0000153'}

In [134]:
adata.obs['development_stage_ontology_term_id']=  adata.obs['orig.ident'].map(mapping)

In [135]:
# change datatype of the column

In [136]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].astype('category')

In [137]:
# view unique values of development_stage_ontology_term_id column

In [138]:
list(adata.obs['development_stage_ontology_term_id'].unique())

['HsapDv:0000087',
 'HsapDv:0000211',
 'HsapDv:0000156',
 'HsapDv:0000207',
 'HsapDv:0000216',
 'HsapDv:0000171',
 'HsapDv:0000212',
 'HsapDv:0000164',
 'HsapDv:0000206',
 'HsapDv:0000167',
 'HsapDv:0000150',
 'HsapDv:0000153',
 'HsapDv:0000172',
 'HsapDv:0000133',
 'HsapDv:0000163',
 'HsapDv:0000217',
 'HsapDv:0000145']

In [139]:
# view adata.obs

In [140]:
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,names,group,location,integrated_snn_res.0.65,seurat_clusters,celltypes,subcelltypes,barcodes,barcodes_with_suffix,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id
AAACCTGAGTTAAGTG-1_1,body_solebordo,1971,852,2.790462,AAACCTGAGTTAAGTG-1_1,body,abdomen,14,14,ILC_NK,ILC_NK,AAACCTGAGTTAAGTG,AAACCTGAGTTAAGTG_body_solebordo,EFO:0009899,CL:0001065,HsapDv:0000087
AAACCTGGTGCAACTT-1_1,body_solebordo,5114,1263,2.718029,AAACCTGGTGCAACTT-1_1,body,abdomen,7,7,DC,DC1,AAACCTGGTGCAACTT,AAACCTGGTGCAACTT_body_solebordo,EFO:0009899,CL:0000990,HsapDv:0000087
AAACCTGTCATGTGGT-1_1,body_solebordo,1860,754,4.623656,AAACCTGTCATGTGGT-1_1,body,abdomen,11,11,Pericytes,RGS5+ pericytes,AAACCTGTCATGTGGT,AAACCTGTCATGTGGT_body_solebordo,EFO:0009899,CL:0000669,HsapDv:0000087
AAACCTGTCCATTCTA-1_1,body_solebordo,3148,978,1.016518,AAACCTGTCCATTCTA-1_1,body,abdomen,1,1,T cells,Th,AAACCTGTCCATTCTA,AAACCTGTCCATTCTA_body_solebordo,EFO:0009899,CL:0000912,HsapDv:0000087
AAACCTGTCGGATGGA-1_1,body_solebordo,2912,928,2.712912,AAACCTGTCGGATGGA-1_1,body,abdomen,6,6,Macrophages,Macro1_2,AAACCTGTCGGATGGA,AAACCTGTCGGATGGA_body_solebordo,EFO:0009899,CL:0000235,HsapDv:0000087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTCAACCAATC-1_27,bcc_temple2,7440,2434,0.268817,TTTGGTTCAACCAATC-1_27,BCCface,temple,1,1,T cells,Th,TTTGGTTCAACCAATC,TTTGGTTCAACCAATC_bcc_temple2,EFO:0009922,CL:0000912,HsapDv:0000153
TTTGGTTTCACCTTAT-1_27,bcc_temple2,862,492,0.928074,TTTGGTTTCACCTTAT-1_27,BCCface,temple,19,19,Mast cells,MastC,TTTGGTTTCACCTTAT,TTTGGTTTCACCTTAT_bcc_temple2,EFO:0009922,CL:0000097,HsapDv:0000153
TTTGGTTTCCTACACC-1_27,bcc_temple2,5155,1573,0.290980,TTTGGTTTCCTACACC-1_27,BCCface,temple,14,14,ILC_NK,ILC_NK,TTTGGTTTCCTACACC,TTTGGTTTCCTACACC_bcc_temple2,EFO:0009922,CL:0001065,HsapDv:0000153
TTTGTTGCAGAACGCA-1_27,bcc_temple2,2654,1117,0.339111,TTTGTTGCAGAACGCA-1_27,BCCface,temple,1,1,T cells,Th,TTTGTTGCAGAACGCA,TTTGTTGCAGAACGCA_bcc_temple2,EFO:0009922,CL:0000912,HsapDv:0000153


#### **donor_id**

In [141]:
#identify the column in adata.obs which provides donor information

In [142]:
adata.obs.columns

Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'names',
       'group', 'location', 'integrated_snn_res.0.65', 'seurat_clusters',
       'celltypes', 'subcelltypes', 'barcodes', 'barcodes_with_suffix',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id'],
      dtype='object')

In [143]:
# add the donor_id column

In [144]:
adata.obs['donor_id'] = adata.obs['orig.ident']

In [145]:
adata.obs['donor_id'].replace({'body_tabib': 'pooled_body_tabib'}, inplace=True)

In [146]:
adata.obs['donor_id'].replace({'body_solebordo': 'pooled_body_solebordo'}, inplace=True)

In [147]:
adata.obs['donor_id'] = adata.obs['donor_id'].astype('category')

In [148]:
list(adata.obs['donor_id'].unique())

['pooled_body_solebordo',
 'face_cheek1',
 'pooled_body_tabib',
 'face_cheek2',
 'face_cheek3',
 'face_cheek4',
 'face_ear1',
 'face_ear2',
 'face_forehead1',
 'face_forehead2',
 'face_forehead3',
 'face_forehead4',
 'face_forehead5',
 'face_nose1',
 'face_temple1',
 'face_temple2',
 'bcc_cheek1',
 'bcc_cheek2',
 'bcc_ear1',
 'bcc_ear2',
 'bcc_forehead1',
 'bcc_nose1',
 'bcc_temple1',
 'bcc_temple2']

#### **disease_ontology_term_id**

In [149]:
# Assign normal since all are healthy patients

In [150]:
# add the disease_ontology_term_id column

In [151]:
adata.obs['disease_ontology_term_id']= ['MONDO:0020804' if 'bcc' in val else 'PATO:0000461' for val in adata.obs['orig.ident']]

In [152]:
# change datatype of the column

In [153]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')

In [154]:
# view obs

In [155]:
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,names,group,location,integrated_snn_res.0.65,seurat_clusters,celltypes,subcelltypes,barcodes,barcodes_with_suffix,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id
AAACCTGAGTTAAGTG-1_1,body_solebordo,1971,852,2.790462,AAACCTGAGTTAAGTG-1_1,body,abdomen,14,14,ILC_NK,ILC_NK,AAACCTGAGTTAAGTG,AAACCTGAGTTAAGTG_body_solebordo,EFO:0009899,CL:0001065,HsapDv:0000087,pooled_body_solebordo,PATO:0000461
AAACCTGGTGCAACTT-1_1,body_solebordo,5114,1263,2.718029,AAACCTGGTGCAACTT-1_1,body,abdomen,7,7,DC,DC1,AAACCTGGTGCAACTT,AAACCTGGTGCAACTT_body_solebordo,EFO:0009899,CL:0000990,HsapDv:0000087,pooled_body_solebordo,PATO:0000461
AAACCTGTCATGTGGT-1_1,body_solebordo,1860,754,4.623656,AAACCTGTCATGTGGT-1_1,body,abdomen,11,11,Pericytes,RGS5+ pericytes,AAACCTGTCATGTGGT,AAACCTGTCATGTGGT_body_solebordo,EFO:0009899,CL:0000669,HsapDv:0000087,pooled_body_solebordo,PATO:0000461
AAACCTGTCCATTCTA-1_1,body_solebordo,3148,978,1.016518,AAACCTGTCCATTCTA-1_1,body,abdomen,1,1,T cells,Th,AAACCTGTCCATTCTA,AAACCTGTCCATTCTA_body_solebordo,EFO:0009899,CL:0000912,HsapDv:0000087,pooled_body_solebordo,PATO:0000461
AAACCTGTCGGATGGA-1_1,body_solebordo,2912,928,2.712912,AAACCTGTCGGATGGA-1_1,body,abdomen,6,6,Macrophages,Macro1_2,AAACCTGTCGGATGGA,AAACCTGTCGGATGGA_body_solebordo,EFO:0009899,CL:0000235,HsapDv:0000087,pooled_body_solebordo,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTCAACCAATC-1_27,bcc_temple2,7440,2434,0.268817,TTTGGTTCAACCAATC-1_27,BCCface,temple,1,1,T cells,Th,TTTGGTTCAACCAATC,TTTGGTTCAACCAATC_bcc_temple2,EFO:0009922,CL:0000912,HsapDv:0000153,bcc_temple2,MONDO:0020804
TTTGGTTTCACCTTAT-1_27,bcc_temple2,862,492,0.928074,TTTGGTTTCACCTTAT-1_27,BCCface,temple,19,19,Mast cells,MastC,TTTGGTTTCACCTTAT,TTTGGTTTCACCTTAT_bcc_temple2,EFO:0009922,CL:0000097,HsapDv:0000153,bcc_temple2,MONDO:0020804
TTTGGTTTCCTACACC-1_27,bcc_temple2,5155,1573,0.290980,TTTGGTTTCCTACACC-1_27,BCCface,temple,14,14,ILC_NK,ILC_NK,TTTGGTTTCCTACACC,TTTGGTTTCCTACACC_bcc_temple2,EFO:0009922,CL:0001065,HsapDv:0000153,bcc_temple2,MONDO:0020804
TTTGTTGCAGAACGCA-1_27,bcc_temple2,2654,1117,0.339111,TTTGTTGCAGAACGCA-1_27,BCCface,temple,1,1,T cells,Th,TTTGTTGCAGAACGCA,TTTGTTGCAGAACGCA_bcc_temple2,EFO:0009922,CL:0000912,HsapDv:0000153,bcc_temple2,MONDO:0020804


#### **is_primary_data**

In [156]:
adata.obs['is_primary_data'] = [True] * len(adata.obs)

In [157]:
adata.obs['is_primary_data']= ~adata.obs['orig.ident'].eq('body_solebordo') & adata.obs['is_primary_data']

In [158]:
unique_bcc_assay_values = adata.obs.loc[adata.obs['orig.ident'] == 'body_solebordo', 'is_primary_data'].unique()

In [159]:
unique_bcc_assay_values = adata.obs.loc[adata.obs['orig.ident'] == 'body_tabib', 'is_primary_data'].unique()

In [160]:
list(adata.obs['is_primary_data'].unique())

[False, True]

In [161]:
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,names,group,location,integrated_snn_res.0.65,seurat_clusters,celltypes,subcelltypes,barcodes,barcodes_with_suffix,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data
AAACCTGAGTTAAGTG-1_1,body_solebordo,1971,852,2.790462,AAACCTGAGTTAAGTG-1_1,body,abdomen,14,14,ILC_NK,ILC_NK,AAACCTGAGTTAAGTG,AAACCTGAGTTAAGTG_body_solebordo,EFO:0009899,CL:0001065,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False
AAACCTGGTGCAACTT-1_1,body_solebordo,5114,1263,2.718029,AAACCTGGTGCAACTT-1_1,body,abdomen,7,7,DC,DC1,AAACCTGGTGCAACTT,AAACCTGGTGCAACTT_body_solebordo,EFO:0009899,CL:0000990,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False
AAACCTGTCATGTGGT-1_1,body_solebordo,1860,754,4.623656,AAACCTGTCATGTGGT-1_1,body,abdomen,11,11,Pericytes,RGS5+ pericytes,AAACCTGTCATGTGGT,AAACCTGTCATGTGGT_body_solebordo,EFO:0009899,CL:0000669,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False
AAACCTGTCCATTCTA-1_1,body_solebordo,3148,978,1.016518,AAACCTGTCCATTCTA-1_1,body,abdomen,1,1,T cells,Th,AAACCTGTCCATTCTA,AAACCTGTCCATTCTA_body_solebordo,EFO:0009899,CL:0000912,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False
AAACCTGTCGGATGGA-1_1,body_solebordo,2912,928,2.712912,AAACCTGTCGGATGGA-1_1,body,abdomen,6,6,Macrophages,Macro1_2,AAACCTGTCGGATGGA,AAACCTGTCGGATGGA_body_solebordo,EFO:0009899,CL:0000235,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTCAACCAATC-1_27,bcc_temple2,7440,2434,0.268817,TTTGGTTCAACCAATC-1_27,BCCface,temple,1,1,T cells,Th,TTTGGTTCAACCAATC,TTTGGTTCAACCAATC_bcc_temple2,EFO:0009922,CL:0000912,HsapDv:0000153,bcc_temple2,MONDO:0020804,True
TTTGGTTTCACCTTAT-1_27,bcc_temple2,862,492,0.928074,TTTGGTTTCACCTTAT-1_27,BCCface,temple,19,19,Mast cells,MastC,TTTGGTTTCACCTTAT,TTTGGTTTCACCTTAT_bcc_temple2,EFO:0009922,CL:0000097,HsapDv:0000153,bcc_temple2,MONDO:0020804,True
TTTGGTTTCCTACACC-1_27,bcc_temple2,5155,1573,0.290980,TTTGGTTTCCTACACC-1_27,BCCface,temple,14,14,ILC_NK,ILC_NK,TTTGGTTTCCTACACC,TTTGGTTTCCTACACC_bcc_temple2,EFO:0009922,CL:0001065,HsapDv:0000153,bcc_temple2,MONDO:0020804,True
TTTGTTGCAGAACGCA-1_27,bcc_temple2,2654,1117,0.339111,TTTGTTGCAGAACGCA-1_27,BCCface,temple,1,1,T cells,Th,TTTGTTGCAGAACGCA,TTTGTTGCAGAACGCA_bcc_temple2,EFO:0009922,CL:0000912,HsapDv:0000153,bcc_temple2,MONDO:0020804,True


In [162]:
#change data type of column

In [163]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

In [164]:
# view obs

In [165]:
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,names,group,location,integrated_snn_res.0.65,seurat_clusters,celltypes,subcelltypes,barcodes,barcodes_with_suffix,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data
AAACCTGAGTTAAGTG-1_1,body_solebordo,1971,852,2.790462,AAACCTGAGTTAAGTG-1_1,body,abdomen,14,14,ILC_NK,ILC_NK,AAACCTGAGTTAAGTG,AAACCTGAGTTAAGTG_body_solebordo,EFO:0009899,CL:0001065,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False
AAACCTGGTGCAACTT-1_1,body_solebordo,5114,1263,2.718029,AAACCTGGTGCAACTT-1_1,body,abdomen,7,7,DC,DC1,AAACCTGGTGCAACTT,AAACCTGGTGCAACTT_body_solebordo,EFO:0009899,CL:0000990,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False
AAACCTGTCATGTGGT-1_1,body_solebordo,1860,754,4.623656,AAACCTGTCATGTGGT-1_1,body,abdomen,11,11,Pericytes,RGS5+ pericytes,AAACCTGTCATGTGGT,AAACCTGTCATGTGGT_body_solebordo,EFO:0009899,CL:0000669,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False
AAACCTGTCCATTCTA-1_1,body_solebordo,3148,978,1.016518,AAACCTGTCCATTCTA-1_1,body,abdomen,1,1,T cells,Th,AAACCTGTCCATTCTA,AAACCTGTCCATTCTA_body_solebordo,EFO:0009899,CL:0000912,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False
AAACCTGTCGGATGGA-1_1,body_solebordo,2912,928,2.712912,AAACCTGTCGGATGGA-1_1,body,abdomen,6,6,Macrophages,Macro1_2,AAACCTGTCGGATGGA,AAACCTGTCGGATGGA_body_solebordo,EFO:0009899,CL:0000235,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTCAACCAATC-1_27,bcc_temple2,7440,2434,0.268817,TTTGGTTCAACCAATC-1_27,BCCface,temple,1,1,T cells,Th,TTTGGTTCAACCAATC,TTTGGTTCAACCAATC_bcc_temple2,EFO:0009922,CL:0000912,HsapDv:0000153,bcc_temple2,MONDO:0020804,True
TTTGGTTTCACCTTAT-1_27,bcc_temple2,862,492,0.928074,TTTGGTTTCACCTTAT-1_27,BCCface,temple,19,19,Mast cells,MastC,TTTGGTTTCACCTTAT,TTTGGTTTCACCTTAT_bcc_temple2,EFO:0009922,CL:0000097,HsapDv:0000153,bcc_temple2,MONDO:0020804,True
TTTGGTTTCCTACACC-1_27,bcc_temple2,5155,1573,0.290980,TTTGGTTTCCTACACC-1_27,BCCface,temple,14,14,ILC_NK,ILC_NK,TTTGGTTTCCTACACC,TTTGGTTTCCTACACC_bcc_temple2,EFO:0009922,CL:0001065,HsapDv:0000153,bcc_temple2,MONDO:0020804,True
TTTGTTGCAGAACGCA-1_27,bcc_temple2,2654,1117,0.339111,TTTGTTGCAGAACGCA-1_27,BCCface,temple,1,1,T cells,Th,TTTGTTGCAGAACGCA,TTTGTTGCAGAACGCA_bcc_temple2,EFO:0009922,CL:0000912,HsapDv:0000153,bcc_temple2,MONDO:0020804,True


In [166]:
filtered_is_values = adata.obs.loc[adata.obs['orig.ident'] == 'body_tabib', 'is_primary_data']

In [167]:
unique_values = filtered_is_values.unique()


In [168]:
unique_values

array([ True])

#### **organism_ontology_term_id**

In [169]:
# assign organism id 

In [170]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [171]:
#change data type of column

In [172]:
adata.obs['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].astype('category')

In [173]:
# view obs

In [174]:
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,names,group,location,integrated_snn_res.0.65,seurat_clusters,celltypes,subcelltypes,barcodes,barcodes_with_suffix,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id
AAACCTGAGTTAAGTG-1_1,body_solebordo,1971,852,2.790462,AAACCTGAGTTAAGTG-1_1,body,abdomen,14,14,ILC_NK,ILC_NK,AAACCTGAGTTAAGTG,AAACCTGAGTTAAGTG_body_solebordo,EFO:0009899,CL:0001065,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606
AAACCTGGTGCAACTT-1_1,body_solebordo,5114,1263,2.718029,AAACCTGGTGCAACTT-1_1,body,abdomen,7,7,DC,DC1,AAACCTGGTGCAACTT,AAACCTGGTGCAACTT_body_solebordo,EFO:0009899,CL:0000990,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606
AAACCTGTCATGTGGT-1_1,body_solebordo,1860,754,4.623656,AAACCTGTCATGTGGT-1_1,body,abdomen,11,11,Pericytes,RGS5+ pericytes,AAACCTGTCATGTGGT,AAACCTGTCATGTGGT_body_solebordo,EFO:0009899,CL:0000669,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606
AAACCTGTCCATTCTA-1_1,body_solebordo,3148,978,1.016518,AAACCTGTCCATTCTA-1_1,body,abdomen,1,1,T cells,Th,AAACCTGTCCATTCTA,AAACCTGTCCATTCTA_body_solebordo,EFO:0009899,CL:0000912,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606
AAACCTGTCGGATGGA-1_1,body_solebordo,2912,928,2.712912,AAACCTGTCGGATGGA-1_1,body,abdomen,6,6,Macrophages,Macro1_2,AAACCTGTCGGATGGA,AAACCTGTCGGATGGA_body_solebordo,EFO:0009899,CL:0000235,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTCAACCAATC-1_27,bcc_temple2,7440,2434,0.268817,TTTGGTTCAACCAATC-1_27,BCCface,temple,1,1,T cells,Th,TTTGGTTCAACCAATC,TTTGGTTCAACCAATC_bcc_temple2,EFO:0009922,CL:0000912,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606
TTTGGTTTCACCTTAT-1_27,bcc_temple2,862,492,0.928074,TTTGGTTTCACCTTAT-1_27,BCCface,temple,19,19,Mast cells,MastC,TTTGGTTTCACCTTAT,TTTGGTTTCACCTTAT_bcc_temple2,EFO:0009922,CL:0000097,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606
TTTGGTTTCCTACACC-1_27,bcc_temple2,5155,1573,0.290980,TTTGGTTTCCTACACC-1_27,BCCface,temple,14,14,ILC_NK,ILC_NK,TTTGGTTTCCTACACC,TTTGGTTTCCTACACC_bcc_temple2,EFO:0009922,CL:0001065,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606
TTTGTTGCAGAACGCA-1_27,bcc_temple2,2654,1117,0.339111,TTTGTTGCAGAACGCA-1_27,BCCface,temple,1,1,T cells,Th,TTTGTTGCAGAACGCA,TTTGTTGCAGAACGCA_bcc_temple2,EFO:0009922,CL:0000912,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606


#### **self_reported_ethnicity_ontology_term_id**

In [175]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [176]:
# change data type

In [177]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['self_reported_ethnicity_ontology_term_id'].astype('category')

In [178]:
# view obs

In [179]:
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,names,group,location,integrated_snn_res.0.65,seurat_clusters,celltypes,...,barcodes,barcodes_with_suffix,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id
AAACCTGAGTTAAGTG-1_1,body_solebordo,1971,852,2.790462,AAACCTGAGTTAAGTG-1_1,body,abdomen,14,14,ILC_NK,...,AAACCTGAGTTAAGTG,AAACCTGAGTTAAGTG_body_solebordo,EFO:0009899,CL:0001065,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown
AAACCTGGTGCAACTT-1_1,body_solebordo,5114,1263,2.718029,AAACCTGGTGCAACTT-1_1,body,abdomen,7,7,DC,...,AAACCTGGTGCAACTT,AAACCTGGTGCAACTT_body_solebordo,EFO:0009899,CL:0000990,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown
AAACCTGTCATGTGGT-1_1,body_solebordo,1860,754,4.623656,AAACCTGTCATGTGGT-1_1,body,abdomen,11,11,Pericytes,...,AAACCTGTCATGTGGT,AAACCTGTCATGTGGT_body_solebordo,EFO:0009899,CL:0000669,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown
AAACCTGTCCATTCTA-1_1,body_solebordo,3148,978,1.016518,AAACCTGTCCATTCTA-1_1,body,abdomen,1,1,T cells,...,AAACCTGTCCATTCTA,AAACCTGTCCATTCTA_body_solebordo,EFO:0009899,CL:0000912,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown
AAACCTGTCGGATGGA-1_1,body_solebordo,2912,928,2.712912,AAACCTGTCGGATGGA-1_1,body,abdomen,6,6,Macrophages,...,AAACCTGTCGGATGGA,AAACCTGTCGGATGGA_body_solebordo,EFO:0009899,CL:0000235,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTCAACCAATC-1_27,bcc_temple2,7440,2434,0.268817,TTTGGTTCAACCAATC-1_27,BCCface,temple,1,1,T cells,...,TTTGGTTCAACCAATC,TTTGGTTCAACCAATC_bcc_temple2,EFO:0009922,CL:0000912,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606,unknown
TTTGGTTTCACCTTAT-1_27,bcc_temple2,862,492,0.928074,TTTGGTTTCACCTTAT-1_27,BCCface,temple,19,19,Mast cells,...,TTTGGTTTCACCTTAT,TTTGGTTTCACCTTAT_bcc_temple2,EFO:0009922,CL:0000097,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606,unknown
TTTGGTTTCCTACACC-1_27,bcc_temple2,5155,1573,0.290980,TTTGGTTTCCTACACC-1_27,BCCface,temple,14,14,ILC_NK,...,TTTGGTTTCCTACACC,TTTGGTTTCCTACACC_bcc_temple2,EFO:0009922,CL:0001065,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606,unknown
TTTGTTGCAGAACGCA-1_27,bcc_temple2,2654,1117,0.339111,TTTGTTGCAGAACGCA-1_27,BCCface,temple,1,1,T cells,...,TTTGTTGCAGAACGCA,TTTGTTGCAGAACGCA_bcc_temple2,EFO:0009922,CL:0000912,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606,unknown


#### **sex_ontology_term_id**

In [180]:
# identify the column in adata.obs which corresponds to sex

In [181]:
adata.obs.columns

Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'names',
       'group', 'location', 'integrated_snn_res.0.65', 'seurat_clusters',
       'celltypes', 'subcelltypes', 'barcodes', 'barcodes_with_suffix',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

In [182]:
list(adata.obs['orig.ident'].unique())

['body_solebordo',
 'face_cheek1',
 'body_tabib',
 'face_cheek2',
 'face_cheek3',
 'face_cheek4',
 'face_ear1',
 'face_ear2',
 'face_forehead1',
 'face_forehead2',
 'face_forehead3',
 'face_forehead4',
 'face_forehead5',
 'face_nose1',
 'face_temple1',
 'face_temple2',
 'bcc_cheek1',
 'bcc_cheek2',
 'bcc_ear1',
 'bcc_ear2',
 'bcc_forehead1',
 'bcc_nose1',
 'bcc_temple1',
 'bcc_temple2']

In [183]:
mapping = {'body_solebordo' :'PATO:0000384',
 'face_cheek1':'PATO:0000384',
 'body_tabib':'unknown',
 'face_cheek2':'PATO:0000384',
 'face_cheek3':'PATO:0000383',
 'face_cheek4':'PATO:0000384',
 'face_ear1':'PATO:0000384',
 'face_ear2':'PATO:0000384',
 'face_forehead1':'PATO:0000384',
 'face_forehead2':'PATO:0000384',
 'face_forehead3':'PATO:0000384',
 'face_forehead4':'PATO:0000384',
 'face_forehead5':'PATO:0000384',
 'face_nose1':'PATO:0000384',
 'face_temple1':'PATO:0000384',
 'face_temple2':'PATO:0000384',
 'bcc_cheek1':'PATO:0000384',
 'bcc_cheek2':'PATO:0000384',
 'bcc_ear1':'PATO:0000384',
 'bcc_ear2':'PATO:0000384',
 'bcc_forehead1':'PATO:0000384',
 'bcc_nose1':'PATO:0000384',
 'bcc_temple1':'PATO:0000384',
 'bcc_temple2':'PATO:0000384'}

In [184]:
adata.obs['sex_ontology_term_id'] = adata.obs['orig.ident'].map(mapping)

In [185]:
# change data type

In [186]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].astype('category')

In [187]:
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,names,group,location,integrated_snn_res.0.65,seurat_clusters,celltypes,...,barcodes_with_suffix,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
AAACCTGAGTTAAGTG-1_1,body_solebordo,1971,852,2.790462,AAACCTGAGTTAAGTG-1_1,body,abdomen,14,14,ILC_NK,...,AAACCTGAGTTAAGTG_body_solebordo,EFO:0009899,CL:0001065,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
AAACCTGGTGCAACTT-1_1,body_solebordo,5114,1263,2.718029,AAACCTGGTGCAACTT-1_1,body,abdomen,7,7,DC,...,AAACCTGGTGCAACTT_body_solebordo,EFO:0009899,CL:0000990,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
AAACCTGTCATGTGGT-1_1,body_solebordo,1860,754,4.623656,AAACCTGTCATGTGGT-1_1,body,abdomen,11,11,Pericytes,...,AAACCTGTCATGTGGT_body_solebordo,EFO:0009899,CL:0000669,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
AAACCTGTCCATTCTA-1_1,body_solebordo,3148,978,1.016518,AAACCTGTCCATTCTA-1_1,body,abdomen,1,1,T cells,...,AAACCTGTCCATTCTA_body_solebordo,EFO:0009899,CL:0000912,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
AAACCTGTCGGATGGA-1_1,body_solebordo,2912,928,2.712912,AAACCTGTCGGATGGA-1_1,body,abdomen,6,6,Macrophages,...,AAACCTGTCGGATGGA_body_solebordo,EFO:0009899,CL:0000235,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTCAACCAATC-1_27,bcc_temple2,7440,2434,0.268817,TTTGGTTCAACCAATC-1_27,BCCface,temple,1,1,T cells,...,TTTGGTTCAACCAATC_bcc_temple2,EFO:0009922,CL:0000912,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606,unknown,PATO:0000384
TTTGGTTTCACCTTAT-1_27,bcc_temple2,862,492,0.928074,TTTGGTTTCACCTTAT-1_27,BCCface,temple,19,19,Mast cells,...,TTTGGTTTCACCTTAT_bcc_temple2,EFO:0009922,CL:0000097,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606,unknown,PATO:0000384
TTTGGTTTCCTACACC-1_27,bcc_temple2,5155,1573,0.290980,TTTGGTTTCCTACACC-1_27,BCCface,temple,14,14,ILC_NK,...,TTTGGTTTCCTACACC_bcc_temple2,EFO:0009922,CL:0001065,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606,unknown,PATO:0000384
TTTGTTGCAGAACGCA-1_27,bcc_temple2,2654,1117,0.339111,TTTGTTGCAGAACGCA-1_27,BCCface,temple,1,1,T cells,...,TTTGTTGCAGAACGCA_bcc_temple2,EFO:0009922,CL:0000912,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606,unknown,PATO:0000384


In [188]:
list(adata.obs['sex_ontology_term_id'].unique())

['PATO:0000384', 'unknown', 'PATO:0000383']

#### **suspension_type**

In [189]:
adata.obs['suspension_type'] = ['cell'] * len(adata.obs)

In [190]:
# change data type of column

In [191]:
adata.obs['suspension_type'] = adata.obs['suspension_type'].astype('category')

In [192]:
# view obs

In [193]:
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,names,group,location,integrated_snn_res.0.65,seurat_clusters,celltypes,...,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type
AAACCTGAGTTAAGTG-1_1,body_solebordo,1971,852,2.790462,AAACCTGAGTTAAGTG-1_1,body,abdomen,14,14,ILC_NK,...,EFO:0009899,CL:0001065,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell
AAACCTGGTGCAACTT-1_1,body_solebordo,5114,1263,2.718029,AAACCTGGTGCAACTT-1_1,body,abdomen,7,7,DC,...,EFO:0009899,CL:0000990,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell
AAACCTGTCATGTGGT-1_1,body_solebordo,1860,754,4.623656,AAACCTGTCATGTGGT-1_1,body,abdomen,11,11,Pericytes,...,EFO:0009899,CL:0000669,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell
AAACCTGTCCATTCTA-1_1,body_solebordo,3148,978,1.016518,AAACCTGTCCATTCTA-1_1,body,abdomen,1,1,T cells,...,EFO:0009899,CL:0000912,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell
AAACCTGTCGGATGGA-1_1,body_solebordo,2912,928,2.712912,AAACCTGTCGGATGGA-1_1,body,abdomen,6,6,Macrophages,...,EFO:0009899,CL:0000235,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTCAACCAATC-1_27,bcc_temple2,7440,2434,0.268817,TTTGGTTCAACCAATC-1_27,BCCface,temple,1,1,T cells,...,EFO:0009922,CL:0000912,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606,unknown,PATO:0000384,cell
TTTGGTTTCACCTTAT-1_27,bcc_temple2,862,492,0.928074,TTTGGTTTCACCTTAT-1_27,BCCface,temple,19,19,Mast cells,...,EFO:0009922,CL:0000097,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606,unknown,PATO:0000384,cell
TTTGGTTTCCTACACC-1_27,bcc_temple2,5155,1573,0.290980,TTTGGTTTCCTACACC-1_27,BCCface,temple,14,14,ILC_NK,...,EFO:0009922,CL:0001065,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606,unknown,PATO:0000384,cell
TTTGTTGCAGAACGCA-1_27,bcc_temple2,2654,1117,0.339111,TTTGTTGCAGAACGCA-1_27,BCCface,temple,1,1,T cells,...,EFO:0009922,CL:0000912,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606,unknown,PATO:0000384,cell


#### **tissue_type**

In [194]:
adata.obs['tissue_type'] = ['tissue'] * len(adata.obs)

In [195]:
adata.obs['tissue_type'] = adata.obs['tissue_type'].astype('category')

#### **tissue_ontology_term_id**

In [196]:
# identify the column in adata.obs which corresponds to tissue

In [197]:
adata.obs.columns

Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'names',
       'group', 'location', 'integrated_snn_res.0.65', 'seurat_clusters',
       'celltypes', 'subcelltypes', 'barcodes', 'barcodes_with_suffix',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type'],
      dtype='object')

In [198]:
list(adata.obs['location'].unique())

['abdomen', 'cheek', 'arm', 'ear', 'forehead', 'nose', 'temple']

In [199]:
# create a dictionary with tissue and corresponding tissue ontology term id

In [200]:
mapping ={'abdomen':'UBERON:0001416', 'cheek':'UBERON:0008803', 'arm':'UBERON:0002427', 'ear':'UBERON:0001459', 'forehead':'UBERON:0016475', 'nose':'UBERON:0015476', 'temple':'UBERON:8480028'}

In [201]:
# add 'tissue_ontology_term_id' column

In [202]:
adata.obs['tissue_ontology_term_id'] = adata.obs['location'].map(mapping)

In [203]:
# change data type of column

In [204]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].astype('category')

In [205]:
#list the unique values in 'tissue_ontology_term_id' column

In [206]:
list(adata.obs['tissue_ontology_term_id'].unique())

['UBERON:0001416',
 'UBERON:0008803',
 'UBERON:0002427',
 'UBERON:0001459',
 'UBERON:0016475',
 'UBERON:0015476',
 'UBERON:8480028']

In [207]:
# view obs

In [208]:
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,names,group,location,integrated_snn_res.0.65,seurat_clusters,celltypes,...,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
AAACCTGAGTTAAGTG-1_1,body_solebordo,1971,852,2.790462,AAACCTGAGTTAAGTG-1_1,body,abdomen,14,14,ILC_NK,...,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
AAACCTGGTGCAACTT-1_1,body_solebordo,5114,1263,2.718029,AAACCTGGTGCAACTT-1_1,body,abdomen,7,7,DC,...,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
AAACCTGTCATGTGGT-1_1,body_solebordo,1860,754,4.623656,AAACCTGTCATGTGGT-1_1,body,abdomen,11,11,Pericytes,...,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
AAACCTGTCCATTCTA-1_1,body_solebordo,3148,978,1.016518,AAACCTGTCCATTCTA-1_1,body,abdomen,1,1,T cells,...,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
AAACCTGTCGGATGGA-1_1,body_solebordo,2912,928,2.712912,AAACCTGTCGGATGGA-1_1,body,abdomen,6,6,Macrophages,...,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTCAACCAATC-1_27,bcc_temple2,7440,2434,0.268817,TTTGGTTCAACCAATC-1_27,BCCface,temple,1,1,T cells,...,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:8480028
TTTGGTTTCACCTTAT-1_27,bcc_temple2,862,492,0.928074,TTTGGTTTCACCTTAT-1_27,BCCface,temple,19,19,Mast cells,...,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:8480028
TTTGGTTTCCTACACC-1_27,bcc_temple2,5155,1573,0.290980,TTTGGTTTCCTACACC-1_27,BCCface,temple,14,14,ILC_NK,...,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:8480028
TTTGTTGCAGAACGCA-1_27,bcc_temple2,2654,1117,0.339111,TTTGTTGCAGAACGCA-1_27,BCCface,temple,1,1,T cells,...,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:8480028


#### **obsm (Embeddings)**

In [209]:
# view obsm

In [210]:
# check whether all columns are prefixed with X

In [211]:
adata.obsm

AxisArrays with keys: X_umap

#### **uns (Dataset Metadata)**

In [212]:
# View

In [213]:
adata.uns

OverloadedDict, wrapping:
	{'celltypes_colors': array(['#1f77b4', '#ff7f0e', '#279e68', '#d62728', '#aa40fc', '#8c564b',
       '#e377c2', '#b5bd61', '#17becf', '#aec7e8', '#ffbb78', '#98df8a',
       '#ff9896', '#c5b0d5', '#c49c94', '#f7b6d2', '#dbdb8d', '#9edae5',
       '#ad494a', '#8c6d31'], dtype=object), 'log1p': {}, 'subcelltypes_colors': array(['#ffff00', '#1ce6ff', '#ff34ff', '#ff4a46', '#008941', '#006fa6',
       '#a30059', '#ffdbe5', '#7a4900', '#0000a6', '#63ffac', '#b79762',
       '#004d43', '#8fb0ff', '#997d87', '#5a0007', '#809693', '#6a3a4c',
       '#1b4400', '#4fc601', '#3b5dff', '#4a3b53', '#ff2f80', '#61615a',
       '#ba0900', '#6b7900', '#00c2a0', '#ffaa92', '#ff90c9', '#b903aa'],
      dtype=object)}
With overloaded keys:
	['neighbors'].

In [214]:
adata.uns.keys

<bound method OverloadedDict.keys of OverloadedDict, wrapping:
	{'celltypes_colors': array(['#1f77b4', '#ff7f0e', '#279e68', '#d62728', '#aa40fc', '#8c564b',
       '#e377c2', '#b5bd61', '#17becf', '#aec7e8', '#ffbb78', '#98df8a',
       '#ff9896', '#c5b0d5', '#c49c94', '#f7b6d2', '#dbdb8d', '#9edae5',
       '#ad494a', '#8c6d31'], dtype=object), 'log1p': {}, 'subcelltypes_colors': array(['#ffff00', '#1ce6ff', '#ff34ff', '#ff4a46', '#008941', '#006fa6',
       '#a30059', '#ffdbe5', '#7a4900', '#0000a6', '#63ffac', '#b79762',
       '#004d43', '#8fb0ff', '#997d87', '#5a0007', '#809693', '#6a3a4c',
       '#1b4400', '#4fc601', '#3b5dff', '#4a3b53', '#ff2f80', '#61615a',
       '#ba0900', '#6b7900', '#00c2a0', '#ffaa92', '#ff90c9', '#b903aa'],
      dtype=object)}
With overloaded keys:
	['neighbors'].>

In [215]:
# Give a title for the dataset

In [216]:
adata.uns['title'] = 'BCC and Normal - scRNA-seq data'

In [217]:
# Set the default embedding

In [218]:
adata.uns['default_embedding'] = 'X_umap'

### **Final check**

In [219]:
# view anndata object

In [220]:
adata

AnnData object with n_obs × n_vars = 152189 × 27405
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'names', 'group', 'location', 'integrated_snn_res.0.65', 'seurat_clusters', 'celltypes', 'subcelltypes', 'barcodes', 'barcodes_with_suffix', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'donor_id', 'disease_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'suspension_type', 'tissue_type', 'tissue_ontology_term_id'
    var: 'feature_is_filtered'
    uns: 'celltypes_colors', 'log1p', 'subcelltypes_colors', 'title', 'default_embedding'
    obsm: 'X_umap'

In [221]:
# view obs and var data types

In [222]:
adata.obs.dtypes

orig.ident                                  category
nCount_RNA                                     int32
nFeature_RNA                                   int32
percent.mt                                   float64
names                                         object
group                                       category
location                                    category
integrated_snn_res.0.65                     category
seurat_clusters                             category
celltypes                                   category
subcelltypes                                category
barcodes                                      object
barcodes_with_suffix                          object
assay_ontology_term_id                      category
cell_type_ontology_term_id                  category
development_stage_ontology_term_id          category
donor_id                                    category
disease_ontology_term_id                    category
is_primary_data                               

In [223]:
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

In [224]:
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")
for c in dty[dty['dtype'] == 'object'].index.values:
    adata.obs[c] = adata.obs[c].astype('category') 
    print(f"changed {c} from object to category")

changed percent.mt from float64 to float32
changed names from object to category
changed barcodes from object to category
changed barcodes_with_suffix from object to category


In [225]:
# view obs

In [226]:
adata.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,names,group,location,integrated_snn_res.0.65,seurat_clusters,celltypes,...,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
AAACCTGAGTTAAGTG-1_1,body_solebordo,1971,852,2.790462,AAACCTGAGTTAAGTG-1_1,body,abdomen,14,14,ILC_NK,...,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
AAACCTGGTGCAACTT-1_1,body_solebordo,5114,1263,2.718029,AAACCTGGTGCAACTT-1_1,body,abdomen,7,7,DC,...,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
AAACCTGTCATGTGGT-1_1,body_solebordo,1860,754,4.623656,AAACCTGTCATGTGGT-1_1,body,abdomen,11,11,Pericytes,...,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
AAACCTGTCCATTCTA-1_1,body_solebordo,3148,978,1.016518,AAACCTGTCCATTCTA-1_1,body,abdomen,1,1,T cells,...,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
AAACCTGTCGGATGGA-1_1,body_solebordo,2912,928,2.712912,AAACCTGTCGGATGGA-1_1,body,abdomen,6,6,Macrophages,...,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTCAACCAATC-1_27,bcc_temple2,7440,2434,0.268817,TTTGGTTCAACCAATC-1_27,BCCface,temple,1,1,T cells,...,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:8480028
TTTGGTTTCACCTTAT-1_27,bcc_temple2,862,492,0.928074,TTTGGTTTCACCTTAT-1_27,BCCface,temple,19,19,Mast cells,...,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:8480028
TTTGGTTTCCTACACC-1_27,bcc_temple2,5155,1573,0.290980,TTTGGTTTCCTACACC-1_27,BCCface,temple,14,14,ILC_NK,...,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:8480028
TTTGTTGCAGAACGCA-1_27,bcc_temple2,2654,1117,0.339111,TTTGTTGCAGAACGCA-1_27,BCCface,temple,1,1,T cells,...,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:8480028


In [227]:
adata.obs.columns

Index(['orig.ident', 'nCount_RNA', 'nFeature_RNA', 'percent.mt', 'names',
       'group', 'location', 'integrated_snn_res.0.65', 'seurat_clusters',
       'celltypes', 'subcelltypes', 'barcodes', 'barcodes_with_suffix',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

In [228]:
# delete unwanted columns in obs

In [229]:
del adata.obs['names']
del adata.obs['orig.ident']
del adata.obs['location']
#del adata.obs['assay']
del adata.obs['barcodes']
del adata.obs['barcodes_with_suffix']


In [230]:
# view obs

In [231]:
adata.obs

Unnamed: 0,nCount_RNA,nFeature_RNA,percent.mt,group,integrated_snn_res.0.65,seurat_clusters,celltypes,subcelltypes,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
AAACCTGAGTTAAGTG-1_1,1971,852,2.790462,body,14,14,ILC_NK,ILC_NK,EFO:0009899,CL:0001065,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
AAACCTGGTGCAACTT-1_1,5114,1263,2.718029,body,7,7,DC,DC1,EFO:0009899,CL:0000990,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
AAACCTGTCATGTGGT-1_1,1860,754,4.623656,body,11,11,Pericytes,RGS5+ pericytes,EFO:0009899,CL:0000669,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
AAACCTGTCCATTCTA-1_1,3148,978,1.016518,body,1,1,T cells,Th,EFO:0009899,CL:0000912,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
AAACCTGTCGGATGGA-1_1,2912,928,2.712912,body,6,6,Macrophages,Macro1_2,EFO:0009899,CL:0000235,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTCAACCAATC-1_27,7440,2434,0.268817,BCCface,1,1,T cells,Th,EFO:0009922,CL:0000912,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:8480028
TTTGGTTTCACCTTAT-1_27,862,492,0.928074,BCCface,19,19,Mast cells,MastC,EFO:0009922,CL:0000097,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:8480028
TTTGGTTTCCTACACC-1_27,5155,1573,0.290980,BCCface,14,14,ILC_NK,ILC_NK,EFO:0009922,CL:0001065,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:8480028
TTTGTTGCAGAACGCA-1_27,2654,1117,0.339111,BCCface,1,1,T cells,Th,EFO:0009922,CL:0000912,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:8480028


In [232]:
adata.obs.columns

Index(['nCount_RNA', 'nFeature_RNA', 'percent.mt', 'group',
       'integrated_snn_res.0.65', 'seurat_clusters', 'celltypes',
       'subcelltypes', 'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

In [233]:
# view var

In [234]:
adata.var

Unnamed: 0_level_0,feature_is_filtered
ensembl_id,Unnamed: 1_level_1
ENSG00000238009,False
ENSG00000177757,False
ENSG00000225880,False
ENSG00000230368,False
ENSG00000187634,False
...,...
ENSG00000267952,False
ENSG00000267662,False
ENSG00000124467,False
ENSG00000231405,False


In [235]:
araw.var

ENSG00000238009
ENSG00000177757
ENSG00000225880
ENSG00000230368
ENSG00000187634
...
ENSG00000267952
ENSG00000267662
ENSG00000124467
ENSG00000231405
ENSG00000279094


In [236]:
#view uns

In [237]:
adata.uns

OverloadedDict, wrapping:
	{'celltypes_colors': array(['#1f77b4', '#ff7f0e', '#279e68', '#d62728', '#aa40fc', '#8c564b',
       '#e377c2', '#b5bd61', '#17becf', '#aec7e8', '#ffbb78', '#98df8a',
       '#ff9896', '#c5b0d5', '#c49c94', '#f7b6d2', '#dbdb8d', '#9edae5',
       '#ad494a', '#8c6d31'], dtype=object), 'log1p': {}, 'subcelltypes_colors': array(['#ffff00', '#1ce6ff', '#ff34ff', '#ff4a46', '#008941', '#006fa6',
       '#a30059', '#ffdbe5', '#7a4900', '#0000a6', '#63ffac', '#b79762',
       '#004d43', '#8fb0ff', '#997d87', '#5a0007', '#809693', '#6a3a4c',
       '#1b4400', '#4fc601', '#3b5dff', '#4a3b53', '#ff2f80', '#61615a',
       '#ba0900', '#6b7900', '#00c2a0', '#ffaa92', '#ff90c9', '#b903aa'],
      dtype=object), 'title': 'BCC and Normal - scRNA-seq data', 'default_embedding': 'X_umap'}
With overloaded keys:
	['neighbors'].

In [238]:
list(adata.uns.keys())

['celltypes_colors',
 'log1p',
 'subcelltypes_colors',
 'title',
 'default_embedding']

In [239]:
adata.obs.columns

Index(['nCount_RNA', 'nFeature_RNA', 'percent.mt', 'group',
       'integrated_snn_res.0.65', 'seurat_clusters', 'celltypes',
       'subcelltypes', 'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

In [240]:
# Remove unwanted columns in uns

In [241]:
#check the format of expression matrix

In [242]:
adata.X

<152189x27405 sparse matrix of type '<class 'numpy.float32'>'
	with 209215019 stored elements in Compressed Sparse Row format>

In [243]:
araw.X

<152189x27405 sparse matrix of type '<class 'numpy.float32'>'
	with 209215019 stored elements in Compressed Sparse Row format>

In [244]:
#Copy raw counts to adata.raw

In [245]:
adata.raw = araw

In [246]:
del adata.uns['log1p']

In [247]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/spatial-skin/Final_objects/bcc_and_normal-CG_portal_fat.h5ad', compression = 'gzip')

In [248]:
adata.obs

Unnamed: 0,nCount_RNA,nFeature_RNA,percent.mt,group,integrated_snn_res.0.65,seurat_clusters,celltypes,subcelltypes,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
AAACCTGAGTTAAGTG-1_1,1971,852,2.790462,body,14,14,ILC_NK,ILC_NK,EFO:0009899,CL:0001065,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
AAACCTGGTGCAACTT-1_1,5114,1263,2.718029,body,7,7,DC,DC1,EFO:0009899,CL:0000990,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
AAACCTGTCATGTGGT-1_1,1860,754,4.623656,body,11,11,Pericytes,RGS5+ pericytes,EFO:0009899,CL:0000669,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
AAACCTGTCCATTCTA-1_1,3148,978,1.016518,body,1,1,T cells,Th,EFO:0009899,CL:0000912,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
AAACCTGTCGGATGGA-1_1,2912,928,2.712912,body,6,6,Macrophages,Macro1_2,EFO:0009899,CL:0000235,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTCAACCAATC-1_27,7440,2434,0.268817,BCCface,1,1,T cells,Th,EFO:0009922,CL:0000912,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:8480028
TTTGGTTTCACCTTAT-1_27,862,492,0.928074,BCCface,19,19,Mast cells,MastC,EFO:0009922,CL:0000097,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:8480028
TTTGGTTTCCTACACC-1_27,5155,1573,0.290980,BCCface,14,14,ILC_NK,ILC_NK,EFO:0009922,CL:0001065,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:8480028
TTTGTTGCAGAACGCA-1_27,2654,1117,0.339111,BCCface,1,1,T cells,Th,EFO:0009922,CL:0000912,HsapDv:0000153,bcc_temple2,MONDO:0020804,True,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:8480028


In [249]:
adata.var

Unnamed: 0_level_0,feature_is_filtered
ensembl_id,Unnamed: 1_level_1
ENSG00000238009,False
ENSG00000177757,False
ENSG00000225880,False
ENSG00000230368,False
ENSG00000187634,False
...,...
ENSG00000267952,False
ENSG00000267662,False
ENSG00000124467,False
ENSG00000231405,False


In [250]:
adata.raw.var

ENSG00000238009
ENSG00000177757
ENSG00000225880
ENSG00000230368
ENSG00000187634
...
ENSG00000267952
ENSG00000267662
ENSG00000124467
ENSG00000231405
ENSG00000279094


In [251]:
adata.obs.columns

Index(['nCount_RNA', 'nFeature_RNA', 'percent.mt', 'group',
       'integrated_snn_res.0.65', 'seurat_clusters', 'celltypes',
       'subcelltypes', 'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

In [252]:
list(adata.obs['is_primary_data'].unique())

[False, True]

In [253]:
unique_combinations = adata.obs[['donor_id', 'assay_ontology_term_id']].drop_duplicates()

In [254]:
unique_combinations

Unnamed: 0,donor_id,assay_ontology_term_id
AAACCTGAGTTAAGTG-1_1,pooled_body_solebordo,EFO:0009899
AAACCCAAGGAAAGGT-1_2,face_cheek1,EFO:0009922
SC1control_AAACATACAGAACA.1_4,pooled_body_tabib,EFO:0009901
AAACCCACAATGACCT-1_5,face_cheek2,EFO:0009922
AAACCCAAGACTTAAG-1_7,face_cheek3,EFO:0009922
AAACCCAAGATCGCTT-1_8,face_cheek4,EFO:0009922
AAACCCAAGCTCGAAG-1_9,face_ear1,EFO:0009922
AAACCCATCACCGGTG-1_10,face_ear2,EFO:0009922
AAACCCAAGACATCCT-1_11,face_forehead1,EFO:0009922
AAACCCAGTGAATTGA-1_12,face_forehead2,EFO:0009922


In [255]:
index_string_check = any('25' in str(index) for index in adata.obs.index)

In [256]:
index_string_check

False