### **Curating epithelial_bcc_and_normal.h5ad**

Article: Multi-scale spatial mapping of cell populations across anatomical sites in healthy human skin and basal cell carcinoma

DOI: https://doi.org/10.1101/2023.08.08.551504

Data Source : https://spatial-skin-atlas.cellgeni.sanger.ac.uk

##### **Mount farm**

mount-farm

##### **Packages required for curation**

In [1]:
#Import all packages required for curation

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess
import math
import re
from collections import Counter

### **Curation Schema**

##### **X (Matrix Layers)**

##### **AnnData object**

In [3]:
# Load the AnnData object

In [4]:
adata = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/spatial-skin/Data/epithelial_bcc_and_normal.h5ad')

In [5]:
adata.obsm

AxisArrays with keys: X_pca, X_umap

In [6]:
# View the AnnData object

In [7]:
adata

AnnData object with n_obs × n_vars = 9857 × 36565
    obs: 'nCount_RNA', 'nFeature_RNA', 'percent.mt', '00_conditions', '01_sample', '02_group', '03_location', '04_celltypes', '05_subcelltypes', '06_seurat_clusters'
    var: 'name'
    obsm: 'X_pca', 'X_umap'

##### **X - expression matrix**

In [8]:
# View the expression matrix of the anndata object.

In [9]:
adata.X

<9857x36565 sparse matrix of type '<class 'numpy.float64'>'
	with 18770964 stored elements in Compressed Sparse Column format>

In [10]:
# Print the matrix to check whether they are normalized counts or raw counts. if the matrix has floating numbers,they are normalized counts.if they are integers, they are raw counts.

In [11]:
print(adata.X)

  (1043, 0)	0.8139979599279599
  (22, 1)	0.8302222287915094
  (28, 1)	0.8988908101732906
  (34, 1)	1.4766784225881597
  (100, 1)	0.9135693976389069
  (109, 1)	1.2545515577756177
  (243, 1)	0.9065623362498655
  (246, 1)	0.9468338607466571
  (309, 1)	1.0763196173518712
  (337, 1)	1.655118533461911
  (448, 1)	1.0809300990931605
  (511, 1)	1.1959824891028281
  (872, 1)	1.4951953148267552
  (960, 1)	1.628589678521531
  (1335, 1)	0.774349402446096
  (1872, 1)	1.1358175201837903
  (1979, 1)	1.3745601009675068
  (2629, 1)	0.7113832066527307
  (2719, 1)	0.6064010746599753
  (3026, 1)	1.3599587110955507
  (3732, 1)	1.2444329839550443
  (3929, 1)	0.781282984358385
  (3953, 1)	1.1632195643177228
  (4294, 1)	0.9741084501718084
  (4724, 1)	1.6791689908094365
  :	:
  (8728, 33028)	1.0096323446036377
  (9837, 33028)	0.5530214618127588
  (9569, 33029)	0.7524572947145531
  (8681, 33030)	0.7502093654867563
  (9642, 33030)	0.8278213108895978
  (9817, 33030)	0.620135714229806
  (8894, 33031)	2.461135816587

In [12]:
adata.obs['barcodes-1'] = adata.obs_names

In [13]:
adata.obs['barcodes'] = adata.obs_names

In [14]:
adata.obs['barcodes'] = adata.obs['barcodes'].str.extract(r'([ATGC]{10,16})', expand=False)

In [15]:
adata.obs['barcodes']

bcc_AAAGATGAGAGCAATT-1_1    AAAGATGAGAGCAATT
bcc_AACTGGTGTACAAGTA-1_1    AACTGGTGTACAAGTA
bcc_AAGCCGCTCCAGAAGG-1_1    AAGCCGCTCCAGAAGG
bcc_AAGGTTCAGGTTCCTA-1_1    AAGGTTCAGGTTCCTA
bcc_ACATACGTCAGATAAG-1_1    ACATACGTCAGATAAG
                                  ...       
ife_TTTGGAGGTTGGTAGG-1      TTTGGAGGTTGGTAGG
ife_TTTGGTTCACTACACA-1      TTTGGTTCACTACACA
ife_TTTGGTTGTGCCTGCA-1      TTTGGTTGTGCCTGCA
ife_TTTGGTTTCGTTACCC-1      TTTGGTTTCGTTACCC
ife_TTTGTTGTCCTCACGT-1      TTTGTTGTCCTCACGT
Name: barcodes, Length: 9857, dtype: object

In [16]:
adata.obs['01_sample'] = adata.obs['01_sample'].str.replace('body_soleboldo', 'body_solebordo')

In [17]:
adata.obs['barcodes_with_suffix'] = adata.obs.apply(lambda x: x['barcodes'] + '_' + x['01_sample'], axis=1)

In [18]:
#adata = adata[~(adata.obs.duplicated(subset=['barcodes'], keep='first') & (adata.obs['01_sample'] == 'bcc_nose1'))]

##### **Raw counts matrix**

In [19]:
# If X has normalized counts, check for the raw counts matrix.

In [20]:
# check whether raw counts are present in adata.raw

In [21]:
araw = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/spatial-skin/Raw/bcc_and_normal-CG_portal_fat_raw.h5ad')

In [22]:
araw.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,names,group,location,integrated_snn_res.0.65,seurat_clusters,celltypes,subcelltypes,barcodes,barcodes_with_suffix
AAACCTGAGTTAAGTG-1_1,body_solebordo,1971,852,2.790462,AAACCTGAGTTAAGTG-1_1,body,abdomen,14,14,ILC_NK,ILC_NK,AAACCTGAGTTAAGTG,AAACCTGAGTTAAGTG_body_solebordo
AAACCTGGTGCAACTT-1_1,body_solebordo,5114,1263,2.718029,AAACCTGGTGCAACTT-1_1,body,abdomen,7,7,DC,DC1,AAACCTGGTGCAACTT,AAACCTGGTGCAACTT_body_solebordo
AAACCTGTCATGTGGT-1_1,body_solebordo,1860,754,4.623656,AAACCTGTCATGTGGT-1_1,body,abdomen,11,11,Pericytes,RGS5+ pericytes,AAACCTGTCATGTGGT,AAACCTGTCATGTGGT_body_solebordo
AAACCTGTCCATTCTA-1_1,body_solebordo,3148,978,1.016518,AAACCTGTCCATTCTA-1_1,body,abdomen,1,1,T cells,Th,AAACCTGTCCATTCTA,AAACCTGTCCATTCTA_body_solebordo
AAACCTGTCGGATGGA-1_1,body_solebordo,2912,928,2.712912,AAACCTGTCGGATGGA-1_1,body,abdomen,6,6,Macrophages,Macro1_2,AAACCTGTCGGATGGA,AAACCTGTCGGATGGA_body_solebordo
...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTCAACCAATC-1_27,bcc_temple2,7440,2434,0.268817,TTTGGTTCAACCAATC-1_27,BCCface,temple,1,1,T cells,Th,TTTGGTTCAACCAATC,TTTGGTTCAACCAATC_bcc_temple2
TTTGGTTTCACCTTAT-1_27,bcc_temple2,862,492,0.928074,TTTGGTTTCACCTTAT-1_27,BCCface,temple,19,19,Mast cells,MastC,TTTGGTTTCACCTTAT,TTTGGTTTCACCTTAT_bcc_temple2
TTTGGTTTCCTACACC-1_27,bcc_temple2,5155,1573,0.290980,TTTGGTTTCCTACACC-1_27,BCCface,temple,14,14,ILC_NK,ILC_NK,TTTGGTTTCCTACACC,TTTGGTTTCCTACACC_bcc_temple2
TTTGTTGCAGAACGCA-1_27,bcc_temple2,2654,1117,0.339111,TTTGTTGCAGAACGCA-1_27,BCCface,temple,1,1,T cells,Th,TTTGTTGCAGAACGCA,TTTGTTGCAGAACGCA_bcc_temple2


In [23]:
araw.obs['barcodes-1'] = araw.obs.index

In [24]:
araw.obs.index = araw.obs['barcodes_with_suffix']

In [25]:
adata.obs.index = adata.obs['barcodes_with_suffix']

In [26]:
araw = araw[~araw.obs.index.duplicated(keep='first')]

In [27]:
adata = adata[~adata.obs.index.duplicated(keep='first')]

In [28]:
araw.obs

Unnamed: 0_level_0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,names,group,location,integrated_snn_res.0.65,seurat_clusters,celltypes,subcelltypes,barcodes,barcodes_with_suffix,barcodes-1
barcodes_with_suffix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
AAACCTGAGTTAAGTG_body_solebordo,body_solebordo,1971,852,2.790462,AAACCTGAGTTAAGTG-1_1,body,abdomen,14,14,ILC_NK,ILC_NK,AAACCTGAGTTAAGTG,AAACCTGAGTTAAGTG_body_solebordo,AAACCTGAGTTAAGTG-1_1
AAACCTGGTGCAACTT_body_solebordo,body_solebordo,5114,1263,2.718029,AAACCTGGTGCAACTT-1_1,body,abdomen,7,7,DC,DC1,AAACCTGGTGCAACTT,AAACCTGGTGCAACTT_body_solebordo,AAACCTGGTGCAACTT-1_1
AAACCTGTCATGTGGT_body_solebordo,body_solebordo,1860,754,4.623656,AAACCTGTCATGTGGT-1_1,body,abdomen,11,11,Pericytes,RGS5+ pericytes,AAACCTGTCATGTGGT,AAACCTGTCATGTGGT_body_solebordo,AAACCTGTCATGTGGT-1_1
AAACCTGTCCATTCTA_body_solebordo,body_solebordo,3148,978,1.016518,AAACCTGTCCATTCTA-1_1,body,abdomen,1,1,T cells,Th,AAACCTGTCCATTCTA,AAACCTGTCCATTCTA_body_solebordo,AAACCTGTCCATTCTA-1_1
AAACCTGTCGGATGGA_body_solebordo,body_solebordo,2912,928,2.712912,AAACCTGTCGGATGGA-1_1,body,abdomen,6,6,Macrophages,Macro1_2,AAACCTGTCGGATGGA,AAACCTGTCGGATGGA_body_solebordo,AAACCTGTCGGATGGA-1_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTCAACCAATC_bcc_temple2,bcc_temple2,7440,2434,0.268817,TTTGGTTCAACCAATC-1_27,BCCface,temple,1,1,T cells,Th,TTTGGTTCAACCAATC,TTTGGTTCAACCAATC_bcc_temple2,TTTGGTTCAACCAATC-1_27
TTTGGTTTCACCTTAT_bcc_temple2,bcc_temple2,862,492,0.928074,TTTGGTTTCACCTTAT-1_27,BCCface,temple,19,19,Mast cells,MastC,TTTGGTTTCACCTTAT,TTTGGTTTCACCTTAT_bcc_temple2,TTTGGTTTCACCTTAT-1_27
TTTGGTTTCCTACACC_bcc_temple2,bcc_temple2,5155,1573,0.290980,TTTGGTTTCCTACACC-1_27,BCCface,temple,14,14,ILC_NK,ILC_NK,TTTGGTTTCCTACACC,TTTGGTTTCCTACACC_bcc_temple2,TTTGGTTTCCTACACC-1_27
TTTGTTGCAGAACGCA_bcc_temple2,bcc_temple2,2654,1117,0.339111,TTTGTTGCAGAACGCA-1_27,BCCface,temple,1,1,T cells,Th,TTTGTTGCAGAACGCA,TTTGTTGCAGAACGCA_bcc_temple2,TTTGTTGCAGAACGCA-1_27


In [29]:
adata.obs

Unnamed: 0_level_0,nCount_RNA,nFeature_RNA,percent.mt,00_conditions,01_sample,02_group,03_location,04_celltypes,05_subcelltypes,06_seurat_clusters,barcodes-1,barcodes,barcodes_with_suffix
barcodes_with_suffix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AAAGATGAGAGCAATT_body_solebordo,2887.0,1229,3.221337,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,bcc_AAAGATGAGAGCAATT-1_1,AAAGATGAGAGCAATT,AAAGATGAGAGCAATT_body_solebordo
AACTGGTGTACAAGTA_body_solebordo,3956.0,1329,4.802831,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_AACTGGTGTACAAGTA-1_1,AACTGGTGTACAAGTA,AACTGGTGTACAAGTA_body_solebordo
AAGCCGCTCCAGAAGG_body_solebordo,3698.0,1256,4.245538,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_AAGCCGCTCCAGAAGG-1_1,AAGCCGCTCCAGAAGG,AAGCCGCTCCAGAAGG_body_solebordo
AAGGTTCAGGTTCCTA_body_solebordo,2037.0,762,0.147275,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,bcc_AAGGTTCAGGTTCCTA-1_1,AAGGTTCAGGTTCCTA,AAGGTTCAGGTTCCTA_body_solebordo
ACATACGTCAGATAAG_body_solebordo,7270.0,1900,3.658872,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_ACATACGTCAGATAAG-1_1,ACATACGTCAGATAAG,ACATACGTCAGATAAG_body_solebordo
...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGAGGTTGGTAGG_ife1,571.0,442,0.175131,healthy,ife1,face,scalp,N/A (microdissected data),Transitional K,3,ife_TTTGGAGGTTGGTAGG-1,TTTGGAGGTTGGTAGG,TTTGGAGGTTGGTAGG_ife1
TTTGGTTCACTACACA_ife1,11141.0,2884,0.430841,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,ife_TTTGGTTCACTACACA-1,TTTGGTTCACTACACA,TTTGGTTCACTACACA_ife1
TTTGGTTGTGCCTGCA_ife1,11746.0,2779,2.843521,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,ife_TTTGGTTGTGCCTGCA-1,TTTGGTTGTGCCTGCA,TTTGGTTGTGCCTGCA_ife1
TTTGGTTTCGTTACCC_ife1,2844.0,998,1.793249,healthy,ife1,face,scalp,N/A (microdissected data),Granular K,9,ife_TTTGGTTTCGTTACCC-1,TTTGGTTTCGTTACCC,TTTGGTTTCGTTACCC_ife1


In [30]:
count_adata = (adata.obs.index == 'TTTGGTTCAGACAAGC_bcc_nose1').sum()

In [31]:
count_araw = (araw.obs.index == 'TTTGGTTCAGACAAGC_bcc_nose1').sum()

In [32]:
count_adata

1

In [33]:
count_araw

0

In [34]:
araw_psu=sc.read_10x_mtx('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/spatial-skin/Raw/psu', var_names='gene_symbols', cache=True)
araw_ife1=sc.read_10x_mtx('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/spatial-skin/Raw/ife', var_names='gene_symbols', cache=True)

In [35]:
araw_psu.obs

AAACCCAAGGGAGGGT-1
AAACCCAGTAGAATGT-1
AAACGAAGTATCCCTC-1
AAACGAAGTGTGCTTA-1
AAACGAAGTTCCGCGA-1
...
TTTGGTTTCCCATAGA-1
TTTGGTTTCCCTCGTA-1
TTTGTTGCACGTCGGT-1
TTTGTTGTCCCGAGGT-1
TTTGTTGTCTCTGCCA-1


In [36]:
araw_ife1.obs

AAACCCAAGAGCCGTA-1
AAACCCAAGCCTAACT-1
AAACCCAAGGACATCG-1
AAACCCAGTAGTCTGT-1
AAACGAACAACTGTGT-1
...
TTTGGTTCACTACACA-1
TTTGGTTGTGCCTGCA-1
TTTGGTTTCGAGAAGC-1
TTTGGTTTCGTTACCC-1
TTTGTTGTCCTCACGT-1


In [37]:
araw_psu.obs['barcodes-1'] = araw_psu.obs_names
araw_ife1.obs['barcodes-1'] = araw_ife1.obs_names
araw_psu.obs['barcodes'] = araw_psu.obs_names
araw_ife1.obs['barcodes'] = araw_ife1.obs_names

In [38]:
araw_psu.obs['barcodes'] = araw_psu.obs['barcodes'].str.extract(r'([ATGC]{10,16})', expand=False)
araw_ife1.obs['barcodes'] = araw_ife1.obs['barcodes'].str.extract(r'([ATGC]{10,16})', expand=False)

In [39]:
araw_psu.obs['barcodes'] = araw_psu.obs['barcodes'] + '_psu1'

In [40]:
araw_ife1.obs['barcodes'] = araw_ife1.obs['barcodes'] + '_ife1'

In [41]:
araw_ife1.obs

Unnamed: 0,barcodes-1,barcodes
AAACCCAAGAGCCGTA-1,AAACCCAAGAGCCGTA-1,AAACCCAAGAGCCGTA_ife1
AAACCCAAGCCTAACT-1,AAACCCAAGCCTAACT-1,AAACCCAAGCCTAACT_ife1
AAACCCAAGGACATCG-1,AAACCCAAGGACATCG-1,AAACCCAAGGACATCG_ife1
AAACCCAGTAGTCTGT-1,AAACCCAGTAGTCTGT-1,AAACCCAGTAGTCTGT_ife1
AAACGAACAACTGTGT-1,AAACGAACAACTGTGT-1,AAACGAACAACTGTGT_ife1
...,...,...
TTTGGTTCACTACACA-1,TTTGGTTCACTACACA-1,TTTGGTTCACTACACA_ife1
TTTGGTTGTGCCTGCA-1,TTTGGTTGTGCCTGCA-1,TTTGGTTGTGCCTGCA_ife1
TTTGGTTTCGAGAAGC-1,TTTGGTTTCGAGAAGC-1,TTTGGTTTCGAGAAGC_ife1
TTTGGTTTCGTTACCC-1,TTTGGTTTCGTTACCC-1,TTTGGTTTCGTTACCC_ife1


In [42]:
araw_psu.obs

Unnamed: 0,barcodes-1,barcodes
AAACCCAAGGGAGGGT-1,AAACCCAAGGGAGGGT-1,AAACCCAAGGGAGGGT_psu1
AAACCCAGTAGAATGT-1,AAACCCAGTAGAATGT-1,AAACCCAGTAGAATGT_psu1
AAACGAAGTATCCCTC-1,AAACGAAGTATCCCTC-1,AAACGAAGTATCCCTC_psu1
AAACGAAGTGTGCTTA-1,AAACGAAGTGTGCTTA-1,AAACGAAGTGTGCTTA_psu1
AAACGAAGTTCCGCGA-1,AAACGAAGTTCCGCGA-1,AAACGAAGTTCCGCGA_psu1
...,...,...
TTTGGTTTCCCATAGA-1,TTTGGTTTCCCATAGA-1,TTTGGTTTCCCATAGA_psu1
TTTGGTTTCCCTCGTA-1,TTTGGTTTCCCTCGTA-1,TTTGGTTTCCCTCGTA_psu1
TTTGTTGCACGTCGGT-1,TTTGTTGCACGTCGGT-1,TTTGTTGCACGTCGGT_psu1
TTTGTTGTCCCGAGGT-1,TTTGTTGTCCCGAGGT-1,TTTGTTGTCCCGAGGT_psu1


In [43]:
araw_ife1.obs.index = araw_ife1.obs['barcodes']

In [44]:
araw_psu.obs.index = araw_psu.obs['barcodes']

In [45]:
araw_ife1.obs

Unnamed: 0_level_0,barcodes-1,barcodes
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1
AAACCCAAGAGCCGTA_ife1,AAACCCAAGAGCCGTA-1,AAACCCAAGAGCCGTA_ife1
AAACCCAAGCCTAACT_ife1,AAACCCAAGCCTAACT-1,AAACCCAAGCCTAACT_ife1
AAACCCAAGGACATCG_ife1,AAACCCAAGGACATCG-1,AAACCCAAGGACATCG_ife1
AAACCCAGTAGTCTGT_ife1,AAACCCAGTAGTCTGT-1,AAACCCAGTAGTCTGT_ife1
AAACGAACAACTGTGT_ife1,AAACGAACAACTGTGT-1,AAACGAACAACTGTGT_ife1
...,...,...
TTTGGTTCACTACACA_ife1,TTTGGTTCACTACACA-1,TTTGGTTCACTACACA_ife1
TTTGGTTGTGCCTGCA_ife1,TTTGGTTGTGCCTGCA-1,TTTGGTTGTGCCTGCA_ife1
TTTGGTTTCGAGAAGC_ife1,TTTGGTTTCGAGAAGC-1,TTTGGTTTCGAGAAGC_ife1
TTTGGTTTCGTTACCC_ife1,TTTGGTTTCGTTACCC-1,TTTGGTTTCGTTACCC_ife1


In [46]:
araw_psu.obs

Unnamed: 0_level_0,barcodes-1,barcodes
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1
AAACCCAAGGGAGGGT_psu1,AAACCCAAGGGAGGGT-1,AAACCCAAGGGAGGGT_psu1
AAACCCAGTAGAATGT_psu1,AAACCCAGTAGAATGT-1,AAACCCAGTAGAATGT_psu1
AAACGAAGTATCCCTC_psu1,AAACGAAGTATCCCTC-1,AAACGAAGTATCCCTC_psu1
AAACGAAGTGTGCTTA_psu1,AAACGAAGTGTGCTTA-1,AAACGAAGTGTGCTTA_psu1
AAACGAAGTTCCGCGA_psu1,AAACGAAGTTCCGCGA-1,AAACGAAGTTCCGCGA_psu1
...,...,...
TTTGGTTTCCCATAGA_psu1,TTTGGTTTCCCATAGA-1,TTTGGTTTCCCATAGA_psu1
TTTGGTTTCCCTCGTA_psu1,TTTGGTTTCCCTCGTA-1,TTTGGTTTCCCTCGTA_psu1
TTTGTTGCACGTCGGT_psu1,TTTGTTGCACGTCGGT-1,TTTGTTGCACGTCGGT_psu1
TTTGTTGTCCCGAGGT_psu1,TTTGTTGTCCCGAGGT-1,TTTGTTGTCCCGAGGT_psu1


In [47]:
adata.obs

Unnamed: 0_level_0,nCount_RNA,nFeature_RNA,percent.mt,00_conditions,01_sample,02_group,03_location,04_celltypes,05_subcelltypes,06_seurat_clusters,barcodes-1,barcodes,barcodes_with_suffix
barcodes_with_suffix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AAAGATGAGAGCAATT_body_solebordo,2887.0,1229,3.221337,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,bcc_AAAGATGAGAGCAATT-1_1,AAAGATGAGAGCAATT,AAAGATGAGAGCAATT_body_solebordo
AACTGGTGTACAAGTA_body_solebordo,3956.0,1329,4.802831,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_AACTGGTGTACAAGTA-1_1,AACTGGTGTACAAGTA,AACTGGTGTACAAGTA_body_solebordo
AAGCCGCTCCAGAAGG_body_solebordo,3698.0,1256,4.245538,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_AAGCCGCTCCAGAAGG-1_1,AAGCCGCTCCAGAAGG,AAGCCGCTCCAGAAGG_body_solebordo
AAGGTTCAGGTTCCTA_body_solebordo,2037.0,762,0.147275,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,bcc_AAGGTTCAGGTTCCTA-1_1,AAGGTTCAGGTTCCTA,AAGGTTCAGGTTCCTA_body_solebordo
ACATACGTCAGATAAG_body_solebordo,7270.0,1900,3.658872,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_ACATACGTCAGATAAG-1_1,ACATACGTCAGATAAG,ACATACGTCAGATAAG_body_solebordo
...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGAGGTTGGTAGG_ife1,571.0,442,0.175131,healthy,ife1,face,scalp,N/A (microdissected data),Transitional K,3,ife_TTTGGAGGTTGGTAGG-1,TTTGGAGGTTGGTAGG,TTTGGAGGTTGGTAGG_ife1
TTTGGTTCACTACACA_ife1,11141.0,2884,0.430841,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,ife_TTTGGTTCACTACACA-1,TTTGGTTCACTACACA,TTTGGTTCACTACACA_ife1
TTTGGTTGTGCCTGCA_ife1,11746.0,2779,2.843521,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,ife_TTTGGTTGTGCCTGCA-1,TTTGGTTGTGCCTGCA,TTTGGTTGTGCCTGCA_ife1
TTTGGTTTCGTTACCC_ife1,2844.0,998,1.793249,healthy,ife1,face,scalp,N/A (microdissected data),Granular K,9,ife_TTTGGTTTCGTTACCC-1,TTTGGTTTCGTTACCC,TTTGGTTTCGTTACCC_ife1


In [48]:
adata.obs.index = adata.obs['barcodes_with_suffix']

In [49]:
araw.obs

Unnamed: 0_level_0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,names,group,location,integrated_snn_res.0.65,seurat_clusters,celltypes,subcelltypes,barcodes,barcodes_with_suffix,barcodes-1
barcodes_with_suffix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
AAACCTGAGTTAAGTG_body_solebordo,body_solebordo,1971,852,2.790462,AAACCTGAGTTAAGTG-1_1,body,abdomen,14,14,ILC_NK,ILC_NK,AAACCTGAGTTAAGTG,AAACCTGAGTTAAGTG_body_solebordo,AAACCTGAGTTAAGTG-1_1
AAACCTGGTGCAACTT_body_solebordo,body_solebordo,5114,1263,2.718029,AAACCTGGTGCAACTT-1_1,body,abdomen,7,7,DC,DC1,AAACCTGGTGCAACTT,AAACCTGGTGCAACTT_body_solebordo,AAACCTGGTGCAACTT-1_1
AAACCTGTCATGTGGT_body_solebordo,body_solebordo,1860,754,4.623656,AAACCTGTCATGTGGT-1_1,body,abdomen,11,11,Pericytes,RGS5+ pericytes,AAACCTGTCATGTGGT,AAACCTGTCATGTGGT_body_solebordo,AAACCTGTCATGTGGT-1_1
AAACCTGTCCATTCTA_body_solebordo,body_solebordo,3148,978,1.016518,AAACCTGTCCATTCTA-1_1,body,abdomen,1,1,T cells,Th,AAACCTGTCCATTCTA,AAACCTGTCCATTCTA_body_solebordo,AAACCTGTCCATTCTA-1_1
AAACCTGTCGGATGGA_body_solebordo,body_solebordo,2912,928,2.712912,AAACCTGTCGGATGGA-1_1,body,abdomen,6,6,Macrophages,Macro1_2,AAACCTGTCGGATGGA,AAACCTGTCGGATGGA_body_solebordo,AAACCTGTCGGATGGA-1_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGTTCAACCAATC_bcc_temple2,bcc_temple2,7440,2434,0.268817,TTTGGTTCAACCAATC-1_27,BCCface,temple,1,1,T cells,Th,TTTGGTTCAACCAATC,TTTGGTTCAACCAATC_bcc_temple2,TTTGGTTCAACCAATC-1_27
TTTGGTTTCACCTTAT_bcc_temple2,bcc_temple2,862,492,0.928074,TTTGGTTTCACCTTAT-1_27,BCCface,temple,19,19,Mast cells,MastC,TTTGGTTTCACCTTAT,TTTGGTTTCACCTTAT_bcc_temple2,TTTGGTTTCACCTTAT-1_27
TTTGGTTTCCTACACC_bcc_temple2,bcc_temple2,5155,1573,0.290980,TTTGGTTTCCTACACC-1_27,BCCface,temple,14,14,ILC_NK,ILC_NK,TTTGGTTTCCTACACC,TTTGGTTTCCTACACC_bcc_temple2,TTTGGTTTCCTACACC-1_27
TTTGTTGCAGAACGCA_bcc_temple2,bcc_temple2,2654,1117,0.339111,TTTGTTGCAGAACGCA-1_27,BCCface,temple,1,1,T cells,Th,TTTGTTGCAGAACGCA,TTTGTTGCAGAACGCA_bcc_temple2,TTTGTTGCAGAACGCA-1_27


In [50]:
araw = araw[araw.obs.index.isin(adata.obs.index)]
araw_psu = araw_psu[araw_psu.obs.index.isin(adata.obs.index)]
araw_ife1 = araw_ife1[araw_ife1.obs.index.isin(adata.obs.index)]

In [51]:
araw.obs

Unnamed: 0_level_0,orig.ident,nCount_RNA,nFeature_RNA,percent.mt,names,group,location,integrated_snn_res.0.65,seurat_clusters,celltypes,subcelltypes,barcodes,barcodes_with_suffix,barcodes-1
barcodes_with_suffix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
AAAGATGAGAGCAATT_body_solebordo,body_solebordo,2887,1229,3.221337,AAAGATGAGAGCAATT-1_1,body,abdomen,15,15,Suprabasal keratinocytes,Suprabasal keratinocytes,AAAGATGAGAGCAATT,AAAGATGAGAGCAATT_body_solebordo,AAAGATGAGAGCAATT-1_1
AACCATGGTTCTGTTT_body_solebordo,body_solebordo,3052,1267,2.457405,AACCATGGTTCTGTTT-1_1,body,abdomen,11,11,Pericytes,RGS5+ pericytes,AACCATGGTTCTGTTT,AACCATGGTTCTGTTT_body_solebordo,AACCATGGTTCTGTTT-1_1
AACTGGTGTACAAGTA_body_solebordo,body_solebordo,3956,1329,4.802831,AACTGGTGTACAAGTA-1_1,body,abdomen,15,15,Suprabasal keratinocytes,Suprabasal keratinocytes,AACTGGTGTACAAGTA,AACTGGTGTACAAGTA_body_solebordo,AACTGGTGTACAAGTA-1_1
AAGCCGCTCCAGAAGG_body_solebordo,body_solebordo,3698,1256,4.245538,AAGCCGCTCCAGAAGG-1_1,body,abdomen,15,15,Suprabasal keratinocytes,Suprabasal keratinocytes,AAGCCGCTCCAGAAGG,AAGCCGCTCCAGAAGG_body_solebordo,AAGCCGCTCCAGAAGG-1_1
AAGGTTCAGGTTCCTA_body_solebordo,body_solebordo,2037,762,0.147275,AAGGTTCAGGTTCCTA-1_1,body,abdomen,15,15,Suprabasal keratinocytes,Suprabasal keratinocytes,AAGGTTCAGGTTCCTA,AAGGTTCAGGTTCCTA_body_solebordo,AAGGTTCAGGTTCCTA-1_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTCCTAATCCATTTCA_bcc_temple1,bcc_temple1,5818,2432,3.970437,TTCCTAATCCATTTCA-1_26,BCCface,temple,17,17,Basal keratinocytes,Basal keratinocytes,TTCCTAATCCATTTCA,TTCCTAATCCATTTCA_bcc_temple1,TTCCTAATCCATTTCA-1_26
CTCAACCAGGCATGCA_bcc_temple2,bcc_temple2,2140,873,0.140187,CTCAACCAGGCATGCA-1_27,BCCface,temple,15,15,Suprabasal keratinocytes,Suprabasal keratinocytes,CTCAACCAGGCATGCA,CTCAACCAGGCATGCA_bcc_temple2,CTCAACCAGGCATGCA-1_27
CTCCACAAGCTTTGTG_bcc_temple2,bcc_temple2,639,385,3.442879,CTCCACAAGCTTTGTG-1_27,BCCface,temple,20,20,Melanocytes,Melanocytes,CTCCACAAGCTTTGTG,CTCCACAAGCTTTGTG_bcc_temple2,CTCCACAAGCTTTGTG-1_27
GTGAGTTAGGGATGTC_bcc_temple2,bcc_temple2,4148,1851,3.061716,GTGAGTTAGGGATGTC-1_27,BCCface,temple,17,17,Basal keratinocytes,Basal keratinocytes,GTGAGTTAGGGATGTC,GTGAGTTAGGGATGTC_bcc_temple2,GTGAGTTAGGGATGTC-1_27


In [52]:
list(adata.obs['01_sample'].unique())

['body_solebordo',
 'body_tabib',
 'face_cheek1',
 'face_cheek2',
 'face_cheek3b',
 'face_cheek4',
 'face_ear1',
 'face_ear2',
 'face_forehead1',
 'face_forehead2',
 'face_forehead3',
 'face_forehead4',
 'face_forehead5',
 'face_nose1',
 'face_temple1',
 'face_temple2',
 'bcc_cheek1',
 'bcc_cheek2',
 'bcc_ear1',
 'bcc_ear2',
 'bcc_forehead1',
 'bcc_nose1',
 'bcc_temple1',
 'bcc_temple2',
 'psu1',
 'ife1']

In [53]:
araw_psu.obs

Unnamed: 0_level_0,barcodes-1,barcodes
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1
AAACCCAAGGGAGGGT_psu1,AAACCCAAGGGAGGGT-1,AAACCCAAGGGAGGGT_psu1
AAACCCAGTAGAATGT_psu1,AAACCCAGTAGAATGT-1,AAACCCAGTAGAATGT_psu1
AAACGAAGTTCCGCGA_psu1,AAACGAAGTTCCGCGA-1,AAACGAAGTTCCGCGA_psu1
AAACGAATCACTCGAA_psu1,AAACGAATCACTCGAA-1,AAACGAATCACTCGAA_psu1
AAACGAATCTCCAATT_psu1,AAACGAATCTCCAATT-1,AAACGAATCTCCAATT_psu1
...,...,...
TTTGGAGTCTGTGCTC_psu1,TTTGGAGTCTGTGCTC-1,TTTGGAGTCTGTGCTC_psu1
TTTGGTTAGGCACCAA_psu1,TTTGGTTAGGCACCAA-1,TTTGGTTAGGCACCAA_psu1
TTTGTTGCACGTCGGT_psu1,TTTGTTGCACGTCGGT-1,TTTGTTGCACGTCGGT_psu1
TTTGTTGTCCCGAGGT_psu1,TTTGTTGTCCCGAGGT-1,TTTGTTGTCCCGAGGT_psu1


In [54]:
araw_ife1.obs

Unnamed: 0_level_0,barcodes-1,barcodes
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1
AAACCCAAGAGCCGTA_ife1,AAACCCAAGAGCCGTA-1,AAACCCAAGAGCCGTA_ife1
AAACGCTCATGTCAGT_ife1,AAACGCTCATGTCAGT-1,AAACGCTCATGTCAGT_ife1
AAACGCTGTACAGAAT_ife1,AAACGCTGTACAGAAT-1,AAACGCTGTACAGAAT_ife1
AAAGGGCCAACTCGAT_ife1,AAAGGGCCAACTCGAT-1,AAAGGGCCAACTCGAT_ife1
AAAGGGCGTATGGTTC_ife1,AAAGGGCGTATGGTTC-1,AAAGGGCGTATGGTTC_ife1
...,...,...
TTTGGAGGTTGGTAGG_ife1,TTTGGAGGTTGGTAGG-1,TTTGGAGGTTGGTAGG_ife1
TTTGGTTCACTACACA_ife1,TTTGGTTCACTACACA-1,TTTGGTTCACTACACA_ife1
TTTGGTTGTGCCTGCA_ife1,TTTGGTTGTGCCTGCA-1,TTTGGTTGTGCCTGCA_ife1
TTTGGTTTCGTTACCC_ife1,TTTGGTTTCGTTACCC-1,TTTGGTTTCGTTACCC_ife1


In [55]:
1233+1796+6813

9842

In [56]:
araw_psu.obs['01_sample'] = 'psu1'
araw_ife1.obs['01_sample'] = 'ife1'
araw.obs.rename(columns={'orig.ident': '01_sample'}, inplace=True)

  araw_psu.obs['01_sample'] = 'psu1'
  araw_ife1.obs['01_sample'] = 'ife1'


In [57]:
common_genes = set(araw.var_names) & set(araw_psu.var_names) & set(araw_ife1.var_names)

In [58]:
araw = araw[:, araw.var_names.isin(common_genes)]
araw_psu = araw_psu[:, araw_psu.var_names.isin(common_genes)]
araw_ife1 = araw_ife1[:, araw_ife1.var_names.isin(common_genes)]

In [59]:
araw.var

AL627309.1
FAM87B
LINC00115
FAM41C
SAMD11
...
AC008878.1
AC025809.1
PSG8
AL008638.1
LINC01670


In [60]:
araw_psu.var

Unnamed: 0,gene_ids,feature_types
AL627309.1,ENSG00000238009,Gene Expression
AL627309.3,ENSG00000239945,Gene Expression
AL627309.4,ENSG00000241599,Gene Expression
AL732372.1,ENSG00000236601,Gene Expression
AL669831.2,ENSG00000229905,Gene Expression
...,...,...
AL354822.1,ENSG00000278384,Gene Expression
AC004556.1,ENSG00000276345,Gene Expression
AC233755.2,ENSG00000277856,Gene Expression
AC233755.1,ENSG00000275063,Gene Expression


In [61]:
araw_ife1.var

Unnamed: 0,gene_ids,feature_types
AL627309.1,ENSG00000238009,Gene Expression
AL627309.3,ENSG00000239945,Gene Expression
AL627309.4,ENSG00000241599,Gene Expression
AL732372.1,ENSG00000236601,Gene Expression
AL669831.2,ENSG00000229905,Gene Expression
...,...,...
AL354822.1,ENSG00000278384,Gene Expression
AC004556.1,ENSG00000276345,Gene Expression
AC233755.2,ENSG00000277856,Gene Expression
AC233755.1,ENSG00000275063,Gene Expression


In [62]:
concatenated_ad = ad.concat([araw, araw_psu, araw_ife1])

In [63]:
concatenated_ad.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/spatial-skin/Raw/concatenated_araw.h5ad')

In [64]:
concatenated_ad.X

<9841x27695 sparse matrix of type '<class 'numpy.float32'>'
	with 18381392 stored elements in Compressed Sparse Row format>

In [65]:
adata.obsm

AxisArraysView with keys: X_pca, X_umap

In [66]:
adata = adata[adata.obs.index != 'TTTGGTTCAGACAAGC_bcc_nose1']

In [67]:
adata.obs

Unnamed: 0_level_0,nCount_RNA,nFeature_RNA,percent.mt,00_conditions,01_sample,02_group,03_location,04_celltypes,05_subcelltypes,06_seurat_clusters,barcodes-1,barcodes,barcodes_with_suffix
barcodes_with_suffix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AAAGATGAGAGCAATT_body_solebordo,2887.0,1229,3.221337,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,bcc_AAAGATGAGAGCAATT-1_1,AAAGATGAGAGCAATT,AAAGATGAGAGCAATT_body_solebordo
AACTGGTGTACAAGTA_body_solebordo,3956.0,1329,4.802831,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_AACTGGTGTACAAGTA-1_1,AACTGGTGTACAAGTA,AACTGGTGTACAAGTA_body_solebordo
AAGCCGCTCCAGAAGG_body_solebordo,3698.0,1256,4.245538,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_AAGCCGCTCCAGAAGG-1_1,AAGCCGCTCCAGAAGG,AAGCCGCTCCAGAAGG_body_solebordo
AAGGTTCAGGTTCCTA_body_solebordo,2037.0,762,0.147275,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,bcc_AAGGTTCAGGTTCCTA-1_1,AAGGTTCAGGTTCCTA,AAGGTTCAGGTTCCTA_body_solebordo
ACATACGTCAGATAAG_body_solebordo,7270.0,1900,3.658872,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_ACATACGTCAGATAAG-1_1,ACATACGTCAGATAAG,ACATACGTCAGATAAG_body_solebordo
...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGAGGTTGGTAGG_ife1,571.0,442,0.175131,healthy,ife1,face,scalp,N/A (microdissected data),Transitional K,3,ife_TTTGGAGGTTGGTAGG-1,TTTGGAGGTTGGTAGG,TTTGGAGGTTGGTAGG_ife1
TTTGGTTCACTACACA_ife1,11141.0,2884,0.430841,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,ife_TTTGGTTCACTACACA-1,TTTGGTTCACTACACA,TTTGGTTCACTACACA_ife1
TTTGGTTGTGCCTGCA_ife1,11746.0,2779,2.843521,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,ife_TTTGGTTGTGCCTGCA-1,TTTGGTTGTGCCTGCA,TTTGGTTGTGCCTGCA_ife1
TTTGGTTTCGTTACCC_ife1,2844.0,998,1.793249,healthy,ife1,face,scalp,N/A (microdissected data),Granular K,9,ife_TTTGGTTTCGTTACCC-1,TTTGGTTTCGTTACCC,TTTGGTTTCGTTACCC_ife1


In [68]:
adata.obsm

AxisArraysView with keys: X_pca, X_umap

In [69]:
adata

View of AnnData object with n_obs × n_vars = 9841 × 36565
    obs: 'nCount_RNA', 'nFeature_RNA', 'percent.mt', '00_conditions', '01_sample', '02_group', '03_location', '04_celltypes', '05_subcelltypes', '06_seurat_clusters', 'barcodes-1', 'barcodes', 'barcodes_with_suffix'
    var: 'name'
    obsm: 'X_pca', 'X_umap'

In [70]:
araw = concatenated_ad

In [71]:
araw.obs

Unnamed: 0,barcodes,barcodes-1
AAAGATGAGAGCAATT_body_solebordo,AAAGATGAGAGCAATT,AAAGATGAGAGCAATT-1_1
AACCATGGTTCTGTTT_body_solebordo,AACCATGGTTCTGTTT,AACCATGGTTCTGTTT-1_1
AACTGGTGTACAAGTA_body_solebordo,AACTGGTGTACAAGTA,AACTGGTGTACAAGTA-1_1
AAGCCGCTCCAGAAGG_body_solebordo,AAGCCGCTCCAGAAGG,AAGCCGCTCCAGAAGG-1_1
AAGGTTCAGGTTCCTA_body_solebordo,AAGGTTCAGGTTCCTA,AAGGTTCAGGTTCCTA-1_1
...,...,...
TTTGGAGGTTGGTAGG_ife1,TTTGGAGGTTGGTAGG_ife1,TTTGGAGGTTGGTAGG-1
TTTGGTTCACTACACA_ife1,TTTGGTTCACTACACA_ife1,TTTGGTTCACTACACA-1
TTTGGTTGTGCCTGCA_ife1,TTTGGTTGTGCCTGCA_ife1,TTTGGTTGTGCCTGCA-1
TTTGGTTTCGTTACCC_ife1,TTTGGTTTCGTTACCC_ife1,TTTGGTTTCGTTACCC-1


In [72]:
araw.var

AL627309.1
FAM87B
LINC00115
FAM41C
SAMD11
...
AC008878.1
AC025809.1
PSG8
AL008638.1
LINC01670


In [73]:
adata.var

Unnamed: 0,name
RP11-34P13.7,RP11-34P13.7
AL627309.1,AL627309.1
AP006222.2,AP006222.2
RP4-669L17.10,RP4-669L17.10
RP11-206L10.3,RP11-206L10.3
...,...
AP001505.10,AP001505.10
LL21NC02-21A1.1,LL21NC02-21A1.1
BX004987.4,BX004987.4
AC145212.4,AC145212.4


In [74]:
adata_genes = set(adata.var_names)
araw_genes = set(araw.var_names)
genes_to_remove = adata_genes - araw_genes
adata = adata[:, ~adata.var_names.isin(genes_to_remove)]

In [75]:
adata

View of AnnData object with n_obs × n_vars = 9841 × 27695
    obs: 'nCount_RNA', 'nFeature_RNA', 'percent.mt', '00_conditions', '01_sample', '02_group', '03_location', '04_celltypes', '05_subcelltypes', '06_seurat_clusters', 'barcodes-1', 'barcodes', 'barcodes_with_suffix'
    var: 'name'
    obsm: 'X_pca', 'X_umap'

In [76]:
value_counts_adata = adata.obs['01_sample'].value_counts()

In [77]:
value_counts_adata

body_solebordo    2043
psu1              1796
body_tabib        1524
bcc_cheek1        1439
ife1              1233
bcc_forehead1      947
bcc_ear1           235
face_cheek1        184
face_temple1        96
face_forehead3      52
face_forehead4      45
bcc_cheek2          45
bcc_ear2            31
face_forehead1      30
bcc_temple1         28
face_forehead2      28
face_ear2           26
bcc_nose1           11
face_forehead5      10
face_cheek4          9
face_ear1            8
face_cheek3b         7
bcc_temple2          4
face_temple2         4
face_cheek2          3
face_nose1           3
Name: 01_sample, dtype: int64

In [78]:
#indices_adata = adata.obs.index

In [79]:
#concatenated_ad = concatenated_ad[concatenated_ad.obs.index.isin(indices_adata)]

In [80]:
concatenated_ad.obs

Unnamed: 0,barcodes,barcodes-1
AAAGATGAGAGCAATT_body_solebordo,AAAGATGAGAGCAATT,AAAGATGAGAGCAATT-1_1
AACCATGGTTCTGTTT_body_solebordo,AACCATGGTTCTGTTT,AACCATGGTTCTGTTT-1_1
AACTGGTGTACAAGTA_body_solebordo,AACTGGTGTACAAGTA,AACTGGTGTACAAGTA-1_1
AAGCCGCTCCAGAAGG_body_solebordo,AAGCCGCTCCAGAAGG,AAGCCGCTCCAGAAGG-1_1
AAGGTTCAGGTTCCTA_body_solebordo,AAGGTTCAGGTTCCTA,AAGGTTCAGGTTCCTA-1_1
...,...,...
TTTGGAGGTTGGTAGG_ife1,TTTGGAGGTTGGTAGG_ife1,TTTGGAGGTTGGTAGG-1
TTTGGTTCACTACACA_ife1,TTTGGTTCACTACACA_ife1,TTTGGTTCACTACACA-1
TTTGGTTGTGCCTGCA_ife1,TTTGGTTGTGCCTGCA_ife1,TTTGGTTGTGCCTGCA-1
TTTGGTTTCGTTACCC_ife1,TTTGGTTTCGTTACCC_ife1,TTTGGTTTCGTTACCC-1


In [81]:
concatenated_ad.obs['barcodes_with_suffix'] = concatenated_ad.obs.index

In [82]:
adata_barcodes = set(adata.obs['barcodes_with_suffix'])
araw_barcodes = set(concatenated_ad.obs['barcodes_with_suffix'])

In [83]:
different_elements = adata_barcodes.symmetric_difference(araw_barcodes)


In [84]:
different_elements

set()

In [85]:
count_ttggtt = (adata.obs.index == 'TTTGGTTCAGACAAGC_bcc_nose1').sum()

In [86]:
adata.obs

Unnamed: 0_level_0,nCount_RNA,nFeature_RNA,percent.mt,00_conditions,01_sample,02_group,03_location,04_celltypes,05_subcelltypes,06_seurat_clusters,barcodes-1,barcodes,barcodes_with_suffix
barcodes_with_suffix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
AAAGATGAGAGCAATT_body_solebordo,2887.0,1229,3.221337,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,bcc_AAAGATGAGAGCAATT-1_1,AAAGATGAGAGCAATT,AAAGATGAGAGCAATT_body_solebordo
AACTGGTGTACAAGTA_body_solebordo,3956.0,1329,4.802831,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_AACTGGTGTACAAGTA-1_1,AACTGGTGTACAAGTA,AACTGGTGTACAAGTA_body_solebordo
AAGCCGCTCCAGAAGG_body_solebordo,3698.0,1256,4.245538,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_AAGCCGCTCCAGAAGG-1_1,AAGCCGCTCCAGAAGG,AAGCCGCTCCAGAAGG_body_solebordo
AAGGTTCAGGTTCCTA_body_solebordo,2037.0,762,0.147275,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,bcc_AAGGTTCAGGTTCCTA-1_1,AAGGTTCAGGTTCCTA,AAGGTTCAGGTTCCTA_body_solebordo
ACATACGTCAGATAAG_body_solebordo,7270.0,1900,3.658872,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_ACATACGTCAGATAAG-1_1,ACATACGTCAGATAAG,ACATACGTCAGATAAG_body_solebordo
...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGGAGGTTGGTAGG_ife1,571.0,442,0.175131,healthy,ife1,face,scalp,N/A (microdissected data),Transitional K,3,ife_TTTGGAGGTTGGTAGG-1,TTTGGAGGTTGGTAGG,TTTGGAGGTTGGTAGG_ife1
TTTGGTTCACTACACA_ife1,11141.0,2884,0.430841,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,ife_TTTGGTTCACTACACA-1,TTTGGTTCACTACACA,TTTGGTTCACTACACA_ife1
TTTGGTTGTGCCTGCA_ife1,11746.0,2779,2.843521,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,ife_TTTGGTTGTGCCTGCA-1,TTTGGTTGTGCCTGCA,TTTGGTTGTGCCTGCA_ife1
TTTGGTTTCGTTACCC_ife1,2844.0,998,1.793249,healthy,ife1,face,scalp,N/A (microdissected data),Granular K,9,ife_TTTGGTTTCGTTACCC-1,TTTGGTTTCGTTACCC,TTTGGTTTCGTTACCC_ife1


In [87]:
araw= concatenated_ad

In [88]:
araw

AnnData object with n_obs × n_vars = 9841 × 27695
    obs: 'barcodes', 'barcodes-1', 'barcodes_with_suffix'

In [89]:
adata.raw = concatenated_ad

  adata.raw = concatenated_ad


In [90]:
print(adata.X)

  (22, 0)	0.83022225
  (28, 0)	0.8988908
  (34, 0)	1.4766784
  (100, 0)	0.9135694
  (109, 0)	1.2545515
  (243, 0)	0.9065623
  (246, 0)	0.94683385
  (309, 0)	1.0763196
  (337, 0)	1.6551186
  (448, 0)	1.0809301
  (511, 0)	1.1959825
  (872, 0)	1.4951953
  (960, 0)	1.6285896
  (1335, 0)	0.7743494
  (1872, 0)	1.1358175
  (1979, 0)	1.3745601
  (2628, 0)	0.7113832
  (2718, 0)	0.6064011
  (3025, 0)	1.3599588
  (3729, 0)	1.2444329
  (3926, 0)	0.78128296
  (3950, 0)	1.1632196
  (4291, 0)	0.97410846
  (4721, 0)	1.6791689
  (4886, 0)	1.1365354
  :	:
  (6843, 27481)	0.42173466
  (6851, 27481)	0.46856067
  (6871, 27481)	0.4734644
  (6841, 27482)	0.4004618
  (6964, 27482)	0.29123402
  (7529, 27482)	0.35842356
  (7158, 27483)	0.27133977
  (7165, 27487)	0.35682073
  (7590, 27487)	0.19234538
  (7645, 27540)	0.7025274
  (7983, 27540)	0.7074999
  (8561, 27577)	0.2702038
  (7048, 27591)	0.27169582
  (7459, 27591)	0.52655923
  (7987, 27591)	0.38500103
  (8069, 27591)	0.76415706
  (8310, 27591)	0.48423716
  

In [91]:
print(adata.raw.X)

  (0, 10)	1.0
  (0, 14)	1.0
  (0, 16)	1.0
  (0, 18)	1.0
  (0, 24)	1.0
  (0, 34)	1.0
  (0, 69)	2.0
  (0, 75)	7.0
  (0, 92)	1.0
  (0, 93)	1.0
  (0, 98)	1.0
  (0, 101)	2.0
  (0, 157)	1.0
  (0, 168)	1.0
  (0, 174)	1.0
  (0, 175)	1.0
  (0, 179)	1.0
  (0, 184)	1.0
  (0, 192)	2.0
  (0, 193)	1.0
  (0, 199)	1.0
  (0, 201)	1.0
  (0, 220)	1.0
  (0, 229)	7.0
  (0, 245)	9.0
  :	:
  (9840, 101)	10.0
  (9840, 100)	2.0
  (9840, 97)	6.0
  (9840, 94)	1.0
  (9840, 93)	1.0
  (9840, 92)	1.0
  (9840, 88)	1.0
  (9840, 85)	1.0
  (9840, 80)	1.0
  (9840, 77)	2.0
  (9840, 75)	55.0
  (9840, 49)	2.0
  (9840, 40)	2.0
  (9840, 38)	2.0
  (9840, 34)	2.0
  (9840, 32)	1.0
  (9840, 26)	2.0
  (9840, 25)	3.0
  (9840, 24)	2.0
  (9840, 20)	1.0
  (9840, 19)	1.0
  (9840, 18)	2.0
  (9840, 16436)	6.0
  (9840, 17)	1.0
  (9840, 14)	2.0


##### **Variables(var)**

In [92]:
# View the var of anndata and raw object

In [93]:
adata.var

Unnamed: 0,name
AL627309.1,AL627309.1
FAM87B,FAM87B
LINC00115,LINC00115
FAM41C,FAM41C
SAMD11,SAMD11
...,...
AC008878.1,AC008878.1
AC025809.1,AC025809.1
PSG8,PSG8
AL008638.1,AL008638.1


In [94]:
adata.var['name'] = adata.var.index

In [95]:
adata.var

Unnamed: 0,name
AL627309.1,AL627309.1
FAM87B,FAM87B
LINC00115,LINC00115
FAM41C,FAM41C
SAMD11,SAMD11
...,...
AC008878.1,AC008878.1
AC025809.1,AC025809.1
PSG8,PSG8
AL008638.1,AL008638.1


In [96]:
araw.var

AL627309.1
FAM87B
LINC00115
FAM41C
SAMD11
...
AC008878.1
AC025809.1
PSG8
AL008638.1
LINC01670


In [97]:
araw.var['name'] = araw.var.index

In [98]:
araw.var

Unnamed: 0,name
AL627309.1,AL627309.1
FAM87B,FAM87B
LINC00115,LINC00115
FAM41C,FAM41C
SAMD11,SAMD11
...,...
AC008878.1,AC008878.1
AC025809.1,AC025809.1
PSG8,PSG8
AL008638.1,AL008638.1


In [99]:
#Since ensembl ids are not provided in adata.var. Load the GRCh38-1.2.0 reference to match ensembl ids to gene symbols

In [100]:
#gene_info = pd.read_table('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/_genelists/GRCh38-3.0.0.gene_names.tsv')

In [101]:
gene_info = pd.read_table('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/Endometrium_reference_integrated_atlas/GRCh38-3.0.0.gene_names-2.tsv')

In [102]:
gene_info

Unnamed: 0,ensembl_ids,gene
0,ENSG00000243485,MIR1302-2HG
1,ENSG00000237613,FAM138A
2,ENSG00000186092,OR4F5
3,ENSG00000238009,AL627309.1
4,ENSG00000239945,AL627309.3
...,...,...
33533,ENSG00000277856,AC233755.2
33534,ENSG00000275063,AC233755.1
33535,ENSG00000271254,AC240274.1
33536,ENSG00000277475,AC213203.1


In [103]:
column_names = ['Gene ID','Gene Name']

In [104]:
gene_info.columns = column_names 

In [105]:
gene_info

Unnamed: 0,Gene ID,Gene Name
0,ENSG00000243485,MIR1302-2HG
1,ENSG00000237613,FAM138A
2,ENSG00000186092,OR4F5
3,ENSG00000238009,AL627309.1
4,ENSG00000239945,AL627309.3
...,...,...
33533,ENSG00000277856,AC233755.2
33534,ENSG00000275063,AC233755.1
33535,ENSG00000271254,AC240274.1
33536,ENSG00000277475,AC213203.1


In [106]:
dupl_gene_name = gene_info[gene_info.duplicated(subset= 'Gene Name' , keep=False)]

In [107]:
dupl_gene_name 

Unnamed: 0,Gene ID,Gene Name
2230,ENSG00000143248,RGS5
2232,ENSG00000232995,RGS5
2997,ENSG00000285053,TBCE
2999,ENSG00000284770,TBCE
4798,ENSG00000128655,PDE11A
4799,ENSG00000284741,PDE11A
5435,ENSG00000237940,LINC01238
5438,ENSG00000261186,LINC01238
5832,ENSG00000283706,PRSS50
5833,ENSG00000206549,PRSS50


In [108]:
len(dupl_gene_name)

48

In [109]:
#create a dictionary with gene symbols and ensembl ids from the gene information file

In [110]:
#gene_info_genesym_to_ensembl = dict(zip(gene_info['ensembl_id'],gene_info['gene_symbols']))

In [111]:
gene_info_genesym_to_ensembl=dict(gene_info.drop_duplicates('Gene Name').set_index('Gene Name')['Gene ID'])

In [112]:
gene_info_genesym_to_ensembl

{'MIR1302-2HG': 'ENSG00000243485',
 'FAM138A': 'ENSG00000237613',
 'OR4F5': 'ENSG00000186092',
 'AL627309.1': 'ENSG00000238009',
 'AL627309.3': 'ENSG00000239945',
 'AL627309.2': 'ENSG00000239906',
 'AL627309.4': 'ENSG00000241599',
 'AL732372.1': 'ENSG00000236601',
 'OR4F29': 'ENSG00000284733',
 'AC114498.1': 'ENSG00000235146',
 'OR4F16': 'ENSG00000284662',
 'AL669831.2': 'ENSG00000229905',
 'AL669831.5': 'ENSG00000237491',
 'FAM87B': 'ENSG00000177757',
 'LINC00115': 'ENSG00000225880',
 'FAM41C': 'ENSG00000230368',
 'AL645608.7': 'ENSG00000272438',
 'AL645608.3': 'ENSG00000230699',
 'AL645608.5': 'ENSG00000241180',
 'AL645608.1': 'ENSG00000223764',
 'SAMD11': 'ENSG00000187634',
 'NOC2L': 'ENSG00000188976',
 'KLHL17': 'ENSG00000187961',
 'PLEKHN1': 'ENSG00000187583',
 'PERM1': 'ENSG00000187642',
 'AL645608.8': 'ENSG00000272512',
 'HES4': 'ENSG00000188290',
 'ISG15': 'ENSG00000187608',
 'AL645608.2': 'ENSG00000224969',
 'AGRN': 'ENSG00000188157',
 'AL645608.9': 'ENSG00000273443',
 'RNF223

In [113]:
len(gene_info_genesym_to_ensembl)

33514

In [114]:
#Store ensembl ids in a new column in adata.var by matching gene symbols and ensembl ids from the gene information file

In [115]:
adata.var['ensembl_id'] = adata.var['name'].map(gene_info_genesym_to_ensembl)

In [116]:
araw.var['ensembl_id'] = araw.var['name'].map(gene_info_genesym_to_ensembl)

In [117]:
adata.var

Unnamed: 0,name,ensembl_id
AL627309.1,AL627309.1,ENSG00000238009
FAM87B,FAM87B,ENSG00000177757
LINC00115,LINC00115,ENSG00000225880
FAM41C,FAM41C,ENSG00000230368
SAMD11,SAMD11,ENSG00000187634
...,...,...
AC008878.1,AC008878.1,ENSG00000267952
AC025809.1,AC025809.1,ENSG00000267662
PSG8,PSG8,ENSG00000124467
AL008638.1,AL008638.1,ENSG00000231405


In [118]:
araw.var

Unnamed: 0,name,ensembl_id
AL627309.1,AL627309.1,ENSG00000238009
FAM87B,FAM87B,ENSG00000177757
LINC00115,LINC00115,ENSG00000225880
FAM41C,FAM41C,ENSG00000230368
SAMD11,SAMD11,ENSG00000187634
...,...,...
AC008878.1,AC008878.1,ENSG00000267952
AC025809.1,AC025809.1,ENSG00000267662
PSG8,PSG8,ENSG00000124467
AL008638.1,AL008638.1,ENSG00000231405


In [119]:
nan_ensembl_names = adata.var.loc[adata.var['ensembl_id'].isna(), 'name'].tolist()

In [120]:
nan_ensembl_names

[]

In [121]:
adata.var.loc[adata.var['ensembl_id'].isna(), 'name'].to_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/spatial-skin/Suppl_info/spatial_ensembl_na.txt',index=False)

In [122]:
len(nan_ensembl_names)

0

In [123]:
adata.var_names = adata.var['ensembl_id']

In [124]:
araw.var_names = araw.var['ensembl_id']

In [125]:
adata.var['ensembl_id'].isna().sum()

0

In [126]:
adata.var.dropna(inplace=True)

In [127]:
araw.var.dropna(inplace=True)

In [128]:
adata.var

Unnamed: 0_level_0,name,ensembl_id
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000238009,AL627309.1,ENSG00000238009
ENSG00000177757,FAM87B,ENSG00000177757
ENSG00000225880,LINC00115,ENSG00000225880
ENSG00000230368,FAM41C,ENSG00000230368
ENSG00000187634,SAMD11,ENSG00000187634
...,...,...
ENSG00000267952,AC008878.1,ENSG00000267952
ENSG00000267662,AC025809.1,ENSG00000267662
ENSG00000124467,PSG8,ENSG00000124467
ENSG00000231405,AL008638.1,ENSG00000231405


In [129]:
araw.var

Unnamed: 0_level_0,name,ensembl_id
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000238009,AL627309.1,ENSG00000238009
ENSG00000177757,FAM87B,ENSG00000177757
ENSG00000225880,LINC00115,ENSG00000225880
ENSG00000230368,FAM41C,ENSG00000230368
ENSG00000187634,SAMD11,ENSG00000187634
...,...,...
ENSG00000267952,AC008878.1,ENSG00000267952
ENSG00000267662,AC025809.1,ENSG00000267662
ENSG00000124467,PSG8,ENSG00000124467
ENSG00000231405,AL008638.1,ENSG00000231405


In [130]:
# Load the approved genes file.

In [131]:
approved_genes = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/Endometrium_reference_integrated_atlas/genes_approved.csv')

In [132]:
#Create a dictionary from the approved genes file 

In [133]:
genedict = {key: 1 for key in list(approved_genes.feature_id)}

In [134]:
genedict

{'ERCC-00002': 1,
 'ERCC-00003': 1,
 'ERCC-00004': 1,
 'ERCC-00009': 1,
 'ERCC-00012': 1,
 'ERCC-00013': 1,
 'ERCC-00014': 1,
 'ERCC-00016': 1,
 'ERCC-00017': 1,
 'ERCC-00019': 1,
 'ERCC-00022': 1,
 'ERCC-00024': 1,
 'ERCC-00025': 1,
 'ERCC-00028': 1,
 'ERCC-00031': 1,
 'ERCC-00033': 1,
 'ERCC-00034': 1,
 'ERCC-00035': 1,
 'ERCC-00039': 1,
 'ERCC-00040': 1,
 'ERCC-00041': 1,
 'ERCC-00042': 1,
 'ERCC-00043': 1,
 'ERCC-00044': 1,
 'ERCC-00046': 1,
 'ERCC-00048': 1,
 'ERCC-00051': 1,
 'ERCC-00053': 1,
 'ERCC-00054': 1,
 'ERCC-00057': 1,
 'ERCC-00058': 1,
 'ERCC-00059': 1,
 'ERCC-00060': 1,
 'ERCC-00061': 1,
 'ERCC-00062': 1,
 'ERCC-00067': 1,
 'ERCC-00069': 1,
 'ERCC-00071': 1,
 'ERCC-00073': 1,
 'ERCC-00074': 1,
 'ERCC-00075': 1,
 'ERCC-00076': 1,
 'ERCC-00077': 1,
 'ERCC-00078': 1,
 'ERCC-00079': 1,
 'ERCC-00081': 1,
 'ERCC-00083': 1,
 'ERCC-00084': 1,
 'ERCC-00085': 1,
 'ERCC-00086': 1,
 'ERCC-00092': 1,
 'ERCC-00095': 1,
 'ERCC-00096': 1,
 'ERCC-00097': 1,
 'ERCC-00098': 1,
 'ERCC-000

In [135]:
len(genedict)

119799

In [136]:
#Filter out the genes which are not in the approved genes file.

In [137]:
var_to_keep_adata = [x for x in adata.var_names if (x in genedict)]
var_to_keep_araw = [x for x in araw.var_names if (x in genedict)]

In [138]:
len(var_to_keep_adata)

27405

In [139]:
len(var_to_keep_araw)

27405

In [140]:
adata.var

Unnamed: 0_level_0,name,ensembl_id
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000238009,AL627309.1,ENSG00000238009
ENSG00000177757,FAM87B,ENSG00000177757
ENSG00000225880,LINC00115,ENSG00000225880
ENSG00000230368,FAM41C,ENSG00000230368
ENSG00000187634,SAMD11,ENSG00000187634
...,...,...
ENSG00000267952,AC008878.1,ENSG00000267952
ENSG00000267662,AC025809.1,ENSG00000267662
ENSG00000124467,PSG8,ENSG00000124467
ENSG00000231405,AL008638.1,ENSG00000231405


In [141]:
araw.var

Unnamed: 0_level_0,name,ensembl_id
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000238009,AL627309.1,ENSG00000238009
ENSG00000177757,FAM87B,ENSG00000177757
ENSG00000225880,LINC00115,ENSG00000225880
ENSG00000230368,FAM41C,ENSG00000230368
ENSG00000187634,SAMD11,ENSG00000187634
...,...,...
ENSG00000267952,AC008878.1,ENSG00000267952
ENSG00000267662,AC025809.1,ENSG00000267662
ENSG00000124467,PSG8,ENSG00000124467
ENSG00000231405,AL008638.1,ENSG00000231405


In [142]:
# Modify the anndata object by filtering out the filtered genes.

In [143]:
adata = adata[:, var_to_keep_adata].copy()
araw = araw[:, var_to_keep_araw].copy()

In [144]:
adata.var

Unnamed: 0_level_0,name,ensembl_id
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000238009,AL627309.1,ENSG00000238009
ENSG00000177757,FAM87B,ENSG00000177757
ENSG00000225880,LINC00115,ENSG00000225880
ENSG00000230368,FAM41C,ENSG00000230368
ENSG00000187634,SAMD11,ENSG00000187634
...,...,...
ENSG00000267952,AC008878.1,ENSG00000267952
ENSG00000267662,AC025809.1,ENSG00000267662
ENSG00000124467,PSG8,ENSG00000124467
ENSG00000231405,AL008638.1,ENSG00000231405


In [145]:
araw.var

Unnamed: 0_level_0,name,ensembl_id
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000238009,AL627309.1,ENSG00000238009
ENSG00000177757,FAM87B,ENSG00000177757
ENSG00000225880,LINC00115,ENSG00000225880
ENSG00000230368,FAM41C,ENSG00000230368
ENSG00000187634,SAMD11,ENSG00000187634
...,...,...
ENSG00000267952,AC008878.1,ENSG00000267952
ENSG00000267662,AC025809.1,ENSG00000267662
ENSG00000124467,PSG8,ENSG00000124467
ENSG00000231405,AL008638.1,ENSG00000231405


adata_genes = set(adata.var_names)
araw_genes = set(araw.var_names)
genes_to_remove = adata_genes - araw_genes
adata = adata[:, ~adata.var_names.isin(genes_to_remove)]

In [146]:
adata.var

Unnamed: 0_level_0,name,ensembl_id
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000238009,AL627309.1,ENSG00000238009
ENSG00000177757,FAM87B,ENSG00000177757
ENSG00000225880,LINC00115,ENSG00000225880
ENSG00000230368,FAM41C,ENSG00000230368
ENSG00000187634,SAMD11,ENSG00000187634
...,...,...
ENSG00000267952,AC008878.1,ENSG00000267952
ENSG00000267662,AC025809.1,ENSG00000267662
ENSG00000124467,PSG8,ENSG00000124467
ENSG00000231405,AL008638.1,ENSG00000231405


In [147]:
adata.var = adata.var.iloc[:, 0:0]
araw.var = araw.var.iloc[:, 0:0]

In [148]:
# View var

In [149]:
adata.var

ENSG00000238009
ENSG00000177757
ENSG00000225880
ENSG00000230368
ENSG00000187634
...
ENSG00000267952
ENSG00000267662
ENSG00000124467
ENSG00000231405
ENSG00000279094


In [150]:
araw.var

ENSG00000238009
ENSG00000177757
ENSG00000225880
ENSG00000230368
ENSG00000187634
...
ENSG00000267952
ENSG00000267662
ENSG00000124467
ENSG00000231405
ENSG00000279094


feature is filtered

In [151]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

In [152]:
adata.var

Unnamed: 0_level_0,feature_is_filtered
ensembl_id,Unnamed: 1_level_1
ENSG00000238009,False
ENSG00000177757,False
ENSG00000225880,False
ENSG00000230368,False
ENSG00000187634,False
...,...
ENSG00000267952,False
ENSG00000267662,False
ENSG00000124467,False
ENSG00000231405,False


In [153]:
araw.var

ENSG00000238009
ENSG00000177757
ENSG00000225880
ENSG00000230368
ENSG00000187634
...
ENSG00000267952
ENSG00000267662
ENSG00000124467
ENSG00000231405
ENSG00000279094


In [154]:
common_genes = list(set(adata.var_names) & set(araw.var_names))

In [155]:
adata_filtered= adata[:, common_genes]

In [156]:
adata = adata_filtered

In [157]:
araw_filtered = araw[:, common_genes]

In [158]:
araw = araw_filtered

In [159]:
araw

View of AnnData object with n_obs × n_vars = 9841 × 27405
    obs: 'barcodes', 'barcodes-1', 'barcodes_with_suffix'

In [160]:
adata.var

Unnamed: 0_level_0,feature_is_filtered
ensembl_id,Unnamed: 1_level_1
ENSG00000167614,False
ENSG00000111906,False
ENSG00000166922,False
ENSG00000104970,False
ENSG00000260213,False
...,...
ENSG00000225783,False
ENSG00000285100,False
ENSG00000166595,False
ENSG00000225216,False


In [161]:
araw.var

ENSG00000167614
ENSG00000111906
ENSG00000166922
ENSG00000104970
ENSG00000260213
...
ENSG00000225783
ENSG00000285100
ENSG00000166595
ENSG00000225216
ENSG00000039123


In [162]:
araw.obs

Unnamed: 0,barcodes,barcodes-1,barcodes_with_suffix
AAAGATGAGAGCAATT_body_solebordo,AAAGATGAGAGCAATT,AAAGATGAGAGCAATT-1_1,AAAGATGAGAGCAATT_body_solebordo
AACCATGGTTCTGTTT_body_solebordo,AACCATGGTTCTGTTT,AACCATGGTTCTGTTT-1_1,AACCATGGTTCTGTTT_body_solebordo
AACTGGTGTACAAGTA_body_solebordo,AACTGGTGTACAAGTA,AACTGGTGTACAAGTA-1_1,AACTGGTGTACAAGTA_body_solebordo
AAGCCGCTCCAGAAGG_body_solebordo,AAGCCGCTCCAGAAGG,AAGCCGCTCCAGAAGG-1_1,AAGCCGCTCCAGAAGG_body_solebordo
AAGGTTCAGGTTCCTA_body_solebordo,AAGGTTCAGGTTCCTA,AAGGTTCAGGTTCCTA-1_1,AAGGTTCAGGTTCCTA_body_solebordo
...,...,...,...
TTTGGAGGTTGGTAGG_ife1,TTTGGAGGTTGGTAGG_ife1,TTTGGAGGTTGGTAGG-1,TTTGGAGGTTGGTAGG_ife1
TTTGGTTCACTACACA_ife1,TTTGGTTCACTACACA_ife1,TTTGGTTCACTACACA-1,TTTGGTTCACTACACA_ife1
TTTGGTTGTGCCTGCA_ife1,TTTGGTTGTGCCTGCA_ife1,TTTGGTTGTGCCTGCA-1,TTTGGTTGTGCCTGCA_ife1
TTTGGTTTCGTTACCC_ife1,TTTGGTTTCGTTACCC_ife1,TTTGGTTTCGTTACCC-1,TTTGGTTTCGTTACCC_ife1


In [163]:
print(araw.X)

  (0, 10056)	1.0
  (0, 17645)	1.0
  (0, 4019)	1.0
  (0, 19403)	1.0
  (0, 5525)	1.0
  (0, 12279)	1.0
  (0, 24599)	2.0
  (0, 14942)	7.0
  (0, 5783)	1.0
  (0, 18593)	1.0
  (0, 4830)	1.0
  (0, 14797)	2.0
  (0, 891)	1.0
  (0, 9493)	1.0
  (0, 29)	1.0
  (0, 18386)	1.0
  (0, 303)	1.0
  (0, 24216)	1.0
  (0, 5303)	2.0
  (0, 21082)	1.0
  (0, 4238)	1.0
  (0, 14384)	1.0
  (0, 17787)	1.0
  (0, 11916)	7.0
  (0, 26540)	9.0
  :	:
  (9840, 14797)	10.0
  (9840, 23334)	2.0
  (9840, 6164)	6.0
  (9840, 23785)	1.0
  (9840, 18593)	1.0
  (9840, 5783)	1.0
  (9840, 12234)	1.0
  (9840, 16589)	1.0
  (9840, 9279)	1.0
  (9840, 9044)	2.0
  (9840, 14942)	55.0
  (9840, 8860)	2.0
  (9840, 21334)	2.0
  (9840, 16481)	2.0
  (9840, 12279)	2.0
  (9840, 26647)	1.0
  (9840, 21009)	2.0
  (9840, 17749)	3.0
  (9840, 5525)	2.0
  (9840, 7865)	1.0
  (9840, 8907)	1.0
  (9840, 19403)	2.0
  (9840, 27354)	6.0
  (9840, 32)	1.0
  (9840, 17645)	2.0


#### **obs (Cell metadata)**

In [164]:
#view obs

In [165]:
adata.obs.index = adata.obs['barcodes-1']

In [166]:
araw.obs.index = araw.obs['barcodes-1']

In [167]:
adata.obs

Unnamed: 0_level_0,nCount_RNA,nFeature_RNA,percent.mt,00_conditions,01_sample,02_group,03_location,04_celltypes,05_subcelltypes,06_seurat_clusters,barcodes-1,barcodes,barcodes_with_suffix
barcodes-1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
bcc_AAAGATGAGAGCAATT-1_1,2887.0,1229,3.221337,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,bcc_AAAGATGAGAGCAATT-1_1,AAAGATGAGAGCAATT,AAAGATGAGAGCAATT_body_solebordo
bcc_AACTGGTGTACAAGTA-1_1,3956.0,1329,4.802831,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_AACTGGTGTACAAGTA-1_1,AACTGGTGTACAAGTA,AACTGGTGTACAAGTA_body_solebordo
bcc_AAGCCGCTCCAGAAGG-1_1,3698.0,1256,4.245538,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_AAGCCGCTCCAGAAGG-1_1,AAGCCGCTCCAGAAGG,AAGCCGCTCCAGAAGG_body_solebordo
bcc_AAGGTTCAGGTTCCTA-1_1,2037.0,762,0.147275,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,bcc_AAGGTTCAGGTTCCTA-1_1,AAGGTTCAGGTTCCTA,AAGGTTCAGGTTCCTA_body_solebordo
bcc_ACATACGTCAGATAAG-1_1,7270.0,1900,3.658872,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_ACATACGTCAGATAAG-1_1,ACATACGTCAGATAAG,ACATACGTCAGATAAG_body_solebordo
...,...,...,...,...,...,...,...,...,...,...,...,...,...
ife_TTTGGAGGTTGGTAGG-1,571.0,442,0.175131,healthy,ife1,face,scalp,N/A (microdissected data),Transitional K,3,ife_TTTGGAGGTTGGTAGG-1,TTTGGAGGTTGGTAGG,TTTGGAGGTTGGTAGG_ife1
ife_TTTGGTTCACTACACA-1,11141.0,2884,0.430841,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,ife_TTTGGTTCACTACACA-1,TTTGGTTCACTACACA,TTTGGTTCACTACACA_ife1
ife_TTTGGTTGTGCCTGCA-1,11746.0,2779,2.843521,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,ife_TTTGGTTGTGCCTGCA-1,TTTGGTTGTGCCTGCA,TTTGGTTGTGCCTGCA_ife1
ife_TTTGGTTTCGTTACCC-1,2844.0,998,1.793249,healthy,ife1,face,scalp,N/A (microdissected data),Granular K,9,ife_TTTGGTTTCGTTACCC-1,TTTGGTTTCGTTACCC,TTTGGTTTCGTTACCC_ife1


In [168]:
#mask = (adata.obs_names.str.contains('_24')) & (adata.obs['orig.ident'] == 'bcc_nose1')

In [169]:
#filtered_cells = adata[~mask]

In [170]:
#adata = filtered_cells.copy()

In [171]:
adata

View of AnnData object with n_obs × n_vars = 9841 × 27405
    obs: 'nCount_RNA', 'nFeature_RNA', 'percent.mt', '00_conditions', '01_sample', '02_group', '03_location', '04_celltypes', '05_subcelltypes', '06_seurat_clusters', 'barcodes-1', 'barcodes', 'barcodes_with_suffix'
    var: 'feature_is_filtered'
    obsm: 'X_pca', 'X_umap'

In [172]:
adata.obs

Unnamed: 0_level_0,nCount_RNA,nFeature_RNA,percent.mt,00_conditions,01_sample,02_group,03_location,04_celltypes,05_subcelltypes,06_seurat_clusters,barcodes-1,barcodes,barcodes_with_suffix
barcodes-1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
bcc_AAAGATGAGAGCAATT-1_1,2887.0,1229,3.221337,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,bcc_AAAGATGAGAGCAATT-1_1,AAAGATGAGAGCAATT,AAAGATGAGAGCAATT_body_solebordo
bcc_AACTGGTGTACAAGTA-1_1,3956.0,1329,4.802831,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_AACTGGTGTACAAGTA-1_1,AACTGGTGTACAAGTA,AACTGGTGTACAAGTA_body_solebordo
bcc_AAGCCGCTCCAGAAGG-1_1,3698.0,1256,4.245538,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_AAGCCGCTCCAGAAGG-1_1,AAGCCGCTCCAGAAGG,AAGCCGCTCCAGAAGG_body_solebordo
bcc_AAGGTTCAGGTTCCTA-1_1,2037.0,762,0.147275,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,bcc_AAGGTTCAGGTTCCTA-1_1,AAGGTTCAGGTTCCTA,AAGGTTCAGGTTCCTA_body_solebordo
bcc_ACATACGTCAGATAAG-1_1,7270.0,1900,3.658872,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_ACATACGTCAGATAAG-1_1,ACATACGTCAGATAAG,ACATACGTCAGATAAG_body_solebordo
...,...,...,...,...,...,...,...,...,...,...,...,...,...
ife_TTTGGAGGTTGGTAGG-1,571.0,442,0.175131,healthy,ife1,face,scalp,N/A (microdissected data),Transitional K,3,ife_TTTGGAGGTTGGTAGG-1,TTTGGAGGTTGGTAGG,TTTGGAGGTTGGTAGG_ife1
ife_TTTGGTTCACTACACA-1,11141.0,2884,0.430841,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,ife_TTTGGTTCACTACACA-1,TTTGGTTCACTACACA,TTTGGTTCACTACACA_ife1
ife_TTTGGTTGTGCCTGCA-1,11746.0,2779,2.843521,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,ife_TTTGGTTGTGCCTGCA-1,TTTGGTTGTGCCTGCA,TTTGGTTGTGCCTGCA_ife1
ife_TTTGGTTTCGTTACCC-1,2844.0,998,1.793249,healthy,ife1,face,scalp,N/A (microdissected data),Granular K,9,ife_TTTGGTTTCGTTACCC-1,TTTGGTTTCGTTACCC,TTTGGTTTCGTTACCC_ife1


In [173]:
# view the column names in obs

In [174]:
adata.obs.columns

Index(['nCount_RNA', 'nFeature_RNA', 'percent.mt', '00_conditions',
       '01_sample', '02_group', '03_location', '04_celltypes',
       '05_subcelltypes', '06_seurat_clusters', 'barcodes-1', 'barcodes',
       'barcodes_with_suffix'],
      dtype='object')

#### **assay_ontology_term_id**

In [175]:
adata.obs['barcodes'] = adata.obs['barcodes'].str.extract(r'([ATGC]{10,16})', expand=False)

  adata.obs['barcodes'] = adata.obs['barcodes'].str.extract(r'([ATGC]{10,16})', expand=False)


In [176]:
list(adata.obs['01_sample'].unique())

['body_solebordo',
 'body_tabib',
 'face_cheek1',
 'face_cheek2',
 'face_cheek3b',
 'face_cheek4',
 'face_ear1',
 'face_ear2',
 'face_forehead1',
 'face_forehead2',
 'face_forehead3',
 'face_forehead4',
 'face_forehead5',
 'face_nose1',
 'face_temple1',
 'face_temple2',
 'bcc_cheek1',
 'bcc_cheek2',
 'bcc_ear1',
 'bcc_ear2',
 'bcc_forehead1',
 'bcc_nose1',
 'bcc_temple1',
 'bcc_temple2',
 'psu1',
 'ife1']

In [177]:
# adata.obs['assay']= None

In [178]:
mapping = {'psu1' : 'EFO:0009922','ife1':'EFO:0009922', 'body_solebordo':'EFO:0009899','body_tabib':'EFO:0009901','face_cheek1':'EFO:0009922', 'face_cheek2':'EFO:0009922', 'face_cheek3b':'EFO:0009922', 'face_cheek4':'EFO:0009922', 'face_ear1':'EFO:0009922', 'face_ear2':'EFO:0009922', 'face_forehead1':'EFO:0009922', 'face_forehead2':'EFO:0009922', 'face_forehead3':'EFO:0009922', 'face_forehead4':'EFO:0009922', 'face_forehead5':'EFO:0009922', 'face_nose1':'EFO:0009922', 'face_temple1':'EFO:0009922', 'face_temple2':'EFO:0009922', 'bcc_cheek1':'EFO:0009922', 'bcc_cheek2':'EFO:0009922', 'bcc_ear1':'EFO:0009922', 'bcc_ear2':'EFO:0009922', 'bcc_forehead1':'EFO:0009922', 'bcc_nose1':'EFO:0009922', 'bcc_temple1':'EFO:0009922', 'bcc_temple2':'EFO:0009922',}

In [179]:
adata.obs['assay_ontology_term_id'] = adata.obs['01_sample'].map(mapping)

In [180]:
list(adata.obs['assay_ontology_term_id'].unique())

['EFO:0009899', 'EFO:0009901', 'EFO:0009922']

In [181]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay_ontology_term_id'].astype('category')

In [182]:
#assay_info = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/Megagut/Suppl_info/barcode_assay_summary.csv')

In [183]:
#mapping = dict(zip(assay_info['barcode'], assay_info['assay_summary']))

In [184]:
#adata.obs['assay'] = adata.obs['barcodes'].map(mapping)

In [185]:
adata.obs.columns

Index(['nCount_RNA', 'nFeature_RNA', 'percent.mt', '00_conditions',
       '01_sample', '02_group', '03_location', '04_celltypes',
       '05_subcelltypes', '06_seurat_clusters', 'barcodes-1', 'barcodes',
       'barcodes_with_suffix', 'assay_ontology_term_id'],
      dtype='object')

In [186]:
#unique_combinations = adata.obs[['01_sample', 'assay']].drop_duplicates()

In [187]:
#unique_combinations.to_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/Megagut/Suppl_info/unique_combinations.csv', index=False)


In [188]:
# view adata.obs

In [189]:
adata.obs

Unnamed: 0_level_0,nCount_RNA,nFeature_RNA,percent.mt,00_conditions,01_sample,02_group,03_location,04_celltypes,05_subcelltypes,06_seurat_clusters,barcodes-1,barcodes,barcodes_with_suffix,assay_ontology_term_id
barcodes-1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
bcc_AAAGATGAGAGCAATT-1_1,2887.0,1229,3.221337,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,bcc_AAAGATGAGAGCAATT-1_1,AAAGATGAGAGCAATT,AAAGATGAGAGCAATT_body_solebordo,EFO:0009899
bcc_AACTGGTGTACAAGTA-1_1,3956.0,1329,4.802831,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_AACTGGTGTACAAGTA-1_1,AACTGGTGTACAAGTA,AACTGGTGTACAAGTA_body_solebordo,EFO:0009899
bcc_AAGCCGCTCCAGAAGG-1_1,3698.0,1256,4.245538,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_AAGCCGCTCCAGAAGG-1_1,AAGCCGCTCCAGAAGG,AAGCCGCTCCAGAAGG_body_solebordo,EFO:0009899
bcc_AAGGTTCAGGTTCCTA-1_1,2037.0,762,0.147275,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,bcc_AAGGTTCAGGTTCCTA-1_1,AAGGTTCAGGTTCCTA,AAGGTTCAGGTTCCTA_body_solebordo,EFO:0009899
bcc_ACATACGTCAGATAAG-1_1,7270.0,1900,3.658872,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_ACATACGTCAGATAAG-1_1,ACATACGTCAGATAAG,ACATACGTCAGATAAG_body_solebordo,EFO:0009899
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ife_TTTGGAGGTTGGTAGG-1,571.0,442,0.175131,healthy,ife1,face,scalp,N/A (microdissected data),Transitional K,3,ife_TTTGGAGGTTGGTAGG-1,TTTGGAGGTTGGTAGG,TTTGGAGGTTGGTAGG_ife1,EFO:0009922
ife_TTTGGTTCACTACACA-1,11141.0,2884,0.430841,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,ife_TTTGGTTCACTACACA-1,TTTGGTTCACTACACA,TTTGGTTCACTACACA_ife1,EFO:0009922
ife_TTTGGTTGTGCCTGCA-1,11746.0,2779,2.843521,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,ife_TTTGGTTGTGCCTGCA-1,TTTGGTTGTGCCTGCA,TTTGGTTGTGCCTGCA_ife1,EFO:0009922
ife_TTTGGTTTCGTTACCC-1,2844.0,998,1.793249,healthy,ife1,face,scalp,N/A (microdissected data),Granular K,9,ife_TTTGGTTTCGTTACCC-1,TTTGGTTTCGTTACCC,TTTGGTTTCGTTACCC_ife1,EFO:0009922


#### **cell_type_ontology_term_id**

In [190]:
#identify the column in adata.obs related. to cell type annotation

In [191]:
adata.obs.columns

Index(['nCount_RNA', 'nFeature_RNA', 'percent.mt', '00_conditions',
       '01_sample', '02_group', '03_location', '04_celltypes',
       '05_subcelltypes', '06_seurat_clusters', 'barcodes-1', 'barcodes',
       'barcodes_with_suffix', 'assay_ontology_term_id'],
      dtype='object')

In [192]:
list(adata.obs['05_subcelltypes'].unique())

['IFE spinous K',
 'IFE basal K',
 'Transitional K',
 'Dividing K',
 'Outer bulb K',
 'Secretory luminal cells',
 'Upper hair follicle K',
 'Inner bulb K',
 'Sebocytes',
 'Granular K']

In [193]:
mapping= {'IFE spinous K' :'CL:2000092',
 'IFE basal K' :'CL:0002187',
 'Transitional K':'CL:2000092',
 'Dividing K':'CL:2000092',
 'Outer bulb K':'CL:2000092',
 'Secretory luminal cells' :'CL:2000092',
 'Upper hair follicle K':'CL:2000092',
 'Inner bulb K':'CL:0000312',
 'Sebocytes':'CL:0000317',
 'Granular K':'CL:0002189'}

In [194]:
# create a dictionary of cell type and ontology term

In [195]:
# add the cell_type_ontology_term_id column

In [196]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['05_subcelltypes'].map(mapping)

In [197]:
# change datatype of the column

In [198]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [199]:
# view adata.obs

In [200]:
adata.obs

Unnamed: 0_level_0,nCount_RNA,nFeature_RNA,percent.mt,00_conditions,01_sample,02_group,03_location,04_celltypes,05_subcelltypes,06_seurat_clusters,barcodes-1,barcodes,barcodes_with_suffix,assay_ontology_term_id,cell_type_ontology_term_id
barcodes-1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
bcc_AAAGATGAGAGCAATT-1_1,2887.0,1229,3.221337,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,bcc_AAAGATGAGAGCAATT-1_1,AAAGATGAGAGCAATT,AAAGATGAGAGCAATT_body_solebordo,EFO:0009899,CL:2000092
bcc_AACTGGTGTACAAGTA-1_1,3956.0,1329,4.802831,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_AACTGGTGTACAAGTA-1_1,AACTGGTGTACAAGTA,AACTGGTGTACAAGTA_body_solebordo,EFO:0009899,CL:0002187
bcc_AAGCCGCTCCAGAAGG-1_1,3698.0,1256,4.245538,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_AAGCCGCTCCAGAAGG-1_1,AAGCCGCTCCAGAAGG,AAGCCGCTCCAGAAGG_body_solebordo,EFO:0009899,CL:0002187
bcc_AAGGTTCAGGTTCCTA-1_1,2037.0,762,0.147275,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,bcc_AAGGTTCAGGTTCCTA-1_1,AAGGTTCAGGTTCCTA,AAGGTTCAGGTTCCTA_body_solebordo,EFO:0009899,CL:2000092
bcc_ACATACGTCAGATAAG-1_1,7270.0,1900,3.658872,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_ACATACGTCAGATAAG-1_1,ACATACGTCAGATAAG,ACATACGTCAGATAAG_body_solebordo,EFO:0009899,CL:0002187
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ife_TTTGGAGGTTGGTAGG-1,571.0,442,0.175131,healthy,ife1,face,scalp,N/A (microdissected data),Transitional K,3,ife_TTTGGAGGTTGGTAGG-1,TTTGGAGGTTGGTAGG,TTTGGAGGTTGGTAGG_ife1,EFO:0009922,CL:2000092
ife_TTTGGTTCACTACACA-1,11141.0,2884,0.430841,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,ife_TTTGGTTCACTACACA-1,TTTGGTTCACTACACA,TTTGGTTCACTACACA_ife1,EFO:0009922,CL:2000092
ife_TTTGGTTGTGCCTGCA-1,11746.0,2779,2.843521,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,ife_TTTGGTTGTGCCTGCA-1,TTTGGTTGTGCCTGCA,TTTGGTTGTGCCTGCA_ife1,EFO:0009922,CL:2000092
ife_TTTGGTTTCGTTACCC-1,2844.0,998,1.793249,healthy,ife1,face,scalp,N/A (microdissected data),Granular K,9,ife_TTTGGTTTCGTTACCC-1,TTTGGTTTCGTTACCC,TTTGGTTTCGTTACCC_ife1,EFO:0009922,CL:0002189


In [201]:
list(adata.obs['01_sample'].unique())

['body_solebordo',
 'body_tabib',
 'face_cheek1',
 'face_cheek2',
 'face_cheek3b',
 'face_cheek4',
 'face_ear1',
 'face_ear2',
 'face_forehead1',
 'face_forehead2',
 'face_forehead3',
 'face_forehead4',
 'face_forehead5',
 'face_nose1',
 'face_temple1',
 'face_temple2',
 'bcc_cheek1',
 'bcc_cheek2',
 'bcc_ear1',
 'bcc_ear2',
 'bcc_forehead1',
 'bcc_nose1',
 'bcc_temple1',
 'bcc_temple2',
 'psu1',
 'ife1']

#### **development_stage_ontology_term_id**

In [202]:
list(adata.obs['01_sample'].unique())

['body_solebordo',
 'body_tabib',
 'face_cheek1',
 'face_cheek2',
 'face_cheek3b',
 'face_cheek4',
 'face_ear1',
 'face_ear2',
 'face_forehead1',
 'face_forehead2',
 'face_forehead3',
 'face_forehead4',
 'face_forehead5',
 'face_nose1',
 'face_temple1',
 'face_temple2',
 'bcc_cheek1',
 'bcc_cheek2',
 'bcc_ear1',
 'bcc_ear2',
 'bcc_forehead1',
 'bcc_nose1',
 'bcc_temple1',
 'bcc_temple2',
 'psu1',
 'ife1']

In [203]:
# identify the column in adata which corresponds to age

In [204]:
mapping= {'body_solebordo':'HsapDv:0000087',
 'face_cheek1':'HsapDv:0000211',
 'body_tabib':'HsapDv:0000087',
 'face_cheek2':'HsapDv:0000156',
 'face_cheek3b':'HsapDv:0000207',
 'face_cheek4':'HsapDv:0000216',
 'face_ear1':'HsapDv:0000171',
 'face_ear2':'HsapDv:0000212',
 'face_forehead1':'HsapDv:0000211',
 'face_forehead2':'HsapDv:0000164',
 'face_forehead3':'HsapDv:0000206',
 'face_forehead4':'HsapDv:0000216',
 'face_forehead5':'HsapDv:0000167',
 'face_nose1':'HsapDv:0000150',
 'face_temple1':'HsapDv:0000153',
 'face_temple2':'HsapDv:0000172',
 'bcc_cheek1':'HsapDv:0000133',
 'bcc_cheek2':'HsapDv:0000211',
 'bcc_ear1':'HsapDv:0000164',
 'bcc_ear2':'HsapDv:0000163',
 'bcc_forehead1':'HsapDv:0000167',
 'bcc_nose1':'HsapDv:0000217',
 'bcc_temple1':'HsapDv:0000145',
 'bcc_temple2':'HsapDv:0000153',
 'psu1' :'HsapDv:0000148',
 'ife1':'HsapDv:0000148'}

In [205]:
adata.obs['development_stage_ontology_term_id']=  adata.obs['01_sample'].map(mapping)

In [206]:
# change datatype of the column

In [207]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].astype('category')

In [208]:
# view unique values of development_stage_ontology_term_id column

In [209]:
list(adata.obs['development_stage_ontology_term_id'].unique())

['HsapDv:0000087',
 'HsapDv:0000211',
 'HsapDv:0000156',
 'HsapDv:0000207',
 'HsapDv:0000216',
 'HsapDv:0000171',
 'HsapDv:0000212',
 'HsapDv:0000164',
 'HsapDv:0000206',
 'HsapDv:0000167',
 'HsapDv:0000150',
 'HsapDv:0000153',
 'HsapDv:0000172',
 'HsapDv:0000133',
 'HsapDv:0000163',
 'HsapDv:0000217',
 'HsapDv:0000145',
 'HsapDv:0000148']

In [210]:
# view adata.obs

In [211]:
adata.obs

Unnamed: 0_level_0,nCount_RNA,nFeature_RNA,percent.mt,00_conditions,01_sample,02_group,03_location,04_celltypes,05_subcelltypes,06_seurat_clusters,barcodes-1,barcodes,barcodes_with_suffix,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id
barcodes-1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
bcc_AAAGATGAGAGCAATT-1_1,2887.0,1229,3.221337,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,bcc_AAAGATGAGAGCAATT-1_1,AAAGATGAGAGCAATT,AAAGATGAGAGCAATT_body_solebordo,EFO:0009899,CL:2000092,HsapDv:0000087
bcc_AACTGGTGTACAAGTA-1_1,3956.0,1329,4.802831,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_AACTGGTGTACAAGTA-1_1,AACTGGTGTACAAGTA,AACTGGTGTACAAGTA_body_solebordo,EFO:0009899,CL:0002187,HsapDv:0000087
bcc_AAGCCGCTCCAGAAGG-1_1,3698.0,1256,4.245538,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_AAGCCGCTCCAGAAGG-1_1,AAGCCGCTCCAGAAGG,AAGCCGCTCCAGAAGG_body_solebordo,EFO:0009899,CL:0002187,HsapDv:0000087
bcc_AAGGTTCAGGTTCCTA-1_1,2037.0,762,0.147275,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,bcc_AAGGTTCAGGTTCCTA-1_1,AAGGTTCAGGTTCCTA,AAGGTTCAGGTTCCTA_body_solebordo,EFO:0009899,CL:2000092,HsapDv:0000087
bcc_ACATACGTCAGATAAG-1_1,7270.0,1900,3.658872,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_ACATACGTCAGATAAG-1_1,ACATACGTCAGATAAG,ACATACGTCAGATAAG_body_solebordo,EFO:0009899,CL:0002187,HsapDv:0000087
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ife_TTTGGAGGTTGGTAGG-1,571.0,442,0.175131,healthy,ife1,face,scalp,N/A (microdissected data),Transitional K,3,ife_TTTGGAGGTTGGTAGG-1,TTTGGAGGTTGGTAGG,TTTGGAGGTTGGTAGG_ife1,EFO:0009922,CL:2000092,HsapDv:0000148
ife_TTTGGTTCACTACACA-1,11141.0,2884,0.430841,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,ife_TTTGGTTCACTACACA-1,TTTGGTTCACTACACA,TTTGGTTCACTACACA_ife1,EFO:0009922,CL:2000092,HsapDv:0000148
ife_TTTGGTTGTGCCTGCA-1,11746.0,2779,2.843521,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,ife_TTTGGTTGTGCCTGCA-1,TTTGGTTGTGCCTGCA,TTTGGTTGTGCCTGCA_ife1,EFO:0009922,CL:2000092,HsapDv:0000148
ife_TTTGGTTTCGTTACCC-1,2844.0,998,1.793249,healthy,ife1,face,scalp,N/A (microdissected data),Granular K,9,ife_TTTGGTTTCGTTACCC-1,TTTGGTTTCGTTACCC,TTTGGTTTCGTTACCC_ife1,EFO:0009922,CL:0002189,HsapDv:0000148


#### **donor_id**

In [212]:
#identify the column in adata.obs which provides donor information

In [213]:
adata.obs.columns

Index(['nCount_RNA', 'nFeature_RNA', 'percent.mt', '00_conditions',
       '01_sample', '02_group', '03_location', '04_celltypes',
       '05_subcelltypes', '06_seurat_clusters', 'barcodes-1', 'barcodes',
       'barcodes_with_suffix', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id'],
      dtype='object')

In [214]:
# add the donor_id column

In [215]:
adata.obs['donor_id'] = adata.obs['01_sample']

In [216]:
adata.obs['donor_id'].replace({'body_tabib': 'pooled_body_tabib'}, inplace=True)

In [217]:
adata.obs['donor_id'].replace({'body_solebordo': 'pooled_body_solebordo'}, inplace=True)

In [218]:
adata.obs['donor_id'] = adata.obs['donor_id'].astype('category')

In [219]:
list(adata.obs['donor_id'].unique())

['pooled_body_solebordo',
 'pooled_body_tabib',
 'face_cheek1',
 'face_cheek2',
 'face_cheek3b',
 'face_cheek4',
 'face_ear1',
 'face_ear2',
 'face_forehead1',
 'face_forehead2',
 'face_forehead3',
 'face_forehead4',
 'face_forehead5',
 'face_nose1',
 'face_temple1',
 'face_temple2',
 'bcc_cheek1',
 'bcc_cheek2',
 'bcc_ear1',
 'bcc_ear2',
 'bcc_forehead1',
 'bcc_nose1',
 'bcc_temple1',
 'bcc_temple2',
 'psu1',
 'ife1']

#### **disease_ontology_term_id**

In [220]:
# Assign normal since all are healthy patients

In [221]:
# add the disease_ontology_term_id column

In [222]:
adata.obs['disease_ontology_term_id']= ['MONDO:0020804' if 'bcc' in val else 'PATO:0000461' for val in adata.obs['01_sample']]

In [223]:
# change datatype of the column

In [224]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')

In [225]:
# view obs

In [226]:
adata.obs

Unnamed: 0_level_0,nCount_RNA,nFeature_RNA,percent.mt,00_conditions,01_sample,02_group,03_location,04_celltypes,05_subcelltypes,06_seurat_clusters,barcodes-1,barcodes,barcodes_with_suffix,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id
barcodes-1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
bcc_AAAGATGAGAGCAATT-1_1,2887.0,1229,3.221337,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,bcc_AAAGATGAGAGCAATT-1_1,AAAGATGAGAGCAATT,AAAGATGAGAGCAATT_body_solebordo,EFO:0009899,CL:2000092,HsapDv:0000087,pooled_body_solebordo,PATO:0000461
bcc_AACTGGTGTACAAGTA-1_1,3956.0,1329,4.802831,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_AACTGGTGTACAAGTA-1_1,AACTGGTGTACAAGTA,AACTGGTGTACAAGTA_body_solebordo,EFO:0009899,CL:0002187,HsapDv:0000087,pooled_body_solebordo,PATO:0000461
bcc_AAGCCGCTCCAGAAGG-1_1,3698.0,1256,4.245538,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_AAGCCGCTCCAGAAGG-1_1,AAGCCGCTCCAGAAGG,AAGCCGCTCCAGAAGG_body_solebordo,EFO:0009899,CL:0002187,HsapDv:0000087,pooled_body_solebordo,PATO:0000461
bcc_AAGGTTCAGGTTCCTA-1_1,2037.0,762,0.147275,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,bcc_AAGGTTCAGGTTCCTA-1_1,AAGGTTCAGGTTCCTA,AAGGTTCAGGTTCCTA_body_solebordo,EFO:0009899,CL:2000092,HsapDv:0000087,pooled_body_solebordo,PATO:0000461
bcc_ACATACGTCAGATAAG-1_1,7270.0,1900,3.658872,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_ACATACGTCAGATAAG-1_1,ACATACGTCAGATAAG,ACATACGTCAGATAAG_body_solebordo,EFO:0009899,CL:0002187,HsapDv:0000087,pooled_body_solebordo,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ife_TTTGGAGGTTGGTAGG-1,571.0,442,0.175131,healthy,ife1,face,scalp,N/A (microdissected data),Transitional K,3,ife_TTTGGAGGTTGGTAGG-1,TTTGGAGGTTGGTAGG,TTTGGAGGTTGGTAGG_ife1,EFO:0009922,CL:2000092,HsapDv:0000148,ife1,PATO:0000461
ife_TTTGGTTCACTACACA-1,11141.0,2884,0.430841,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,ife_TTTGGTTCACTACACA-1,TTTGGTTCACTACACA,TTTGGTTCACTACACA_ife1,EFO:0009922,CL:2000092,HsapDv:0000148,ife1,PATO:0000461
ife_TTTGGTTGTGCCTGCA-1,11746.0,2779,2.843521,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,ife_TTTGGTTGTGCCTGCA-1,TTTGGTTGTGCCTGCA,TTTGGTTGTGCCTGCA_ife1,EFO:0009922,CL:2000092,HsapDv:0000148,ife1,PATO:0000461
ife_TTTGGTTTCGTTACCC-1,2844.0,998,1.793249,healthy,ife1,face,scalp,N/A (microdissected data),Granular K,9,ife_TTTGGTTTCGTTACCC-1,TTTGGTTTCGTTACCC,TTTGGTTTCGTTACCC_ife1,EFO:0009922,CL:0002189,HsapDv:0000148,ife1,PATO:0000461


#### **is_primary_data**

In [227]:
adata.obs['is_primary_data'] = [False] * len(adata.obs)

In [228]:
adata.obs['is_primary_data'][(adata.obs['01_sample'] == 'psu1') | (adata.obs['01_sample']== 'ife1')] = True

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adata.obs['is_primary_data'][(adata.obs['01_sample'] == 'psu1') | (adata.obs['01_sample']== 'ife1')] = True


In [229]:
list(adata.obs['is_primary_data'].unique())

[False, True]

In [230]:
adata.obs

Unnamed: 0_level_0,nCount_RNA,nFeature_RNA,percent.mt,00_conditions,01_sample,02_group,03_location,04_celltypes,05_subcelltypes,06_seurat_clusters,barcodes-1,barcodes,barcodes_with_suffix,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data
barcodes-1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
bcc_AAAGATGAGAGCAATT-1_1,2887.0,1229,3.221337,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,bcc_AAAGATGAGAGCAATT-1_1,AAAGATGAGAGCAATT,AAAGATGAGAGCAATT_body_solebordo,EFO:0009899,CL:2000092,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False
bcc_AACTGGTGTACAAGTA-1_1,3956.0,1329,4.802831,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_AACTGGTGTACAAGTA-1_1,AACTGGTGTACAAGTA,AACTGGTGTACAAGTA_body_solebordo,EFO:0009899,CL:0002187,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False
bcc_AAGCCGCTCCAGAAGG-1_1,3698.0,1256,4.245538,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_AAGCCGCTCCAGAAGG-1_1,AAGCCGCTCCAGAAGG,AAGCCGCTCCAGAAGG_body_solebordo,EFO:0009899,CL:0002187,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False
bcc_AAGGTTCAGGTTCCTA-1_1,2037.0,762,0.147275,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,bcc_AAGGTTCAGGTTCCTA-1_1,AAGGTTCAGGTTCCTA,AAGGTTCAGGTTCCTA_body_solebordo,EFO:0009899,CL:2000092,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False
bcc_ACATACGTCAGATAAG-1_1,7270.0,1900,3.658872,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_ACATACGTCAGATAAG-1_1,ACATACGTCAGATAAG,ACATACGTCAGATAAG_body_solebordo,EFO:0009899,CL:0002187,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ife_TTTGGAGGTTGGTAGG-1,571.0,442,0.175131,healthy,ife1,face,scalp,N/A (microdissected data),Transitional K,3,ife_TTTGGAGGTTGGTAGG-1,TTTGGAGGTTGGTAGG,TTTGGAGGTTGGTAGG_ife1,EFO:0009922,CL:2000092,HsapDv:0000148,ife1,PATO:0000461,True
ife_TTTGGTTCACTACACA-1,11141.0,2884,0.430841,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,ife_TTTGGTTCACTACACA-1,TTTGGTTCACTACACA,TTTGGTTCACTACACA_ife1,EFO:0009922,CL:2000092,HsapDv:0000148,ife1,PATO:0000461,True
ife_TTTGGTTGTGCCTGCA-1,11746.0,2779,2.843521,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,ife_TTTGGTTGTGCCTGCA-1,TTTGGTTGTGCCTGCA,TTTGGTTGTGCCTGCA_ife1,EFO:0009922,CL:2000092,HsapDv:0000148,ife1,PATO:0000461,True
ife_TTTGGTTTCGTTACCC-1,2844.0,998,1.793249,healthy,ife1,face,scalp,N/A (microdissected data),Granular K,9,ife_TTTGGTTTCGTTACCC-1,TTTGGTTTCGTTACCC,TTTGGTTTCGTTACCC_ife1,EFO:0009922,CL:0002189,HsapDv:0000148,ife1,PATO:0000461,True


In [231]:
#change data type of column

In [232]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

In [233]:
# view obs

In [234]:
adata.obs

Unnamed: 0_level_0,nCount_RNA,nFeature_RNA,percent.mt,00_conditions,01_sample,02_group,03_location,04_celltypes,05_subcelltypes,06_seurat_clusters,barcodes-1,barcodes,barcodes_with_suffix,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data
barcodes-1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
bcc_AAAGATGAGAGCAATT-1_1,2887.0,1229,3.221337,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,bcc_AAAGATGAGAGCAATT-1_1,AAAGATGAGAGCAATT,AAAGATGAGAGCAATT_body_solebordo,EFO:0009899,CL:2000092,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False
bcc_AACTGGTGTACAAGTA-1_1,3956.0,1329,4.802831,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_AACTGGTGTACAAGTA-1_1,AACTGGTGTACAAGTA,AACTGGTGTACAAGTA_body_solebordo,EFO:0009899,CL:0002187,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False
bcc_AAGCCGCTCCAGAAGG-1_1,3698.0,1256,4.245538,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_AAGCCGCTCCAGAAGG-1_1,AAGCCGCTCCAGAAGG,AAGCCGCTCCAGAAGG_body_solebordo,EFO:0009899,CL:0002187,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False
bcc_AAGGTTCAGGTTCCTA-1_1,2037.0,762,0.147275,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,bcc_AAGGTTCAGGTTCCTA-1_1,AAGGTTCAGGTTCCTA,AAGGTTCAGGTTCCTA_body_solebordo,EFO:0009899,CL:2000092,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False
bcc_ACATACGTCAGATAAG-1_1,7270.0,1900,3.658872,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_ACATACGTCAGATAAG-1_1,ACATACGTCAGATAAG,ACATACGTCAGATAAG_body_solebordo,EFO:0009899,CL:0002187,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ife_TTTGGAGGTTGGTAGG-1,571.0,442,0.175131,healthy,ife1,face,scalp,N/A (microdissected data),Transitional K,3,ife_TTTGGAGGTTGGTAGG-1,TTTGGAGGTTGGTAGG,TTTGGAGGTTGGTAGG_ife1,EFO:0009922,CL:2000092,HsapDv:0000148,ife1,PATO:0000461,True
ife_TTTGGTTCACTACACA-1,11141.0,2884,0.430841,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,ife_TTTGGTTCACTACACA-1,TTTGGTTCACTACACA,TTTGGTTCACTACACA_ife1,EFO:0009922,CL:2000092,HsapDv:0000148,ife1,PATO:0000461,True
ife_TTTGGTTGTGCCTGCA-1,11746.0,2779,2.843521,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,ife_TTTGGTTGTGCCTGCA-1,TTTGGTTGTGCCTGCA,TTTGGTTGTGCCTGCA_ife1,EFO:0009922,CL:2000092,HsapDv:0000148,ife1,PATO:0000461,True
ife_TTTGGTTTCGTTACCC-1,2844.0,998,1.793249,healthy,ife1,face,scalp,N/A (microdissected data),Granular K,9,ife_TTTGGTTTCGTTACCC-1,TTTGGTTTCGTTACCC,TTTGGTTTCGTTACCC_ife1,EFO:0009922,CL:0002189,HsapDv:0000148,ife1,PATO:0000461,True


#### **organism_ontology_term_id**

In [235]:
# assign organism id 

In [236]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [237]:
#change data type of column

In [238]:
adata.obs['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].astype('category')

In [239]:
# view obs

In [240]:
adata.obs

Unnamed: 0_level_0,nCount_RNA,nFeature_RNA,percent.mt,00_conditions,01_sample,02_group,03_location,04_celltypes,05_subcelltypes,06_seurat_clusters,barcodes-1,barcodes,barcodes_with_suffix,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id
barcodes-1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
bcc_AAAGATGAGAGCAATT-1_1,2887.0,1229,3.221337,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,bcc_AAAGATGAGAGCAATT-1_1,AAAGATGAGAGCAATT,AAAGATGAGAGCAATT_body_solebordo,EFO:0009899,CL:2000092,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606
bcc_AACTGGTGTACAAGTA-1_1,3956.0,1329,4.802831,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_AACTGGTGTACAAGTA-1_1,AACTGGTGTACAAGTA,AACTGGTGTACAAGTA_body_solebordo,EFO:0009899,CL:0002187,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606
bcc_AAGCCGCTCCAGAAGG-1_1,3698.0,1256,4.245538,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_AAGCCGCTCCAGAAGG-1_1,AAGCCGCTCCAGAAGG,AAGCCGCTCCAGAAGG_body_solebordo,EFO:0009899,CL:0002187,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606
bcc_AAGGTTCAGGTTCCTA-1_1,2037.0,762,0.147275,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,bcc_AAGGTTCAGGTTCCTA-1_1,AAGGTTCAGGTTCCTA,AAGGTTCAGGTTCCTA_body_solebordo,EFO:0009899,CL:2000092,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606
bcc_ACATACGTCAGATAAG-1_1,7270.0,1900,3.658872,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,bcc_ACATACGTCAGATAAG-1_1,ACATACGTCAGATAAG,ACATACGTCAGATAAG_body_solebordo,EFO:0009899,CL:0002187,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ife_TTTGGAGGTTGGTAGG-1,571.0,442,0.175131,healthy,ife1,face,scalp,N/A (microdissected data),Transitional K,3,ife_TTTGGAGGTTGGTAGG-1,TTTGGAGGTTGGTAGG,TTTGGAGGTTGGTAGG_ife1,EFO:0009922,CL:2000092,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606
ife_TTTGGTTCACTACACA-1,11141.0,2884,0.430841,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,ife_TTTGGTTCACTACACA-1,TTTGGTTCACTACACA,TTTGGTTCACTACACA_ife1,EFO:0009922,CL:2000092,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606
ife_TTTGGTTGTGCCTGCA-1,11746.0,2779,2.843521,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,ife_TTTGGTTGTGCCTGCA-1,TTTGGTTGTGCCTGCA,TTTGGTTGTGCCTGCA_ife1,EFO:0009922,CL:2000092,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606
ife_TTTGGTTTCGTTACCC-1,2844.0,998,1.793249,healthy,ife1,face,scalp,N/A (microdissected data),Granular K,9,ife_TTTGGTTTCGTTACCC-1,TTTGGTTTCGTTACCC,TTTGGTTTCGTTACCC_ife1,EFO:0009922,CL:0002189,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606


#### **self_reported_ethnicity_ontology_term_id**

In [241]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [242]:
# change data type

In [243]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['self_reported_ethnicity_ontology_term_id'].astype('category')

In [244]:
# view obs

In [245]:
adata.obs

Unnamed: 0_level_0,nCount_RNA,nFeature_RNA,percent.mt,00_conditions,01_sample,02_group,03_location,04_celltypes,05_subcelltypes,06_seurat_clusters,...,barcodes,barcodes_with_suffix,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id
barcodes-1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bcc_AAAGATGAGAGCAATT-1_1,2887.0,1229,3.221337,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,...,AAAGATGAGAGCAATT,AAAGATGAGAGCAATT_body_solebordo,EFO:0009899,CL:2000092,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown
bcc_AACTGGTGTACAAGTA-1_1,3956.0,1329,4.802831,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,...,AACTGGTGTACAAGTA,AACTGGTGTACAAGTA_body_solebordo,EFO:0009899,CL:0002187,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown
bcc_AAGCCGCTCCAGAAGG-1_1,3698.0,1256,4.245538,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,...,AAGCCGCTCCAGAAGG,AAGCCGCTCCAGAAGG_body_solebordo,EFO:0009899,CL:0002187,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown
bcc_AAGGTTCAGGTTCCTA-1_1,2037.0,762,0.147275,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,...,AAGGTTCAGGTTCCTA,AAGGTTCAGGTTCCTA_body_solebordo,EFO:0009899,CL:2000092,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown
bcc_ACATACGTCAGATAAG-1_1,7270.0,1900,3.658872,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,...,ACATACGTCAGATAAG,ACATACGTCAGATAAG_body_solebordo,EFO:0009899,CL:0002187,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ife_TTTGGAGGTTGGTAGG-1,571.0,442,0.175131,healthy,ife1,face,scalp,N/A (microdissected data),Transitional K,3,...,TTTGGAGGTTGGTAGG,TTTGGAGGTTGGTAGG_ife1,EFO:0009922,CL:2000092,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown
ife_TTTGGTTCACTACACA-1,11141.0,2884,0.430841,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,...,TTTGGTTCACTACACA,TTTGGTTCACTACACA_ife1,EFO:0009922,CL:2000092,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown
ife_TTTGGTTGTGCCTGCA-1,11746.0,2779,2.843521,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,...,TTTGGTTGTGCCTGCA,TTTGGTTGTGCCTGCA_ife1,EFO:0009922,CL:2000092,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown
ife_TTTGGTTTCGTTACCC-1,2844.0,998,1.793249,healthy,ife1,face,scalp,N/A (microdissected data),Granular K,9,...,TTTGGTTTCGTTACCC,TTTGGTTTCGTTACCC_ife1,EFO:0009922,CL:0002189,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown


#### **sex_ontology_term_id**

In [246]:
# identify the column in adata.obs which corresponds to sex

In [247]:
adata.obs.columns

Index(['nCount_RNA', 'nFeature_RNA', 'percent.mt', '00_conditions',
       '01_sample', '02_group', '03_location', '04_celltypes',
       '05_subcelltypes', '06_seurat_clusters', 'barcodes-1', 'barcodes',
       'barcodes_with_suffix', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

In [248]:
list(adata.obs['01_sample'].unique())

['body_solebordo',
 'body_tabib',
 'face_cheek1',
 'face_cheek2',
 'face_cheek3b',
 'face_cheek4',
 'face_ear1',
 'face_ear2',
 'face_forehead1',
 'face_forehead2',
 'face_forehead3',
 'face_forehead4',
 'face_forehead5',
 'face_nose1',
 'face_temple1',
 'face_temple2',
 'bcc_cheek1',
 'bcc_cheek2',
 'bcc_ear1',
 'bcc_ear2',
 'bcc_forehead1',
 'bcc_nose1',
 'bcc_temple1',
 'bcc_temple2',
 'psu1',
 'ife1']

In [249]:
mapping = {'body_solebordo' :'PATO:0000384',
 'face_cheek1':'PATO:0000384',
 'body_tabib':'unknown',
 'face_cheek2':'PATO:0000384',
 'face_cheek3b':'PATO:0000383',
 'face_cheek4':'PATO:0000384',
 'face_ear1':'PATO:0000384',
 'face_ear2':'PATO:0000384',
 'face_forehead1':'PATO:0000384',
 'face_forehead2':'PATO:0000384',
 'face_forehead3':'PATO:0000384',
 'face_forehead4':'PATO:0000384',
 'face_forehead5':'PATO:0000384',
 'face_nose1':'PATO:0000384',
 'face_temple1':'PATO:0000384',
 'face_temple2':'PATO:0000384',
 'bcc_cheek1':'PATO:0000384',
 'bcc_cheek2':'PATO:0000384',
 'bcc_ear1':'PATO:0000384',
 'bcc_ear2':'PATO:0000384',
 'bcc_forehead1':'PATO:0000384',
 'bcc_nose1':'PATO:0000384',
 'bcc_temple1':'PATO:0000384',
 'bcc_temple2':'PATO:0000384',
 'psu1' :'PATO:0000383',
 'ife1' :'PATO:0000383'}

In [250]:
adata.obs['sex_ontology_term_id'] = adata.obs['01_sample'].map(mapping)

In [251]:
# change data type

In [252]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].astype('category')

In [253]:
adata.obs

Unnamed: 0_level_0,nCount_RNA,nFeature_RNA,percent.mt,00_conditions,01_sample,02_group,03_location,04_celltypes,05_subcelltypes,06_seurat_clusters,...,barcodes_with_suffix,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
barcodes-1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bcc_AAAGATGAGAGCAATT-1_1,2887.0,1229,3.221337,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,...,AAAGATGAGAGCAATT_body_solebordo,EFO:0009899,CL:2000092,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
bcc_AACTGGTGTACAAGTA-1_1,3956.0,1329,4.802831,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,...,AACTGGTGTACAAGTA_body_solebordo,EFO:0009899,CL:0002187,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
bcc_AAGCCGCTCCAGAAGG-1_1,3698.0,1256,4.245538,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,...,AAGCCGCTCCAGAAGG_body_solebordo,EFO:0009899,CL:0002187,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
bcc_AAGGTTCAGGTTCCTA-1_1,2037.0,762,0.147275,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,...,AAGGTTCAGGTTCCTA_body_solebordo,EFO:0009899,CL:2000092,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
bcc_ACATACGTCAGATAAG-1_1,7270.0,1900,3.658872,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,...,ACATACGTCAGATAAG_body_solebordo,EFO:0009899,CL:0002187,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ife_TTTGGAGGTTGGTAGG-1,571.0,442,0.175131,healthy,ife1,face,scalp,N/A (microdissected data),Transitional K,3,...,TTTGGAGGTTGGTAGG_ife1,EFO:0009922,CL:2000092,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
ife_TTTGGTTCACTACACA-1,11141.0,2884,0.430841,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,...,TTTGGTTCACTACACA_ife1,EFO:0009922,CL:2000092,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
ife_TTTGGTTGTGCCTGCA-1,11746.0,2779,2.843521,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,...,TTTGGTTGTGCCTGCA_ife1,EFO:0009922,CL:2000092,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383
ife_TTTGGTTTCGTTACCC-1,2844.0,998,1.793249,healthy,ife1,face,scalp,N/A (microdissected data),Granular K,9,...,TTTGGTTTCGTTACCC_ife1,EFO:0009922,CL:0002189,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383


In [254]:
list(adata.obs['sex_ontology_term_id'].unique())

['PATO:0000384', 'unknown', 'PATO:0000383']

#### **suspension_type**

In [255]:
adata.obs['suspension_type'] = ['cell'] * len(adata.obs)

In [256]:
# change data type of column

In [257]:
adata.obs['suspension_type'] = adata.obs['suspension_type'].astype('category')

In [258]:
# view obs

In [259]:
adata.obs

Unnamed: 0_level_0,nCount_RNA,nFeature_RNA,percent.mt,00_conditions,01_sample,02_group,03_location,04_celltypes,05_subcelltypes,06_seurat_clusters,...,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type
barcodes-1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bcc_AAAGATGAGAGCAATT-1_1,2887.0,1229,3.221337,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,...,EFO:0009899,CL:2000092,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell
bcc_AACTGGTGTACAAGTA-1_1,3956.0,1329,4.802831,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,...,EFO:0009899,CL:0002187,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell
bcc_AAGCCGCTCCAGAAGG-1_1,3698.0,1256,4.245538,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,...,EFO:0009899,CL:0002187,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell
bcc_AAGGTTCAGGTTCCTA-1_1,2037.0,762,0.147275,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,...,EFO:0009899,CL:2000092,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell
bcc_ACATACGTCAGATAAG-1_1,7270.0,1900,3.658872,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,...,EFO:0009899,CL:0002187,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ife_TTTGGAGGTTGGTAGG-1,571.0,442,0.175131,healthy,ife1,face,scalp,N/A (microdissected data),Transitional K,3,...,EFO:0009922,CL:2000092,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell
ife_TTTGGTTCACTACACA-1,11141.0,2884,0.430841,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,...,EFO:0009922,CL:2000092,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell
ife_TTTGGTTGTGCCTGCA-1,11746.0,2779,2.843521,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,...,EFO:0009922,CL:2000092,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell
ife_TTTGGTTTCGTTACCC-1,2844.0,998,1.793249,healthy,ife1,face,scalp,N/A (microdissected data),Granular K,9,...,EFO:0009922,CL:0002189,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell


#### **tissue_type**

In [260]:
adata.obs['tissue_type'] = ['tissue'] * len(adata.obs)

In [261]:
adata.obs['tissue_type'] = adata.obs['tissue_type'].astype('category')

#### **tissue_ontology_term_id**

In [262]:
# identify the column in adata.obs which corresponds to tissue

In [263]:
adata.obs.columns

Index(['nCount_RNA', 'nFeature_RNA', 'percent.mt', '00_conditions',
       '01_sample', '02_group', '03_location', '04_celltypes',
       '05_subcelltypes', '06_seurat_clusters', 'barcodes-1', 'barcodes',
       'barcodes_with_suffix', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type'],
      dtype='object')

In [264]:
list(adata.obs['03_location'].unique())

['abdomen', 'arm', 'cheek', 'ear', 'forehead', 'nose', 'temple', 'scalp']

In [265]:
# create a dictionary with tissue and corresponding tissue ontology term id

In [266]:
mapping ={'abdomen':'UBERON:0001416', 'cheek':'UBERON:0008803', 'arm':'UBERON:0002427', 'ear':'UBERON:0001459', 'forehead':'UBERON:0016475', 'nose':'UBERON:0015476', 'temple':'UBERON:8480028', 'scalp' :'UBERON:0000403'}

In [267]:
# add 'tissue_ontology_term_id' column

In [268]:
adata.obs['tissue_ontology_term_id'] = adata.obs['03_location'].map(mapping)

In [269]:
# change data type of column

In [270]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].astype('category')

In [271]:
#list the unique values in 'tissue_ontology_term_id' column

In [272]:
list(adata.obs['tissue_ontology_term_id'].unique())

['UBERON:0001416',
 'UBERON:0002427',
 'UBERON:0008803',
 'UBERON:0001459',
 'UBERON:0016475',
 'UBERON:0015476',
 'UBERON:8480028',
 'UBERON:0000403']

In [273]:
# view obs

In [274]:
adata.obs

Unnamed: 0_level_0,nCount_RNA,nFeature_RNA,percent.mt,00_conditions,01_sample,02_group,03_location,04_celltypes,05_subcelltypes,06_seurat_clusters,...,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
barcodes-1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bcc_AAAGATGAGAGCAATT-1_1,2887.0,1229,3.221337,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,...,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
bcc_AACTGGTGTACAAGTA-1_1,3956.0,1329,4.802831,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,...,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
bcc_AAGCCGCTCCAGAAGG-1_1,3698.0,1256,4.245538,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,...,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
bcc_AAGGTTCAGGTTCCTA-1_1,2037.0,762,0.147275,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,...,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
bcc_ACATACGTCAGATAAG-1_1,7270.0,1900,3.658872,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,...,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ife_TTTGGAGGTTGGTAGG-1,571.0,442,0.175131,healthy,ife1,face,scalp,N/A (microdissected data),Transitional K,3,...,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0000403
ife_TTTGGTTCACTACACA-1,11141.0,2884,0.430841,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,...,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0000403
ife_TTTGGTTGTGCCTGCA-1,11746.0,2779,2.843521,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,...,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0000403
ife_TTTGGTTTCGTTACCC-1,2844.0,998,1.793249,healthy,ife1,face,scalp,N/A (microdissected data),Granular K,9,...,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0000403


In [275]:
adata.obs['01_sample']= adata.obs['01_sample'].replace('face_cheek3b', 'face_cheek3')

#### **obsm (Embeddings)**

In [276]:
# view obsm

In [277]:
# check whether all columns are prefixed with X

In [278]:
adata.obsm

AxisArrays with keys: X_pca, X_umap

#### **uns (Dataset Metadata)**

In [279]:
# View

In [280]:
adata.uns

OverloadedDict, wrapping:
	OrderedDict()
With overloaded keys:
	['neighbors'].

In [281]:
adata.uns.keys

<bound method OverloadedDict.keys of OverloadedDict, wrapping:
	OrderedDict()
With overloaded keys:
	['neighbors'].>

In [282]:
# Give a title for the dataset

In [283]:
adata.uns['title'] = 'Epithelial bcc and normal_scRNAseq'

In [284]:
# Set the default embedding

In [285]:
adata.uns['default_embedding'] = 'X_umap'

### **Final check**

In [286]:
# view anndata object

In [287]:
adata

AnnData object with n_obs × n_vars = 9841 × 27405
    obs: 'nCount_RNA', 'nFeature_RNA', 'percent.mt', '00_conditions', '01_sample', '02_group', '03_location', '04_celltypes', '05_subcelltypes', '06_seurat_clusters', 'barcodes-1', 'barcodes', 'barcodes_with_suffix', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'donor_id', 'disease_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'suspension_type', 'tissue_type', 'tissue_ontology_term_id'
    var: 'feature_is_filtered'
    uns: 'title', 'default_embedding'
    obsm: 'X_pca', 'X_umap'

In [288]:
# view obs and var data types

In [289]:
adata.obs.dtypes

nCount_RNA                                   float64
nFeature_RNA                                   int32
percent.mt                                   float64
00_conditions                               category
01_sample                                     object
02_group                                    category
03_location                                 category
04_celltypes                                category
05_subcelltypes                             category
06_seurat_clusters                          category
barcodes-1                                    object
barcodes                                      object
barcodes_with_suffix                          object
assay_ontology_term_id                      category
cell_type_ontology_term_id                  category
development_stage_ontology_term_id          category
donor_id                                    category
disease_ontology_term_id                    category
is_primary_data                               

In [290]:
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

In [291]:
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")
for c in dty[dty['dtype'] == 'object'].index.values:
    adata.obs[c] = adata.obs[c].astype('category') 
    print(f"changed {c} from object to category")

changed nCount_RNA from float64 to float32
changed percent.mt from float64 to float32
changed 01_sample from object to category
changed barcodes-1 from object to category
changed barcodes from object to category
changed barcodes_with_suffix from object to category


In [292]:
# view obs

In [293]:
adata.obs

Unnamed: 0_level_0,nCount_RNA,nFeature_RNA,percent.mt,00_conditions,01_sample,02_group,03_location,04_celltypes,05_subcelltypes,06_seurat_clusters,...,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
barcodes-1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
bcc_AAAGATGAGAGCAATT-1_1,2887.0,1229,3.221337,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,...,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
bcc_AACTGGTGTACAAGTA-1_1,3956.0,1329,4.802831,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,...,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
bcc_AAGCCGCTCCAGAAGG-1_1,3698.0,1256,4.245538,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,...,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
bcc_AAGGTTCAGGTTCCTA-1_1,2037.0,762,0.147275,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE spinous K,1,...,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
bcc_ACATACGTCAGATAAG-1_1,7270.0,1900,3.658872,healthy,body_solebordo,body,abdomen,Suprabasal K,IFE basal K,2,...,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ife_TTTGGAGGTTGGTAGG-1,571.0,442,0.175131,healthy,ife1,face,scalp,N/A (microdissected data),Transitional K,3,...,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0000403
ife_TTTGGTTCACTACACA-1,11141.0,2884,0.430841,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,...,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0000403
ife_TTTGGTTGTGCCTGCA-1,11746.0,2779,2.843521,healthy,ife1,face,scalp,N/A (microdissected data),Outer bulb K,4,...,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0000403
ife_TTTGGTTTCGTTACCC-1,2844.0,998,1.793249,healthy,ife1,face,scalp,N/A (microdissected data),Granular K,9,...,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0000403


In [294]:
adata.obs.columns

Index(['nCount_RNA', 'nFeature_RNA', 'percent.mt', '00_conditions',
       '01_sample', '02_group', '03_location', '04_celltypes',
       '05_subcelltypes', '06_seurat_clusters', 'barcodes-1', 'barcodes',
       'barcodes_with_suffix', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

In [295]:
adata.obs.index=  adata.obs['barcodes-1']

In [296]:
araw.obs.index=  araw.obs['barcodes-1']

In [297]:
# delete unwanted columns in obs

In [298]:
del adata.obs['01_sample']
del adata.obs['03_location']

del adata.obs['barcodes']
del adata.obs['barcodes-1']
del adata.obs['barcodes_with_suffix']

In [299]:
# view obs

In [300]:
adata.obs

Unnamed: 0_level_0,nCount_RNA,nFeature_RNA,percent.mt,00_conditions,02_group,04_celltypes,05_subcelltypes,06_seurat_clusters,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
barcodes-1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
bcc_AAAGATGAGAGCAATT-1_1,2887.0,1229,3.221337,healthy,body,Suprabasal K,IFE spinous K,1,EFO:0009899,CL:2000092,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
bcc_AACTGGTGTACAAGTA-1_1,3956.0,1329,4.802831,healthy,body,Suprabasal K,IFE basal K,2,EFO:0009899,CL:0002187,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
bcc_AAGCCGCTCCAGAAGG-1_1,3698.0,1256,4.245538,healthy,body,Suprabasal K,IFE basal K,2,EFO:0009899,CL:0002187,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
bcc_AAGGTTCAGGTTCCTA-1_1,2037.0,762,0.147275,healthy,body,Suprabasal K,IFE spinous K,1,EFO:0009899,CL:2000092,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
bcc_ACATACGTCAGATAAG-1_1,7270.0,1900,3.658872,healthy,body,Suprabasal K,IFE basal K,2,EFO:0009899,CL:0002187,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ife_TTTGGAGGTTGGTAGG-1,571.0,442,0.175131,healthy,face,N/A (microdissected data),Transitional K,3,EFO:0009922,CL:2000092,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0000403
ife_TTTGGTTCACTACACA-1,11141.0,2884,0.430841,healthy,face,N/A (microdissected data),Outer bulb K,4,EFO:0009922,CL:2000092,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0000403
ife_TTTGGTTGTGCCTGCA-1,11746.0,2779,2.843521,healthy,face,N/A (microdissected data),Outer bulb K,4,EFO:0009922,CL:2000092,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0000403
ife_TTTGGTTTCGTTACCC-1,2844.0,998,1.793249,healthy,face,N/A (microdissected data),Granular K,9,EFO:0009922,CL:0002189,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0000403


In [301]:
adata.obs.columns

Index(['nCount_RNA', 'nFeature_RNA', 'percent.mt', '00_conditions', '02_group',
       '04_celltypes', '05_subcelltypes', '06_seurat_clusters',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

In [302]:
# view var

In [303]:
adata.var

Unnamed: 0_level_0,feature_is_filtered
ensembl_id,Unnamed: 1_level_1
ENSG00000167614,False
ENSG00000111906,False
ENSG00000166922,False
ENSG00000104970,False
ENSG00000260213,False
...,...
ENSG00000225783,False
ENSG00000285100,False
ENSG00000166595,False
ENSG00000225216,False


In [304]:
araw.var

ENSG00000167614
ENSG00000111906
ENSG00000166922
ENSG00000104970
ENSG00000260213
...
ENSG00000225783
ENSG00000285100
ENSG00000166595
ENSG00000225216
ENSG00000039123


In [305]:
#view uns

In [306]:
adata.uns

OverloadedDict, wrapping:
	OrderedDict([('title', 'Epithelial bcc and normal_scRNAseq'), ('default_embedding', 'X_umap')])
With overloaded keys:
	['neighbors'].

In [307]:
list(adata.uns.keys())

['title', 'default_embedding']

In [308]:
adata.obs.columns

Index(['nCount_RNA', 'nFeature_RNA', 'percent.mt', '00_conditions', '02_group',
       '04_celltypes', '05_subcelltypes', '06_seurat_clusters',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

In [309]:
# Remove unwanted columns in uns

In [310]:
#check the format of expression matrix

In [311]:
adata.X

<9841x27405 sparse matrix of type '<class 'numpy.float32'>'
	with 18336633 stored elements in Compressed Sparse Column format>

In [312]:
adata.X = scipy.sparse.csr_matrix(adata.X)

In [313]:
adata.X

<9841x27405 sparse matrix of type '<class 'numpy.float32'>'
	with 18336633 stored elements in Compressed Sparse Row format>

In [314]:
araw.X

<9841x27405 sparse matrix of type '<class 'numpy.float32'>'
	with 18344980 stored elements in Compressed Sparse Row format>

In [315]:
#Copy raw counts to adata.raw

In [316]:
adata.raw = araw

In [317]:
print(adata.X)

  (0, 1)	1.4960011
  (0, 19)	1.4960011
  (0, 21)	2.698351
  (0, 22)	1.4960011
  (0, 24)	2.4328597
  (0, 27)	1.4960011
  (0, 29)	1.4960011
  (0, 58)	2.9079397
  (0, 128)	3.0811217
  (0, 140)	1.4960011
  (0, 211)	2.4328597
  (0, 214)	2.698351
  (0, 244)	2.0703511
  (0, 273)	1.4960011
  (0, 303)	1.4960011
  (0, 377)	2.0703511
  (0, 417)	1.4960011
  (0, 478)	1.4960011
  (0, 487)	1.4960011
  (0, 496)	1.4960011
  (0, 518)	1.4960011
  (0, 528)	1.4960011
  (0, 543)	1.4960011
  (0, 566)	1.4960011
  (0, 626)	1.4960011
  :	:
  (9840, 27247)	0.62944984
  (9840, 27252)	0.36345416
  (9840, 27253)	0.36345416
  (9840, 27265)	0.36345416
  (9840, 27267)	1.0127478
  (9840, 27268)	0.62944984
  (9840, 27270)	0.36345416
  (9840, 27271)	0.62944984
  (9840, 27282)	0.36345416
  (9840, 27284)	0.36345416
  (9840, 27285)	0.36345416
  (9840, 27295)	0.83935213
  (9840, 27301)	0.36345416
  (9840, 27305)	0.36345416
  (9840, 27320)	0.83935213
  (9840, 27327)	0.62944984
  (9840, 27337)	0.62944984
  (9840, 27347)	0.3634

In [318]:
print(adata.raw.X)

  (0, 10056)	1.0
  (0, 17645)	1.0
  (0, 4019)	1.0
  (0, 19403)	1.0
  (0, 5525)	1.0
  (0, 12279)	1.0
  (0, 24599)	2.0
  (0, 14942)	7.0
  (0, 5783)	1.0
  (0, 18593)	1.0
  (0, 4830)	1.0
  (0, 14797)	2.0
  (0, 891)	1.0
  (0, 9493)	1.0
  (0, 29)	1.0
  (0, 18386)	1.0
  (0, 303)	1.0
  (0, 24216)	1.0
  (0, 5303)	2.0
  (0, 21082)	1.0
  (0, 4238)	1.0
  (0, 14384)	1.0
  (0, 17787)	1.0
  (0, 11916)	7.0
  (0, 26540)	9.0
  :	:
  (9840, 14797)	10.0
  (9840, 23334)	2.0
  (9840, 6164)	6.0
  (9840, 23785)	1.0
  (9840, 18593)	1.0
  (9840, 5783)	1.0
  (9840, 12234)	1.0
  (9840, 16589)	1.0
  (9840, 9279)	1.0
  (9840, 9044)	2.0
  (9840, 14942)	55.0
  (9840, 8860)	2.0
  (9840, 21334)	2.0
  (9840, 16481)	2.0
  (9840, 12279)	2.0
  (9840, 26647)	1.0
  (9840, 21009)	2.0
  (9840, 17749)	3.0
  (9840, 5525)	2.0
  (9840, 7865)	1.0
  (9840, 8907)	1.0
  (9840, 19403)	2.0
  (9840, 27354)	6.0
  (9840, 32)	1.0
  (9840, 17645)	2.0


In [319]:
adata.raw.X[0, 5680]

3.0

In [320]:
adata.X[0, 5680]

2.4328597

In [321]:
adata.obs_names.name = None

In [322]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/spatial-skin/Final_objects/epithelial_bcc_and_normal.h5ad', compression = 'gzip')

In [323]:
adata.obs

Unnamed: 0,nCount_RNA,nFeature_RNA,percent.mt,00_conditions,02_group,04_celltypes,05_subcelltypes,06_seurat_clusters,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
bcc_AAAGATGAGAGCAATT-1_1,2887.0,1229,3.221337,healthy,body,Suprabasal K,IFE spinous K,1,EFO:0009899,CL:2000092,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
bcc_AACTGGTGTACAAGTA-1_1,3956.0,1329,4.802831,healthy,body,Suprabasal K,IFE basal K,2,EFO:0009899,CL:0002187,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
bcc_AAGCCGCTCCAGAAGG-1_1,3698.0,1256,4.245538,healthy,body,Suprabasal K,IFE basal K,2,EFO:0009899,CL:0002187,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
bcc_AAGGTTCAGGTTCCTA-1_1,2037.0,762,0.147275,healthy,body,Suprabasal K,IFE spinous K,1,EFO:0009899,CL:2000092,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
bcc_ACATACGTCAGATAAG-1_1,7270.0,1900,3.658872,healthy,body,Suprabasal K,IFE basal K,2,EFO:0009899,CL:0002187,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ife_TTTGGAGGTTGGTAGG-1,571.0,442,0.175131,healthy,face,N/A (microdissected data),Transitional K,3,EFO:0009922,CL:2000092,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0000403
ife_TTTGGTTCACTACACA-1,11141.0,2884,0.430841,healthy,face,N/A (microdissected data),Outer bulb K,4,EFO:0009922,CL:2000092,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0000403
ife_TTTGGTTGTGCCTGCA-1,11746.0,2779,2.843521,healthy,face,N/A (microdissected data),Outer bulb K,4,EFO:0009922,CL:2000092,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0000403
ife_TTTGGTTTCGTTACCC-1,2844.0,998,1.793249,healthy,face,N/A (microdissected data),Granular K,9,EFO:0009922,CL:0002189,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0000403


In [324]:
araw.obs

Unnamed: 0_level_0,barcodes,barcodes-1,barcodes_with_suffix
barcodes-1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AAAGATGAGAGCAATT-1_1,AAAGATGAGAGCAATT,AAAGATGAGAGCAATT-1_1,AAAGATGAGAGCAATT_body_solebordo
AACCATGGTTCTGTTT-1_1,AACCATGGTTCTGTTT,AACCATGGTTCTGTTT-1_1,AACCATGGTTCTGTTT_body_solebordo
AACTGGTGTACAAGTA-1_1,AACTGGTGTACAAGTA,AACTGGTGTACAAGTA-1_1,AACTGGTGTACAAGTA_body_solebordo
AAGCCGCTCCAGAAGG-1_1,AAGCCGCTCCAGAAGG,AAGCCGCTCCAGAAGG-1_1,AAGCCGCTCCAGAAGG_body_solebordo
AAGGTTCAGGTTCCTA-1_1,AAGGTTCAGGTTCCTA,AAGGTTCAGGTTCCTA-1_1,AAGGTTCAGGTTCCTA_body_solebordo
...,...,...,...
TTTGGAGGTTGGTAGG-1,TTTGGAGGTTGGTAGG_ife1,TTTGGAGGTTGGTAGG-1,TTTGGAGGTTGGTAGG_ife1
TTTGGTTCACTACACA-1,TTTGGTTCACTACACA_ife1,TTTGGTTCACTACACA-1,TTTGGTTCACTACACA_ife1
TTTGGTTGTGCCTGCA-1,TTTGGTTGTGCCTGCA_ife1,TTTGGTTGTGCCTGCA-1,TTTGGTTGTGCCTGCA_ife1
TTTGGTTTCGTTACCC-1,TTTGGTTTCGTTACCC_ife1,TTTGGTTTCGTTACCC-1,TTTGGTTTCGTTACCC_ife1


In [325]:
adata.obs

Unnamed: 0,nCount_RNA,nFeature_RNA,percent.mt,00_conditions,02_group,04_celltypes,05_subcelltypes,06_seurat_clusters,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
bcc_AAAGATGAGAGCAATT-1_1,2887.0,1229,3.221337,healthy,body,Suprabasal K,IFE spinous K,1,EFO:0009899,CL:2000092,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
bcc_AACTGGTGTACAAGTA-1_1,3956.0,1329,4.802831,healthy,body,Suprabasal K,IFE basal K,2,EFO:0009899,CL:0002187,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
bcc_AAGCCGCTCCAGAAGG-1_1,3698.0,1256,4.245538,healthy,body,Suprabasal K,IFE basal K,2,EFO:0009899,CL:0002187,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
bcc_AAGGTTCAGGTTCCTA-1_1,2037.0,762,0.147275,healthy,body,Suprabasal K,IFE spinous K,1,EFO:0009899,CL:2000092,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
bcc_ACATACGTCAGATAAG-1_1,7270.0,1900,3.658872,healthy,body,Suprabasal K,IFE basal K,2,EFO:0009899,CL:0002187,HsapDv:0000087,pooled_body_solebordo,PATO:0000461,False,NCBITaxon:9606,unknown,PATO:0000384,cell,tissue,UBERON:0001416
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ife_TTTGGAGGTTGGTAGG-1,571.0,442,0.175131,healthy,face,N/A (microdissected data),Transitional K,3,EFO:0009922,CL:2000092,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0000403
ife_TTTGGTTCACTACACA-1,11141.0,2884,0.430841,healthy,face,N/A (microdissected data),Outer bulb K,4,EFO:0009922,CL:2000092,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0000403
ife_TTTGGTTGTGCCTGCA-1,11746.0,2779,2.843521,healthy,face,N/A (microdissected data),Outer bulb K,4,EFO:0009922,CL:2000092,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0000403
ife_TTTGGTTTCGTTACCC-1,2844.0,998,1.793249,healthy,face,N/A (microdissected data),Granular K,9,EFO:0009922,CL:0002189,HsapDv:0000148,ife1,PATO:0000461,True,NCBITaxon:9606,unknown,PATO:0000383,cell,tissue,UBERON:0000403


In [326]:
list(adata.obs['donor_id'].unique())

['pooled_body_solebordo',
 'pooled_body_tabib',
 'face_cheek1',
 'face_cheek2',
 'face_cheek3b',
 'face_cheek4',
 'face_ear1',
 'face_ear2',
 'face_forehead1',
 'face_forehead2',
 'face_forehead3',
 'face_forehead4',
 'face_forehead5',
 'face_nose1',
 'face_temple1',
 'face_temple2',
 'bcc_cheek1',
 'bcc_cheek2',
 'bcc_ear1',
 'bcc_ear2',
 'bcc_forehead1',
 'bcc_nose1',
 'bcc_temple1',
 'bcc_temple2',
 'psu1',
 'ife1']

In [327]:
unique_combinations = adata.obs[['donor_id', 'assay_ontology_term_id']].drop_duplicates()

In [328]:
unique_combinations

Unnamed: 0,donor_id,assay_ontology_term_id
bcc_AAAGATGAGAGCAATT-1_1,pooled_body_solebordo,EFO:0009899
bcc_SC1control_AAATCCCTCTTAGG.1_4,pooled_body_tabib,EFO:0009901
bcc_AAACGCTCATGAATAG-1_2,face_cheek1,EFO:0009922
bcc_AGTGCCGTCCATTTGT-1_5,face_cheek2,EFO:0009922
bcc_ACGTAGTGTACGCTTA-1_7,face_cheek3b,EFO:0009922
bcc_CCCATTGCAGCTTTCC-1_8,face_cheek4,EFO:0009922
bcc_ACGTAGTAGTAACGAT-1_9,face_ear1,EFO:0009922
bcc_AAAGAACGTCTGATCA-1_10,face_ear2,EFO:0009922
bcc_AATTCCTCAGGCTATT-1_11,face_forehead1,EFO:0009922
bcc_AAATGGACAAGTGGTG-1_12,face_forehead2,EFO:0009922


In [329]:
adata.var

Unnamed: 0_level_0,feature_is_filtered
ensembl_id,Unnamed: 1_level_1
ENSG00000167614,False
ENSG00000111906,False
ENSG00000166922,False
ENSG00000104970,False
ENSG00000260213,False
...,...
ENSG00000225783,False
ENSG00000285100,False
ENSG00000166595,False
ENSG00000225216,False


In [330]:
adata.raw.var

ENSG00000167614
ENSG00000111906
ENSG00000166922
ENSG00000104970
ENSG00000260213
...
ENSG00000225783
ENSG00000285100
ENSG00000166595
ENSG00000225216
ENSG00000039123


In [331]:
print(adata.X)

  (0, 1)	1.4960011
  (0, 19)	1.4960011
  (0, 21)	2.698351
  (0, 22)	1.4960011
  (0, 24)	2.4328597
  (0, 27)	1.4960011
  (0, 29)	1.4960011
  (0, 58)	2.9079397
  (0, 128)	3.0811217
  (0, 140)	1.4960011
  (0, 211)	2.4328597
  (0, 214)	2.698351
  (0, 244)	2.0703511
  (0, 273)	1.4960011
  (0, 303)	1.4960011
  (0, 377)	2.0703511
  (0, 417)	1.4960011
  (0, 478)	1.4960011
  (0, 487)	1.4960011
  (0, 496)	1.4960011
  (0, 518)	1.4960011
  (0, 528)	1.4960011
  (0, 543)	1.4960011
  (0, 566)	1.4960011
  (0, 626)	1.4960011
  :	:
  (9840, 27247)	0.62944984
  (9840, 27252)	0.36345416
  (9840, 27253)	0.36345416
  (9840, 27265)	0.36345416
  (9840, 27267)	1.0127478
  (9840, 27268)	0.62944984
  (9840, 27270)	0.36345416
  (9840, 27271)	0.62944984
  (9840, 27282)	0.36345416
  (9840, 27284)	0.36345416
  (9840, 27285)	0.36345416
  (9840, 27295)	0.83935213
  (9840, 27301)	0.36345416
  (9840, 27305)	0.36345416
  (9840, 27320)	0.83935213
  (9840, 27327)	0.62944984
  (9840, 27337)	0.62944984
  (9840, 27347)	0.3634

In [332]:
print(adata.raw.X)

  (0, 10056)	1.0
  (0, 17645)	1.0
  (0, 4019)	1.0
  (0, 19403)	1.0
  (0, 5525)	1.0
  (0, 12279)	1.0
  (0, 24599)	2.0
  (0, 14942)	7.0
  (0, 5783)	1.0
  (0, 18593)	1.0
  (0, 4830)	1.0
  (0, 14797)	2.0
  (0, 891)	1.0
  (0, 9493)	1.0
  (0, 29)	1.0
  (0, 18386)	1.0
  (0, 303)	1.0
  (0, 24216)	1.0
  (0, 5303)	2.0
  (0, 21082)	1.0
  (0, 4238)	1.0
  (0, 14384)	1.0
  (0, 17787)	1.0
  (0, 11916)	7.0
  (0, 26540)	9.0
  :	:
  (9840, 14797)	10.0
  (9840, 23334)	2.0
  (9840, 6164)	6.0
  (9840, 23785)	1.0
  (9840, 18593)	1.0
  (9840, 5783)	1.0
  (9840, 12234)	1.0
  (9840, 16589)	1.0
  (9840, 9279)	1.0
  (9840, 9044)	2.0
  (9840, 14942)	55.0
  (9840, 8860)	2.0
  (9840, 21334)	2.0
  (9840, 16481)	2.0
  (9840, 12279)	2.0
  (9840, 26647)	1.0
  (9840, 21009)	2.0
  (9840, 17749)	3.0
  (9840, 5525)	2.0
  (9840, 7865)	1.0
  (9840, 8907)	1.0
  (9840, 19403)	2.0
  (9840, 27354)	6.0
  (9840, 32)	1.0
  (9840, 17645)	2.0


In [333]:
adata.layers.keys

<bound method Mapping.keys of Layers with keys: >

In [334]:
adata.obs.columns

Index(['nCount_RNA', 'nFeature_RNA', 'percent.mt', '00_conditions', '02_group',
       '04_celltypes', '05_subcelltypes', '06_seurat_clusters',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

In [335]:
adata.uns

OverloadedDict, wrapping:
	OrderedDict([('title', 'Epithelial bcc and normal_scRNAseq'), ('default_embedding', 'X_umap')])
With overloaded keys:
	['neighbors'].