### **Curating BCs-PCs_allgenes**

Article: Immune microniches shape intestinal Treg function

DOI: https://doi.org/10.1038/s41586-024-07251-0

Data Source : https://treg-gut-niches.cellgeni.sanger.ac.uk

##### **Mount farm**

mount-farm

##### **Packages required for curation**

In [1]:
#Import all packages required for curation

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess
import math
import re
from collections import Counter

### **Curation Schema**

##### **X (Matrix Layers)**

##### **AnnData object**

In [3]:
# Load the AnnData object

In [4]:
adata = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/Treg_Gut_Niches/Data/BCs-PCs_allgenes.h5ad')

In [5]:
# View the AnnData object

In [6]:
adata

AnnData object with n_obs × n_vars = 3912 × 19330
    obs: 'final_annotation', 'lineage', 'location'
    var: 'mt', 'rb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'location_colors'
    obsm: 'X_pca', 'X_umap'

##### **X - expression matrix**

In [7]:
# View the expression matrix of the anndata object.

In [8]:
adata.X

<3912x19330 sparse matrix of type '<class 'numpy.float32'>'
	with 7916964 stored elements in Compressed Sparse Row format>

In [9]:
# Print the matrix to check whether they are normalized counts or raw counts. if the matrix has floating numbers,they are normalized counts.if they are integers, they are raw counts.

In [10]:
print(adata.X)

  (0, 3)	2.3085635
  (0, 5)	0.82580304
  (0, 15)	0.83009386
  (0, 18)	0.8195916
  (0, 32)	0.8286092
  (0, 40)	3.2906787
  (0, 44)	0.8251624
  (0, 66)	0.8249553
  (0, 67)	0.82418835
  (0, 78)	0.8256777
  (0, 86)	2.306515
  (0, 92)	1.2756549
  (0, 99)	1.2739407
  (0, 103)	1.2763335
  (0, 104)	0.8231335
  (0, 111)	1.5784442
  (0, 112)	0.82539594
  (0, 115)	1.8203938
  (0, 128)	1.2718738
  (0, 129)	1.2749233
  (0, 138)	0.8251257
  (0, 140)	1.5779023
  (0, 167)	1.2745485
  (0, 170)	0.82341534
  (0, 193)	0.80613863
  :	:
  (3911, 18952)	3.363887
  (3911, 18959)	1.1136417
  (3911, 18965)	1.1294258
  (3911, 18982)	1.1367664
  (3911, 19042)	1.1354567
  (3911, 19057)	2.8923721
  (3911, 19080)	1.6570591
  (3911, 19125)	1.9989483
  (3911, 19148)	1.1379305
  (3911, 19173)	1.0212976
  (3911, 19223)	1.6641432
  (3911, 19230)	1.136623
  (3911, 19235)	1.128641
  (3911, 19255)	1.1370549
  (3911, 19293)	1.1322955
  (3911, 19294)	1.6481347
  (3911, 19296)	2.228501
  (3911, 19297)	2.889403
  (3911, 19298)	

In [11]:
adata.layers.keys()

KeysView(Layers with keys: )

##### **Raw counts matrix**

In [12]:
araw = sc.read_h5ad('/nfs/team205/rb29/mice_gutTCRtg_project/Src/Mice-gut.TCRtg/new_analysis/h5ad_files/cellxgene/global_object_rawcounts.h5ad')

In [13]:
araw

AnnData object with n_obs × n_vars = 10831 × 19330
    obs: 'original_annotation', 'anno', 'Dataset'
    var: 'gene_ids', 'feature_types', 'genome', 'n_cells'

In [14]:
barcodes_of_interest = adata.obs_names

# Subset araw based on the extracted barcodes
subset_araw = araw[araw.obs_names.isin(barcodes_of_interest)]

In [15]:
subset_araw

View of AnnData object with n_obs × n_vars = 3912 × 19330
    obs: 'original_annotation', 'anno', 'Dataset'
    var: 'gene_ids', 'feature_types', 'genome', 'n_cells'

In [16]:
araw.layers.keys()

KeysView(Layers with keys: )

In [17]:
subset_araw.X

<3912x19330 sparse matrix of type '<class 'numpy.float32'>'
	with 7925088 stored elements in Compressed Sparse Row format>

In [18]:
araw = ad.AnnData(X=subset_araw.X.copy(), obs=subset_araw.obs.copy(), var=subset_araw.var.copy())

In [19]:
araw

AnnData object with n_obs × n_vars = 3912 × 19330
    obs: 'original_annotation', 'anno', 'Dataset'
    var: 'gene_ids', 'feature_types', 'genome', 'n_cells'

In [20]:
print(araw.X)

  (0, 3)	7.0
  (0, 5)	1.0
  (0, 15)	1.0
  (0, 18)	1.0
  (0, 32)	1.0
  (0, 40)	20.0
  (0, 44)	1.0
  (0, 66)	1.0
  (0, 67)	1.0
  (0, 78)	1.0
  (0, 86)	7.0
  (0, 92)	2.0
  (0, 99)	2.0
  (0, 103)	2.0
  (0, 104)	1.0
  (0, 111)	3.0
  (0, 112)	1.0
  (0, 115)	4.0
  (0, 128)	2.0
  (0, 129)	2.0
  (0, 138)	1.0
  (0, 140)	3.0
  (0, 167)	2.0
  (0, 170)	1.0
  (0, 193)	1.0
  :	:
  (3911, 18952)	13.0
  (3911, 18959)	1.0
  (3911, 18965)	1.0
  (3911, 18982)	1.0
  (3911, 19042)	1.0
  (3911, 19057)	8.0
  (3911, 19080)	2.0
  (3911, 19125)	3.0
  (3911, 19148)	1.0
  (3911, 19173)	1.0
  (3911, 19223)	2.0
  (3911, 19230)	1.0
  (3911, 19235)	1.0
  (3911, 19255)	1.0
  (3911, 19293)	1.0
  (3911, 19294)	2.0
  (3911, 19296)	4.0
  (3911, 19297)	8.0
  (3911, 19298)	2.0
  (3911, 19299)	4.0
  (3911, 19300)	1.0
  (3911, 19302)	4.0
  (3911, 19303)	1.0
  (3911, 19306)	2.0
  (3911, 19329)	3.0


##### **Variables(var)**

In [21]:
# View the var of anndata and raw object

In [22]:
adata.var

Unnamed: 0,mt,rb,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts,n_cells,highly_variable,means,dispersions,dispersions_norm
Sox17,False,False,221,0.026731,98.718025,460.818756,221,True,0.131385,2.748144,2.592534
Mrpl15,False,False,3218,0.297666,81.333023,5131.465332,3218,False,0.480022,1.226943,-0.471321
Lypla1,False,False,2460,0.188250,85.730034,3245.247803,2460,False,0.353775,1.208837,-0.495850
Tcea1,False,False,6241,0.829154,63.797203,14293.792969,6241,False,1.043117,1.383716,-0.455413
Rgs20,False,False,3,0.000174,99.982597,2.995511,3,False,0.000562,1.542446,0.282720
...,...,...,...,...,...,...,...,...,...,...,...
CR974586.2,False,False,67,0.007599,99.611343,131.000000,67,False,0.017431,1.512446,0.225248
CR974586.4,False,False,237,0.042484,98.625214,732.385132,237,True,0.101700,2.757137,2.609761
Ccl21a.1,False,False,30,0.002436,99.825974,42.000000,30,False,0.008137,1.713660,0.610724
CAAA01147332.1,False,False,1400,0.095975,91.878876,1654.511108,1400,False,0.240455,1.431409,0.070001


In [23]:
araw.var

Unnamed: 0,gene_ids,feature_types,genome,n_cells
Sox17,ENSMUSG00000025902,Gene Expression,mm10,221
Mrpl15,ENSMUSG00000033845,Gene Expression,mm10,3218
Lypla1,ENSMUSG00000025903,Gene Expression,mm10,2460
Tcea1,ENSMUSG00000033813,Gene Expression,mm10,6241
Rgs20,ENSMUSG00000002459,Gene Expression,mm10,3
...,...,...,...,...
CR974586.2,ENSMUSG00000095585,Gene Expression,mm10,67
CR974586.4,ENSMUSG00000096506,Gene Expression,mm10,237
Ccl21a-1,ENSMUSG00000095320,Gene Expression,mm10,30
CAAA01147332.1,ENSMUSG00000095742,Gene Expression,mm10,1400


In [24]:
old_names = ['Ptp4a1.1', 'Zc3h11a.1', 'Gm16701.1', 'Pakap.1', 'Fam220a.1',
             'Gm16499.1', 'Aldoa.1', 'Gm16364.1', 'St6galnac2.1', 'Gm41392.1',
             'Snhg4.1', 'Fam90a1b.1', 'Ccl21a.1']

new_names = ['Ptp4a1-1', 'Zc3h11a-1', 'Gm16701-1', 'Pakap-1', 'Fam220a-1',
             'Gm16499-1', 'Aldoa-1', 'Gm16364-1', 'St6galnac2-1', 'Gm41392-1',
             'Snhg4-1', 'Fam90a1b-1', 'Ccl21a-1']

# Create a dictionary for the replacements
rename_dict = dict(zip(old_names, new_names))

# Rename the values in adata.var.index
adata.var.index = adata.var.index.to_series().replace(rename_dict)


In [25]:
gene_name_to_id = {index: gene_id for index, gene_id in zip(araw.var.index, araw.var['gene_ids'])}

In [26]:
gene_name_to_id

{'Sox17': 'ENSMUSG00000025902',
 'Mrpl15': 'ENSMUSG00000033845',
 'Lypla1': 'ENSMUSG00000025903',
 'Tcea1': 'ENSMUSG00000033813',
 'Rgs20': 'ENSMUSG00000002459',
 'Atp6v1h': 'ENSMUSG00000033793',
 'Rb1cc1': 'ENSMUSG00000025907',
 '4732440D04Rik': 'ENSMUSG00000090031',
 'St18': 'ENSMUSG00000033740',
 'Pcmtd1': 'ENSMUSG00000051285',
 'Gm26901': 'ENSMUSG00000097797',
 'Sntg1': 'ENSMUSG00000025909',
 'Rrs1': 'ENSMUSG00000061024',
 'Adhfe1': 'ENSMUSG00000025911',
 '2610203C22Rik': 'ENSMUSG00000079671',
 'Mybl1': 'ENSMUSG00000025912',
 'Vcpip1': 'ENSMUSG00000045210',
 '1700034P13Rik': 'ENSMUSG00000097893',
 'Sgk3': 'ENSMUSG00000025915',
 'Mcmdc2': 'ENSMUSG00000046101',
 'Snhg6': 'ENSMUSG00000098234',
 'Tcf24': 'ENSMUSG00000099032',
 'Cops5': 'ENSMUSG00000025917',
 'Cspp1': 'ENSMUSG00000056763',
 'Arfgef1': 'ENSMUSG00000067851',
 'Cpa6': 'ENSMUSG00000042501',
 'Prex2': 'ENSMUSG00000048960',
 'Sulf1': 'ENSMUSG00000016918',
 'Slco5a1': 'ENSMUSG00000025938',
 'Ncoa2': 'ENSMUSG00000005886',
 'Gm2

In [27]:
adata.var['gene_ids'] = adata.var.index.map(gene_name_to_id)

In [28]:
nan_values = adata.var['gene_ids'].isna().sum()

print(f"Number of NaN values in 'gene_ids' column: {nan_values}")

Number of NaN values in 'gene_ids' column: 0


In [29]:
adata.var

Unnamed: 0,mt,rb,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts,n_cells,highly_variable,means,dispersions,dispersions_norm,gene_ids
Sox17,False,False,221,0.026731,98.718025,460.818756,221,True,0.131385,2.748144,2.592534,ENSMUSG00000025902
Mrpl15,False,False,3218,0.297666,81.333023,5131.465332,3218,False,0.480022,1.226943,-0.471321,ENSMUSG00000033845
Lypla1,False,False,2460,0.188250,85.730034,3245.247803,2460,False,0.353775,1.208837,-0.495850,ENSMUSG00000025903
Tcea1,False,False,6241,0.829154,63.797203,14293.792969,6241,False,1.043117,1.383716,-0.455413,ENSMUSG00000033813
Rgs20,False,False,3,0.000174,99.982597,2.995511,3,False,0.000562,1.542446,0.282720,ENSMUSG00000002459
...,...,...,...,...,...,...,...,...,...,...,...,...
CR974586.2,False,False,67,0.007599,99.611343,131.000000,67,False,0.017431,1.512446,0.225248,ENSMUSG00000095585
CR974586.4,False,False,237,0.042484,98.625214,732.385132,237,True,0.101700,2.757137,2.609761,ENSMUSG00000096506
Ccl21a-1,False,False,30,0.002436,99.825974,42.000000,30,False,0.008137,1.713660,0.610724,ENSMUSG00000095320
CAAA01147332.1,False,False,1400,0.095975,91.878876,1654.511108,1400,False,0.240455,1.431409,0.070001,ENSMUSG00000095742


In [30]:
araw.var

Unnamed: 0,gene_ids,feature_types,genome,n_cells
Sox17,ENSMUSG00000025902,Gene Expression,mm10,221
Mrpl15,ENSMUSG00000033845,Gene Expression,mm10,3218
Lypla1,ENSMUSG00000025903,Gene Expression,mm10,2460
Tcea1,ENSMUSG00000033813,Gene Expression,mm10,6241
Rgs20,ENSMUSG00000002459,Gene Expression,mm10,3
...,...,...,...,...
CR974586.2,ENSMUSG00000095585,Gene Expression,mm10,67
CR974586.4,ENSMUSG00000096506,Gene Expression,mm10,237
Ccl21a-1,ENSMUSG00000095320,Gene Expression,mm10,30
CAAA01147332.1,ENSMUSG00000095742,Gene Expression,mm10,1400


In [31]:
nan_values = adata.var['gene_ids'].isna().sum()

print(f"Number of NaN values in 'gene_ids' column: {nan_values}")

Number of NaN values in 'gene_ids' column: 0


In [32]:
nan_gene_ids_indices = adata.var.index[adata.var['gene_ids'].isna()]

print("Index values with NaN in 'gene_ids' column:")
print(nan_gene_ids_indices)

Index values with NaN in 'gene_ids' column:
Index([], dtype='object')


In [33]:
adata.var['gene_name'] = adata.var.index
araw.var['gene_name'] = araw.var.index

In [34]:
adata.var.index = adata.var['gene_ids'] 
araw.var.index = araw.var['gene_ids']

In [35]:
# Load the approved genes file.

In [36]:
approved_genes = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/Endometrium_reference_integrated_atlas/genes_approved.csv')

In [37]:
#Create a dictionary from the approved genes file 

In [38]:
genedict = {key: 1 for key in list(approved_genes.feature_id)}

In [39]:
genedict

{'ERCC-00002': 1,
 'ERCC-00003': 1,
 'ERCC-00004': 1,
 'ERCC-00009': 1,
 'ERCC-00012': 1,
 'ERCC-00013': 1,
 'ERCC-00014': 1,
 'ERCC-00016': 1,
 'ERCC-00017': 1,
 'ERCC-00019': 1,
 'ERCC-00022': 1,
 'ERCC-00024': 1,
 'ERCC-00025': 1,
 'ERCC-00028': 1,
 'ERCC-00031': 1,
 'ERCC-00033': 1,
 'ERCC-00034': 1,
 'ERCC-00035': 1,
 'ERCC-00039': 1,
 'ERCC-00040': 1,
 'ERCC-00041': 1,
 'ERCC-00042': 1,
 'ERCC-00043': 1,
 'ERCC-00044': 1,
 'ERCC-00046': 1,
 'ERCC-00048': 1,
 'ERCC-00051': 1,
 'ERCC-00053': 1,
 'ERCC-00054': 1,
 'ERCC-00057': 1,
 'ERCC-00058': 1,
 'ERCC-00059': 1,
 'ERCC-00060': 1,
 'ERCC-00061': 1,
 'ERCC-00062': 1,
 'ERCC-00067': 1,
 'ERCC-00069': 1,
 'ERCC-00071': 1,
 'ERCC-00073': 1,
 'ERCC-00074': 1,
 'ERCC-00075': 1,
 'ERCC-00076': 1,
 'ERCC-00077': 1,
 'ERCC-00078': 1,
 'ERCC-00079': 1,
 'ERCC-00081': 1,
 'ERCC-00083': 1,
 'ERCC-00084': 1,
 'ERCC-00085': 1,
 'ERCC-00086': 1,
 'ERCC-00092': 1,
 'ERCC-00095': 1,
 'ERCC-00096': 1,
 'ERCC-00097': 1,
 'ERCC-00098': 1,
 'ERCC-000

In [40]:
len(genedict)

119799

In [41]:
#Filter out the genes which are not in the approved genes file.

In [42]:
var_to_keep_adata = [x for x in adata.var_names if (x in genedict)]
var_to_keep_araw = [x for x in araw.var_names if (x in genedict)]

In [43]:
len(var_to_keep_adata)

19283

In [44]:
len(var_to_keep_araw)

19283

In [45]:
adata.var

Unnamed: 0_level_0,mt,rb,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts,n_cells,highly_variable,means,dispersions,dispersions_norm,gene_ids,gene_name
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
ENSMUSG00000025902,False,False,221,0.026731,98.718025,460.818756,221,True,0.131385,2.748144,2.592534,ENSMUSG00000025902,Sox17
ENSMUSG00000033845,False,False,3218,0.297666,81.333023,5131.465332,3218,False,0.480022,1.226943,-0.471321,ENSMUSG00000033845,Mrpl15
ENSMUSG00000025903,False,False,2460,0.188250,85.730034,3245.247803,2460,False,0.353775,1.208837,-0.495850,ENSMUSG00000025903,Lypla1
ENSMUSG00000033813,False,False,6241,0.829154,63.797203,14293.792969,6241,False,1.043117,1.383716,-0.455413,ENSMUSG00000033813,Tcea1
ENSMUSG00000002459,False,False,3,0.000174,99.982597,2.995511,3,False,0.000562,1.542446,0.282720,ENSMUSG00000002459,Rgs20
...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSMUSG00000095585,False,False,67,0.007599,99.611343,131.000000,67,False,0.017431,1.512446,0.225248,ENSMUSG00000095585,CR974586.2
ENSMUSG00000096506,False,False,237,0.042484,98.625214,732.385132,237,True,0.101700,2.757137,2.609761,ENSMUSG00000096506,CR974586.4
ENSMUSG00000095320,False,False,30,0.002436,99.825974,42.000000,30,False,0.008137,1.713660,0.610724,ENSMUSG00000095320,Ccl21a-1
ENSMUSG00000095742,False,False,1400,0.095975,91.878876,1654.511108,1400,False,0.240455,1.431409,0.070001,ENSMUSG00000095742,CAAA01147332.1


In [46]:
araw.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,n_cells,gene_name
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSMUSG00000025902,ENSMUSG00000025902,Gene Expression,mm10,221,Sox17
ENSMUSG00000033845,ENSMUSG00000033845,Gene Expression,mm10,3218,Mrpl15
ENSMUSG00000025903,ENSMUSG00000025903,Gene Expression,mm10,2460,Lypla1
ENSMUSG00000033813,ENSMUSG00000033813,Gene Expression,mm10,6241,Tcea1
ENSMUSG00000002459,ENSMUSG00000002459,Gene Expression,mm10,3,Rgs20
...,...,...,...,...,...
ENSMUSG00000095585,ENSMUSG00000095585,Gene Expression,mm10,67,CR974586.2
ENSMUSG00000096506,ENSMUSG00000096506,Gene Expression,mm10,237,CR974586.4
ENSMUSG00000095320,ENSMUSG00000095320,Gene Expression,mm10,30,Ccl21a-1
ENSMUSG00000095742,ENSMUSG00000095742,Gene Expression,mm10,1400,CAAA01147332.1


In [47]:
# Modify the anndata object by filtering out the filtered genes.

In [48]:
adata = adata[:, adata.var.index.isin(var_to_keep_adata)]
araw = araw[:, araw.var.index.isin(var_to_keep_araw)]

In [49]:
adata.var

Unnamed: 0_level_0,mt,rb,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts,n_cells,highly_variable,means,dispersions,dispersions_norm,gene_ids,gene_name
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
ENSMUSG00000025902,False,False,221,0.026731,98.718025,460.818756,221,True,0.131385,2.748144,2.592534,ENSMUSG00000025902,Sox17
ENSMUSG00000033845,False,False,3218,0.297666,81.333023,5131.465332,3218,False,0.480022,1.226943,-0.471321,ENSMUSG00000033845,Mrpl15
ENSMUSG00000025903,False,False,2460,0.188250,85.730034,3245.247803,2460,False,0.353775,1.208837,-0.495850,ENSMUSG00000025903,Lypla1
ENSMUSG00000033813,False,False,6241,0.829154,63.797203,14293.792969,6241,False,1.043117,1.383716,-0.455413,ENSMUSG00000033813,Tcea1
ENSMUSG00000002459,False,False,3,0.000174,99.982597,2.995511,3,False,0.000562,1.542446,0.282720,ENSMUSG00000002459,Rgs20
...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSMUSG00000094728,False,False,70,0.004056,99.593941,69.925201,70,False,0.006127,0.903737,-0.940885,ENSMUSG00000094728,AC132444.2
ENSMUSG00000062783,False,False,91,0.005321,99.472130,91.735985,91,False,0.011063,1.130046,-0.507334,ENSMUSG00000062783,Csprs
ENSMUSG00000096808,False,False,8,0.000464,99.953590,8.000000,8,False,0.000741,1.017444,-0.723052,ENSMUSG00000096808,AC132444.6
ENSMUSG00000095742,False,False,1400,0.095975,91.878876,1654.511108,1400,False,0.240455,1.431409,0.070001,ENSMUSG00000095742,CAAA01147332.1


In [50]:
# View var

In [51]:
adata.var

Unnamed: 0_level_0,mt,rb,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts,n_cells,highly_variable,means,dispersions,dispersions_norm,gene_ids,gene_name
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
ENSMUSG00000025902,False,False,221,0.026731,98.718025,460.818756,221,True,0.131385,2.748144,2.592534,ENSMUSG00000025902,Sox17
ENSMUSG00000033845,False,False,3218,0.297666,81.333023,5131.465332,3218,False,0.480022,1.226943,-0.471321,ENSMUSG00000033845,Mrpl15
ENSMUSG00000025903,False,False,2460,0.188250,85.730034,3245.247803,2460,False,0.353775,1.208837,-0.495850,ENSMUSG00000025903,Lypla1
ENSMUSG00000033813,False,False,6241,0.829154,63.797203,14293.792969,6241,False,1.043117,1.383716,-0.455413,ENSMUSG00000033813,Tcea1
ENSMUSG00000002459,False,False,3,0.000174,99.982597,2.995511,3,False,0.000562,1.542446,0.282720,ENSMUSG00000002459,Rgs20
...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSMUSG00000094728,False,False,70,0.004056,99.593941,69.925201,70,False,0.006127,0.903737,-0.940885,ENSMUSG00000094728,AC132444.2
ENSMUSG00000062783,False,False,91,0.005321,99.472130,91.735985,91,False,0.011063,1.130046,-0.507334,ENSMUSG00000062783,Csprs
ENSMUSG00000096808,False,False,8,0.000464,99.953590,8.000000,8,False,0.000741,1.017444,-0.723052,ENSMUSG00000096808,AC132444.6
ENSMUSG00000095742,False,False,1400,0.095975,91.878876,1654.511108,1400,False,0.240455,1.431409,0.070001,ENSMUSG00000095742,CAAA01147332.1


In [52]:
araw.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,n_cells,gene_name
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSMUSG00000025902,ENSMUSG00000025902,Gene Expression,mm10,221,Sox17
ENSMUSG00000033845,ENSMUSG00000033845,Gene Expression,mm10,3218,Mrpl15
ENSMUSG00000025903,ENSMUSG00000025903,Gene Expression,mm10,2460,Lypla1
ENSMUSG00000033813,ENSMUSG00000033813,Gene Expression,mm10,6241,Tcea1
ENSMUSG00000002459,ENSMUSG00000002459,Gene Expression,mm10,3,Rgs20
...,...,...,...,...,...
ENSMUSG00000094728,ENSMUSG00000094728,Gene Expression,mm10,70,AC132444.2
ENSMUSG00000062783,ENSMUSG00000062783,Gene Expression,mm10,91,Csprs
ENSMUSG00000096808,ENSMUSG00000096808,Gene Expression,mm10,8,AC132444.6
ENSMUSG00000095742,ENSMUSG00000095742,Gene Expression,mm10,1400,CAAA01147332.1


feature is filtered

In [53]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

  adata.var['feature_is_filtered'] = [False] * len(adata.var)


In [54]:
adata.var

Unnamed: 0_level_0,mt,rb,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts,n_cells,highly_variable,means,dispersions,dispersions_norm,gene_ids,gene_name,feature_is_filtered
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
ENSMUSG00000025902,False,False,221,0.026731,98.718025,460.818756,221,True,0.131385,2.748144,2.592534,ENSMUSG00000025902,Sox17,False
ENSMUSG00000033845,False,False,3218,0.297666,81.333023,5131.465332,3218,False,0.480022,1.226943,-0.471321,ENSMUSG00000033845,Mrpl15,False
ENSMUSG00000025903,False,False,2460,0.188250,85.730034,3245.247803,2460,False,0.353775,1.208837,-0.495850,ENSMUSG00000025903,Lypla1,False
ENSMUSG00000033813,False,False,6241,0.829154,63.797203,14293.792969,6241,False,1.043117,1.383716,-0.455413,ENSMUSG00000033813,Tcea1,False
ENSMUSG00000002459,False,False,3,0.000174,99.982597,2.995511,3,False,0.000562,1.542446,0.282720,ENSMUSG00000002459,Rgs20,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSMUSG00000094728,False,False,70,0.004056,99.593941,69.925201,70,False,0.006127,0.903737,-0.940885,ENSMUSG00000094728,AC132444.2,False
ENSMUSG00000062783,False,False,91,0.005321,99.472130,91.735985,91,False,0.011063,1.130046,-0.507334,ENSMUSG00000062783,Csprs,False
ENSMUSG00000096808,False,False,8,0.000464,99.953590,8.000000,8,False,0.000741,1.017444,-0.723052,ENSMUSG00000096808,AC132444.6,False
ENSMUSG00000095742,False,False,1400,0.095975,91.878876,1654.511108,1400,False,0.240455,1.431409,0.070001,ENSMUSG00000095742,CAAA01147332.1,False


In [55]:
araw.var

Unnamed: 0_level_0,gene_ids,feature_types,genome,n_cells,gene_name
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ENSMUSG00000025902,ENSMUSG00000025902,Gene Expression,mm10,221,Sox17
ENSMUSG00000033845,ENSMUSG00000033845,Gene Expression,mm10,3218,Mrpl15
ENSMUSG00000025903,ENSMUSG00000025903,Gene Expression,mm10,2460,Lypla1
ENSMUSG00000033813,ENSMUSG00000033813,Gene Expression,mm10,6241,Tcea1
ENSMUSG00000002459,ENSMUSG00000002459,Gene Expression,mm10,3,Rgs20
...,...,...,...,...,...
ENSMUSG00000094728,ENSMUSG00000094728,Gene Expression,mm10,70,AC132444.2
ENSMUSG00000062783,ENSMUSG00000062783,Gene Expression,mm10,91,Csprs
ENSMUSG00000096808,ENSMUSG00000096808,Gene Expression,mm10,8,AC132444.6
ENSMUSG00000095742,ENSMUSG00000095742,Gene Expression,mm10,1400,CAAA01147332.1


In [56]:
del adata.var['gene_ids']
del araw.var['gene_ids']
del adata.var['gene_name']
del araw.var['gene_name']

In [57]:
adata.var

Unnamed: 0_level_0,mt,rb,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts,n_cells,highly_variable,means,dispersions,dispersions_norm,feature_is_filtered
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ENSMUSG00000025902,False,False,221,0.026731,98.718025,460.818756,221,True,0.131385,2.748144,2.592534,False
ENSMUSG00000033845,False,False,3218,0.297666,81.333023,5131.465332,3218,False,0.480022,1.226943,-0.471321,False
ENSMUSG00000025903,False,False,2460,0.188250,85.730034,3245.247803,2460,False,0.353775,1.208837,-0.495850,False
ENSMUSG00000033813,False,False,6241,0.829154,63.797203,14293.792969,6241,False,1.043117,1.383716,-0.455413,False
ENSMUSG00000002459,False,False,3,0.000174,99.982597,2.995511,3,False,0.000562,1.542446,0.282720,False
...,...,...,...,...,...,...,...,...,...,...,...,...
ENSMUSG00000094728,False,False,70,0.004056,99.593941,69.925201,70,False,0.006127,0.903737,-0.940885,False
ENSMUSG00000062783,False,False,91,0.005321,99.472130,91.735985,91,False,0.011063,1.130046,-0.507334,False
ENSMUSG00000096808,False,False,8,0.000464,99.953590,8.000000,8,False,0.000741,1.017444,-0.723052,False
ENSMUSG00000095742,False,False,1400,0.095975,91.878876,1654.511108,1400,False,0.240455,1.431409,0.070001,False


In [58]:
araw.var

Unnamed: 0_level_0,feature_types,genome,n_cells
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSMUSG00000025902,Gene Expression,mm10,221
ENSMUSG00000033845,Gene Expression,mm10,3218
ENSMUSG00000025903,Gene Expression,mm10,2460
ENSMUSG00000033813,Gene Expression,mm10,6241
ENSMUSG00000002459,Gene Expression,mm10,3
...,...,...,...
ENSMUSG00000094728,Gene Expression,mm10,70
ENSMUSG00000062783,Gene Expression,mm10,91
ENSMUSG00000096808,Gene Expression,mm10,8
ENSMUSG00000095742,Gene Expression,mm10,1400


#### **obs (Cell metadata)**

In [59]:
#view obs

In [60]:
adata.obs

Unnamed: 0,final_annotation,lineage,location
AAACCTGAGCCCAATT-1-0,GC.BC_LZ-resting,B-PCs,MLN
AAACCTGAGGTGATTA-1-0,PCs.CD19-IgM+.2,B-PCs,MLN
AAACCTGCATGGTTGT-1-0,PCs.CD19+IgM+,B-PCs,MLN
AAACCTGGTACCGGCT-1-0,Naive.Bcells,B-PCs,MLN
AAACCTGGTCGAGATG-1-0,GC.BC_DZ.2,B-PCs,MLN
...,...,...,...
TTTATGCCAATGGATA-1-3,PCs.IgA+,B-PCs,LP
TTTATGCTCGTAGATC-1-3,PCs.CD19-IgM+.1,B-PCs,LP
TTTGCGCAGGCCCTCA-1-3,PCs.IgA+,B-PCs,LP
TTTGCGCAGGGTTTCT-1-3,PCs.CD19-IgM+.1,B-PCs,LP


In [61]:
# view the column names in obs

In [62]:
adata.obs.columns

Index(['final_annotation', 'lineage', 'location'], dtype='object')

In [63]:
adata.obs

Unnamed: 0,final_annotation,lineage,location
AAACCTGAGCCCAATT-1-0,GC.BC_LZ-resting,B-PCs,MLN
AAACCTGAGGTGATTA-1-0,PCs.CD19-IgM+.2,B-PCs,MLN
AAACCTGCATGGTTGT-1-0,PCs.CD19+IgM+,B-PCs,MLN
AAACCTGGTACCGGCT-1-0,Naive.Bcells,B-PCs,MLN
AAACCTGGTCGAGATG-1-0,GC.BC_DZ.2,B-PCs,MLN
...,...,...,...
TTTATGCCAATGGATA-1-3,PCs.IgA+,B-PCs,LP
TTTATGCTCGTAGATC-1-3,PCs.CD19-IgM+.1,B-PCs,LP
TTTGCGCAGGCCCTCA-1-3,PCs.IgA+,B-PCs,LP
TTTGCGCAGGGTTTCT-1-3,PCs.CD19-IgM+.1,B-PCs,LP


In [64]:
araw.obs

Unnamed: 0,original_annotation,anno,Dataset
AAACCTGAGCCCAATT-1-0,GC.BC_LZ-resting_MLN,GC.BC_LZ-resting,NicheSeq
AAACCTGAGGTGATTA-1-0,PCs.CD19-IgM+.2_MLN,PCs.CD19-IgM+.2,NicheSeq
AAACCTGCATGGTTGT-1-0,PCs.CD19+IgM+_MLN,PCs.CD19+IgM+,NicheSeq
AAACCTGGTACCGGCT-1-0,Naive.Bcells_MLN,Naive.Bcells,NicheSeq
AAACCTGGTCGAGATG-1-0,GC.BC_DZ.2_MLN,GC.BC_DZ.2,NicheSeq
...,...,...,...
TTTATGCCAATGGATA-1-3,PCs.IgA+_LP,PCs.IgA+,NicheSeq
TTTATGCTCGTAGATC-1-3,PCs.CD19-IgM+.1_LP,PCs.CD19-IgM+.1,NicheSeq
TTTGCGCAGGCCCTCA-1-3,PCs.IgA+_LP,PCs.IgA+,NicheSeq
TTTGCGCAGGGTTTCT-1-3,PCs.CD19-IgM+.1_LP,PCs.CD19-IgM+.1,NicheSeq


#### **assay_ontology_term_id**

In [65]:
adata.obs['barcodes'] = adata.obs_names

In [66]:
adata.obs['barcodes'] = adata.obs['barcodes'].str.extract(r'([ATGC]{10,16})', expand=False)

In [67]:
assay_info = pd.read_csv('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/covid_cell_atlas_ALI_organoids/10X_barcode_table_assay.csv')

In [68]:
mapping = dict(zip(assay_info['barcode'], assay_info['assay']))

In [69]:
adata.obs['assay'] = adata.obs['barcodes'].map(mapping)

In [70]:
list(adata.obs['assay'].unique())

['3pv2_5pv1_5pv2', '3pv2_5pv1_5pv2+3pv3', '3pv2_5pv1_5pv2+multiome']

In [71]:
adata.obs['assay_ontology_term_id']  = 'EFO:0009900'

In [72]:
adata.obs['assay_ontology_term_id'] = adata.obs['assay_ontology_term_id'].astype('category')

In [73]:
# view adata.obs

In [74]:
adata.obs

Unnamed: 0,final_annotation,lineage,location,barcodes,assay,assay_ontology_term_id
AAACCTGAGCCCAATT-1-0,GC.BC_LZ-resting,B-PCs,MLN,AAACCTGAGCCCAATT,3pv2_5pv1_5pv2,EFO:0009900
AAACCTGAGGTGATTA-1-0,PCs.CD19-IgM+.2,B-PCs,MLN,AAACCTGAGGTGATTA,3pv2_5pv1_5pv2,EFO:0009900
AAACCTGCATGGTTGT-1-0,PCs.CD19+IgM+,B-PCs,MLN,AAACCTGCATGGTTGT,3pv2_5pv1_5pv2,EFO:0009900
AAACCTGGTACCGGCT-1-0,Naive.Bcells,B-PCs,MLN,AAACCTGGTACCGGCT,3pv2_5pv1_5pv2,EFO:0009900
AAACCTGGTCGAGATG-1-0,GC.BC_DZ.2,B-PCs,MLN,AAACCTGGTCGAGATG,3pv2_5pv1_5pv2,EFO:0009900
...,...,...,...,...,...,...
TTTATGCCAATGGATA-1-3,PCs.IgA+,B-PCs,LP,TTTATGCCAATGGATA,3pv2_5pv1_5pv2,EFO:0009900
TTTATGCTCGTAGATC-1-3,PCs.CD19-IgM+.1,B-PCs,LP,TTTATGCTCGTAGATC,3pv2_5pv1_5pv2,EFO:0009900
TTTGCGCAGGCCCTCA-1-3,PCs.IgA+,B-PCs,LP,TTTGCGCAGGCCCTCA,3pv2_5pv1_5pv2,EFO:0009900
TTTGCGCAGGGTTTCT-1-3,PCs.CD19-IgM+.1,B-PCs,LP,TTTGCGCAGGGTTTCT,3pv2_5pv1_5pv2,EFO:0009900


#### **cell_type_ontology_term_id**

In [75]:
#identify the column in adata.obs related. to cell type annotation

In [76]:
adata.obs.columns

Index(['final_annotation', 'lineage', 'location', 'barcodes', 'assay',
       'assay_ontology_term_id'],
      dtype='object')

In [77]:
list(adata.obs['final_annotation'].unique())

['GC.BC_LZ-resting',
 'PCs.CD19-IgM+.2',
 'PCs.CD19+IgM+',
 'Naive.Bcells',
 'GC.BC_DZ.2',
 'Plasmablast-IgM+_cycling',
 'GC.BC_DZ.1',
 'eMBCs',
 'PCs.CD19-IgM+.1',
 'PCs.IgA+',
 'pre-GC.BCs',
 'PCs_low_count',
 'Naive.Bcells.ISGhi',
 'GC.BC_DZ-pre-memory BC']

In [78]:
# create a dictionary of cell type and ontology term

In [79]:
mapping={'GC.BC_LZ-resting':'CL:0000844',
'Lymph.DC.Sirpa+':'CL:0000990',
'PCs.CD19-IgM+.2':'CL:0000786',
'cTregs':'CL:0000815',
'PCs.CD19+IgM+':'CL:0000786',
'Naive.Bcells':'CL:0000788',
'Naive.CD4+_MLN':'CL:0000895',
'GC.BC_DZ.2':'CL:0000844',
'Plasmablast-IgM+_cycling':'CL:0000983',
'Lymph.DC.Cd81+':'CL:0000990',
'DC.Sirpa+Cd81+':'CL:0000451',
'GC.BC_DZ.1':'CL:0009104',
'eMBCs':'CL:0000787',
'PCs.CD19-IgM+.1':'CL:0000786',
'DC1.Cd8+Xcr1+':'CL:0001000',
'Naive.CD8+_MLN':'CL:0000900',
'DC.Il1b_low.Cd103-Sirpa+':'CL:0000990',
'Fibroblast_Cxcl12+':'CL:0000057',
'PCs.IgA+':'CL:0000786',
'DC1.apopt.':'CL:0000990',
'Th17':'CL:0000899',
'DC.prolif.':'CL:0000990',
'NK-NKT-gdTCs':'CL:0000798',
'Naive.CD4+_CP':'CL:0000895',
'Mono.Sirpa+':'CL:0000576',
'Endothelial.cells':'CL:0000115',
'Neutrophils':'CL:0000775',
'Lymph.DC.Cd8+':'CL:0000990',
'pDCs':'CL:0000784',
'pre-GC.BCs':'CL:0000844',
'PCs_low_count':'CL:0000786',
'gdTCs.Th17':'CL:0000798',
'Naive.Bcells.ISGhi':'CL:0000788',
'Lymphatic_capillary':'CL:0002144',
'Naive.CD4+CD8+':'CL:0000895',
'LTi-like.ILC3s':'CL:0001071',
'Naive.CD8+_CP':'CL:0000900',
'Fibroblast_Endosialin+':'CL:0000057',
'Mono.Il1b+':'CL:0000576',
'CD4.memory':'CL:0000897',
'Lymph.DC.Il1b_high.prolif.':'CL:0000990',
'Prolif-Tregs':'CL:0000815',
'eTregs':'CL:0000815',
'Mac.Sirpa+':'CL:0000235',
'pre-DC':'CL:0000451',
'Mac.Il1b+':'CL:0000235',
'ILC3s.Rorgt+':'CL:0001071',
'ILC3s.Rorgt+.MHCIIhi':'CL:0001071',
'DC.Il1b_high.Cd103+':'CL:0002461',
'DC.Il1b_high.Cd103+Sirpa+':'CL:0002461',
'ILC2s':'CL:0001069',
'Pericytes':'CL:0000669',
'Mac.Cd206+':'CL:0000235',
'Epithelial.MHCII+':'CL:0000066',
'Mast cells':'CL:0000097',
'GC.BC_DZ-pre-memory BC':'CL:0000844',
'Lymphatic_endothelium':'CL:0002138',
'Fibroblast.Adamdec1+Pdgfra_high':'CL:0000057',
'Fibroblast_Adamdec1+Pdgfra_low':'CL:0000057',
'Fibroblast_Acta2+':'CL:0000057',
'Epithelial':'CL:0000066',
'Myofibroblast':'CL:0000186',
'Glial cells':'CL:0000125'}

In [80]:
# add the cell_type_ontology_term_id column

In [81]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['final_annotation'].map(mapping)

In [82]:
# change datatype of the column

In [83]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [84]:
# view adata.obs

In [85]:
adata.obs

Unnamed: 0,final_annotation,lineage,location,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id
AAACCTGAGCCCAATT-1-0,GC.BC_LZ-resting,B-PCs,MLN,AAACCTGAGCCCAATT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000844
AAACCTGAGGTGATTA-1-0,PCs.CD19-IgM+.2,B-PCs,MLN,AAACCTGAGGTGATTA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786
AAACCTGCATGGTTGT-1-0,PCs.CD19+IgM+,B-PCs,MLN,AAACCTGCATGGTTGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786
AAACCTGGTACCGGCT-1-0,Naive.Bcells,B-PCs,MLN,AAACCTGGTACCGGCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000788
AAACCTGGTCGAGATG-1-0,GC.BC_DZ.2,B-PCs,MLN,AAACCTGGTCGAGATG,3pv2_5pv1_5pv2,EFO:0009900,CL:0000844
...,...,...,...,...,...,...,...
TTTATGCCAATGGATA-1-3,PCs.IgA+,B-PCs,LP,TTTATGCCAATGGATA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786
TTTATGCTCGTAGATC-1-3,PCs.CD19-IgM+.1,B-PCs,LP,TTTATGCTCGTAGATC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786
TTTGCGCAGGCCCTCA-1-3,PCs.IgA+,B-PCs,LP,TTTGCGCAGGCCCTCA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786
TTTGCGCAGGGTTTCT-1-3,PCs.CD19-IgM+.1,B-PCs,LP,TTTGCGCAGGGTTTCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786


#### **development_stage_ontology_term_id**

In [86]:
# identify the column in adata which corresponds to age

In [87]:
adata.obs.columns

Index(['final_annotation', 'lineage', 'location', 'barcodes', 'assay',
       'assay_ontology_term_id', 'cell_type_ontology_term_id'],
      dtype='object')

In [88]:
adata.obs['development_stage_ontology_term_id'] = 'MmusDv:0000061'

In [89]:
# change datatype of the column

In [90]:
adata.obs['development_stage_ontology_term_id'] = adata.obs['development_stage_ontology_term_id'].astype('category')

In [91]:
# view unique values of development_stage_ontology_term_id column

In [92]:
list(adata.obs['development_stage_ontology_term_id'].unique())

['MmusDv:0000061']

In [93]:
# view adata.obs

In [94]:
adata.obs

Unnamed: 0,final_annotation,lineage,location,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id
AAACCTGAGCCCAATT-1-0,GC.BC_LZ-resting,B-PCs,MLN,AAACCTGAGCCCAATT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000844,MmusDv:0000061
AAACCTGAGGTGATTA-1-0,PCs.CD19-IgM+.2,B-PCs,MLN,AAACCTGAGGTGATTA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061
AAACCTGCATGGTTGT-1-0,PCs.CD19+IgM+,B-PCs,MLN,AAACCTGCATGGTTGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061
AAACCTGGTACCGGCT-1-0,Naive.Bcells,B-PCs,MLN,AAACCTGGTACCGGCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000788,MmusDv:0000061
AAACCTGGTCGAGATG-1-0,GC.BC_DZ.2,B-PCs,MLN,AAACCTGGTCGAGATG,3pv2_5pv1_5pv2,EFO:0009900,CL:0000844,MmusDv:0000061
...,...,...,...,...,...,...,...,...
TTTATGCCAATGGATA-1-3,PCs.IgA+,B-PCs,LP,TTTATGCCAATGGATA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061
TTTATGCTCGTAGATC-1-3,PCs.CD19-IgM+.1,B-PCs,LP,TTTATGCTCGTAGATC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061
TTTGCGCAGGCCCTCA-1-3,PCs.IgA+,B-PCs,LP,TTTGCGCAGGCCCTCA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061
TTTGCGCAGGGTTTCT-1-3,PCs.CD19-IgM+.1,B-PCs,LP,TTTGCGCAGGGTTTCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061


#### **donor_id**

In [95]:
#identify the column in adata.obs which provides donor information

In [96]:
adata.obs.columns

Index(['final_annotation', 'lineage', 'location', 'barcodes', 'assay',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id'],
      dtype='object')

In [97]:
adata.obs['donor_id'] = 'pooled'

In [98]:
#adata.obs['donor_id'] = ['unknown'] * len(adata.obs['names'])

In [99]:
# change datatype of the column

In [100]:
adata.obs['donor_id'] = adata.obs['donor_id'].astype('category')

In [101]:
# view unique values of donor_id column

In [102]:
list(adata.obs['donor_id'].unique())

['pooled']

In [103]:
#view obs

In [104]:
adata.obs

Unnamed: 0,final_annotation,lineage,location,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id
AAACCTGAGCCCAATT-1-0,GC.BC_LZ-resting,B-PCs,MLN,AAACCTGAGCCCAATT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000844,MmusDv:0000061,pooled
AAACCTGAGGTGATTA-1-0,PCs.CD19-IgM+.2,B-PCs,MLN,AAACCTGAGGTGATTA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled
AAACCTGCATGGTTGT-1-0,PCs.CD19+IgM+,B-PCs,MLN,AAACCTGCATGGTTGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled
AAACCTGGTACCGGCT-1-0,Naive.Bcells,B-PCs,MLN,AAACCTGGTACCGGCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000788,MmusDv:0000061,pooled
AAACCTGGTCGAGATG-1-0,GC.BC_DZ.2,B-PCs,MLN,AAACCTGGTCGAGATG,3pv2_5pv1_5pv2,EFO:0009900,CL:0000844,MmusDv:0000061,pooled
...,...,...,...,...,...,...,...,...,...
TTTATGCCAATGGATA-1-3,PCs.IgA+,B-PCs,LP,TTTATGCCAATGGATA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled
TTTATGCTCGTAGATC-1-3,PCs.CD19-IgM+.1,B-PCs,LP,TTTATGCTCGTAGATC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled
TTTGCGCAGGCCCTCA-1-3,PCs.IgA+,B-PCs,LP,TTTGCGCAGGCCCTCA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled
TTTGCGCAGGGTTTCT-1-3,PCs.CD19-IgM+.1,B-PCs,LP,TTTGCGCAGGGTTTCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled


In [105]:
adata.obs.columns

Index(['final_annotation', 'lineage', 'location', 'barcodes', 'assay',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id'],
      dtype='object')

#### **disease_ontology_term_id**

In [106]:
adata.obs['disease_ontology_term_id']= ['PATO:0000461'] * len(adata.obs)

In [107]:
adata.obs

Unnamed: 0,final_annotation,lineage,location,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id
AAACCTGAGCCCAATT-1-0,GC.BC_LZ-resting,B-PCs,MLN,AAACCTGAGCCCAATT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000844,MmusDv:0000061,pooled,PATO:0000461
AAACCTGAGGTGATTA-1-0,PCs.CD19-IgM+.2,B-PCs,MLN,AAACCTGAGGTGATTA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461
AAACCTGCATGGTTGT-1-0,PCs.CD19+IgM+,B-PCs,MLN,AAACCTGCATGGTTGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461
AAACCTGGTACCGGCT-1-0,Naive.Bcells,B-PCs,MLN,AAACCTGGTACCGGCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000788,MmusDv:0000061,pooled,PATO:0000461
AAACCTGGTCGAGATG-1-0,GC.BC_DZ.2,B-PCs,MLN,AAACCTGGTCGAGATG,3pv2_5pv1_5pv2,EFO:0009900,CL:0000844,MmusDv:0000061,pooled,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...
TTTATGCCAATGGATA-1-3,PCs.IgA+,B-PCs,LP,TTTATGCCAATGGATA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461
TTTATGCTCGTAGATC-1-3,PCs.CD19-IgM+.1,B-PCs,LP,TTTATGCTCGTAGATC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461
TTTGCGCAGGCCCTCA-1-3,PCs.IgA+,B-PCs,LP,TTTGCGCAGGCCCTCA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461
TTTGCGCAGGGTTTCT-1-3,PCs.CD19-IgM+.1,B-PCs,LP,TTTGCGCAGGGTTTCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461


In [108]:
# change datatype of the column

In [109]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease_ontology_term_id'].astype('category')

In [110]:
# view obs

In [111]:
adata.obs

Unnamed: 0,final_annotation,lineage,location,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id
AAACCTGAGCCCAATT-1-0,GC.BC_LZ-resting,B-PCs,MLN,AAACCTGAGCCCAATT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000844,MmusDv:0000061,pooled,PATO:0000461
AAACCTGAGGTGATTA-1-0,PCs.CD19-IgM+.2,B-PCs,MLN,AAACCTGAGGTGATTA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461
AAACCTGCATGGTTGT-1-0,PCs.CD19+IgM+,B-PCs,MLN,AAACCTGCATGGTTGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461
AAACCTGGTACCGGCT-1-0,Naive.Bcells,B-PCs,MLN,AAACCTGGTACCGGCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000788,MmusDv:0000061,pooled,PATO:0000461
AAACCTGGTCGAGATG-1-0,GC.BC_DZ.2,B-PCs,MLN,AAACCTGGTCGAGATG,3pv2_5pv1_5pv2,EFO:0009900,CL:0000844,MmusDv:0000061,pooled,PATO:0000461
...,...,...,...,...,...,...,...,...,...,...
TTTATGCCAATGGATA-1-3,PCs.IgA+,B-PCs,LP,TTTATGCCAATGGATA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461
TTTATGCTCGTAGATC-1-3,PCs.CD19-IgM+.1,B-PCs,LP,TTTATGCTCGTAGATC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461
TTTGCGCAGGCCCTCA-1-3,PCs.IgA+,B-PCs,LP,TTTGCGCAGGCCCTCA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461
TTTGCGCAGGGTTTCT-1-3,PCs.CD19-IgM+.1,B-PCs,LP,TTTGCGCAGGGTTTCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461


#### **is_primary_data**

In [112]:
#change data type of column

In [113]:
adata.obs['is_primary_data']= [True] * len(adata.obs)

In [114]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

In [115]:
# view obs

In [116]:
adata.obs

Unnamed: 0,final_annotation,lineage,location,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data
AAACCTGAGCCCAATT-1-0,GC.BC_LZ-resting,B-PCs,MLN,AAACCTGAGCCCAATT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000844,MmusDv:0000061,pooled,PATO:0000461,True
AAACCTGAGGTGATTA-1-0,PCs.CD19-IgM+.2,B-PCs,MLN,AAACCTGAGGTGATTA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True
AAACCTGCATGGTTGT-1-0,PCs.CD19+IgM+,B-PCs,MLN,AAACCTGCATGGTTGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True
AAACCTGGTACCGGCT-1-0,Naive.Bcells,B-PCs,MLN,AAACCTGGTACCGGCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000788,MmusDv:0000061,pooled,PATO:0000461,True
AAACCTGGTCGAGATG-1-0,GC.BC_DZ.2,B-PCs,MLN,AAACCTGGTCGAGATG,3pv2_5pv1_5pv2,EFO:0009900,CL:0000844,MmusDv:0000061,pooled,PATO:0000461,True
...,...,...,...,...,...,...,...,...,...,...,...
TTTATGCCAATGGATA-1-3,PCs.IgA+,B-PCs,LP,TTTATGCCAATGGATA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True
TTTATGCTCGTAGATC-1-3,PCs.CD19-IgM+.1,B-PCs,LP,TTTATGCTCGTAGATC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True
TTTGCGCAGGCCCTCA-1-3,PCs.IgA+,B-PCs,LP,TTTGCGCAGGCCCTCA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True
TTTGCGCAGGGTTTCT-1-3,PCs.CD19-IgM+.1,B-PCs,LP,TTTGCGCAGGGTTTCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True


#### **organism_ontology_term_id**

In [117]:
# assign organism id 

In [118]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:10090'] * len(adata.obs)

In [119]:
#change data type of column

In [120]:
adata.obs['organism_ontology_term_id'] = adata.obs['organism_ontology_term_id'].astype('category')

In [121]:
# view obs

In [122]:
adata.obs

Unnamed: 0,final_annotation,lineage,location,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id
AAACCTGAGCCCAATT-1-0,GC.BC_LZ-resting,B-PCs,MLN,AAACCTGAGCCCAATT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000844,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090
AAACCTGAGGTGATTA-1-0,PCs.CD19-IgM+.2,B-PCs,MLN,AAACCTGAGGTGATTA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090
AAACCTGCATGGTTGT-1-0,PCs.CD19+IgM+,B-PCs,MLN,AAACCTGCATGGTTGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090
AAACCTGGTACCGGCT-1-0,Naive.Bcells,B-PCs,MLN,AAACCTGGTACCGGCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000788,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090
AAACCTGGTCGAGATG-1-0,GC.BC_DZ.2,B-PCs,MLN,AAACCTGGTCGAGATG,3pv2_5pv1_5pv2,EFO:0009900,CL:0000844,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090
...,...,...,...,...,...,...,...,...,...,...,...,...
TTTATGCCAATGGATA-1-3,PCs.IgA+,B-PCs,LP,TTTATGCCAATGGATA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090
TTTATGCTCGTAGATC-1-3,PCs.CD19-IgM+.1,B-PCs,LP,TTTATGCTCGTAGATC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090
TTTGCGCAGGCCCTCA-1-3,PCs.IgA+,B-PCs,LP,TTTGCGCAGGCCCTCA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090
TTTGCGCAGGGTTTCT-1-3,PCs.CD19-IgM+.1,B-PCs,LP,TTTGCGCAGGGTTTCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090


#### **self_reported_ethnicity_ontology_term_id**

In [123]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = ['na'] * len(adata.obs)

In [124]:
# change data type

In [125]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['self_reported_ethnicity_ontology_term_id'].astype('category')

In [126]:
# view obs

In [127]:
adata.obs

Unnamed: 0,final_annotation,lineage,location,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id
AAACCTGAGCCCAATT-1-0,GC.BC_LZ-resting,B-PCs,MLN,AAACCTGAGCCCAATT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000844,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na
AAACCTGAGGTGATTA-1-0,PCs.CD19-IgM+.2,B-PCs,MLN,AAACCTGAGGTGATTA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na
AAACCTGCATGGTTGT-1-0,PCs.CD19+IgM+,B-PCs,MLN,AAACCTGCATGGTTGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na
AAACCTGGTACCGGCT-1-0,Naive.Bcells,B-PCs,MLN,AAACCTGGTACCGGCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000788,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na
AAACCTGGTCGAGATG-1-0,GC.BC_DZ.2,B-PCs,MLN,AAACCTGGTCGAGATG,3pv2_5pv1_5pv2,EFO:0009900,CL:0000844,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na
...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTATGCCAATGGATA-1-3,PCs.IgA+,B-PCs,LP,TTTATGCCAATGGATA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na
TTTATGCTCGTAGATC-1-3,PCs.CD19-IgM+.1,B-PCs,LP,TTTATGCTCGTAGATC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na
TTTGCGCAGGCCCTCA-1-3,PCs.IgA+,B-PCs,LP,TTTGCGCAGGCCCTCA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na
TTTGCGCAGGGTTTCT-1-3,PCs.CD19-IgM+.1,B-PCs,LP,TTTGCGCAGGGTTTCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na


In [128]:
adata.obs.columns

Index(['final_annotation', 'lineage', 'location', 'barcodes', 'assay',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

#### **sex_ontology_term_id**

In [129]:
# identify the column in adata.obs which corresponds to sex

In [130]:
adata.obs.columns

Index(['final_annotation', 'lineage', 'location', 'barcodes', 'assay',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id',
       'self_reported_ethnicity_ontology_term_id'],
      dtype='object')

In [131]:
adata.obs

Unnamed: 0,final_annotation,lineage,location,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id
AAACCTGAGCCCAATT-1-0,GC.BC_LZ-resting,B-PCs,MLN,AAACCTGAGCCCAATT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000844,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na
AAACCTGAGGTGATTA-1-0,PCs.CD19-IgM+.2,B-PCs,MLN,AAACCTGAGGTGATTA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na
AAACCTGCATGGTTGT-1-0,PCs.CD19+IgM+,B-PCs,MLN,AAACCTGCATGGTTGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na
AAACCTGGTACCGGCT-1-0,Naive.Bcells,B-PCs,MLN,AAACCTGGTACCGGCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000788,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na
AAACCTGGTCGAGATG-1-0,GC.BC_DZ.2,B-PCs,MLN,AAACCTGGTCGAGATG,3pv2_5pv1_5pv2,EFO:0009900,CL:0000844,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na
...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTATGCCAATGGATA-1-3,PCs.IgA+,B-PCs,LP,TTTATGCCAATGGATA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na
TTTATGCTCGTAGATC-1-3,PCs.CD19-IgM+.1,B-PCs,LP,TTTATGCTCGTAGATC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na
TTTGCGCAGGCCCTCA-1-3,PCs.IgA+,B-PCs,LP,TTTGCGCAGGCCCTCA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na
TTTGCGCAGGGTTTCT-1-3,PCs.CD19-IgM+.1,B-PCs,LP,TTTGCGCAGGGTTTCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na


In [132]:
adata.obs['sex_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [133]:
# change data type

In [134]:
adata.obs['sex_ontology_term_id'] = adata.obs['sex_ontology_term_id'].astype('category')

In [135]:
adata.obs

Unnamed: 0,final_annotation,lineage,location,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
AAACCTGAGCCCAATT-1-0,GC.BC_LZ-resting,B-PCs,MLN,AAACCTGAGCCCAATT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000844,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown
AAACCTGAGGTGATTA-1-0,PCs.CD19-IgM+.2,B-PCs,MLN,AAACCTGAGGTGATTA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown
AAACCTGCATGGTTGT-1-0,PCs.CD19+IgM+,B-PCs,MLN,AAACCTGCATGGTTGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown
AAACCTGGTACCGGCT-1-0,Naive.Bcells,B-PCs,MLN,AAACCTGGTACCGGCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000788,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown
AAACCTGGTCGAGATG-1-0,GC.BC_DZ.2,B-PCs,MLN,AAACCTGGTCGAGATG,3pv2_5pv1_5pv2,EFO:0009900,CL:0000844,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTATGCCAATGGATA-1-3,PCs.IgA+,B-PCs,LP,TTTATGCCAATGGATA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown
TTTATGCTCGTAGATC-1-3,PCs.CD19-IgM+.1,B-PCs,LP,TTTATGCTCGTAGATC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown
TTTGCGCAGGCCCTCA-1-3,PCs.IgA+,B-PCs,LP,TTTGCGCAGGCCCTCA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown
TTTGCGCAGGGTTTCT-1-3,PCs.CD19-IgM+.1,B-PCs,LP,TTTGCGCAGGGTTTCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown


#### **suspension_type**

In [136]:
adata.obs

Unnamed: 0,final_annotation,lineage,location,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id
AAACCTGAGCCCAATT-1-0,GC.BC_LZ-resting,B-PCs,MLN,AAACCTGAGCCCAATT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000844,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown
AAACCTGAGGTGATTA-1-0,PCs.CD19-IgM+.2,B-PCs,MLN,AAACCTGAGGTGATTA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown
AAACCTGCATGGTTGT-1-0,PCs.CD19+IgM+,B-PCs,MLN,AAACCTGCATGGTTGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown
AAACCTGGTACCGGCT-1-0,Naive.Bcells,B-PCs,MLN,AAACCTGGTACCGGCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000788,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown
AAACCTGGTCGAGATG-1-0,GC.BC_DZ.2,B-PCs,MLN,AAACCTGGTCGAGATG,3pv2_5pv1_5pv2,EFO:0009900,CL:0000844,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTATGCCAATGGATA-1-3,PCs.IgA+,B-PCs,LP,TTTATGCCAATGGATA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown
TTTATGCTCGTAGATC-1-3,PCs.CD19-IgM+.1,B-PCs,LP,TTTATGCTCGTAGATC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown
TTTGCGCAGGCCCTCA-1-3,PCs.IgA+,B-PCs,LP,TTTGCGCAGGCCCTCA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown
TTTGCGCAGGGTTTCT-1-3,PCs.CD19-IgM+.1,B-PCs,LP,TTTGCGCAGGGTTTCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown


In [137]:
adata.obs['suspension_type'] = ['cell'] * len(adata.obs)

In [138]:
# change data type of column

In [139]:
adata.obs['suspension_type'] = adata.obs['suspension_type'].astype('category')

In [140]:
# view obs

In [141]:
adata.obs

Unnamed: 0,final_annotation,lineage,location,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type
AAACCTGAGCCCAATT-1-0,GC.BC_LZ-resting,B-PCs,MLN,AAACCTGAGCCCAATT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000844,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell
AAACCTGAGGTGATTA-1-0,PCs.CD19-IgM+.2,B-PCs,MLN,AAACCTGAGGTGATTA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell
AAACCTGCATGGTTGT-1-0,PCs.CD19+IgM+,B-PCs,MLN,AAACCTGCATGGTTGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell
AAACCTGGTACCGGCT-1-0,Naive.Bcells,B-PCs,MLN,AAACCTGGTACCGGCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000788,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell
AAACCTGGTCGAGATG-1-0,GC.BC_DZ.2,B-PCs,MLN,AAACCTGGTCGAGATG,3pv2_5pv1_5pv2,EFO:0009900,CL:0000844,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTATGCCAATGGATA-1-3,PCs.IgA+,B-PCs,LP,TTTATGCCAATGGATA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell
TTTATGCTCGTAGATC-1-3,PCs.CD19-IgM+.1,B-PCs,LP,TTTATGCTCGTAGATC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell
TTTGCGCAGGCCCTCA-1-3,PCs.IgA+,B-PCs,LP,TTTGCGCAGGCCCTCA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell
TTTGCGCAGGGTTTCT-1-3,PCs.CD19-IgM+.1,B-PCs,LP,TTTGCGCAGGGTTTCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell


#### **tissue_type**

In [142]:
adata.obs['tissue_type'] = ['tissue'] * len(adata.obs)

In [143]:
adata.obs['tissue_type'] = adata.obs['tissue_type'].astype('category')

#### **tissue_ontology_term_id**

In [144]:
# identify the column in adata.obs which corresponds to tissue

In [145]:
adata.obs.columns

Index(['final_annotation', 'lineage', 'location', 'barcodes', 'assay',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type'],
      dtype='object')

In [146]:
list(adata.obs['location'].unique())

['MLN', 'CP', 'LA', 'LP']

In [147]:
mapping= {'MLN':'UBERON:0002509', 'CP':'UBERON:0001153', 'LA':'UBERON:0003454', 'LP':'UBERON:0001238'}

In [148]:
# add 'tissue_ontology_term_id' column

In [149]:
adata.obs['tissue_ontology_term_id'] =adata.obs['location'].map(mapping)

In [150]:
# change data type of column

In [151]:
adata.obs['tissue_ontology_term_id'] = adata.obs['tissue_ontology_term_id'].astype('category')

In [152]:
#list the unique values in 'tissue_ontology_term_id' column

In [153]:
list(adata.obs['tissue_ontology_term_id'].unique())

['UBERON:0002509', 'UBERON:0001153', 'UBERON:0003454', 'UBERON:0001238']

In [154]:
# view obs

In [155]:
adata.obs

Unnamed: 0,final_annotation,lineage,location,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
AAACCTGAGCCCAATT-1-0,GC.BC_LZ-resting,B-PCs,MLN,AAACCTGAGCCCAATT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000844,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0002509
AAACCTGAGGTGATTA-1-0,PCs.CD19-IgM+.2,B-PCs,MLN,AAACCTGAGGTGATTA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0002509
AAACCTGCATGGTTGT-1-0,PCs.CD19+IgM+,B-PCs,MLN,AAACCTGCATGGTTGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0002509
AAACCTGGTACCGGCT-1-0,Naive.Bcells,B-PCs,MLN,AAACCTGGTACCGGCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000788,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0002509
AAACCTGGTCGAGATG-1-0,GC.BC_DZ.2,B-PCs,MLN,AAACCTGGTCGAGATG,3pv2_5pv1_5pv2,EFO:0009900,CL:0000844,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0002509
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTATGCCAATGGATA-1-3,PCs.IgA+,B-PCs,LP,TTTATGCCAATGGATA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0001238
TTTATGCTCGTAGATC-1-3,PCs.CD19-IgM+.1,B-PCs,LP,TTTATGCTCGTAGATC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0001238
TTTGCGCAGGCCCTCA-1-3,PCs.IgA+,B-PCs,LP,TTTGCGCAGGCCCTCA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0001238
TTTGCGCAGGGTTTCT-1-3,PCs.CD19-IgM+.1,B-PCs,LP,TTTGCGCAGGGTTTCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0001238


In [156]:
adata.obs.columns

Index(['final_annotation', 'lineage', 'location', 'barcodes', 'assay',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

#### **obsm (Embeddings)**

In [157]:
# view obsm

In [158]:
# check whether all columns are prefixed with X

In [159]:
adata.obsm

AxisArrays with keys: X_pca, X_umap

#### **uns (Dataset Metadata)**

In [160]:
# View

In [161]:
adata.uns

{'location_colors': array(['#1f77b4', '#ff7f0e', '#279e68', '#d62728'], dtype=object)}

In [162]:
adata.uns.keys

<function dict.keys>

In [163]:
# Give a title for the dataset

In [164]:
adata.uns['title'] = 'BCs-PCs'

In [165]:
# Set the default embedding

In [166]:
adata.uns['default_embedding'] = 'X_umap'

### **Final check**

In [167]:
# view anndata object

In [168]:
adata

AnnData object with n_obs × n_vars = 3912 × 19283
    obs: 'final_annotation', 'lineage', 'location', 'barcodes', 'assay', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'development_stage_ontology_term_id', 'donor_id', 'disease_ontology_term_id', 'is_primary_data', 'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex_ontology_term_id', 'suspension_type', 'tissue_type', 'tissue_ontology_term_id'
    var: 'mt', 'rb', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'feature_is_filtered'
    uns: 'location_colors', 'title', 'default_embedding'
    obsm: 'X_pca', 'X_umap'

In [169]:
# view obs and var data types

In [170]:
adata.obs.dtypes

final_annotation                            category
lineage                                     category
location                                    category
barcodes                                      object
assay                                         object
assay_ontology_term_id                      category
cell_type_ontology_term_id                  category
development_stage_ontology_term_id          category
donor_id                                    category
disease_ontology_term_id                    category
is_primary_data                                 bool
organism_ontology_term_id                   category
self_reported_ethnicity_ontology_term_id    category
sex_ontology_term_id                        category
suspension_type                             category
tissue_type                                 category
tissue_ontology_term_id                     category
dtype: object

In [171]:
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

In [172]:
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")
for c in dty[dty['dtype'] == 'object'].index.values:
    adata.obs[c] = adata.obs[c].astype('category') 
    print(f"changed {c} from object to category")

changed barcodes from object to category
changed assay from object to category


In [173]:
# view obs

In [174]:
adata.obs

Unnamed: 0,final_annotation,lineage,location,barcodes,assay,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
AAACCTGAGCCCAATT-1-0,GC.BC_LZ-resting,B-PCs,MLN,AAACCTGAGCCCAATT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000844,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0002509
AAACCTGAGGTGATTA-1-0,PCs.CD19-IgM+.2,B-PCs,MLN,AAACCTGAGGTGATTA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0002509
AAACCTGCATGGTTGT-1-0,PCs.CD19+IgM+,B-PCs,MLN,AAACCTGCATGGTTGT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0002509
AAACCTGGTACCGGCT-1-0,Naive.Bcells,B-PCs,MLN,AAACCTGGTACCGGCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000788,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0002509
AAACCTGGTCGAGATG-1-0,GC.BC_DZ.2,B-PCs,MLN,AAACCTGGTCGAGATG,3pv2_5pv1_5pv2,EFO:0009900,CL:0000844,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0002509
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTATGCCAATGGATA-1-3,PCs.IgA+,B-PCs,LP,TTTATGCCAATGGATA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0001238
TTTATGCTCGTAGATC-1-3,PCs.CD19-IgM+.1,B-PCs,LP,TTTATGCTCGTAGATC,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0001238
TTTGCGCAGGCCCTCA-1-3,PCs.IgA+,B-PCs,LP,TTTGCGCAGGCCCTCA,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0001238
TTTGCGCAGGGTTTCT-1-3,PCs.CD19-IgM+.1,B-PCs,LP,TTTGCGCAGGGTTTCT,3pv2_5pv1_5pv2,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0001238


In [175]:
adata.obs.columns

Index(['final_annotation', 'lineage', 'location', 'barcodes', 'assay',
       'assay_ontology_term_id', 'cell_type_ontology_term_id',
       'development_stage_ontology_term_id', 'donor_id',
       'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

In [176]:
# delete unwanted columns in obs

In [177]:
del adata.obs['barcodes']
del adata.obs['assay']
del adata.obs['location']
del adata.uns['location_colors']

In [178]:
# view obs

In [179]:
adata.obs

Unnamed: 0,final_annotation,lineage,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
AAACCTGAGCCCAATT-1-0,GC.BC_LZ-resting,B-PCs,EFO:0009900,CL:0000844,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0002509
AAACCTGAGGTGATTA-1-0,PCs.CD19-IgM+.2,B-PCs,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0002509
AAACCTGCATGGTTGT-1-0,PCs.CD19+IgM+,B-PCs,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0002509
AAACCTGGTACCGGCT-1-0,Naive.Bcells,B-PCs,EFO:0009900,CL:0000788,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0002509
AAACCTGGTCGAGATG-1-0,GC.BC_DZ.2,B-PCs,EFO:0009900,CL:0000844,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0002509
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTATGCCAATGGATA-1-3,PCs.IgA+,B-PCs,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0001238
TTTATGCTCGTAGATC-1-3,PCs.CD19-IgM+.1,B-PCs,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0001238
TTTGCGCAGGCCCTCA-1-3,PCs.IgA+,B-PCs,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0001238
TTTGCGCAGGGTTTCT-1-3,PCs.CD19-IgM+.1,B-PCs,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0001238


In [180]:
# view var

In [181]:
adata.var

Unnamed: 0_level_0,mt,rb,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts,n_cells,highly_variable,means,dispersions,dispersions_norm,feature_is_filtered
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ENSMUSG00000025902,False,False,221,0.026731,98.718025,460.818756,221,True,0.131385,2.748144,2.592534,False
ENSMUSG00000033845,False,False,3218,0.297666,81.333023,5131.465332,3218,False,0.480022,1.226943,-0.471321,False
ENSMUSG00000025903,False,False,2460,0.188250,85.730034,3245.247803,2460,False,0.353775,1.208837,-0.495850,False
ENSMUSG00000033813,False,False,6241,0.829154,63.797203,14293.792969,6241,False,1.043117,1.383716,-0.455413,False
ENSMUSG00000002459,False,False,3,0.000174,99.982597,2.995511,3,False,0.000562,1.542446,0.282720,False
...,...,...,...,...,...,...,...,...,...,...,...,...
ENSMUSG00000094728,False,False,70,0.004056,99.593941,69.925201,70,False,0.006127,0.903737,-0.940885,False
ENSMUSG00000062783,False,False,91,0.005321,99.472130,91.735985,91,False,0.011063,1.130046,-0.507334,False
ENSMUSG00000096808,False,False,8,0.000464,99.953590,8.000000,8,False,0.000741,1.017444,-0.723052,False
ENSMUSG00000095742,False,False,1400,0.095975,91.878876,1654.511108,1400,False,0.240455,1.431409,0.070001,False


In [182]:
araw.var

Unnamed: 0_level_0,feature_types,genome,n_cells
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSMUSG00000025902,Gene Expression,mm10,221
ENSMUSG00000033845,Gene Expression,mm10,3218
ENSMUSG00000025903,Gene Expression,mm10,2460
ENSMUSG00000033813,Gene Expression,mm10,6241
ENSMUSG00000002459,Gene Expression,mm10,3
...,...,...,...
ENSMUSG00000094728,Gene Expression,mm10,70
ENSMUSG00000062783,Gene Expression,mm10,91
ENSMUSG00000096808,Gene Expression,mm10,8
ENSMUSG00000095742,Gene Expression,mm10,1400


In [183]:
#view uns

In [184]:
adata.uns

{'title': 'BCs-PCs', 'default_embedding': 'X_umap'}

In [185]:
list(adata.uns.keys())

['title', 'default_embedding']

In [186]:
adata.obs.columns

Index(['final_annotation', 'lineage', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

In [187]:
# Remove unwanted columns in uns

In [188]:
#check the format of expression matrix

In [189]:
adata.X

<3912x19283 sparse matrix of type '<class 'numpy.float32'>'
	with 7902226 stored elements in Compressed Sparse Row format>

In [190]:
araw.X

<3912x19283 sparse matrix of type '<class 'numpy.float32'>'
	with 7910284 stored elements in Compressed Sparse Row format>

In [191]:
#Copy raw counts to adata.raw

In [192]:
adata.raw = araw

In [193]:
obs_dtype = adata.obs.dtypes

In [194]:
obs_dtype

final_annotation                            category
lineage                                     category
assay_ontology_term_id                      category
cell_type_ontology_term_id                  category
development_stage_ontology_term_id          category
donor_id                                    category
disease_ontology_term_id                    category
is_primary_data                                 bool
organism_ontology_term_id                   category
self_reported_ethnicity_ontology_term_id    category
sex_ontology_term_id                        category
suspension_type                             category
tissue_type                                 category
tissue_ontology_term_id                     category
dtype: object

In [195]:
adata.var

Unnamed: 0_level_0,mt,rb,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts,n_cells,highly_variable,means,dispersions,dispersions_norm,feature_is_filtered
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ENSMUSG00000025902,False,False,221,0.026731,98.718025,460.818756,221,True,0.131385,2.748144,2.592534,False
ENSMUSG00000033845,False,False,3218,0.297666,81.333023,5131.465332,3218,False,0.480022,1.226943,-0.471321,False
ENSMUSG00000025903,False,False,2460,0.188250,85.730034,3245.247803,2460,False,0.353775,1.208837,-0.495850,False
ENSMUSG00000033813,False,False,6241,0.829154,63.797203,14293.792969,6241,False,1.043117,1.383716,-0.455413,False
ENSMUSG00000002459,False,False,3,0.000174,99.982597,2.995511,3,False,0.000562,1.542446,0.282720,False
...,...,...,...,...,...,...,...,...,...,...,...,...
ENSMUSG00000094728,False,False,70,0.004056,99.593941,69.925201,70,False,0.006127,0.903737,-0.940885,False
ENSMUSG00000062783,False,False,91,0.005321,99.472130,91.735985,91,False,0.011063,1.130046,-0.507334,False
ENSMUSG00000096808,False,False,8,0.000464,99.953590,8.000000,8,False,0.000741,1.017444,-0.723052,False
ENSMUSG00000095742,False,False,1400,0.095975,91.878876,1654.511108,1400,False,0.240455,1.431409,0.070001,False


In [196]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/Treg_Gut_Niches/Final_objects/BCs-PCs_allgenes.h5ad', compression = 'gzip')

In [197]:
adata.obs

Unnamed: 0,final_annotation,lineage,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,donor_id,disease_ontology_term_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_type,tissue_ontology_term_id
AAACCTGAGCCCAATT-1-0,GC.BC_LZ-resting,B-PCs,EFO:0009900,CL:0000844,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0002509
AAACCTGAGGTGATTA-1-0,PCs.CD19-IgM+.2,B-PCs,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0002509
AAACCTGCATGGTTGT-1-0,PCs.CD19+IgM+,B-PCs,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0002509
AAACCTGGTACCGGCT-1-0,Naive.Bcells,B-PCs,EFO:0009900,CL:0000788,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0002509
AAACCTGGTCGAGATG-1-0,GC.BC_DZ.2,B-PCs,EFO:0009900,CL:0000844,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0002509
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTATGCCAATGGATA-1-3,PCs.IgA+,B-PCs,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0001238
TTTATGCTCGTAGATC-1-3,PCs.CD19-IgM+.1,B-PCs,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0001238
TTTGCGCAGGCCCTCA-1-3,PCs.IgA+,B-PCs,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0001238
TTTGCGCAGGGTTTCT-1-3,PCs.CD19-IgM+.1,B-PCs,EFO:0009900,CL:0000786,MmusDv:0000061,pooled,PATO:0000461,True,NCBITaxon:10090,na,unknown,cell,tissue,UBERON:0001238


In [198]:
adata.obs.columns

Index(['final_annotation', 'lineage', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'development_stage_ontology_term_id',
       'donor_id', 'disease_ontology_term_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex_ontology_term_id', 'suspension_type', 'tissue_type',
       'tissue_ontology_term_id'],
      dtype='object')

In [199]:
adata.raw.var

Unnamed: 0_level_0,feature_types,genome,n_cells
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ENSMUSG00000025902,Gene Expression,mm10,221
ENSMUSG00000033845,Gene Expression,mm10,3218
ENSMUSG00000025903,Gene Expression,mm10,2460
ENSMUSG00000033813,Gene Expression,mm10,6241
ENSMUSG00000002459,Gene Expression,mm10,3
...,...,...,...
ENSMUSG00000094728,Gene Expression,mm10,70
ENSMUSG00000062783,Gene Expression,mm10,91
ENSMUSG00000096808,Gene Expression,mm10,8
ENSMUSG00000095742,Gene Expression,mm10,1400


In [200]:
adata.var

Unnamed: 0_level_0,mt,rb,n_cells_by_counts,mean_counts,pct_dropout_by_counts,total_counts,n_cells,highly_variable,means,dispersions,dispersions_norm,feature_is_filtered
gene_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ENSMUSG00000025902,False,False,221,0.026731,98.718025,460.818756,221,True,0.131385,2.748144,2.592534,False
ENSMUSG00000033845,False,False,3218,0.297666,81.333023,5131.465332,3218,False,0.480022,1.226943,-0.471321,False
ENSMUSG00000025903,False,False,2460,0.188250,85.730034,3245.247803,2460,False,0.353775,1.208837,-0.495850,False
ENSMUSG00000033813,False,False,6241,0.829154,63.797203,14293.792969,6241,False,1.043117,1.383716,-0.455413,False
ENSMUSG00000002459,False,False,3,0.000174,99.982597,2.995511,3,False,0.000562,1.542446,0.282720,False
...,...,...,...,...,...,...,...,...,...,...,...,...
ENSMUSG00000094728,False,False,70,0.004056,99.593941,69.925201,70,False,0.006127,0.903737,-0.940885,False
ENSMUSG00000062783,False,False,91,0.005321,99.472130,91.735985,91,False,0.011063,1.130046,-0.507334,False
ENSMUSG00000096808,False,False,8,0.000464,99.953590,8.000000,8,False,0.000741,1.017444,-0.723052,False
ENSMUSG00000095742,False,False,1400,0.095975,91.878876,1654.511108,1400,False,0.240455,1.431409,0.070001,False


In [201]:
adata.raw.X

<3912x19283 sparse matrix of type '<class 'numpy.float32'>'
	with 7910284 stored elements in Compressed Sparse Row format>