# **Curating single-cell transcriptomic datasets of kidney cancer patients**

Article: 'Mapping single-cell transcriptomes in the intra-tumoral and associated territories of kidney cancer'

DOI : '10.1016/j.ccell.2022.11.001'

Data Source : 'https://data.mendeley.com/datasets/g67bkbnhhg/1'

### Mount farm

mount-farm

### Packages required for curation

In [2]:
# Import all packages required for the curation

In [3]:
import numpy as np
import pandas as pd
import scanpy as sc
import scipy
from tqdm import tqdm
from scipy import sparse
from scipy.sparse import csr_matrix
import anndata as ad
import os
import subprocess

### Curation Schema

#### X (Matrix Layers)

##### AnnData object

In [4]:
# Load the AnnData object

In [5]:
adata = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/kidney-cancer/RCC_upload_final_normalised_counts.h5ad' )

In [6]:
# View the AnnData object

In [7]:
adata

AnnData object with n_obs × n_vars = 270855 × 19736
    obs: 'orig.ident', 'patient', 'summaryDescription', 'annotation', 'region', 'broad_type', 'n_counts'
    var: 'name'
    uns: 'annotation_colors', 'broad_type_colors', 'region_colors', 'summaryDescription_colors'
    obsm: 'X_pca', 'X_umap'

##### Raw counts matrix

In [8]:
# Check whether adata has raw counts or normalized counts

In [9]:
print(adata.X)

  (720, 0)	1.6043375
  (842, 0)	1.6081591
  (912, 0)	1.6724615
  (920, 0)	1.6018018
  (1007, 0)	1.6259605
  (1106, 0)	1.7318614
  (1825, 0)	1.6924223
  (5769, 0)	1.7716298
  (6109, 0)	1.5112342
  (9106, 0)	1.450639
  (9169, 0)	1.4350846
  (9379, 0)	1.4275725
  (10132, 0)	1.4531442
  (11518, 0)	1.567888
  (11948, 0)	1.6174862
  (11995, 0)	1.6139331
  (12047, 0)	1.679879
  (12053, 0)	1.6276026
  (12134, 0)	1.6217101
  (12573, 0)	1.6971401
  (12661, 0)	1.8498801
  (13108, 0)	1.6385416
  (14508, 0)	1.747735
  (14832, 0)	1.7627742
  (16361, 0)	1.717089
  :	:
  (152813, 19735)	1.0348034
  (152840, 19735)	1.1493376
  (152891, 19735)	1.012413
  (153067, 19735)	1.1288344
  (153240, 19735)	1.1585511
  (153282, 19735)	1.3138173
  (153333, 19735)	1.1546675
  (153471, 19735)	1.2834092
  (153531, 19735)	1.127562
  (153614, 19735)	1.3023945
  (153649, 19735)	1.1462536
  (153691, 19735)	1.2896675
  (153910, 19735)	1.1699181
  (153972, 19735)	1.0939673
  (154095, 19735)	1.5644413
  (154157, 19735)	1.12

In [10]:
# Load the raw counts matrix if it is provided as another object in data source

In [11]:
araw = sc.read_h5ad('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/kidney-cancer/RCC_upload_final_raw_counts.h5ad')

In [12]:
# View raw data

In [13]:
araw

AnnData object with n_obs × n_vars = 270855 × 19736
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'patient', 'percent.mt', 'summaryDescription', 'annotation', 'region', 'broad_type'
    var: 'name'
    uns: 'annotation_colors', 'broad_type_colors', 'region_colors', 'summaryDescription_colors'
    obsm: 'X_pca', 'X_umap'

In [14]:
araw.X = scipy.sparse.csr_matrix(araw.X)

In [15]:
print(araw.X)

  (0, 259)	1.0
  (0, 359)	1.0
  (0, 384)	1.0
  (0, 440)	1.0
  (0, 570)	1.0
  (0, 624)	1.0
  (0, 632)	1.0
  (0, 805)	1.0
  (0, 862)	1.0
  (0, 907)	4.0
  (0, 1066)	1.0
  (0, 1077)	1.0
  (0, 1108)	1.0
  (0, 1131)	1.0
  (0, 1153)	1.0
  (0, 1227)	1.0
  (0, 1284)	1.0
  (0, 1294)	1.0
  (0, 1302)	2.0
  (0, 1338)	1.0
  (0, 1345)	1.0
  (0, 1347)	1.0
  (0, 1420)	1.0
  (0, 1430)	1.0
  (0, 1467)	1.0
  :	:
  (270854, 15379)	43.0
  (270854, 15659)	2.0
  (270854, 15669)	1.0
  (270854, 15865)	1.0
  (270854, 15920)	1.0
  (270854, 15990)	1.0
  (270854, 16187)	1.0
  (270854, 16229)	1.0
  (270854, 16267)	1.0
  (270854, 16522)	1.0
  (270854, 16667)	1.0
  (270854, 16742)	1.0
  (270854, 16749)	1.0
  (270854, 17297)	1.0
  (270854, 17332)	9.0
  (270854, 17783)	1.0
  (270854, 17854)	1.0
  (270854, 18071)	1.0
  (270854, 18092)	1.0
  (270854, 18168)	1.0
  (270854, 18676)	1.0
  (270854, 18884)	1.0
  (270854, 19103)	1.0
  (270854, 19593)	1.0
  (270854, 19618)	1.0


In [16]:
araw.X

<270855x19736 sparse matrix of type '<class 'numpy.float32'>'
	with 333922833 stored elements in Compressed Sparse Row format>

#### Variables (var)

In [17]:
# View var

In [18]:
adata.var

Unnamed: 0,name
FO538757.3,FO538757.3
FO538757.2,FO538757.2
AP006222.2,AP006222.2
RP5-857K21.4,RP5-857K21.4
RP11-206L10.9,RP11-206L10.9
...,...
BACH1-IT2,BACH1-IT2
AP000282.2,AP000282.2
LINC01436,LINC01436
CRYAA,CRYAA


In [19]:
araw.var

Unnamed: 0,name
FO538757.3,FO538757.3
FO538757.2,FO538757.2
AP006222.2,AP006222.2
RP5-857K21.4,RP5-857K21.4
RP11-206L10.9,RP11-206L10.9
...,...
BACH1-IT2,BACH1-IT2
AP000282.2,AP000282.2
LINC01436,LINC01436
CRYAA,CRYAA


##### Ensembl IDs

In [20]:
#Since ensembl ids are not provided in adata.var. Load the GRCh38-1.2.0 reference to match ensembl ids to gene symbols

In [21]:
gene_info = pd.read_table('/home/jovyan/CXG_DATASETS_PORTAL/gene_info/GRCh38-1.2.0.gene_names.tsv')

In [22]:
gene_info

Unnamed: 0,ensembl_ids,gene
0,ENSG00000243485,RP11-34P13.3
1,ENSG00000237613,FAM138A
2,ENSG00000186092,OR4F5
3,ENSG00000238009,RP11-34P13.7
4,ENSG00000239945,RP11-34P13.8
...,...,...
33689,ENSG00000277856,AC233755.2
33690,ENSG00000275063,AC233755.1
33691,ENSG00000271254,AC240274.1
33692,ENSG00000277475,AC213203.1


In [23]:
#create a dictionary with gene symbols and ensembl ids from the gene information file

In [24]:
gene_info_genesym_to_ensembl = dict(zip(gene_info['gene'],gene_info['ensembl_ids']))

In [25]:
gene_info_genesym_to_ensembl

{'RP11-34P13.3': 'ENSG00000243485',
 'FAM138A': 'ENSG00000237613',
 'OR4F5': 'ENSG00000186092',
 'RP11-34P13.7': 'ENSG00000238009',
 'RP11-34P13.8': 'ENSG00000239945',
 'RP11-34P13.14': 'ENSG00000239906',
 'RP11-34P13.9': 'ENSG00000241599',
 'FO538757.3': 'ENSG00000279928',
 'FO538757.2': 'ENSG00000279457',
 'AP006222.2': 'ENSG00000228463',
 'RP5-857K21.15': 'ENSG00000236743',
 'RP4-669L17.2': 'ENSG00000236601',
 'RP4-669L17.10': 'ENSG00000237094',
 'OR4F29': 'ENSG00000278566',
 'RP5-857K21.4': 'ENSG00000230021',
 'RP5-857K21.2': 'ENSG00000235146',
 'OR4F16': 'ENSG00000273547',
 'RP11-206L10.4': 'ENSG00000229905',
 'RP11-206L10.9': 'ENSG00000237491',
 'FAM87B': 'ENSG00000177757',
 'LINC00115': 'ENSG00000225880',
 'FAM41C': 'ENSG00000230368',
 'RP11-54O7.16': 'ENSG00000272438',
 'RP11-54O7.1': 'ENSG00000230699',
 'RP11-54O7.2': 'ENSG00000241180',
 'RP11-54O7.3': 'ENSG00000223764',
 'SAMD11': 'ENSG00000187634',
 'NOC2L': 'ENSG00000188976',
 'KLHL17': 'ENSG00000187961',
 'PLEKHN1': 'ENSG0

In [26]:
#Store ensembl ids in a new column in adata.var by matching gene symbols and ensembl ids from the gene information file

In [27]:
adata.var['ensembl_id'] = adata.var['name'].map(gene_info_genesym_to_ensembl)

In [28]:
araw.var['ensembl_id'] = araw.var['name'].map(gene_info_genesym_to_ensembl)

In [29]:
adata.var

Unnamed: 0,name,ensembl_id
FO538757.3,FO538757.3,ENSG00000279928
FO538757.2,FO538757.2,ENSG00000279457
AP006222.2,AP006222.2,ENSG00000228463
RP5-857K21.4,RP5-857K21.4,ENSG00000230021
RP11-206L10.9,RP11-206L10.9,ENSG00000237491
...,...,...
BACH1-IT2,BACH1-IT2,ENSG00000228817
AP000282.2,AP000282.2,ENSG00000227757
LINC01436,LINC01436,ENSG00000231106
CRYAA,CRYAA,ENSG00000160202


In [30]:
adata.var['ensembl_id'].isna().sum()

44

In [31]:
araw.var

Unnamed: 0,name,ensembl_id
FO538757.3,FO538757.3,ENSG00000279928
FO538757.2,FO538757.2,ENSG00000279457
AP006222.2,AP006222.2,ENSG00000228463
RP5-857K21.4,RP5-857K21.4,ENSG00000230021
RP11-206L10.9,RP11-206L10.9,ENSG00000237491
...,...,...
BACH1-IT2,BACH1-IT2,ENSG00000228817
AP000282.2,AP000282.2,ENSG00000227757
LINC01436,LINC01436,ENSG00000231106
CRYAA,CRYAA,ENSG00000160202


In [32]:
#set the column ensembl_id as the index column

In [33]:
adata.var = adata.var.set_index('ensembl_id')

In [34]:
araw.var = araw.var.set_index('ensembl_id')

In [35]:
adata.var

Unnamed: 0_level_0,name
ensembl_id,Unnamed: 1_level_1
ENSG00000279928,FO538757.3
ENSG00000279457,FO538757.2
ENSG00000228463,AP006222.2
ENSG00000230021,RP5-857K21.4
ENSG00000237491,RP11-206L10.9
...,...
ENSG00000228817,BACH1-IT2
ENSG00000227757,AP000282.2
ENSG00000231106,LINC01436
ENSG00000160202,CRYAA


In [36]:
araw.var

Unnamed: 0_level_0,name
ensembl_id,Unnamed: 1_level_1
ENSG00000279928,FO538757.3
ENSG00000279457,FO538757.2
ENSG00000228463,AP006222.2
ENSG00000230021,RP5-857K21.4
ENSG00000237491,RP11-206L10.9
...,...
ENSG00000228817,BACH1-IT2
ENSG00000227757,AP000282.2
ENSG00000231106,LINC01436
ENSG00000160202,CRYAA


In [37]:
#Load the approved genes file 

In [38]:
approved_genes = pd.read_csv('/home/jovyan/CXG_DATASETS_PORTAL/gene_info/genes_approved.csv')

In [39]:
approved_gene_symbol_to_ensembl = dict(zip(approved_genes['symb'], approved_genes['feature_id']))

In [40]:
approved_gene_symbol_to_ensembl

{'ERCC-00002 (spike-in control)': 'ERCC-00002',
 'ERCC-00003 (spike-in control)': 'ERCC-00003',
 'ERCC-00004 (spike-in control)': 'ERCC-00004',
 'ERCC-00009 (spike-in control)': 'ERCC-00009',
 'ERCC-00012 (spike-in control)': 'ERCC-00012',
 'ERCC-00013 (spike-in control)': 'ERCC-00013',
 'ERCC-00014 (spike-in control)': 'ERCC-00014',
 'ERCC-00016 (spike-in control)': 'ERCC-00016',
 'ERCC-00017 (spike-in control)': 'ERCC-00017',
 'ERCC-00019 (spike-in control)': 'ERCC-00019',
 'ERCC-00022 (spike-in control)': 'ERCC-00022',
 'ERCC-00024 (spike-in control)': 'ERCC-00024',
 'ERCC-00025 (spike-in control)': 'ERCC-00025',
 'ERCC-00028 (spike-in control)': 'ERCC-00028',
 'ERCC-00031 (spike-in control)': 'ERCC-00031',
 'ERCC-00033 (spike-in control)': 'ERCC-00033',
 'ERCC-00034 (spike-in control)': 'ERCC-00034',
 'ERCC-00035 (spike-in control)': 'ERCC-00035',
 'ERCC-00039 (spike-in control)': 'ERCC-00039',
 'ERCC-00040 (spike-in control)': 'ERCC-00040',
 'ERCC-00041 (spike-in control)': 'ERCC-

In [41]:
# Filter out genes that don't appear in the approved annotation

In [42]:
# Store all ensembl ids in adata.var to a list called var_to_keep_adata

In [43]:
var_to_keep_adata = adata.var.index.tolist()

In [44]:
var_to_keep_araw = araw.var.index.tolist()

In [45]:
#View the list

In [46]:
var_to_keep_adata

['ENSG00000279928',
 'ENSG00000279457',
 'ENSG00000228463',
 'ENSG00000230021',
 'ENSG00000237491',
 'ENSG00000225880',
 'ENSG00000230368',
 'ENSG00000188976',
 'ENSG00000187961',
 'ENSG00000187583',
 'ENSG00000187642',
 'ENSG00000188290',
 'ENSG00000187608',
 'ENSG00000188157',
 'ENSG00000131591',
 'ENSG00000186891',
 'ENSG00000186827',
 'ENSG00000078808',
 'ENSG00000176022',
 'ENSG00000184163',
 'ENSG00000160087',
 'ENSG00000162572',
 'ENSG00000131584',
 'ENSG00000169972',
 'ENSG00000127054',
 'ENSG00000224051',
 'ENSG00000169962',
 'ENSG00000107404',
 'ENSG00000162576',
 'ENSG00000175756',
 'ENSG00000221978',
 'ENSG00000242485',
 'ENSG00000179403',
 'ENSG00000215915',
 'ENSG00000160072',
 'ENSG00000197785',
 'ENSG00000160075',
 'ENSG00000215014',
 'ENSG00000228594',
 'ENSG00000272106',
 'ENSG00000197530',
 'ENSG00000189409',
 'ENSG00000248333',
 'ENSG00000272004',
 'ENSG00000189339',
 'ENSG00000008128',
 'ENSG00000215790',
 'ENSG00000008130',
 'ENSG00000078369',
 'ENSG00000178821',


In [47]:
var_to_keep_araw

['ENSG00000279928',
 'ENSG00000279457',
 'ENSG00000228463',
 'ENSG00000230021',
 'ENSG00000237491',
 'ENSG00000225880',
 'ENSG00000230368',
 'ENSG00000188976',
 'ENSG00000187961',
 'ENSG00000187583',
 'ENSG00000187642',
 'ENSG00000188290',
 'ENSG00000187608',
 'ENSG00000188157',
 'ENSG00000131591',
 'ENSG00000186891',
 'ENSG00000186827',
 'ENSG00000078808',
 'ENSG00000176022',
 'ENSG00000184163',
 'ENSG00000160087',
 'ENSG00000162572',
 'ENSG00000131584',
 'ENSG00000169972',
 'ENSG00000127054',
 'ENSG00000224051',
 'ENSG00000169962',
 'ENSG00000107404',
 'ENSG00000162576',
 'ENSG00000175756',
 'ENSG00000221978',
 'ENSG00000242485',
 'ENSG00000179403',
 'ENSG00000215915',
 'ENSG00000160072',
 'ENSG00000197785',
 'ENSG00000160075',
 'ENSG00000215014',
 'ENSG00000228594',
 'ENSG00000272106',
 'ENSG00000197530',
 'ENSG00000189409',
 'ENSG00000248333',
 'ENSG00000272004',
 'ENSG00000189339',
 'ENSG00000008128',
 'ENSG00000215790',
 'ENSG00000008130',
 'ENSG00000078369',
 'ENSG00000178821',


In [48]:
#Check number of elements in var_to_keep_adata

In [49]:
len(var_to_keep_adata)

19736

In [50]:
len(var_to_keep_araw)

19736

In [51]:
var_in_approved_adata = adata.var.index[adata.var.index.isin(approved_genes['feature_id'])].tolist()

In [52]:
var_in_approved_adata

['ENSG00000279928',
 'ENSG00000279457',
 'ENSG00000228463',
 'ENSG00000230021',
 'ENSG00000237491',
 'ENSG00000225880',
 'ENSG00000230368',
 'ENSG00000188976',
 'ENSG00000187961',
 'ENSG00000187583',
 'ENSG00000187642',
 'ENSG00000188290',
 'ENSG00000187608',
 'ENSG00000188157',
 'ENSG00000131591',
 'ENSG00000186891',
 'ENSG00000186827',
 'ENSG00000078808',
 'ENSG00000176022',
 'ENSG00000184163',
 'ENSG00000160087',
 'ENSG00000162572',
 'ENSG00000131584',
 'ENSG00000169972',
 'ENSG00000127054',
 'ENSG00000224051',
 'ENSG00000169962',
 'ENSG00000107404',
 'ENSG00000162576',
 'ENSG00000175756',
 'ENSG00000221978',
 'ENSG00000242485',
 'ENSG00000179403',
 'ENSG00000215915',
 'ENSG00000160072',
 'ENSG00000197785',
 'ENSG00000160075',
 'ENSG00000215014',
 'ENSG00000228594',
 'ENSG00000272106',
 'ENSG00000197530',
 'ENSG00000189409',
 'ENSG00000248333',
 'ENSG00000272004',
 'ENSG00000189339',
 'ENSG00000008128',
 'ENSG00000215790',
 'ENSG00000008130',
 'ENSG00000078369',
 'ENSG00000178821',


In [53]:
var_in_approved_araw = araw.var.index[araw.var.index.isin(approved_genes['feature_id'])].tolist()

In [54]:
var_in_approved_araw

['ENSG00000279928',
 'ENSG00000279457',
 'ENSG00000228463',
 'ENSG00000230021',
 'ENSG00000237491',
 'ENSG00000225880',
 'ENSG00000230368',
 'ENSG00000188976',
 'ENSG00000187961',
 'ENSG00000187583',
 'ENSG00000187642',
 'ENSG00000188290',
 'ENSG00000187608',
 'ENSG00000188157',
 'ENSG00000131591',
 'ENSG00000186891',
 'ENSG00000186827',
 'ENSG00000078808',
 'ENSG00000176022',
 'ENSG00000184163',
 'ENSG00000160087',
 'ENSG00000162572',
 'ENSG00000131584',
 'ENSG00000169972',
 'ENSG00000127054',
 'ENSG00000224051',
 'ENSG00000169962',
 'ENSG00000107404',
 'ENSG00000162576',
 'ENSG00000175756',
 'ENSG00000221978',
 'ENSG00000242485',
 'ENSG00000179403',
 'ENSG00000215915',
 'ENSG00000160072',
 'ENSG00000197785',
 'ENSG00000160075',
 'ENSG00000215014',
 'ENSG00000228594',
 'ENSG00000272106',
 'ENSG00000197530',
 'ENSG00000189409',
 'ENSG00000248333',
 'ENSG00000272004',
 'ENSG00000189339',
 'ENSG00000008128',
 'ENSG00000215790',
 'ENSG00000008130',
 'ENSG00000078369',
 'ENSG00000178821',


In [55]:
len(var_in_approved_adata)

19507

In [56]:
len(var_in_approved_araw)

19507

In [57]:
var_to_keep_adata2 = [e for e in var_to_keep_adata if e in var_in_approved_adata]

In [58]:
var_to_keep_adata2

['ENSG00000279928',
 'ENSG00000279457',
 'ENSG00000228463',
 'ENSG00000230021',
 'ENSG00000237491',
 'ENSG00000225880',
 'ENSG00000230368',
 'ENSG00000188976',
 'ENSG00000187961',
 'ENSG00000187583',
 'ENSG00000187642',
 'ENSG00000188290',
 'ENSG00000187608',
 'ENSG00000188157',
 'ENSG00000131591',
 'ENSG00000186891',
 'ENSG00000186827',
 'ENSG00000078808',
 'ENSG00000176022',
 'ENSG00000184163',
 'ENSG00000160087',
 'ENSG00000162572',
 'ENSG00000131584',
 'ENSG00000169972',
 'ENSG00000127054',
 'ENSG00000224051',
 'ENSG00000169962',
 'ENSG00000107404',
 'ENSG00000162576',
 'ENSG00000175756',
 'ENSG00000221978',
 'ENSG00000242485',
 'ENSG00000179403',
 'ENSG00000215915',
 'ENSG00000160072',
 'ENSG00000197785',
 'ENSG00000160075',
 'ENSG00000215014',
 'ENSG00000228594',
 'ENSG00000272106',
 'ENSG00000197530',
 'ENSG00000189409',
 'ENSG00000248333',
 'ENSG00000272004',
 'ENSG00000189339',
 'ENSG00000008128',
 'ENSG00000215790',
 'ENSG00000008130',
 'ENSG00000078369',
 'ENSG00000178821',


In [59]:
len(var_to_keep_adata2)

19507

In [60]:
var_to_keep_araw2 = [e for e in var_to_keep_araw if e in var_in_approved_araw]

In [61]:
var_to_keep_araw2

['ENSG00000279928',
 'ENSG00000279457',
 'ENSG00000228463',
 'ENSG00000230021',
 'ENSG00000237491',
 'ENSG00000225880',
 'ENSG00000230368',
 'ENSG00000188976',
 'ENSG00000187961',
 'ENSG00000187583',
 'ENSG00000187642',
 'ENSG00000188290',
 'ENSG00000187608',
 'ENSG00000188157',
 'ENSG00000131591',
 'ENSG00000186891',
 'ENSG00000186827',
 'ENSG00000078808',
 'ENSG00000176022',
 'ENSG00000184163',
 'ENSG00000160087',
 'ENSG00000162572',
 'ENSG00000131584',
 'ENSG00000169972',
 'ENSG00000127054',
 'ENSG00000224051',
 'ENSG00000169962',
 'ENSG00000107404',
 'ENSG00000162576',
 'ENSG00000175756',
 'ENSG00000221978',
 'ENSG00000242485',
 'ENSG00000179403',
 'ENSG00000215915',
 'ENSG00000160072',
 'ENSG00000197785',
 'ENSG00000160075',
 'ENSG00000215014',
 'ENSG00000228594',
 'ENSG00000272106',
 'ENSG00000197530',
 'ENSG00000189409',
 'ENSG00000248333',
 'ENSG00000272004',
 'ENSG00000189339',
 'ENSG00000008128',
 'ENSG00000215790',
 'ENSG00000008130',
 'ENSG00000078369',
 'ENSG00000178821',


In [62]:
len(var_to_keep_araw2)

19507

In [63]:
adata = adata[:, adata.var.index.isin(var_to_keep_adata2)]

In [64]:
araw = araw[:, araw.var.index.isin(var_to_keep_araw2)]

In [65]:
adata.var['feature_is_filtered'] = [False] * len(adata.var)

  adata.var['feature_is_filtered'] = [False] * len(adata.var)


In [66]:
adata.var

Unnamed: 0_level_0,name,feature_is_filtered
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000279928,FO538757.3,False
ENSG00000279457,FO538757.2,False
ENSG00000228463,AP006222.2,False
ENSG00000230021,RP5-857K21.4,False
ENSG00000237491,RP11-206L10.9,False
...,...,...
ENSG00000228817,BACH1-IT2,False
ENSG00000227757,AP000282.2,False
ENSG00000231106,LINC01436,False
ENSG00000160202,CRYAA,False


In [67]:
araw.var

Unnamed: 0_level_0,name
ensembl_id,Unnamed: 1_level_1
ENSG00000279928,FO538757.3
ENSG00000279457,FO538757.2
ENSG00000228463,AP006222.2
ENSG00000230021,RP5-857K21.4
ENSG00000237491,RP11-206L10.9
...,...
ENSG00000228817,BACH1-IT2
ENSG00000227757,AP000282.2
ENSG00000231106,LINC01436
ENSG00000160202,CRYAA


In [68]:
adata

AnnData object with n_obs × n_vars = 270855 × 19507
    obs: 'orig.ident', 'patient', 'summaryDescription', 'annotation', 'region', 'broad_type', 'n_counts'
    var: 'name', 'feature_is_filtered'
    uns: 'annotation_colors', 'broad_type_colors', 'region_colors', 'summaryDescription_colors'
    obsm: 'X_pca', 'X_umap'

In [69]:
araw

View of AnnData object with n_obs × n_vars = 270855 × 19507
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'patient', 'percent.mt', 'summaryDescription', 'annotation', 'region', 'broad_type'
    var: 'name'
    uns: 'annotation_colors', 'broad_type_colors', 'region_colors', 'summaryDescription_colors'
    obsm: 'X_pca', 'X_umap'

In [70]:
#Store araw to adata.raw

In [71]:
adata.raw = araw

In [72]:
print(adata.raw.X)

  (0, 257)	1.0
  (0, 356)	1.0
  (0, 381)	1.0
  (0, 436)	1.0
  (0, 566)	1.0
  (0, 620)	1.0
  (0, 628)	1.0
  (0, 799)	1.0
  (0, 856)	1.0
  (0, 900)	4.0
  (0, 1058)	1.0
  (0, 1069)	1.0
  (0, 1100)	1.0
  (0, 1123)	1.0
  (0, 1145)	1.0
  (0, 1218)	1.0
  (0, 1275)	1.0
  (0, 1285)	1.0
  (0, 1293)	2.0
  (0, 1329)	1.0
  (0, 1336)	1.0
  (0, 1338)	1.0
  (0, 1410)	1.0
  (0, 1418)	1.0
  (0, 1454)	1.0
  :	:
  (270854, 15253)	43.0
  (270854, 15529)	2.0
  (270854, 15538)	1.0
  (270854, 15731)	1.0
  (270854, 15786)	1.0
  (270854, 15854)	1.0
  (270854, 16050)	1.0
  (270854, 16092)	1.0
  (270854, 16129)	1.0
  (270854, 16382)	1.0
  (270854, 16523)	1.0
  (270854, 16597)	1.0
  (270854, 16603)	1.0
  (270854, 17139)	1.0
  (270854, 17173)	9.0
  (270854, 17607)	1.0
  (270854, 17675)	1.0
  (270854, 17885)	1.0
  (270854, 17906)	1.0
  (270854, 17982)	1.0
  (270854, 18474)	1.0
  (270854, 18675)	1.0
  (270854, 18891)	1.0
  (270854, 19366)	1.0
  (270854, 19390)	1.0


## obs (Cell metadata)

In [73]:
#view obs

In [74]:
adata.obs

Unnamed: 0,orig.ident,patient,summaryDescription,annotation,region,broad_type,n_counts
5739STDY7958790_AACTCAGTCCTCAACC-1,5739STDY7958790,PD43948,Blood,Plasma-IgG,b,Plasma,4916.0
5739STDY7958790_AAGTCTGTCGGATGGA-1,5739STDY7958790,PD43948,Blood,Switched-memory_B,b,B-cell,2708.0
5739STDY7958790_ACACCCTGTGTGAAAT-1,5739STDY7958790,PD43948,Blood,Plasma-IgA,b,Plasma,4575.0
5739STDY7958790_ACATGGTTCAGTTTGG-1,5739STDY7958790,PD43948,Blood,Naïve_B.2,b,B-cell,2931.0
5739STDY7958790_ACCGTAAGTCTTCTCG-1,5739STDY7958790,PD43948,Blood,Plasma-IgA,b,Plasma,5965.0
...,...,...,...,...,...,...,...
5739STDY9266997_TGCTACCAGTGGAGAA-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,RCC,4556.0
5739STDY9266997_TTCTCCTTCCTTAATC-1,5739STDY9266997,PD47512,Tumour-normal,CD,g,RCC,4771.0
5739STDY9266997_TTGAACGGTGCGATAG-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,Epi_non-PT,5502.0
5739STDY9266997_TTGACTTCACCCAGTG-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,RCC,5027.0


#### assay_ontology_term_id

In [75]:
adata.obs

Unnamed: 0,orig.ident,patient,summaryDescription,annotation,region,broad_type,n_counts
5739STDY7958790_AACTCAGTCCTCAACC-1,5739STDY7958790,PD43948,Blood,Plasma-IgG,b,Plasma,4916.0
5739STDY7958790_AAGTCTGTCGGATGGA-1,5739STDY7958790,PD43948,Blood,Switched-memory_B,b,B-cell,2708.0
5739STDY7958790_ACACCCTGTGTGAAAT-1,5739STDY7958790,PD43948,Blood,Plasma-IgA,b,Plasma,4575.0
5739STDY7958790_ACATGGTTCAGTTTGG-1,5739STDY7958790,PD43948,Blood,Naïve_B.2,b,B-cell,2931.0
5739STDY7958790_ACCGTAAGTCTTCTCG-1,5739STDY7958790,PD43948,Blood,Plasma-IgA,b,Plasma,5965.0
...,...,...,...,...,...,...,...
5739STDY9266997_TGCTACCAGTGGAGAA-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,RCC,4556.0
5739STDY9266997_TTCTCCTTCCTTAATC-1,5739STDY9266997,PD47512,Tumour-normal,CD,g,RCC,4771.0
5739STDY9266997_TTGAACGGTGCGATAG-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,Epi_non-PT,5502.0
5739STDY9266997_TTGACTTCACCCAGTG-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,RCC,5027.0


In [76]:
adata.obs['assay_ontology_term_id'] = ['EFO:0030004'] * len(adata.obs)

In [77]:
adata.obs

Unnamed: 0,orig.ident,patient,summaryDescription,annotation,region,broad_type,n_counts,assay_ontology_term_id
5739STDY7958790_AACTCAGTCCTCAACC-1,5739STDY7958790,PD43948,Blood,Plasma-IgG,b,Plasma,4916.0,EFO:0030004
5739STDY7958790_AAGTCTGTCGGATGGA-1,5739STDY7958790,PD43948,Blood,Switched-memory_B,b,B-cell,2708.0,EFO:0030004
5739STDY7958790_ACACCCTGTGTGAAAT-1,5739STDY7958790,PD43948,Blood,Plasma-IgA,b,Plasma,4575.0,EFO:0030004
5739STDY7958790_ACATGGTTCAGTTTGG-1,5739STDY7958790,PD43948,Blood,Naïve_B.2,b,B-cell,2931.0,EFO:0030004
5739STDY7958790_ACCGTAAGTCTTCTCG-1,5739STDY7958790,PD43948,Blood,Plasma-IgA,b,Plasma,5965.0,EFO:0030004
...,...,...,...,...,...,...,...,...
5739STDY9266997_TGCTACCAGTGGAGAA-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,RCC,4556.0,EFO:0030004
5739STDY9266997_TTCTCCTTCCTTAATC-1,5739STDY9266997,PD47512,Tumour-normal,CD,g,RCC,4771.0,EFO:0030004
5739STDY9266997_TTGAACGGTGCGATAG-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,Epi_non-PT,5502.0,EFO:0030004
5739STDY9266997_TTGACTTCACCCAGTG-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,RCC,5027.0,EFO:0030004


#### cell_type_ontology_term_id

In [78]:
#get the column in adata.obs related. to cell type annotation

In [79]:
list(adata.obs['annotation'].unique())

['Plasma-IgG',
 'Switched-memory_B',
 'Plasma-IgA',
 'Naïve_B.2',
 'Plasma-cycling',
 'Low quality',
 'Naïve_B.1',
 'Non-switched-memory_B',
 'Activated_B-AREG',
 'Activated_B-RHOB',
 'Stress_B',
 'INF-response_B',
 'Unknown',
 'CD4+T_EM',
 'CD4+T_Naïve/CM',
 'CD4+Treg',
 'CD4+T_Naïve/CM-patient-specific.1',
 'CD4+T_INF-response',
 'CD4+T_Act-CXCR4',
 'CD4+T_Act-NR4A1',
 'CD4+T_Act-CCL5',
 'CD4+T_Act-CH25H',
 'CD4+T_Naïve/CM-patient-specific.2',
 'CD8+T_EM',
 'gdT_Vd2',
 'CD8+T_EMRA',
 'CD8+T_Naïve/CM',
 'CD8+T_Cycling-G2/M',
 'MAIT',
 'CD8+T_Cycling-G1/S',
 'gdT_Vd1',
 'CD8+T_EX-IL10',
 'CD8+T_EX-CCL4L2',
 'CD8+T_preEX-CXCR4',
 'CD8+T_EFF.2',
 'CD8+T_EFF.1',
 'CD8+T_Act-XCL1/2',
 'CD8+T_RM-CXCL13',
 'CD8+T_preEX-PMCH',
 'CCL5+_EC',
 'ACKR+_EC',
 'Collagen_EC',
 'DNASE1L3+_EC',
 'IGF2+_EC',
 'CRHBP+_EC',
 'Cycling_EC',
 'Pericyte',
 'IGFBP3+_EC',
 'FABP4+_EC',
 'Lymphatic_EC',
 'CRYAB+ Epi',
 'LoH ATL',
 'PT1/2',
 'PT3',
 'LoH TAL',
 'DCT cell',
 'Type A intercalated cell',
 'Podocyte'

In [80]:
mapping= {'Plasma-IgG': 'CL:0000985',
'Switched-memory_B': 'CL:0000972',
'Plasma-IgA': 'CL:0000987',
'Naïve_B.2': 'CL:0000788',
'Plasma-cycling': 'CL:0000786',
'Low quality': 'CL:0000003',
'Naïve_B.1': 'CL:0000788',
'Non-switched-memory_B': 'CL:0000970',
'Activated_B-AREG': 'CL:0000236',
'Activated_B-RHOB': 'CL:0000236',
'Stress_B': 'CL:0000236',
'INF-response_B': 'CL:0000236',
'Unknown': 'CL:0000003',
'CD4+T_EM': 'CL:0000905',
'CD4+T_Naïve/CM': 'CL:0000895',
'CD4+Treg': 'CL:0000792',
'CD4+T_Naïve/CM-patient-specific.1': 'CL:0000895',
'CD4+T_INF-response': 'CL:0000624',
'CD4+T_Act-CXCR4': 'CL:0000624',
'CD4+T_Act-NR4A1': 'CL:0000624',
'CD4+T_Act-CCL5': 'CL:0000624',
'CD4+T_Act-CH25H': 'CL:0000624',
'CD4+T_Naïve/CM-patient-specific.2': 'CL:0000895',
'CD8+T_EM': 'CL:0000913',
'gdT_Vd2': 'CL:0000798',
'CD8+T_EMRA': 'CL:0000913',
'CD8+T_Naïve/CM': 'CL:0000900',
'CD8+T_Cycling-G2/M': 'CL:0000625',
'MAIT': 'CL:0000940',
'CD8+T_Cycling-G1/S': 'CL:0000625',
'gdT_Vd1': 'CL:0000798',
'CD8+T_EX-IL10': 'CL:0000625',
'CD8+T_EX-CCL4L2': 'CL:0000625',
'CD8+T_preEX-CXCR4': 'CL:0000625',
'CD8+T_EFF.2': 'CL:0000625',
'CD8+T_EFF.1': 'CL:0000625',
'CD8+T_Act-XCL1/2': 'CL:0000625',
'CD8+T_RM-CXCL13': 'CL:0000625',
'CD8+T_preEX-PMCH': 'CL:0000625',
'CCL5+_EC': 'CL:0000115',
'ACKR+_EC': 'CL:0000115',
'Collagen_EC': 'CL:0000667',
'DNASE1L3+_EC': 'CL:0000115',
'IGF2+_EC': 'CL:0000115',
'CRHBP+_EC': 'CL:0000115',
'Cycling_EC': 'CL:0000115',
'Pericyte': 'CL:0000669',
'IGFBP3+_EC': 'CL:0000115',
'FABP4+_EC': 'CL:0000115',
'Lymphatic_EC': 'CL:0002138',
'CRYAB+ Epi': 'CL:0000066',
'LoH ATL': 'CL:1001016',
'PT1/2': 'CL:4030009',
'PT3': 'CL:4030011',
'LoH TAL': 'CL:1001106',
'DCT cell': 'CL:1000849',
'Type A intercalated cell': 'CL:0005011',
'Podocyte': 'CL:0000653',
'CD principal cell': 'CL:1001431',
'Connecting tubule cell': 'CL:4030018',
'Type B intercalated cell': 'CL:0002201',
'Pelvic urothelial cell': 'CL:1000703',
'CYGB+ Fibro': 'CL:0000057',
'MHC-II Fibro': 'CL:0000057',
'CNN1+ Fibro': 'CL:0000057',
'Collagen Fibro': 'CL:0000667',
'MMP Fibro': 'CL:0000057',
'CDH19+ Fibro': 'CL:0000057',
'Stress Fibro': 'CL:0000057',
'SMC-like Fibro': 'CL:0000057',
'NFKBIZ+ Fibro': 'CL:0000057',
'Non-classical Mono': 'CL:0000875',
'Classical Mono.2': 'CL:0000860',
'Classical Mono.4': 'CL:0000860',
'Classical Mono.3': 'CL:0000860',
'cDC1': 'CL:0000990',
'pDC': 'CL:0000784',
'cDC2': 'CL:0000990',
'TR Mac.2': 'CL:0000235',
'Classical Mono.1': 'CL:0000860',
'RGS+ TAM': 'CL:0000235',
'Pro-infla. TAM': 'CL:0000235',
'MHC-II TAM': 'CL:0000235',
'TR Mac.1': 'CL:0000235',
'GPNMB+ TAM': 'CL:0000235',
'TR Mac.3': 'CL:0000235',
'SPP1+ TAM': 'CL:0000235',
'FN1+ TAM': 'CL:0000235',
'Mast cell': 'CL:0000097',
'NK-CX3CR1-high': 'CL:0000623',
'cir_NK-PTGDS': 'CL:0000623',
'NK-MTRNR2L12': 'CL:0000623',
'ILC': 'CL:0001065',
'Inflam NK': 'CL:0000623',
'Activated KRT+ NK': 'CL:0000931',
'Activated NK': 'CL:0000931',
'NK-IFNG-high': 'CL:0000623',
'cir_NK-S100B': 'CL:0000623',
'NK-FXYD2': 'CL:0000623',
'NK-SPARC': 'CL:0000623',
'NK-CD65-bright': 'CL:0000623',
'PT': 'CL:0002306',
'MHCII': 'CL:0002470',
'EMT': 'CL:0000003',
'Cycling': 'CL:0000003',
'Stress': 'CL:0000003',
'CD': 'CL:0000003'}

In [80]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['annotation'].map(mapping)

In [81]:
adata.obs['cell_type_ontology_term_id'] = adata.obs['cell_type_ontology_term_id'].astype('category')

In [82]:
adata.obs

Unnamed: 0,orig.ident,patient,summaryDescription,annotation,region,broad_type,n_counts,assay_ontology_term_id,cell_type_ontology_term_id
5739STDY7958790_AACTCAGTCCTCAACC-1,5739STDY7958790,PD43948,Blood,Plasma-IgG,b,Plasma,4916.0,EFO:0030004,CL:0000985
5739STDY7958790_AAGTCTGTCGGATGGA-1,5739STDY7958790,PD43948,Blood,Switched-memory_B,b,B-cell,2708.0,EFO:0030004,CL:0000972
5739STDY7958790_ACACCCTGTGTGAAAT-1,5739STDY7958790,PD43948,Blood,Plasma-IgA,b,Plasma,4575.0,EFO:0030004,CL:0000987
5739STDY7958790_ACATGGTTCAGTTTGG-1,5739STDY7958790,PD43948,Blood,Naïve_B.2,b,B-cell,2931.0,EFO:0030004,CL:0000788
5739STDY7958790_ACCGTAAGTCTTCTCG-1,5739STDY7958790,PD43948,Blood,Plasma-IgA,b,Plasma,5965.0,EFO:0030004,CL:0000987
...,...,...,...,...,...,...,...,...,...
5739STDY9266997_TGCTACCAGTGGAGAA-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,RCC,4556.0,EFO:0030004,CL:0002306
5739STDY9266997_TTCTCCTTCCTTAATC-1,5739STDY9266997,PD47512,Tumour-normal,CD,g,RCC,4771.0,EFO:0030004,CL:0000003
5739STDY9266997_TTGAACGGTGCGATAG-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,Epi_non-PT,5502.0,EFO:0030004,CL:0002306
5739STDY9266997_TTGACTTCACCCAGTG-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,RCC,5027.0,EFO:0030004,CL:0002306


#### development_stage_ontology_term_id

In [83]:
#Since no information regarding age is found in adata.obs, supplementray table with that info is loaded

In [84]:
suppl_info =pd.read_csv('/home/jovyan/CXG_DATASETS_PORTAL/kidney_cancer/suppl_info.csv')

In [85]:
suppl_info

Unnamed: 0,Patient_ID,sex,age,stage,metastases,grade,necrosis,sarcomatoid changes,lymphovascular invasion,Leibovich,...,PBRM1,BAP1,SETD2,TSC2,KDM5C,ARID1A,grade - region a,grade - region c,grade - region d,grade - region e
0,PD43824,male,41-50,1b,0,2.0,no,no,no,2.0,...,,,,,,,2.0,2.0,2.0,2.0
1,PD43948,female,71-80,3a,1,4.0,yes,yes,yes,8.0,...,,,,bifs,,,3.0,2.0,3.0,3.0
2,PD44714,male,51-60,,0,,,,,,...,,,,,,,,,,
3,PD44966,male,51-60,1a,0,3.0,no,no,no,1.0,...,,fs,,,,,2.0,,,
4,PD44967,male,71-80,,1,4.0,no,no,no,8.0,...,-,-,-,-,-,-,,,,
5,PD45814,male,61-70,3a,0,4.0,yes,yes,yes,8.0,...,fs,,,,,,3.0,3.0,3.0,2.0
6,PD45815,male,51-60,3a,0,2.0,no,no,no,4.0,...,,,,,,ns-sub-subclonal,2.0,2.0,2.0,2.0
7,PD45816,female,71-80,3a,0,4.0,yes,yes,yes,9.0,...,,ns-sub,,,,,3.0,3.0,3.0,3.0
8,PD47171,female,51-60,3a,0,4.0,yes,yes,yes,7.0,...,fs,ns-sub,,,,,2.0,2.0,2.0,2.0
9,PD47172,male,51-60,,0,,,,,,...,,,,,,,,,,


In [86]:
#map age to patient in adata.obs

In [87]:
age  = dict(zip(suppl_info['Patient_ID'], suppl_info['age']))

In [88]:
age

{'PD43824': '41-50',
 'PD43948': '71-80',
 'PD44714': '51-60',
 'PD44966': '51-60',
 'PD44967': '71-80',
 'PD45814': '61-70',
 'PD45815': '51-60',
 'PD45816': '71-80',
 'PD47171': '51-60',
 'PD47172': '51-60',
 'PD47465': '61-70',
 'PD47512': '51-60'}

In [89]:
adata.obs['age'] = adata.obs['patient'].map(age)

In [90]:
list(suppl_info['age'].unique())

['41-50', '71-80', '51-60', '61-70']

In [91]:
mapping={'41-50':'HsapDv:0000239', 
         '71-80':'HsapDv:0000242', 
         '51-60':'HsapDv:0000240',
         '61-70':'HsapDv:0000241'}

In [92]:
adata.obs['development_stage_ontology_term_id'] =adata.obs['age'].map(mapping)

In [93]:
adata.obs

Unnamed: 0,orig.ident,patient,summaryDescription,annotation,region,broad_type,n_counts,assay_ontology_term_id,cell_type_ontology_term_id,age,development_stage_ontology_term_id
5739STDY7958790_AACTCAGTCCTCAACC-1,5739STDY7958790,PD43948,Blood,Plasma-IgG,b,Plasma,4916.0,EFO:0030004,CL:0000985,71-80,HsapDv:0000242
5739STDY7958790_AAGTCTGTCGGATGGA-1,5739STDY7958790,PD43948,Blood,Switched-memory_B,b,B-cell,2708.0,EFO:0030004,CL:0000972,71-80,HsapDv:0000242
5739STDY7958790_ACACCCTGTGTGAAAT-1,5739STDY7958790,PD43948,Blood,Plasma-IgA,b,Plasma,4575.0,EFO:0030004,CL:0000987,71-80,HsapDv:0000242
5739STDY7958790_ACATGGTTCAGTTTGG-1,5739STDY7958790,PD43948,Blood,Naïve_B.2,b,B-cell,2931.0,EFO:0030004,CL:0000788,71-80,HsapDv:0000242
5739STDY7958790_ACCGTAAGTCTTCTCG-1,5739STDY7958790,PD43948,Blood,Plasma-IgA,b,Plasma,5965.0,EFO:0030004,CL:0000987,71-80,HsapDv:0000242
...,...,...,...,...,...,...,...,...,...,...,...
5739STDY9266997_TGCTACCAGTGGAGAA-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,RCC,4556.0,EFO:0030004,CL:0002306,51-60,HsapDv:0000240
5739STDY9266997_TTCTCCTTCCTTAATC-1,5739STDY9266997,PD47512,Tumour-normal,CD,g,RCC,4771.0,EFO:0030004,CL:0000003,51-60,HsapDv:0000240
5739STDY9266997_TTGAACGGTGCGATAG-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,Epi_non-PT,5502.0,EFO:0030004,CL:0002306,51-60,HsapDv:0000240
5739STDY9266997_TTGACTTCACCCAGTG-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,RCC,5027.0,EFO:0030004,CL:0002306,51-60,HsapDv:0000240


#### disease_ontology_term_id

In [94]:
disease = age  = dict(zip(suppl_info['Patient_ID'], suppl_info['Histology']))

In [95]:
adata.obs['disease'] = adata.obs['patient'].map(disease)

In [96]:
adata.obs

Unnamed: 0,orig.ident,patient,summaryDescription,annotation,region,broad_type,n_counts,assay_ontology_term_id,cell_type_ontology_term_id,age,development_stage_ontology_term_id,disease
5739STDY7958790_AACTCAGTCCTCAACC-1,5739STDY7958790,PD43948,Blood,Plasma-IgG,b,Plasma,4916.0,EFO:0030004,CL:0000985,71-80,HsapDv:0000242,ccRCC
5739STDY7958790_AAGTCTGTCGGATGGA-1,5739STDY7958790,PD43948,Blood,Switched-memory_B,b,B-cell,2708.0,EFO:0030004,CL:0000972,71-80,HsapDv:0000242,ccRCC
5739STDY7958790_ACACCCTGTGTGAAAT-1,5739STDY7958790,PD43948,Blood,Plasma-IgA,b,Plasma,4575.0,EFO:0030004,CL:0000987,71-80,HsapDv:0000242,ccRCC
5739STDY7958790_ACATGGTTCAGTTTGG-1,5739STDY7958790,PD43948,Blood,Naïve_B.2,b,B-cell,2931.0,EFO:0030004,CL:0000788,71-80,HsapDv:0000242,ccRCC
5739STDY7958790_ACCGTAAGTCTTCTCG-1,5739STDY7958790,PD43948,Blood,Plasma-IgA,b,Plasma,5965.0,EFO:0030004,CL:0000987,71-80,HsapDv:0000242,ccRCC
...,...,...,...,...,...,...,...,...,...,...,...,...
5739STDY9266997_TGCTACCAGTGGAGAA-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,RCC,4556.0,EFO:0030004,CL:0002306,51-60,HsapDv:0000240,ccRCC
5739STDY9266997_TTCTCCTTCCTTAATC-1,5739STDY9266997,PD47512,Tumour-normal,CD,g,RCC,4771.0,EFO:0030004,CL:0000003,51-60,HsapDv:0000240,ccRCC
5739STDY9266997_TTGAACGGTGCGATAG-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,Epi_non-PT,5502.0,EFO:0030004,CL:0002306,51-60,HsapDv:0000240,ccRCC
5739STDY9266997_TTGACTTCACCCAGTG-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,RCC,5027.0,EFO:0030004,CL:0002306,51-60,HsapDv:0000240,ccRCC


In [97]:
list(suppl_info['Histology'].unique())

['ccRCC', 'Benign', 'oncocytoma']

In [98]:
mapping={'ccRCC':'MONDO:0007763', 
         'Benign':'MONDO:0002513', 
         'oncocytoma':'MONDO:0003825'}

In [99]:
adata.obs['disease_ontology_term_id'] = adata.obs['disease'].map(mapping)

In [100]:
adata.obs

Unnamed: 0,orig.ident,patient,summaryDescription,annotation,region,broad_type,n_counts,assay_ontology_term_id,cell_type_ontology_term_id,age,development_stage_ontology_term_id,disease,disease_ontology_term_id
5739STDY7958790_AACTCAGTCCTCAACC-1,5739STDY7958790,PD43948,Blood,Plasma-IgG,b,Plasma,4916.0,EFO:0030004,CL:0000985,71-80,HsapDv:0000242,ccRCC,MONDO:0007763
5739STDY7958790_AAGTCTGTCGGATGGA-1,5739STDY7958790,PD43948,Blood,Switched-memory_B,b,B-cell,2708.0,EFO:0030004,CL:0000972,71-80,HsapDv:0000242,ccRCC,MONDO:0007763
5739STDY7958790_ACACCCTGTGTGAAAT-1,5739STDY7958790,PD43948,Blood,Plasma-IgA,b,Plasma,4575.0,EFO:0030004,CL:0000987,71-80,HsapDv:0000242,ccRCC,MONDO:0007763
5739STDY7958790_ACATGGTTCAGTTTGG-1,5739STDY7958790,PD43948,Blood,Naïve_B.2,b,B-cell,2931.0,EFO:0030004,CL:0000788,71-80,HsapDv:0000242,ccRCC,MONDO:0007763
5739STDY7958790_ACCGTAAGTCTTCTCG-1,5739STDY7958790,PD43948,Blood,Plasma-IgA,b,Plasma,5965.0,EFO:0030004,CL:0000987,71-80,HsapDv:0000242,ccRCC,MONDO:0007763
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5739STDY9266997_TGCTACCAGTGGAGAA-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,RCC,4556.0,EFO:0030004,CL:0002306,51-60,HsapDv:0000240,ccRCC,MONDO:0007763
5739STDY9266997_TTCTCCTTCCTTAATC-1,5739STDY9266997,PD47512,Tumour-normal,CD,g,RCC,4771.0,EFO:0030004,CL:0000003,51-60,HsapDv:0000240,ccRCC,MONDO:0007763
5739STDY9266997_TTGAACGGTGCGATAG-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,Epi_non-PT,5502.0,EFO:0030004,CL:0002306,51-60,HsapDv:0000240,ccRCC,MONDO:0007763
5739STDY9266997_TTGACTTCACCCAGTG-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,RCC,5027.0,EFO:0030004,CL:0002306,51-60,HsapDv:0000240,ccRCC,MONDO:0007763


#### donor_id

In [101]:
adata.obs['donor_id'] = adata.obs['patient']

In [102]:
adata.obs

Unnamed: 0,orig.ident,patient,summaryDescription,annotation,region,broad_type,n_counts,assay_ontology_term_id,cell_type_ontology_term_id,age,development_stage_ontology_term_id,disease,disease_ontology_term_id,donor_id
5739STDY7958790_AACTCAGTCCTCAACC-1,5739STDY7958790,PD43948,Blood,Plasma-IgG,b,Plasma,4916.0,EFO:0030004,CL:0000985,71-80,HsapDv:0000242,ccRCC,MONDO:0007763,PD43948
5739STDY7958790_AAGTCTGTCGGATGGA-1,5739STDY7958790,PD43948,Blood,Switched-memory_B,b,B-cell,2708.0,EFO:0030004,CL:0000972,71-80,HsapDv:0000242,ccRCC,MONDO:0007763,PD43948
5739STDY7958790_ACACCCTGTGTGAAAT-1,5739STDY7958790,PD43948,Blood,Plasma-IgA,b,Plasma,4575.0,EFO:0030004,CL:0000987,71-80,HsapDv:0000242,ccRCC,MONDO:0007763,PD43948
5739STDY7958790_ACATGGTTCAGTTTGG-1,5739STDY7958790,PD43948,Blood,Naïve_B.2,b,B-cell,2931.0,EFO:0030004,CL:0000788,71-80,HsapDv:0000242,ccRCC,MONDO:0007763,PD43948
5739STDY7958790_ACCGTAAGTCTTCTCG-1,5739STDY7958790,PD43948,Blood,Plasma-IgA,b,Plasma,5965.0,EFO:0030004,CL:0000987,71-80,HsapDv:0000242,ccRCC,MONDO:0007763,PD43948
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5739STDY9266997_TGCTACCAGTGGAGAA-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,RCC,4556.0,EFO:0030004,CL:0002306,51-60,HsapDv:0000240,ccRCC,MONDO:0007763,PD47512
5739STDY9266997_TTCTCCTTCCTTAATC-1,5739STDY9266997,PD47512,Tumour-normal,CD,g,RCC,4771.0,EFO:0030004,CL:0000003,51-60,HsapDv:0000240,ccRCC,MONDO:0007763,PD47512
5739STDY9266997_TTGAACGGTGCGATAG-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,Epi_non-PT,5502.0,EFO:0030004,CL:0002306,51-60,HsapDv:0000240,ccRCC,MONDO:0007763,PD47512
5739STDY9266997_TTGACTTCACCCAGTG-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,RCC,5027.0,EFO:0030004,CL:0002306,51-60,HsapDv:0000240,ccRCC,MONDO:0007763,PD47512


#### is_primary_data

In [103]:
adata.obs['is_primary_data'] = [True] * len(adata.obs)

In [104]:
adata.obs['is_primary_data'] = adata.obs['is_primary_data'].astype('bool')

#### organism_ontology_term_id

In [105]:
adata.obs['organism_ontology_term_id'] = ['NCBITaxon:9606'] * len(adata.obs)

In [106]:
adata.obs['organism_ontology_term_id'] = pd.Categorical(adata.obs['organism_ontology_term_id'])

In [107]:
adata.obs['organism_ontology_term_id']

5739STDY7958790_AACTCAGTCCTCAACC-1    NCBITaxon:9606
5739STDY7958790_AAGTCTGTCGGATGGA-1    NCBITaxon:9606
5739STDY7958790_ACACCCTGTGTGAAAT-1    NCBITaxon:9606
5739STDY7958790_ACATGGTTCAGTTTGG-1    NCBITaxon:9606
5739STDY7958790_ACCGTAAGTCTTCTCG-1    NCBITaxon:9606
                                           ...      
5739STDY9266997_TGCTACCAGTGGAGAA-1    NCBITaxon:9606
5739STDY9266997_TTCTCCTTCCTTAATC-1    NCBITaxon:9606
5739STDY9266997_TTGAACGGTGCGATAG-1    NCBITaxon:9606
5739STDY9266997_TTGACTTCACCCAGTG-1    NCBITaxon:9606
5739STDY9266997_TTTGCGCGTCTGCGGT-1    NCBITaxon:9606
Name: organism_ontology_term_id, Length: 270855, dtype: category
Categories (1, object): ['NCBITaxon:9606']

#### self_reported_ethnicity_ontology_term_id

In [108]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = ['unknown'] * len(adata.obs)

In [109]:
adata.obs['self_reported_ethnicity_ontology_term_id'] = adata.obs['self_reported_ethnicity_ontology_term_id'].astype('category')

In [110]:
adata.obs['self_reported_ethnicity_ontology_term_id']

5739STDY7958790_AACTCAGTCCTCAACC-1    unknown
5739STDY7958790_AAGTCTGTCGGATGGA-1    unknown
5739STDY7958790_ACACCCTGTGTGAAAT-1    unknown
5739STDY7958790_ACATGGTTCAGTTTGG-1    unknown
5739STDY7958790_ACCGTAAGTCTTCTCG-1    unknown
                                       ...   
5739STDY9266997_TGCTACCAGTGGAGAA-1    unknown
5739STDY9266997_TTCTCCTTCCTTAATC-1    unknown
5739STDY9266997_TTGAACGGTGCGATAG-1    unknown
5739STDY9266997_TTGACTTCACCCAGTG-1    unknown
5739STDY9266997_TTTGCGCGTCTGCGGT-1    unknown
Name: self_reported_ethnicity_ontology_term_id, Length: 270855, dtype: category
Categories (1, object): ['unknown']

In [111]:
adata.obs

Unnamed: 0,orig.ident,patient,summaryDescription,annotation,region,broad_type,n_counts,assay_ontology_term_id,cell_type_ontology_term_id,age,development_stage_ontology_term_id,disease,disease_ontology_term_id,donor_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id
5739STDY7958790_AACTCAGTCCTCAACC-1,5739STDY7958790,PD43948,Blood,Plasma-IgG,b,Plasma,4916.0,EFO:0030004,CL:0000985,71-80,HsapDv:0000242,ccRCC,MONDO:0007763,PD43948,True,NCBITaxon:9606,unknown
5739STDY7958790_AAGTCTGTCGGATGGA-1,5739STDY7958790,PD43948,Blood,Switched-memory_B,b,B-cell,2708.0,EFO:0030004,CL:0000972,71-80,HsapDv:0000242,ccRCC,MONDO:0007763,PD43948,True,NCBITaxon:9606,unknown
5739STDY7958790_ACACCCTGTGTGAAAT-1,5739STDY7958790,PD43948,Blood,Plasma-IgA,b,Plasma,4575.0,EFO:0030004,CL:0000987,71-80,HsapDv:0000242,ccRCC,MONDO:0007763,PD43948,True,NCBITaxon:9606,unknown
5739STDY7958790_ACATGGTTCAGTTTGG-1,5739STDY7958790,PD43948,Blood,Naïve_B.2,b,B-cell,2931.0,EFO:0030004,CL:0000788,71-80,HsapDv:0000242,ccRCC,MONDO:0007763,PD43948,True,NCBITaxon:9606,unknown
5739STDY7958790_ACCGTAAGTCTTCTCG-1,5739STDY7958790,PD43948,Blood,Plasma-IgA,b,Plasma,5965.0,EFO:0030004,CL:0000987,71-80,HsapDv:0000242,ccRCC,MONDO:0007763,PD43948,True,NCBITaxon:9606,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5739STDY9266997_TGCTACCAGTGGAGAA-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,RCC,4556.0,EFO:0030004,CL:0002306,51-60,HsapDv:0000240,ccRCC,MONDO:0007763,PD47512,True,NCBITaxon:9606,unknown
5739STDY9266997_TTCTCCTTCCTTAATC-1,5739STDY9266997,PD47512,Tumour-normal,CD,g,RCC,4771.0,EFO:0030004,CL:0000003,51-60,HsapDv:0000240,ccRCC,MONDO:0007763,PD47512,True,NCBITaxon:9606,unknown
5739STDY9266997_TTGAACGGTGCGATAG-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,Epi_non-PT,5502.0,EFO:0030004,CL:0002306,51-60,HsapDv:0000240,ccRCC,MONDO:0007763,PD47512,True,NCBITaxon:9606,unknown
5739STDY9266997_TTGACTTCACCCAGTG-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,RCC,5027.0,EFO:0030004,CL:0002306,51-60,HsapDv:0000240,ccRCC,MONDO:0007763,PD47512,True,NCBITaxon:9606,unknown


#### sex_ontology_term_id

In [112]:
sex  = dict(zip(suppl_info['Patient_ID'], suppl_info['sex']))

In [113]:
sex

{'PD43824': 'male',
 'PD43948': 'female',
 'PD44714': 'male',
 'PD44966': 'male',
 'PD44967': 'male',
 'PD45814': 'male',
 'PD45815': 'male',
 'PD45816': 'female',
 'PD47171': 'female',
 'PD47172': 'male',
 'PD47465': 'female',
 'PD47512': 'male'}

In [114]:
adata.obs['sex'] = adata.obs['patient'].map(sex)

In [115]:
adata.obs['sex']

5739STDY7958790_AACTCAGTCCTCAACC-1    female
5739STDY7958790_AAGTCTGTCGGATGGA-1    female
5739STDY7958790_ACACCCTGTGTGAAAT-1    female
5739STDY7958790_ACATGGTTCAGTTTGG-1    female
5739STDY7958790_ACCGTAAGTCTTCTCG-1    female
                                       ...  
5739STDY9266997_TGCTACCAGTGGAGAA-1      male
5739STDY9266997_TTCTCCTTCCTTAATC-1      male
5739STDY9266997_TTGAACGGTGCGATAG-1      male
5739STDY9266997_TTGACTTCACCCAGTG-1      male
5739STDY9266997_TTTGCGCGTCTGCGGT-1      male
Name: sex, Length: 270855, dtype: object

In [116]:
mapping= {'female':'PATO:0000383', 'male':'PATO:0000384'}

In [117]:
adata.obs['sex_ontology_term_id'] =adata.obs['sex'].map(mapping)

In [118]:
adata.obs

Unnamed: 0,orig.ident,patient,summaryDescription,annotation,region,broad_type,n_counts,assay_ontology_term_id,cell_type_ontology_term_id,age,development_stage_ontology_term_id,disease,disease_ontology_term_id,donor_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex,sex_ontology_term_id
5739STDY7958790_AACTCAGTCCTCAACC-1,5739STDY7958790,PD43948,Blood,Plasma-IgG,b,Plasma,4916.0,EFO:0030004,CL:0000985,71-80,HsapDv:0000242,ccRCC,MONDO:0007763,PD43948,True,NCBITaxon:9606,unknown,female,PATO:0000383
5739STDY7958790_AAGTCTGTCGGATGGA-1,5739STDY7958790,PD43948,Blood,Switched-memory_B,b,B-cell,2708.0,EFO:0030004,CL:0000972,71-80,HsapDv:0000242,ccRCC,MONDO:0007763,PD43948,True,NCBITaxon:9606,unknown,female,PATO:0000383
5739STDY7958790_ACACCCTGTGTGAAAT-1,5739STDY7958790,PD43948,Blood,Plasma-IgA,b,Plasma,4575.0,EFO:0030004,CL:0000987,71-80,HsapDv:0000242,ccRCC,MONDO:0007763,PD43948,True,NCBITaxon:9606,unknown,female,PATO:0000383
5739STDY7958790_ACATGGTTCAGTTTGG-1,5739STDY7958790,PD43948,Blood,Naïve_B.2,b,B-cell,2931.0,EFO:0030004,CL:0000788,71-80,HsapDv:0000242,ccRCC,MONDO:0007763,PD43948,True,NCBITaxon:9606,unknown,female,PATO:0000383
5739STDY7958790_ACCGTAAGTCTTCTCG-1,5739STDY7958790,PD43948,Blood,Plasma-IgA,b,Plasma,5965.0,EFO:0030004,CL:0000987,71-80,HsapDv:0000242,ccRCC,MONDO:0007763,PD43948,True,NCBITaxon:9606,unknown,female,PATO:0000383
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5739STDY9266997_TGCTACCAGTGGAGAA-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,RCC,4556.0,EFO:0030004,CL:0002306,51-60,HsapDv:0000240,ccRCC,MONDO:0007763,PD47512,True,NCBITaxon:9606,unknown,male,PATO:0000384
5739STDY9266997_TTCTCCTTCCTTAATC-1,5739STDY9266997,PD47512,Tumour-normal,CD,g,RCC,4771.0,EFO:0030004,CL:0000003,51-60,HsapDv:0000240,ccRCC,MONDO:0007763,PD47512,True,NCBITaxon:9606,unknown,male,PATO:0000384
5739STDY9266997_TTGAACGGTGCGATAG-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,Epi_non-PT,5502.0,EFO:0030004,CL:0002306,51-60,HsapDv:0000240,ccRCC,MONDO:0007763,PD47512,True,NCBITaxon:9606,unknown,male,PATO:0000384
5739STDY9266997_TTGACTTCACCCAGTG-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,RCC,5027.0,EFO:0030004,CL:0002306,51-60,HsapDv:0000240,ccRCC,MONDO:0007763,PD47512,True,NCBITaxon:9606,unknown,male,PATO:0000384


### suspension_type

In [119]:
mapping= {'EFO:0030004':'cell', 'EFO:0009922':'cell','EFO:0030059':'nucleus'}

In [120]:
adata.obs['suspension_type'] =adata.obs['assay_ontology_term_id'].map(mapping)

In [121]:
adata.obs

Unnamed: 0,orig.ident,patient,summaryDescription,annotation,region,broad_type,n_counts,assay_ontology_term_id,cell_type_ontology_term_id,age,development_stage_ontology_term_id,disease,disease_ontology_term_id,donor_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex,sex_ontology_term_id,suspension_type
5739STDY7958790_AACTCAGTCCTCAACC-1,5739STDY7958790,PD43948,Blood,Plasma-IgG,b,Plasma,4916.0,EFO:0030004,CL:0000985,71-80,HsapDv:0000242,ccRCC,MONDO:0007763,PD43948,True,NCBITaxon:9606,unknown,female,PATO:0000383,cell
5739STDY7958790_AAGTCTGTCGGATGGA-1,5739STDY7958790,PD43948,Blood,Switched-memory_B,b,B-cell,2708.0,EFO:0030004,CL:0000972,71-80,HsapDv:0000242,ccRCC,MONDO:0007763,PD43948,True,NCBITaxon:9606,unknown,female,PATO:0000383,cell
5739STDY7958790_ACACCCTGTGTGAAAT-1,5739STDY7958790,PD43948,Blood,Plasma-IgA,b,Plasma,4575.0,EFO:0030004,CL:0000987,71-80,HsapDv:0000242,ccRCC,MONDO:0007763,PD43948,True,NCBITaxon:9606,unknown,female,PATO:0000383,cell
5739STDY7958790_ACATGGTTCAGTTTGG-1,5739STDY7958790,PD43948,Blood,Naïve_B.2,b,B-cell,2931.0,EFO:0030004,CL:0000788,71-80,HsapDv:0000242,ccRCC,MONDO:0007763,PD43948,True,NCBITaxon:9606,unknown,female,PATO:0000383,cell
5739STDY7958790_ACCGTAAGTCTTCTCG-1,5739STDY7958790,PD43948,Blood,Plasma-IgA,b,Plasma,5965.0,EFO:0030004,CL:0000987,71-80,HsapDv:0000242,ccRCC,MONDO:0007763,PD43948,True,NCBITaxon:9606,unknown,female,PATO:0000383,cell
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5739STDY9266997_TGCTACCAGTGGAGAA-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,RCC,4556.0,EFO:0030004,CL:0002306,51-60,HsapDv:0000240,ccRCC,MONDO:0007763,PD47512,True,NCBITaxon:9606,unknown,male,PATO:0000384,cell
5739STDY9266997_TTCTCCTTCCTTAATC-1,5739STDY9266997,PD47512,Tumour-normal,CD,g,RCC,4771.0,EFO:0030004,CL:0000003,51-60,HsapDv:0000240,ccRCC,MONDO:0007763,PD47512,True,NCBITaxon:9606,unknown,male,PATO:0000384,cell
5739STDY9266997_TTGAACGGTGCGATAG-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,Epi_non-PT,5502.0,EFO:0030004,CL:0002306,51-60,HsapDv:0000240,ccRCC,MONDO:0007763,PD47512,True,NCBITaxon:9606,unknown,male,PATO:0000384,cell
5739STDY9266997_TTGACTTCACCCAGTG-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,RCC,5027.0,EFO:0030004,CL:0002306,51-60,HsapDv:0000240,ccRCC,MONDO:0007763,PD47512,True,NCBITaxon:9606,unknown,male,PATO:0000384,cell


#### tissue_ontology_term_id

In [122]:
list(adata.obs['summaryDescription'].unique())

['Blood',
 'Metastasis',
 'Normal adrenal',
 'Tumour',
 'Tumour-normal',
 'Fat',
 'Normal kidney',
 'Thrombus']

In [123]:
mapping = {'Blood':'UBERON:0000178',
 'Metastasis':'UBERON:0018303',
 'Normal adrenal':'UBERON:0018303',
 'Tumour':'UBERON:0002113',
 'Tumour-normal':'UBERON:0002113',
 'Fat':'UBERON:0005406',
 'Normal kidney':'UBERON:0002113',
 'Thrombus':'UBERON:0010210' }

In [124]:
adata.obs['tissue_ontology_term_id'] =adata.obs['summaryDescription'].map(mapping)

In [125]:
adata.obs

Unnamed: 0,orig.ident,patient,summaryDescription,annotation,region,broad_type,n_counts,assay_ontology_term_id,cell_type_ontology_term_id,age,...,disease,disease_ontology_term_id,donor_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
5739STDY7958790_AACTCAGTCCTCAACC-1,5739STDY7958790,PD43948,Blood,Plasma-IgG,b,Plasma,4916.0,EFO:0030004,CL:0000985,71-80,...,ccRCC,MONDO:0007763,PD43948,True,NCBITaxon:9606,unknown,female,PATO:0000383,cell,UBERON:0000178
5739STDY7958790_AAGTCTGTCGGATGGA-1,5739STDY7958790,PD43948,Blood,Switched-memory_B,b,B-cell,2708.0,EFO:0030004,CL:0000972,71-80,...,ccRCC,MONDO:0007763,PD43948,True,NCBITaxon:9606,unknown,female,PATO:0000383,cell,UBERON:0000178
5739STDY7958790_ACACCCTGTGTGAAAT-1,5739STDY7958790,PD43948,Blood,Plasma-IgA,b,Plasma,4575.0,EFO:0030004,CL:0000987,71-80,...,ccRCC,MONDO:0007763,PD43948,True,NCBITaxon:9606,unknown,female,PATO:0000383,cell,UBERON:0000178
5739STDY7958790_ACATGGTTCAGTTTGG-1,5739STDY7958790,PD43948,Blood,Naïve_B.2,b,B-cell,2931.0,EFO:0030004,CL:0000788,71-80,...,ccRCC,MONDO:0007763,PD43948,True,NCBITaxon:9606,unknown,female,PATO:0000383,cell,UBERON:0000178
5739STDY7958790_ACCGTAAGTCTTCTCG-1,5739STDY7958790,PD43948,Blood,Plasma-IgA,b,Plasma,5965.0,EFO:0030004,CL:0000987,71-80,...,ccRCC,MONDO:0007763,PD43948,True,NCBITaxon:9606,unknown,female,PATO:0000383,cell,UBERON:0000178
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5739STDY9266997_TGCTACCAGTGGAGAA-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,RCC,4556.0,EFO:0030004,CL:0002306,51-60,...,ccRCC,MONDO:0007763,PD47512,True,NCBITaxon:9606,unknown,male,PATO:0000384,cell,UBERON:0002113
5739STDY9266997_TTCTCCTTCCTTAATC-1,5739STDY9266997,PD47512,Tumour-normal,CD,g,RCC,4771.0,EFO:0030004,CL:0000003,51-60,...,ccRCC,MONDO:0007763,PD47512,True,NCBITaxon:9606,unknown,male,PATO:0000384,cell,UBERON:0002113
5739STDY9266997_TTGAACGGTGCGATAG-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,Epi_non-PT,5502.0,EFO:0030004,CL:0002306,51-60,...,ccRCC,MONDO:0007763,PD47512,True,NCBITaxon:9606,unknown,male,PATO:0000384,cell,UBERON:0002113
5739STDY9266997_TTGACTTCACCCAGTG-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,RCC,5027.0,EFO:0030004,CL:0002306,51-60,...,ccRCC,MONDO:0007763,PD47512,True,NCBITaxon:9606,unknown,male,PATO:0000384,cell,UBERON:0002113


##### Data types

In [126]:
adata.obs.dtypes

orig.ident                                  category
patient                                     category
summaryDescription                          category
annotation                                  category
region                                      category
broad_type                                  category
n_counts                                     float32
assay_ontology_term_id                        object
cell_type_ontology_term_id                  category
age                                           object
development_stage_ontology_term_id            object
disease                                       object
disease_ontology_term_id                      object
donor_id                                    category
is_primary_data                                 bool
organism_ontology_term_id                   category
self_reported_ethnicity_ontology_term_id    category
sex                                           object
sex_ontology_term_id                          

In [127]:
dty = pd.DataFrame(adata.obs.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.obs[c] = adata.obs[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.obs[c] = adata.obs[c].astype('int32') 
    print(f"changed {c} from int64 to int32")
for c in dty[dty['dtype'] == 'object'].index.values:
    adata.obs[c] = adata.obs[c].astype('category') 
    print(f"changed {c} from object to category")

changed assay_ontology_term_id from object to category
changed age from object to category
changed development_stage_ontology_term_id from object to category
changed disease from object to category
changed disease_ontology_term_id from object to category
changed sex from object to category
changed sex_ontology_term_id from object to category
changed suspension_type from object to category
changed tissue_ontology_term_id from object to category


In [128]:
adata.var.dtypes

name                   object
feature_is_filtered      bool
dtype: object

In [129]:
dty = pd.DataFrame(adata.var.dtypes, columns = ['dtype'])
for c in dty[dty['dtype'] == 'float64'].index.values:
    adata.var[c] = adata.var[c].astype('float32')
    print(f"changed {c} from float64 to float32")
for c in dty[dty['dtype'] == 'int64'].index.values:
    adata.var[c] = adata.var[c].astype('int32') 
    print(f"changed {c} from int64 to int32")

### obsm (Embeddings)

In [130]:
adata.obsm

AxisArrays with keys: X_pca, X_umap

#### uns (Dataset Metadata)

In [131]:
adata.uns

OverloadedDict, wrapping:
	{'annotation_colors': array(['#1f77b4', '#1f77b4', '#1f77b4', '#1f77b4', '#1f77b4', '#1f77b4',
       '#aec7e8', '#aec7e8', '#aec7e8', '#aec7e8', '#aec7e8', '#ff7f0e',
       '#ff7f0e', '#ff7f0e', '#ff7f0e', '#ff7f0e', '#ffbb78', '#ffbb78',
       '#ffbb78', '#ffbb78', '#ffbb78', '#ffbb78', '#2ca02c', '#2ca02c',
       '#2ca02c', '#2ca02c', '#2ca02c', '#98df8a', '#98df8a', '#98df8a',
       '#98df8a', '#98df8a', '#d62728', '#d62728', '#d62728', '#d62728',
       '#d62728', '#d62728', '#ff9896', '#ff9896', '#ff9896', '#ff9896',
       '#ff9896', '#9467bd', '#9467bd', '#9467bd', '#9467bd', '#9467bd',
       '#c5b0d5', '#c5b0d5', '#c5b0d5', '#c5b0d5', '#c5b0d5', '#8c564b',
       '#8c564b', '#8c564b', '#8c564b', '#8c564b', '#8c564b', '#c49c94',
       '#c49c94', '#c49c94', '#c49c94', '#c49c94', '#e377c2', '#e377c2',
       '#e377c2', '#e377c2', '#e377c2', '#f7b6d2', '#f7b6d2', '#f7b6d2',
       '#f7b6d2', '#f7b6d2', '#f7b6d2', '#7f7f7f', '#7f7f7f', '#7f7f7f',
  

In [132]:
adata.uns['schema_version'] = '3.0.0'

In [133]:
adata.uns['title'] = 'Single-cell transcriptomic datasets of Renal cell carcinoma patients'

In [134]:
adata.uns['default_embedding'] = 'X_umap'

### Final checks and adjustments

In [135]:
adata

AnnData object with n_obs × n_vars = 270855 × 19507
    obs: 'orig.ident', 'patient', 'summaryDescription', 'annotation', 'region', 'broad_type', 'n_counts', 'assay_ontology_term_id', 'cell_type_ontology_term_id', 'age', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue_ontology_term_id'
    var: 'name', 'feature_is_filtered'
    uns: 'annotation_colors', 'broad_type_colors', 'region_colors', 'summaryDescription_colors', 'schema_version', 'title', 'default_embedding'
    obsm: 'X_pca', 'X_umap'

In [136]:
adata.obs.dtypes

orig.ident                                  category
patient                                     category
summaryDescription                          category
annotation                                  category
region                                      category
broad_type                                  category
n_counts                                     float32
assay_ontology_term_id                      category
cell_type_ontology_term_id                  category
age                                         category
development_stage_ontology_term_id          category
disease                                     category
disease_ontology_term_id                    category
donor_id                                    category
is_primary_data                                 bool
organism_ontology_term_id                   category
self_reported_ethnicity_ontology_term_id    category
sex                                         category
sex_ontology_term_id                        ca

In [137]:
adata.obs

Unnamed: 0,orig.ident,patient,summaryDescription,annotation,region,broad_type,n_counts,assay_ontology_term_id,cell_type_ontology_term_id,age,...,disease,disease_ontology_term_id,donor_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
5739STDY7958790_AACTCAGTCCTCAACC-1,5739STDY7958790,PD43948,Blood,Plasma-IgG,b,Plasma,4916.0,EFO:0030004,CL:0000985,71-80,...,ccRCC,MONDO:0007763,PD43948,True,NCBITaxon:9606,unknown,female,PATO:0000383,cell,UBERON:0000178
5739STDY7958790_AAGTCTGTCGGATGGA-1,5739STDY7958790,PD43948,Blood,Switched-memory_B,b,B-cell,2708.0,EFO:0030004,CL:0000972,71-80,...,ccRCC,MONDO:0007763,PD43948,True,NCBITaxon:9606,unknown,female,PATO:0000383,cell,UBERON:0000178
5739STDY7958790_ACACCCTGTGTGAAAT-1,5739STDY7958790,PD43948,Blood,Plasma-IgA,b,Plasma,4575.0,EFO:0030004,CL:0000987,71-80,...,ccRCC,MONDO:0007763,PD43948,True,NCBITaxon:9606,unknown,female,PATO:0000383,cell,UBERON:0000178
5739STDY7958790_ACATGGTTCAGTTTGG-1,5739STDY7958790,PD43948,Blood,Naïve_B.2,b,B-cell,2931.0,EFO:0030004,CL:0000788,71-80,...,ccRCC,MONDO:0007763,PD43948,True,NCBITaxon:9606,unknown,female,PATO:0000383,cell,UBERON:0000178
5739STDY7958790_ACCGTAAGTCTTCTCG-1,5739STDY7958790,PD43948,Blood,Plasma-IgA,b,Plasma,5965.0,EFO:0030004,CL:0000987,71-80,...,ccRCC,MONDO:0007763,PD43948,True,NCBITaxon:9606,unknown,female,PATO:0000383,cell,UBERON:0000178
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5739STDY9266997_TGCTACCAGTGGAGAA-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,RCC,4556.0,EFO:0030004,CL:0002306,51-60,...,ccRCC,MONDO:0007763,PD47512,True,NCBITaxon:9606,unknown,male,PATO:0000384,cell,UBERON:0002113
5739STDY9266997_TTCTCCTTCCTTAATC-1,5739STDY9266997,PD47512,Tumour-normal,CD,g,RCC,4771.0,EFO:0030004,CL:0000003,51-60,...,ccRCC,MONDO:0007763,PD47512,True,NCBITaxon:9606,unknown,male,PATO:0000384,cell,UBERON:0002113
5739STDY9266997_TTGAACGGTGCGATAG-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,Epi_non-PT,5502.0,EFO:0030004,CL:0002306,51-60,...,ccRCC,MONDO:0007763,PD47512,True,NCBITaxon:9606,unknown,male,PATO:0000384,cell,UBERON:0002113
5739STDY9266997_TTGACTTCACCCAGTG-1,5739STDY9266997,PD47512,Tumour-normal,PT,g,RCC,5027.0,EFO:0030004,CL:0002306,51-60,...,ccRCC,MONDO:0007763,PD47512,True,NCBITaxon:9606,unknown,male,PATO:0000384,cell,UBERON:0002113


In [138]:
adata.obs.columns

Index(['orig.ident', 'patient', 'summaryDescription', 'annotation', 'region',
       'broad_type', 'n_counts', 'assay_ontology_term_id',
       'cell_type_ontology_term_id', 'age',
       'development_stage_ontology_term_id', 'disease',
       'disease_ontology_term_id', 'donor_id', 'is_primary_data',
       'organism_ontology_term_id', 'self_reported_ethnicity_ontology_term_id',
       'sex', 'sex_ontology_term_id', 'suspension_type',
       'tissue_ontology_term_id'],
      dtype='object')

In [139]:
del adata.obs['age']
del adata.obs['disease']
del adata.obs['sex']
del adata.obs['patient']

In [140]:
adata.obs

Unnamed: 0,orig.ident,summaryDescription,annotation,region,broad_type,n_counts,assay_ontology_term_id,cell_type_ontology_term_id,development_stage_ontology_term_id,disease_ontology_term_id,donor_id,is_primary_data,organism_ontology_term_id,self_reported_ethnicity_ontology_term_id,sex_ontology_term_id,suspension_type,tissue_ontology_term_id
5739STDY7958790_AACTCAGTCCTCAACC-1,5739STDY7958790,Blood,Plasma-IgG,b,Plasma,4916.0,EFO:0030004,CL:0000985,HsapDv:0000242,MONDO:0007763,PD43948,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0000178
5739STDY7958790_AAGTCTGTCGGATGGA-1,5739STDY7958790,Blood,Switched-memory_B,b,B-cell,2708.0,EFO:0030004,CL:0000972,HsapDv:0000242,MONDO:0007763,PD43948,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0000178
5739STDY7958790_ACACCCTGTGTGAAAT-1,5739STDY7958790,Blood,Plasma-IgA,b,Plasma,4575.0,EFO:0030004,CL:0000987,HsapDv:0000242,MONDO:0007763,PD43948,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0000178
5739STDY7958790_ACATGGTTCAGTTTGG-1,5739STDY7958790,Blood,Naïve_B.2,b,B-cell,2931.0,EFO:0030004,CL:0000788,HsapDv:0000242,MONDO:0007763,PD43948,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0000178
5739STDY7958790_ACCGTAAGTCTTCTCG-1,5739STDY7958790,Blood,Plasma-IgA,b,Plasma,5965.0,EFO:0030004,CL:0000987,HsapDv:0000242,MONDO:0007763,PD43948,True,NCBITaxon:9606,unknown,PATO:0000383,cell,UBERON:0000178
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5739STDY9266997_TGCTACCAGTGGAGAA-1,5739STDY9266997,Tumour-normal,PT,g,RCC,4556.0,EFO:0030004,CL:0002306,HsapDv:0000240,MONDO:0007763,PD47512,True,NCBITaxon:9606,unknown,PATO:0000384,cell,UBERON:0002113
5739STDY9266997_TTCTCCTTCCTTAATC-1,5739STDY9266997,Tumour-normal,CD,g,RCC,4771.0,EFO:0030004,CL:0000003,HsapDv:0000240,MONDO:0007763,PD47512,True,NCBITaxon:9606,unknown,PATO:0000384,cell,UBERON:0002113
5739STDY9266997_TTGAACGGTGCGATAG-1,5739STDY9266997,Tumour-normal,PT,g,Epi_non-PT,5502.0,EFO:0030004,CL:0002306,HsapDv:0000240,MONDO:0007763,PD47512,True,NCBITaxon:9606,unknown,PATO:0000384,cell,UBERON:0002113
5739STDY9266997_TTGACTTCACCCAGTG-1,5739STDY9266997,Tumour-normal,PT,g,RCC,5027.0,EFO:0030004,CL:0002306,HsapDv:0000240,MONDO:0007763,PD47512,True,NCBITaxon:9606,unknown,PATO:0000384,cell,UBERON:0002113


In [141]:
adata.var

Unnamed: 0_level_0,name,feature_is_filtered
ensembl_id,Unnamed: 1_level_1,Unnamed: 2_level_1
ENSG00000279928,FO538757.3,False
ENSG00000279457,FO538757.2,False
ENSG00000228463,AP006222.2,False
ENSG00000230021,RP5-857K21.4,False
ENSG00000237491,RP11-206L10.9,False
...,...,...
ENSG00000228817,BACH1-IT2,False
ENSG00000227757,AP000282.2,False
ENSG00000231106,LINC01436,False
ENSG00000160202,CRYAA,False


In [142]:
#check the format of expression matrix

In [143]:
adata.X

<270855x19507 sparse matrix of type '<class 'numpy.float32'>'
	with 332590964 stored elements in Compressed Sparse Column format>

In [144]:
# Convert expression matrix to sparse matrix format

In [145]:
adata.X = scipy.sparse.csr_matrix(adata.X)

In [146]:
adata.X

<270855x19507 sparse matrix of type '<class 'numpy.float32'>'
	with 332590964 stored elements in Compressed Sparse Row format>

In [147]:
araw.X

<270855x19507 sparse matrix of type '<class 'numpy.float32'>'
	with 332590964 stored elements in Compressed Sparse Row format>

In [148]:
print(adata.X[720,0])

1.6043375


In [149]:
print(adata.raw.X[720,0])

1.0


In [81]:
list(adata.obs['assay_ontology_term_id'].unique())

['EFO:0030004']

In [150]:
adata.write('/lustre/scratch127/cellgen/cellgeni/cxgportal_sets/kidney-cancer/final_objects/RCC2.h5ad', compression = 'gzip')