# 2023-04-17-Curation: Norman19 Combo Screen

Add cross validation splits to Norman19

In [1]:
import scanpy as sc
import os
import subprocess as sp
from perturbench.analysis.preprocess import preprocess
from scipy.sparse import csr_matrix

%reload_ext autoreload
%autoreload 2

Download from: https://zenodo.org/records/7041849/files/NormanWeissman2019_filtered.h5ad?download=1

In [2]:
data_url = 'https://zenodo.org/records/7041849/files/NormanWeissman2019_filtered.h5ad?download=1'
data_cache_dir = '../perturbench_data' ## Change this to your local data directory

if not os.path.exists(data_cache_dir):
    os.makedirs(data_cache_dir)

tmp_data_dir = f'{data_cache_dir}/norman19_downloaded.h5ad'

if not os.path.exists(tmp_data_dir):
    sp.call(f'wget {data_url} -O {tmp_data_dir}', shell=True)

In [3]:
adata = sc.read_h5ad(tmp_data_dir)
adata

AnnData object with n_obs × n_vars = 111445 × 33694
    obs: 'guide_id', 'read_count', 'UMI_count', 'coverage', 'gemgroup', 'good_coverage', 'number_of_cells', 'tissue_type', 'cell_line', 'cancer', 'disease', 'perturbation_type', 'celltype', 'organism', 'perturbation', 'nperts', 'ngenes', 'ncounts', 'percent_mito', 'percent_ribo'
    var: 'ensemble_id', 'ncounts', 'ncells'

In [4]:
adata.obs.rename(columns = {
    'nCount_RNA': 'ncounts',
    'nFeature_RNA': 'ngenes',
    'percent.mt': 'percent_mito',
    'cell_line': 'cell_type',
}, inplace=True)

In [5]:
adata.obs.perturbation.unique()

['ARID1A', 'BCORL1', 'FOSB', 'SET_KLF1', 'OSR2', ..., 'CEBPB_OSR2', 'PRDM1_CBFA2T3', 'FOSB_CEBPB', 'ZBTB10_DLX2', 'FEV_CBFA2T3']
Length: 237
Categories (237, object): ['AHR', 'AHR_FEV', 'AHR_KLF1', 'ARID1A', ..., 'ZC3HAV1_HOXC13', 'ZNF318', 'ZNF318_FOXL2', 'control']

In [6]:
adata.obs['perturbation'] = adata.obs['perturbation'].str.replace('_', '+')
adata.obs['perturbation'] = adata.obs['perturbation'].astype('category')
adata.obs.perturbation.value_counts()

perturbation
control          11855
KLF1              1960
BAK1              1457
CEBPE             1233
CEBPE+RUNX1T1     1219
                 ...  
CEBPB+CEBPA         64
CBL+UBASH3A         64
C3orf72+FOXL2       59
JUN+CEBPB           59
JUN+CEBPA           54
Name: count, Length: 237, dtype: int64

In [7]:
adata.obs['condition'] = adata.obs.perturbation.copy()

In [8]:
adata.X = csr_matrix(adata.X)

In [9]:
adata = preprocess(
    adata,
    perturbation_key='condition',
    covariate_keys=['cell_type'],
)

Preprocessing ...
Filtering for highly variable genes or differentially expressed genes ...
Processed dataset summary:
View of AnnData object with n_obs × n_vars = 111445 × 5666
    obs: 'guide_id', 'read_count', 'UMI_count', 'coverage', 'gemgroup', 'good_coverage', 'number_of_cells', 'tissue_type', 'cell_type', 'cancer', 'disease', 'perturbation_type', 'celltype', 'organism', 'perturbation', 'nperts', 'ngenes', 'ncounts', 'percent_mito', 'percent_ribo', 'condition', 'cov_merged', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes'
    var: 'ensemble_id', 'ncounts', 'ncells', 'n_cells', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'log1p', 'h

In [12]:
adata = adata.copy()
output_data_path = f'{data_cache_dir}/norman19_processed.h5ad'
adata.write_h5ad(output_data_path)