# 2023-07-26-Curation: Srivatsan20 Chemical Perturbation Screen

In [2]:
import scanpy as sc
import numpy as np
import subprocess as sp
import os
from perturbench.analysis.utils import get_ensembl_mappings
from perturbench.analysis.preprocess import preprocess

%reload_ext autoreload
%autoreload 2

Download from: https://zenodo.org/records/7041849/files/SrivatsanTrapnell2020_sciplex3.h5ad?download=1

In [3]:
data_url = 'https://zenodo.org/records/7041849/files/SrivatsanTrapnell2020_sciplex3.h5ad?download=1'
data_cache_dir = '../perturbench_data' ## Change this to your local data directory

if not os.path.exists(data_cache_dir):
    os.makedirs(data_cache_dir)

tmp_data_dir = f'{data_cache_dir}/srivatsan20_downloaded.h5ad'

if not os.path.exists(tmp_data_dir):
    sp.call(f'wget {data_url} -O {tmp_data_dir}', shell=True)

--2024-06-12 20:34:01--  https://zenodo.org/records/7041849/files/SrivatsanTrapnell2020_sciplex3.h5ad?download=1
Resolving zenodo.org (zenodo.org)... 188.185.79.172, 188.184.98.238, 188.184.103.159, ...
Connecting to zenodo.org (zenodo.org)|188.185.79.172|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2456030368 (2.3G) [application/octet-stream]
Saving to: ‘./perturbench_data/srivatsan20_downloaded.h5ad’

     0K .......... .......... .......... .......... ..........  0%  159K 4h11m
    50K .......... .......... .......... .......... ..........  0%  318K 3h8m
   100K .......... .......... .......... .......... ..........  0%  265K 2h56m
   150K .......... .......... .......... .......... ..........  0%  131M 2h12m
   200K .......... .......... .......... .......... ..........  0%  167M 1h45m
   250K .......... .......... .......... .......... ..........  0%  319K 1h49m
   300K .......... .......... .......... .......... ..........  0%  109M 93m30s
   350K ...

In [4]:
adata = sc.read_h5ad(tmp_data_dir)
adata

AnnData object with n_obs × n_vars = 799317 × 110984
    obs: 'ncounts', 'well', 'plate', 'cell_line', 'replicate', 'time', 'dose_value', 'pathway_level_1', 'pathway_level_2', 'perturbation', 'target', 'pathway', 'dose_unit', 'celltype', 'disease', 'cancer', 'tissue_type', 'organism', 'perturbation_type', 'ngenes', 'percent_mito', 'percent_ribo', 'nperts', 'chembl-ID'
    var: 'ensembl_id', 'ncounts', 'ncells'

In [5]:
adata.obs.cell_line.value_counts()

cell_line
MCF7    344862
A549    244281
K562    173652
Name: count, dtype: int64

In [6]:
adata.var.head()

Unnamed: 0_level_0,ensembl_id,ncounts,ncells
gene_symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,id gene_short_name,26582.0,23228
nan:1,ENSG00000000003,35.0,33
nan:2,ENSG00000000005,163109.0,116153
nan:3,ENSG00000000419,49655.0,41883
nan:4,ENSG00000000457,57943.0,49609


In [7]:
unique_genes = ~adata.var.ensembl_id.duplicated()
np.sum(unique_genes)

110939

In [8]:
adata = adata[:,unique_genes]
adata.var_names = adata.var.ensembl_id.astype(str)

In [9]:
human_ids = [x for x in adata.var_names if 'ENSG' in x]
len(human_ids)

58302

In [10]:
adata = adata[:,human_ids]
adata.shape

(799317, 58302)

In [12]:
gene_mappings = get_ensembl_mappings()

In [13]:
len(gene_mappings.keys())

70611

In [14]:
gene_mappings = {k:v for k,v in gene_mappings.items() if isinstance(v, str) and v != ''}
len(gene_mappings.keys())

46522

In [15]:
np.sum([x in gene_mappings for x in adata.var_names])

40729

In [16]:
adata = adata[:,[x in gene_mappings for x in adata.var_names]]
adata

View of AnnData object with n_obs × n_vars = 799317 × 40729
    obs: 'ncounts', 'well', 'plate', 'cell_line', 'replicate', 'time', 'dose_value', 'pathway_level_1', 'pathway_level_2', 'perturbation', 'target', 'pathway', 'dose_unit', 'celltype', 'disease', 'cancer', 'tissue_type', 'organism', 'perturbation_type', 'ngenes', 'percent_mito', 'percent_ribo', 'nperts', 'chembl-ID'
    var: 'ensembl_id', 'ncounts', 'ncells'

In [17]:
adata.var['gene_symbol'] = [gene_mappings[x] for x in adata.var_names]
adata.var_names = adata.var['gene_symbol']

  adata.var['gene_symbol'] = [gene_mappings[x] for x in adata.var_names]


In [18]:
adata.var_names[0:5]

Index(['TSPAN6', 'TNMD', 'DPM1', 'SCYL3', 'FIRRM'], dtype='object', name='gene_symbol')

In [19]:
adata.obs.perturbation.value_counts()

perturbation
control                              17578
Ellagic acid                          6257
Divalproex Sodium                     6203
Ruxolitinib (INCB018424)              6143
MC1568                                6126
                                     ...  
Alvespimycin (17-DMAG) HCl            2089
Patupilone (EPO906, Epothilone B)     1822
Flavopiridol HCl                      1729
Epothilone A                          1426
YM155 (Sepantronium Bromide)          1007
Name: count, Length: 189, dtype: int64

In [20]:
adata.obs.rename(columns = {
    'n_genes': 'ngenes',
    'n_counts': 'ncounts',
}, inplace=True)

adata.obs['perturbation_type'] = 'drug'
adata.obs['dataset'] = 'srivatsan20'
adata.obs['cell_type'] = adata.obs['cell_line'].copy()
adata.obs['treatment'] = 'none'
adata.obs['condition'] = adata.obs['perturbation'].copy()

adata

AnnData object with n_obs × n_vars = 799317 × 40729
    obs: 'ncounts', 'well', 'plate', 'cell_line', 'replicate', 'time', 'dose_value', 'pathway_level_1', 'pathway_level_2', 'perturbation', 'target', 'pathway', 'dose_unit', 'celltype', 'disease', 'cancer', 'tissue_type', 'organism', 'perturbation_type', 'ngenes', 'percent_mito', 'percent_ribo', 'nperts', 'chembl-ID', 'dataset', 'cell_type', 'treatment', 'condition'
    var: 'ensembl_id', 'ncounts', 'ncells', 'gene_symbol'

In [21]:
adata.obs.cell_type.value_counts()

cell_type
MCF7    344862
A549    244281
K562    173652
Name: count, dtype: int64

In [None]:
adata = adata[adata.obs.cell_type.isin(['MCF7', 'A549', 'K562'])]
adata.obs.cell_type = [x.lower() for x in adata.obs.cell_type]
adata.obs.cell_type.unique()

Ensure doses are in micromolars

In [23]:
adata.obs['dose'] = adata.obs['dose_value'].copy() * 1/1000
adata.obs.dose.value_counts()

dose
0.01     202725
0.10     192858
1.00     183356
10.00    166278
0.00      17578
Name: count, dtype: int64

In [24]:
adata.obs['dose_unit'] = 'uM'

Ensure no perturbation name has a "+" in it since we use "+" as the perturbation delimiter

In [25]:
for p in adata.obs.condition.unique():
    if "+" in p:
        print(p)

(+)-JQ1
ENMD-2076 L-(+)-Tartaric acid 


In [26]:
perturbation_remap = {
    '(+)-JQ1': 'JQ1',
    'ENMD-2076 L-(+)-Tartaric acid': 'ENMD-2076',
}

adata.obs['perturbation'] = [perturbation_remap.get(x, x) for x in adata.obs.perturbation.astype(str)]
adata.obs['condition'] = adata.obs['perturbation'].copy()

Subset to highest dose

In [27]:
adata.obs['dose'].value_counts()

dose
0.01     202725
0.10     192858
1.00     183356
10.00    166278
0.00      17578
Name: count, dtype: int64

In [28]:
adata.shape

(762795, 40729)

In [29]:
np.sum((adata.obs.dose == 10.0) | (adata.obs.condition == 'control'))

183856

In [None]:
adata = adata[(adata.obs.dose == 10.0) | (adata.obs.condition == 'control')].copy()
adata.shape

In [31]:
import gc
gc.collect()

2315

Run preprocessing

In [32]:
adata = preprocess(
    adata,
    perturbation_key='condition',
    covariate_keys=['cell_type'],
)

Preprocessing ...
Filtering for highly variable genes or differentially expressed genes ...
Processed dataset summary:
View of AnnData object with n_obs × n_vars = 183856 × 9198
    obs: 'ncounts', 'well', 'plate', 'cell_line', 'replicate', 'time', 'dose_value', 'pathway_level_1', 'pathway_level_2', 'perturbation', 'target', 'pathway', 'dose_unit', 'celltype', 'disease', 'cancer', 'tissue_type', 'organism', 'perturbation_type', 'ngenes', 'percent_mito', 'percent_ribo', 'nperts', 'chembl-ID', 'dataset', 'cell_type', 'treatment', 'condition', 'dose', 'cov_merged', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes'
    var: 'ensembl_id', 'ncounts', 'ncells', 'gene_symbol', 'n_cells', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'highly_variab

In [33]:
adata = adata.copy()
adata

gc.collect()

2751

In [37]:
adata.var.head()

Unnamed: 0,ensembl_id,ncounts,ncells,gene_symbol,n_cells,n_cells_by_counts,mean_counts,log1p_mean_counts,pct_dropout_by_counts,total_counts,log1p_total_counts,highly_variable,highly_variable_rank,means,variances,variances_norm,highly_variable_nbatches
RNU1-5P,ENSG00000277234,140.0,131,RNU1-5P,37,37,0.000218,0.000218,99.979876,40.0,3.713572,True,662.0,0.000218,0.00025,1.130177,1
ADAM6,ENSG00000271968,183.0,179,ADAM6,50,50,0.000277,0.000277,99.972805,51.0,3.951244,True,818.0,0.000277,0.000288,1.074129,1
DMXL1,ENSG00000172869,57438.0,51155,DMXL1,11491,11491,0.070082,0.067735,93.75,12885.0,9.463897,False,,0.070082,0.082456,0.928153,0
TRPM6,ENSG00000119121,69767.0,52577,TRPM6,12079,12079,0.088395,0.084704,93.430184,16252.0,9.696033,True,2116.5,0.088395,0.156098,1.099608,2
CSGALNACT2,ENSG00000169826,1495.0,1321,CSGALNACT2,322,322,0.001947,0.001945,99.824863,358.0,5.883322,True,1132.0,0.001947,0.0024,1.089488,1


In [36]:
adata.var.index.name = None

In [38]:
output_data_path = f'{data_cache_dir}/srivatsan20_processed.h5ad'
adata.write_h5ad(output_data_path)