# 2023-07-26-Curation: McFaline-Figuroa23

PerturbSeq screen of interactions between chemical and genetic perturbations

In [1]:
import scanpy as sc
import numpy as np
import anndata as ad
import gc
from scipy.sparse import csr_matrix

from perturbench.analysis.utils import get_ensembl_mappings
from perturbench.analysis.preprocess import preprocess

%load_ext autoreload
%autoreload 2

Get gene names from ENSEMBL IDs

In [3]:
id_to_gene = get_ensembl_mappings()
id_to_gene = {k:v for k,v in id_to_gene.items() if isinstance(v, str) and v != ''}
len(id_to_gene.keys())

## Load data

In [None]:
data_cache_dir = '../perturbench_data'

In [4]:
data_paths = [
    f'{data_cache_dir}/gxe1.h5ad',
    f'{data_cache_dir}/gxe2_A172.h5ad',
    f'{data_cache_dir}/gxe2_T98G.h5ad',
    f'{data_cache_dir}/gxe2_U87MG.h5ad',
]

In [None]:
adata_list = []
for path in data_paths:
    adata_list.append(sc.read_h5ad(path))
adata = ad.concat(adata_list)
adata

In [None]:
adata.X = csr_matrix(adata.X)

In [6]:
adata.obs_names_make_unique()

In [7]:
del adata_list
gc.collect()

608

In [8]:
adata.obs.dose.value_counts()

dose
1.0      405444
10.0     389067
0.0      202315
0.1        3360
0.5        3048
5.0        2177
50.0       1301
100.0      1172
Name: count, dtype: int64

In [9]:
adata.obs['drug_dose'] = adata.obs.dose.copy()

In [10]:
adata.obs.cell_type = [x.lower() for x in adata.obs.cell_type]
adata.obs.cell_type = adata.obs.cell_type.astype('category')
adata.obs.cell_type.value_counts()

cell_type
a172     353673
u87mg    328756
t98g     325455
Name: count, dtype: int64

In [11]:
adata.obs.treatment.value_counts()

treatment
lapatinib       297934
nintedanib      207303
vehicle         199200
zstk474         168355
trametinib      116507
thioguanine       7766
temozolomide      7704
dmso              3115
Name: count, dtype: int64

In [12]:
adata.obs.gene_id.value_counts()

gene_id
NA                   190904
NTC                   12556
HPRT1                  7588
random                 6672
SGK3                   2854
                      ...  
GRK5,PKN2                 1
PRKCG,SCYL2               1
ERBB3,STK39               1
ERBB3,MARK4,PLK2          1
HUNK,MAP3K11,MELK         1
Name: count, Length: 83474, dtype: int64

## Rename metadata columns

In [13]:
adata.obs.rename(columns = {
    'nCount_RNA': 'ncounts',
    'nFeature_RNA': 'ngenes',
}, inplace=True)
adata.obs['perturbation_type'] = 'CRISPRi'
adata.obs['dataset'] = 'mcfaline23'

## Rename perturbations

In [14]:
adata.obs['gene_id'] = [x.replace(',', '+') for x in adata.obs.gene_id]

gene_controls = ['NA', 'NTC', 'random']
for ctrl in gene_controls:
    adata.obs['gene_id'] = [x.replace(ctrl, 'control') for x in adata.obs['gene_id']]
adata.obs.gene_id.value_counts()

gene_id
control             210132
HPRT1                 7588
SGK3                  2854
MARK3                 2732
PRKD2                 2686
                     ...  
CDC42BPG+ERBB3           1
SGK1+TIE1                1
PLK2+PRKCB               1
ACVR1+BRAF+EPHA7         1
BRSK1+EGFR               1
Name: count, Length: 83067, dtype: int64

In [15]:
single_gene_perts = [x for x in adata.obs.gene_id.unique() if '+' not in x]
adata = adata[adata.obs.gene_id.isin(single_gene_perts),:]
adata

View of AnnData object with n_obs × n_vars = 894285 × 58347
    obs: 'orig.ident', 'ncounts', 'ngenes', 'cell', 'sample', 'Size_Factor', 'n.umi', 'PCR_plate', 'new_cell', 'dose', 'treatment', 'gRNA_id', 'gene_id', 'guide_number', 'cell_type', 'drug_dose', 'perturbation_type', 'dataset'

In [16]:
drug_controls = ['vehicle', 'dmso']
for ctrl in drug_controls:
    adata.obs['treatment'] = [x.replace(ctrl, 'none') for x in adata.obs['treatment']]
adata.obs.treatment.value_counts()

  adata.obs['treatment'] = [x.replace(ctrl, 'none') for x in adata.obs['treatment']]


treatment
lapatinib       265013
nintedanib      184054
none            179486
zstk474         148358
trametinib      102774
thioguanine       7378
temozolomide      7222
Name: count, dtype: int64

In [18]:
gene_dose = []
for gene in adata.obs.gene_id:
    ngenes = len(gene.split('+'))
    dose = '+'.join(['1']*ngenes)
    gene_dose.append(dose)
    
adata.obs['gene_dose'] = gene_dose
adata.obs.gene_dose.value_counts()

gene_dose
1    894285
Name: count, dtype: int64

In [19]:
adata.obs['perturbation'] = adata.obs.gene_id.astype('category').copy()
adata.obs.perturbation.value_counts()

perturbation
control    210132
HPRT1        7588
SGK3         2854
MARK3        2732
PRKD2        2686
            ...  
RIOK2         223
TTK           212
PLK1          186
BUB1B         179
AURKB          89
Name: count, Length: 530, dtype: int64

In [20]:
adata.obs['pert_cl_tr'] = adata.obs['perturbation'].astype(str) + '_' + adata.obs['cell_type'].astype(str) + '_' + adata.obs['treatment'].astype(str)
pert_cl_tr_counts = adata.obs.pert_cl_tr.value_counts()
pert_cl_tr_keep = list(pert_cl_tr_counts.loc[pert_cl_tr_counts >= 20].index)
print(len(pert_cl_tr_keep))

7745


In [21]:
adata.shape

(894285, 58347)

In [22]:
adata = adata[adata.obs.pert_cl_tr.isin(pert_cl_tr_keep)]
adata.shape

(892800, 58347)

In [26]:
adata.obs['condition'] = adata.obs['perturbation'].copy()
adata.obs.condition = adata.obs.condition.astype('category')
adata.obs.perturbation = adata.obs.perturbation.astype('category')

  adata.obs['condition'] = adata.obs['perturbation'].copy()


In [27]:
adata

AnnData object with n_obs × n_vars = 892800 × 58347
    obs: 'orig.ident', 'ncounts', 'ngenes', 'cell', 'sample', 'Size_Factor', 'n.umi', 'PCR_plate', 'new_cell', 'dose', 'treatment', 'gRNA_id', 'gene_id', 'guide_number', 'cell_type', 'drug_dose', 'perturbation_type', 'dataset', 'gene_dose', 'perturbation', 'pert_cl_tr', 'condition'

In [29]:
adata.var['gene_id'] = [x.split('.')[0] for x in adata.var_names]
adata = adata[:,[x in id_to_gene for x in adata.var['gene_id']]]
adata.shape

(892800, 57598)

In [30]:
adata.var['gene_name'] = [str(id_to_gene[x]) for x in adata.var['gene_id']]
adata = adata[:,[x != '' for x in adata.var['gene_name']]]
adata.shape

  adata.var['gene_name'] = [str(id_to_gene[x]) for x in adata.var['gene_id']]


(892800, 57598)

In [31]:
adata.var_names = adata.var.gene_name.astype(str).copy()

In [32]:
adata.var.head()

Unnamed: 0_level_0,gene_id,gene_name
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1
TSPAN6,ENSG00000000003,TSPAN6
TNMD,ENSG00000000005,TNMD
DPM1,ENSG00000000419,DPM1
SCYL3,ENSG00000000457,SCYL3
FIRRM,ENSG00000000460,FIRRM


In [33]:
adata = adata[:,['nan' not in x for x in adata.var_names]]
adata.shape

(892800, 42010)

In [34]:
duplicated_genes = adata.var.index.duplicated()
adata = adata[:,~duplicated_genes]
adata.shape

(892800, 40775)

In [35]:
required_cols = [
    'condition',
    'cell_type',
    'treatment',
    'perturbation_type',
    'dataset',
    'ngenes',
    'ncounts',
]

for col in required_cols:
    assert col in adata.obs.columns
    if np.any(adata.obs[col].isnull()):
        print(col)
    if np.any(adata.obs[col].isna()):
        print(col)

In [38]:
adata = adata.copy()

In [39]:
gc.collect()

679

In [5]:
condition_plus_treatment = []
for condition, treatment in zip(adata.obs.condition, adata.obs.treatment):
    if treatment == 'none':
        condition_plus_treatment.append(str(condition))
    else:
        condition_plus_treatment.append(str(condition) + '+' + str(treatment))

adata.obs['condition_plus_treatment'] = condition_plus_treatment
adata.obs['condition_plus_treatment'] = adata.obs['condition_plus_treatment'].astype('category')
adata.obs.condition_plus_treatment.value_counts()

condition_plus_treatment
control+lapatinib     61328
control+nintedanib    43874
control               41614
control+zstk474       35229
control+trametinib    25190
                      ...  
TTK                      25
TTK+zstk474              24
RIOK2+nintedanib         21
BUB1B+nintedanib         21
MGMT+thioguanine         20
Name: count, Length: 2617, dtype: int64

In [3]:
unique_obs = adata.obs.loc[:,['condition', 'cell_type', 'treatment']].drop_duplicates()
unique_obs.treatment.value_counts()

treatment
lapatinib       1563
none            1555
nintedanib      1551
zstk474         1544
trametinib      1518
temozolomide       8
thioguanine        6
Name: count, dtype: int64

In [5]:
treatments_remove = [
    'temozolomide',
    'thioguanine'
]

adata = adata[~adata.obs.treatment.isin(treatments_remove)].to_memory()
adata

AnnData object with n_obs × n_vars = 878229 × 40775
    obs: 'orig.ident', 'ncounts', 'ngenes', 'cell', 'sample', 'Size_Factor', 'n.umi', 'PCR_plate', 'new_cell', 'dose', 'treatment', 'gRNA_id', 'gene_id', 'guide_number', 'cell_type', 'drug_dose', 'perturbation_type', 'dataset', 'gene_dose', 'perturbation', 'pert_cl_tr', 'condition', 'condition_plus_treatment'
    var: 'ensembl_id'

In [None]:
adata = preprocess(
    adata,
    perturbation_key='condition',
    covariate_keys=['cell_type', 'treatment'],
)

Preprocessing ...
Filtering for highly variable genes or differentially expressed genes ...
Processed dataset summary:
View of AnnData object with n_obs × n_vars = 111445 × 5666
    obs: 'guide_id', 'read_count', 'UMI_count', 'coverage', 'gemgroup', 'good_coverage', 'number_of_cells', 'tissue_type', 'cell_type', 'cancer', 'disease', 'perturbation_type', 'celltype', 'organism', 'perturbation', 'nperts', 'ngenes', 'ncounts', 'percent_mito', 'percent_ribo', 'condition', 'cov_merged', 'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts', 'log1p_total_counts', 'pct_counts_in_top_50_genes', 'pct_counts_in_top_100_genes', 'pct_counts_in_top_200_genes', 'pct_counts_in_top_500_genes'
    var: 'ensemble_id', 'ncounts', 'ncells', 'n_cells', 'n_cells_by_counts', 'mean_counts', 'log1p_mean_counts', 'pct_dropout_by_counts', 'total_counts', 'log1p_total_counts', 'highly_variable', 'highly_variable_rank', 'means', 'variances', 'variances_norm', 'highly_variable_nbatches'
    uns: 'log1p', 'h

In [6]:
adata.write_h5ad(f'{data_cache_dir}/mcfaline23_gxe_processed.h5ad')