In [1]:
import anndata
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# Downloaded from the GSE accession at: https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE279945
# !wget "https://ftp.ncbi.nlm.nih.gov/geo/series/GSE279nnn/GSE279945/suppl/GSE279945%5Fsc%5Fcounts%5Fprocessed.h5ad" -O op3_processed.h5ad

AnnData object with n_obs × n_vars = 298087 × 21265
    obs: 'dose_uM', 'timepoint_hr', 'well', 'row', 'col', 'plate_name', 'cell_id', 'cell_type', 'split', 'donor_id', 'sm_name', 'control', 'SMILES', 'sm_lincs_id', 'library_id', 'leiden_res1', 'group', 'cell_type_orig', 'plate_well_celltype_reannotated', 'cell_count_by_well_celltype', 'cell_count_by_plate_well'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'cell_type_colors', 'celltypist_celltype_colors', 'donor_id_colors', 'hvg', 'leiden_res1_colors', 'log1p', 'neighbors', 'over_clustering', 'rank_genes_groups'
    obsm: 'HTO_clr', 'X_pca', 'X_umap', 'protein_counts'
    obsp: 'connectivities', 'distances'

In [None]:
op3_dataset = anndata.read_h5ad('op3_processed.h5ad')
op3_dataset

#### important note, there can be issues in the original split

In [3]:
op3_dataset[op3_dataset.obs['SMILES'] == 'COc1cc2c(c(OC)c1OC)-c1ccc(OC)c(=O)cc1[C@@H](NC(C)=O)CC2'].obs['split'].value_counts()

split
train          802
public_test    411
Name: count, dtype: int64

In [4]:
op3_dataset[
    (op3_dataset.obs['SMILES'] == 'COc1cc2c(c(OC)c1OC)-c1ccc(OC)c(=O)cc1[C@@H](NC(C)=O)CC2') & 
    (op3_dataset.obs['split'] == 'public_test')
].obs['cell_type'].value_counts()

cell_type
Myeloid cells    261
B cells          147
T cells            2
NK cells           1
Name: count, dtype: int64

##### C[S+](C)[O-] is negative control (basically no perturabtion at all, so it is the control in our benchmark)
the other two (belinostat, dabrafenib) are positive controls — known to induce large transcriptomic response
"The dose of belinostat is 0.1µM, DMSO 14.1µM, and the rest of the compounds 1µM" (from the op3 paper)

In [5]:
op3_dataset[op3_dataset.obs['control']].obs['SMILES'].unique()

['O=C(/C=C/c1cccc(S(=O)(=O)Nc2ccccc2)c1)NO', 'CC(C)(C)c1nc(-c2cccc(NS(=O)(=O)c3c(F)cccc3F)c..., 'C[S+](C)[O-]']
Categories (3, object): ['O=C(/C=C/c1cccc(S(=O)(=O)Nc2ccccc2)c1)NO', 'CC(C)(C)c1nc(-c2cccc(NS(=O)(=O)c3c(F)cccc3F)c..., 'C[S+](C)[O-]']

In [6]:
smile_to_ct = {}
for smile in op3_dataset.obs['SMILES'].unique():
    data_subset = op3_dataset[(op3_dataset.obs['SMILES'] == smile)&(op3_dataset.obs['split'] == 'train')]
    smile_to_ct[smile] = data_subset.obs['cell_type'].unique()

In [7]:
np.array([len(v) for k, v in smile_to_ct.items()])

array([0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 4, 2, 2, 2, 2, 3,
       2, 2, 2, 4, 2, 2, 2, 2, 2, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, 2, 2,
       4, 2, 3, 2, 2, 2, 2, 3, 2, 2, 2, 2, 4, 2, 2, 2, 4, 4, 4, 3, 3, 2,
       2, 4, 2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2, 2, 2, 4, 2, 2, 2, 3, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 4, 3, 2, 4, 2, 2,
       2, 2, 2, 2, 2, 4, 2, 2, 2, 2, 2, 2])

In [8]:
list(smile_to_ct)[:3]

['O=C(/C=C/c1cccc(S(=O)(=O)Nc2ccccc2)c1)NO',
 'CC(C)(C)c1nc(-c2cccc(NS(=O)(=O)c3c(F)cccc3F)c2F)c(-c2ccnc(N)n2)s1',
 'C[S+](C)[O-]']

In [9]:
len(smile_to_ct)

144

##### 15 compounds of transfer cell types (B and Myeloids cells) are in the train split

In [10]:
(np.array([len(v) for k, v in smile_to_ct.items()]) == 4).sum()

15

In [11]:
len(op3_dataset[op3_dataset.obs['split'] == 'public_test'].obs['SMILES'].unique())

50

In [12]:
len(op3_dataset[op3_dataset.obs['split'] == 'private_test'].obs['SMILES'].unique())

78

In [13]:
# no overlap of compounds between public test and private test
set(op3_dataset[op3_dataset.obs['split'] == 'public_test'].obs['SMILES'].unique()).intersection(
    set(op3_dataset[op3_dataset.obs['split'] == 'private_test'].obs['SMILES'].unique())
)

set()

In [14]:
# two overlapping compounds between train and public test
set(op3_dataset[op3_dataset.obs['split'] == 'public_test'].obs['SMILES'].unique()).intersection(
    [k for k, v in smile_to_ct.items() if len(v) == 4]
)

{'C[C@@H]1O[C@@H](O[C@@H]2C=C3CC[C@@H]4[C@H](CC[C@]5(C)[C@@H](c6ccc(=O)oc6)CC[C@]45O)[C@@]3(C)CC2)[C@H](O)[C@H](O)[C@H]1O',
 'Nc1ncnc2c1c(I)cn2[C@@H]1O[C@H](CO)[C@@H](O)[C@H]1O'}

In [15]:
# no overlapping between train and private test
set(op3_dataset[op3_dataset.obs['split'] == 'private_test'].obs['SMILES'].unique()).intersection(
    [k for k, v in smile_to_ct.items() if len(v) == 4]
)

set()

In [16]:
# so, for the held-out cell types that are B abd Meyloids, there are 15 compounds in train, 50 in val, 78 in test
# minus the 2 overlap, and plus 3 in the controls, we get the right number which is 144 for all unique compounds in the dataset
15 + 50 + 78 - 2 + 3

144

#### note that public test and private test also have cell types in T and NK

In [17]:
op3_dataset[op3_dataset.obs['split'] == 'train'].obs['cell_type'].value_counts()

cell_type
T cells          136693
NK cells          15302
B cells            3058
Myeloid cells      2502
Name: count, dtype: int64

In [18]:
op3_dataset[op3_dataset.obs['split'] == 'public_test'].obs['cell_type'].value_counts()

cell_type
B cells          12118
Myeloid cells    11202
T cells            411
NK cells             5
Name: count, dtype: int64

In [19]:
op3_dataset[op3_dataset.obs['split'] == 'private_test'].obs['cell_type'].value_counts()

cell_type
B cells          18713
Myeloid cells    18634
T cells            603
NK cells             6
Name: count, dtype: int64

In [20]:
op3_dataset.obs['cell_type'].unique()

['B cells', 'T cells', 'Myeloid cells', 'NK cells']
Categories (4, object): ['B cells', 'Myeloid cells', 'NK cells', 'T cells']

##### create splits

In [21]:
op3_split = pd.Series('none', index=op3_dataset.obs_names)

# first, use existing split of the op3 dataset
op3_split[op3_dataset.obs['split'] == 'train'] = 'train'
op3_split[op3_dataset.obs['split'] == 'public_test'] = 'val'
op3_split[op3_dataset.obs['split'] == 'private_test'] = 'test'

# second, on positive control cells, all assigned to train (I assume positive controls are not meaningful for evaluation)
op3_split[(op3_dataset.obs['split'] == 'control') & (op3_dataset.obs['SMILES'] != 'C[S+](C)[O-]')] = 'train'

# next, on negative control cells, split 50/25/25 to train, val and test
for cell_type in op3_dataset.obs['cell_type'].unique():
    
    control_cells = op3_dataset[(op3_dataset.obs['SMILES'] == 'C[S+](C)[O-]') & (op3_dataset.obs['cell_type'] == cell_type)].obs_names
    
    # using code from frangieh split
    train_control_cells, heldout_control_cells = train_test_split(
        control_cells, 
        test_size=0.5, 
        random_state=11
    )
    val_control_cells, test_control_cells = train_test_split(
        heldout_control_cells, 
        test_size=0.5, 
        random_state=21
    )
    op3_split.loc[train_control_cells] = 'train'
    op3_split.loc[val_control_cells] = 'val'
    op3_split.loc[test_control_cells] = 'test'

op3_split.value_counts()

train    222168
test      45070
val       30849
Name: count, dtype: int64

In [22]:
op3_split[(op3_dataset.obs['SMILES'] == 'C[S+](C)[O-]') & (op3_dataset.obs['cell_type'] == 'T cells')].value_counts()

train    8432
test     4216
val      4216
Name: count, dtype: int64

In [23]:
op3_split[(op3_dataset.obs['SMILES'] == 'C[S+](C)[O-]') & (op3_dataset.obs['cell_type'] == 'NK cells')].value_counts()

train    978
test     490
val      489
Name: count, dtype: int64

In [24]:
op3_split[(op3_dataset.obs['SMILES'] == 'C[S+](C)[O-]') & (op3_dataset.obs['cell_type'] == 'B cells')].value_counts()

train    2198
val      1099
test     1099
Name: count, dtype: int64

In [25]:
op3_split[(op3_dataset.obs['SMILES'] == 'C[S+](C)[O-]') & (op3_dataset.obs['cell_type'] == 'Myeloid cells')].value_counts()

train    2617
test     1309
val      1309
Name: count, dtype: int64

#### there are some bad apples! throwing aways chemical+cell_type in val/test sets that have too few cells

In [26]:
counts=[]

for smile in op3_dataset[op3_split == 'val'].obs['SMILES'].unique():
    
    val_subset = op3_dataset[(op3_split == 'val') & (op3_dataset.obs['SMILES'] == smile)]
    ct_counts = val_subset.obs['cell_type'].value_counts()

    if any(ct_counts < 100):  # too few cells in this drug+cell_type group
        print(smile)
        print(ct_counts)
        print('='*20)
        bad_cell_types = ct_counts[ct_counts < 100].index.values
        for cell_type in bad_cell_types:
            idx = np.where((op3_dataset.obs['SMILES'] == smile) & 
                           (op3_dataset.obs['cell_type'] == cell_type) & 
                           (op3_split == 'val'))
            op3_split.iloc[idx] = 'none'
            counts.append(len(idx[0]))

        
for smile in op3_dataset[op3_split == 'test'].obs['SMILES'].unique():
    
    test_subset = op3_dataset[(op3_split == 'test') & (op3_dataset.obs['SMILES'] == smile)]
    ct_counts = test_subset.obs['cell_type'].value_counts()

    if any(ct_counts < 100):  # too few cells in this drug+cell_type group
        print(smile)
        print(ct_counts)
        print('='*20)
        bad_cell_types = ct_counts[ct_counts < 100].index.values
        for cell_type in bad_cell_types:
            idx = np.where((op3_dataset.obs['SMILES'] == smile) & 
                           (op3_dataset.obs['cell_type'] == cell_type) & 
                           (op3_split == 'test'))
            op3_split.iloc[idx] = 'none'
            counts.append(len(idx[0]))
    

Cc1nc2c(-c3cnc(N)nc3)nc(N3CCOCC3)nc2n1C(C)C
cell_type
B cells          225
Myeloid cells     29
Name: count, dtype: int64
CC[C@@]1(O)C(=O)OCc2c1cc1n(c2=O)Cc2cc3c(CN(C)C)c(O)ccc3nc2-1
cell_type
Myeloid cells    324
B cells          314
T cells           26
Name: count, dtype: int64
CCCCOc1c(C(=O)c2c(F)cc(C)cc2F)cnc2[nH]ncc12
cell_type
Myeloid cells    432
B cells          409
T cells           12
Name: count, dtype: int64
NCCCCN(Cc1nc2ccccc2[nH]1)[C@H]1CCCc2cccnc21
cell_type
Myeloid cells    242
B cells          219
T cells            8
Name: count, dtype: int64
CC(C)c1cc(-c2n[nH]c(=O)n2-c2ccc3c(ccn3C)c2)c(O)cc1O
cell_type
B cells          238
Myeloid cells     29
T cells            1
Name: count, dtype: int64
Cc1cc(-c2nc(-c3ccc(OC(F)(F)F)cc3)no2)nn1Cc1ccnc(N2CCN(C3CC3)CC2)c1
cell_type
B cells          260
Myeloid cells    242
T cells           16
Name: count, dtype: int64
COc1ccc(-c2cc3c(C)nc(N)nc3n([C@H]3CC[C@H](OCCO)CC3)c2=O)cn1
cell_type
B cells          177
Myeloid cells     46
Nam

In [27]:
max(counts)

96

In [28]:
op3_split.value_counts()

train    222168
test      44046
val       29933
none       1940
Name: count, dtype: int64

In [29]:
f'{sum(counts)} cells discarded from validation and test splits'

'1940 cells discarded from validation and test splits'

#### saving datafile and split

In [None]:
op3_split.to_csv('op3_split.csv', header=False)

In [33]:
# check: fixed dose depending on the compounds, so not modelled
op3_dataset.obs[['dose_uM', 'SMILES']].drop_duplicates()

Unnamed: 0_level_0,dose_uM,SMILES
obs_id,Unnamed: 1_level_1,Unnamed: 2_level_1
AAACGAAAGAGCGACT-1_SRTP0006403-0,0.1,O=C(/C=C/c1cccc(S(=O)(=O)Nc2ccccc2)c1)NO
AAACGCTTCATGTCAG-1_SRTP0006403-0,1.0,CC(C)(C)c1nc(-c2cccc(NS(=O)(=O)c3c(F)cccc3F)c2...
AAACCCATCGACGTCG-1_SRTP0006403-0,14.1,C[S+](C)[O-]
AAACCCATCAGCTTCC-1_SRTP0006403-0,1.0,CC(C)c1c(C(=O)Nc2ccccc2)c(-c2ccccc2)c(-c2ccc(F...
AAACGCTAGCGCTGCT-2_SRTP0006404-0,1.0,Cc1nc2c(-c3cnc(N)nc3)nc(N3CCOCC3)nc2n1C(C)C
...,...,...
AAACCCAAGCGCTGAA-12_SRTP0006414-1,1.0,Cc1cc(Nc2ncc(Cl)c(Nc3ccccc3S(=O)(=O)C(C)C)n2)c...
AAAGTCCAGCTATCCA-13_SRTP0006415-1,1.0,COc1cc(Nc2c(C#N)cnc3cc(OCCCN4CCN(C)CC4)c(OC)cc...
AAAGGTAGTCATACCA-14_SRTP0006416-1,1.0,COc1cc2c(Nc3ccc(Br)cc3F)ncnc2cc1OCC1CCN(C)CC1
AAAGAACCAGTCGGTC-15_SRTP0006417-1,1.0,C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...
