In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
%config InlineBackend.figure_format='retina'

%cd /home/gk922433/PycharmProjects/perturbx_remote
!export PYTHONPATH=$(pwd)

from matplotlib import rcParams
# Set scanpy figure size.
rcParams["figure.figsize"] = 6, 6

/home/gk922433/PycharmProjects/perturbx_remote


In [2]:
import os
from itertools import product
import random
import numpy as np
import scanpy as sc
from sklearn.preprocessing import OneHotEncoder

from aiml_perturbx.utils.adata import read_adata_mixseq
from aiml_perturbx.utils.plot import get_color, plot_cov, plot_latent, plot_umap, plot_heatmap, plot_cor, plot_anno_heatmp

In [3]:
batch_dict = {
    "experiment1": [
        "DMSO_6hr_expt1",
        "DMSO_24hr_expt1",
        "Idasanutlin_6hr_expt1",
        "Idasanutlin_24hr_expt1",
#         "Trametinib_6hr_expt1",
#         "Trametinib_24hr_expt1",
        "Bortezomib_6hr_expt1",
        "Bortezomib_24hr_expt1",
    ],
    "experiment3": [
        "DMSO_24hr_expt3",
        "BRD3379_24hr_expt3",
        "Dabrafenib_24hr_expt3",
#         "Navitoclax_24hr_expt3",
        "Trametinib_24hr_expt3",
    ],
    "experiment10": [
        "DMSO_expt10",
        "Everolimus_expt10",
        "Afatinib_expt10",
        "Taselisib_expt10",
        "AZD5591_expt10",
        "JQ1_expt10",
        "Gemcitabine_expt10",
        "Trametinib_expt10",
        "Prexasertib_expt10",
    ]
}
mixseq_path = "/hpc/aiml/upt/causality_data/mixseq"

In [4]:
batch_list = batch_dict["experiment3"]
adata = read_adata_mixseq(batch_list, mixseq_path)

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


In [5]:
len(adata.obs['celltype'].unique())

97

In [6]:
sc.pp.normalize_per_cell(adata)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=5000, subset=True)

In [7]:
attr_enc_celltype = OneHotEncoder()
attr_enc_perturbation = OneHotEncoder()
adata.obsm["attr_celltype"] = np.array(attr_enc_celltype.fit_transform(
    adata.obs[["celltype"]]
).todense())
adata.obsm["attr_perturbation"] = np.array(attr_enc_perturbation.fit_transform(
    adata.obs[["perturbation"]]
).todense())
adata.uns["attr_enc_celltype"] = attr_enc_celltype.categories_
adata.uns["attr_enc_perturbation"] = attr_enc_perturbation.categories_

In [9]:
# train_hold_schemas = {}  # dictionary of tuples for different train-hold schemas.
# train_hold_schemas["hold1"] = ['DKMG_CENTRAL_NERVOUS_SYSTEM', 'NCIH226_LUNG', 'COLO680N_OESOPHAGUS', 'RCM1_LARGE_INTESTINE', 'SQ1_LUNG', 'BICR31_UPPER_AERODIGESTIVE_TRACT']
# train_hold_schemas["hold2"] = ['DKMG_CENTRAL_NERVOUS_SYSTEM', 'NCIH226_LUNG', 'COLO680N_OESOPHAGUS', 'RCM1_LARGE_INTESTINE', 'SQ1_LUNG', 'BICR31_UPPER_AERODIGESTIVE_TRACT', 'COV434_OVARY', 'UMUC1_URINARY_TRACT']
# train_hold_schemas["hold3"] = ['DKMG_CENTRAL_NERVOUS_SYSTEM', 'SNU1079_BILIARY_TRACT', 'COLO680N_OESOPHAGUS', 'RCM1_LARGE_INTESTINE', 'SQ1_LUNG', 'BICR31_UPPER_AERODIGESTIVE_TRACT']
# train_hold_schemas["hold4"] = ['DKMG_CENTRAL_NERVOUS_SYSTEM', 'SNU1079_BILIARY_TRACT', 'COLO680N_OESOPHAGUS', 'RCM1_LARGE_INTESTINE', 'SQ1_LUNG', 'BICR31_UPPER_AERODIGESTIVE_TRACT', 'COV434_OVARY', 'UMUC1_URINARY_TRACT']

In [8]:
# Split samples for trainning and OOD prediction based on attributes.
celltypes = list(adata.obs["celltype"].unique())
random.shuffle(celltypes)

train_hold_schemas = {}  # dictionary of tuples for different train-hold schemas.
train_hold_schemas["hold1"] = celltypes[:24]
train_hold_schemas["hold2"] = celltypes[24:48]
train_hold_schemas["hold3"] = celltypes[48:72]
train_hold_schemas["hold4"] = celltypes[72:]

for train_hold, hold_celltypes in train_hold_schemas.items():
    adata.obs.loc[:, train_hold] = "train"
    for celltype in hold_celltypes:
        hold_obs = adata.obs_names[adata.obs["celltype"] == celltype]
        adata.obs.loc[hold_obs, train_hold] = "hold"

In [9]:
mixseq_processed_dir = os.path.join(mixseq_path, "processed")
if not os.path.isdir(mixseq_processed_dir):
    os.mkdir(mixseq_processed_dir)

adata.write_h5ad(os.path.join(mixseq_processed_dir, "experiment3.h5ad"))