In [1]:
import gc
import time
import umap

import torch
import collections
import numpy as np
import pandas as pd
import scanpy as sc

from scipy.sparse import csr_matrix

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.cluster import AgglomerativeClustering

from scipy.cluster.hierarchy import dendrogram

In [3]:
import scvi
from cellcap.module_v1 import CellCap

Global seed set to 0


[easydl] tensorflow not available!


Prepare data for ready-to-use

In [4]:
adata = sc.read_h5ad('../data/scLevyAll_neuron20.h5ad')

In [5]:
drugtype = pd.read_csv('../data/LevyDrug_class.csv',header=0,index_col=False)
drugtype

Unnamed: 0,Perturbation,Category,Vehicle
0,A1CYTO,Inflammatory response,PBS
1,ATOR,Cholesterol biosynthesis,DMSO
2,AZT,Oxidative stress,DMSO
3,C1Q,Inflammatory response,PBS
4,CLOZ,Antipsychotic,DMSO
5,DMSO,Vehicle control,DMSO
6,EFA,Cholesterol biosynthesis,DMSO
7,GluN2a,NMDA/excitability,PBS
8,GLUT,NMDA/excitability,PBS
9,H2O2,Oxidative stress,PBS


In [6]:
codes, uniques = pd.factorize(adata.obs['condition'])
uniques = list(uniques)
adata.obs['Condition']=codes
drugY = np.zeros((len(codes),len(uniques)))
for i in range(len(codes)):
    j = codes[i]
    drugY[i,j]=1
drugY[:,uniques.index('Control')]=0
drugY[:,uniques.index('DMSO')]=0
drugY[:,uniques.index('PBS')]=0
drugY = drugY[:,np.sum(drugY,0)>0]

In [7]:
drug_names = uniques.copy()
drug_names.remove('Control')
drug_names.remove('DMSO')
drug_names.remove('PBS')

In [8]:
contY = np.zeros((len(codes),len(uniques)))
for i in range(len(codes)):
    j = codes[i]
    sub_drugtype = drugtype[drugtype['Perturbation']==uniques[j]]
    index = uniques.index(sub_drugtype['Vehicle'].values[0])
    contY[i,index]=1
contY[:,uniques.index('Control')]=0
contY = contY[:,np.sum(contY,0)>0]

In [9]:
target_label = np.zeros((len(codes),len(uniques)))
for i in range(len(codes)):
    j = codes[i]
    target_label[i,j]=1
    sub_drugtype = drugtype[drugtype['Perturbation']==uniques[j]]
    index = uniques.index(sub_drugtype['Vehicle'].values[0])
    target_label[i,index]=1
target_label[:,uniques.index('Control')]=0
target_label = target_label[:,np.sum(target_label,0)>0]

In [10]:
vehicles=[]
for i in range(len(codes)):
    j = codes[i]
    sub_drugtype = drugtype[drugtype['Perturbation']==uniques[j]]
    vehicles.append(sub_drugtype['Vehicle'].values[0])
adata.obs['Vehicle']=vehicles

In [11]:
print(drugY.shape)
print(contY.shape)
print(target_label.shape)
adata.obsm['X_drug']=drugY
adata.obsm['X_cont']=contY
adata.obsm['X_target']=target_label

(88419, 13)
(88419, 2)
(88419, 15)


In [12]:
donor_codes, donor_uniques = pd.factorize(adata.obs['DONOR'])
donor_uniques = list(donor_uniques)
donorY = np.zeros((len(donor_codes),len(donor_uniques)))
for i in range(len(donor_codes)):
    j = donor_codes[i]
    donorY[i,j]=1
adata.obsm['X_donor']=donorY

In [13]:
print(donorY.shape)

(88419, 39)


In [14]:
import collections
collections.Counter(adata.obs['cell_type'])

Counter({'Neuron': 88419})

In [15]:
collections.Counter(adata.obs['condition'])

Counter({'INFa': 3510,
         'DMSO': 18274,
         'CLOZ': 3855,
         'HALO': 3559,
         'AZT': 3426,
         'INFy': 8609,
         'EFA': 2452,
         'Control': 17279,
         'TNFa': 3860,
         'ATOR': 2629,
         'ISRD': 4118,
         'GluN2a': 3922,
         'PBS': 5638,
         'H2O2': 1046,
         'GLUT': 4877,
         'SIM': 1365})

Train CellCap model

In [16]:
CellCap.setup_anndata(adata, labels_key='control',
                      pert_key='Condition',layer="counts",
                      cond_key='X_drug',cont_key='X_cont',
                      target_key='X_target',donor_key='X_donor')

[34mINFO    [0m Generating sequential column names                                                  
[34mINFO    [0m Generating sequential column names                                                  
[34mINFO    [0m Generating sequential column names                                                  
[34mINFO    [0m Generating sequential column names                                                  


In [17]:
cellcap = CellCap(adata, n_latent=10, n_layers=3, 
                  n_drug=13,n_control=2,n_target=15,
                  n_donor=39,gene_likelihood='nb')

In [None]:
cellcap.train(max_epochs=1000,batch_size=2048)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Epoch 4/1000:   0%|          | 3/1000 [00:07<42:01,  2.53s/it, loss=1.31e+04, v_num=1]

Plot latent space

In [None]:
z = cellcap.get_latent_embedding(adata)
adata.obsm['X_basal']=z

In [None]:
sc.pp.neighbors(adata, n_neighbors=15, use_rep='X_basal', random_state=0,metric='euclidean')
sc.tl.umap(adata, min_dist=0.15,method='rapids')
sc.set_figure_params(scanpy=True, dpi=75, dpi_save=75)
sc.pl.umap(adata, color='cell_type', title='',legend_loc='on data')
sc.pl.umap(adata, color='condition', title='')

In [None]:
sc.set_figure_params(scanpy=True, dpi=75, dpi_save=75)
sc.pl.umap(adata, color='DONOR', title='')
sc.pl.umap(adata, color='Sex', title='')
sc.pl.umap(adata, color='Clinical', title='')
sc.pl.umap(adata, color='Age', title='')

In [32]:
##ROC to evaluate basal state
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [33]:
adata.obsm['X_latent']=z
fpr=dict()
tpr=dict()
roc_auc=dict()
conditions = list(set(adata[adata.obs['condition']!='Control'].obs['condition'].values))
for c in conditions:
    subad = adata[np.logical_or(adata.obs['condition']=='Control',adata.obs['condition']==c)]
    y, ycode = pd.factorize(subad.obs['condition'])
    y[subad.obs['condition']=='Control']=0
    y[subad.obs['condition']==c]=1
    X = subad.obsm['X_latent']
    random_state = np.random.RandomState(0)

    # shuffle and split training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

    # Learn to predict each class against the other
    classifier = LogisticRegression(random_state=random_state)
    y_score = classifier.fit(X_train, y_train).decision_function(X_test)

    fpr[c], tpr[c], _ = roc_curve(y_test, y_score, pos_label=classifier.classes_[1])
    roc_auc[c] = auc(fpr[c], tpr[c])

In [34]:
conditions.sort()

In [35]:
colors = list(sns.color_palette("Paired"))+list(sns.color_palette("hls", 8))

In [None]:
sc.set_figure_params(scanpy=True, dpi=200, dpi_save=200)
plt.figure()
for i, color in zip(range(len(conditions)), colors):
    plt.plot(
        fpr[conditions[i]],
        tpr[conditions[i]],
        color=color,
        lw=1,
        #linestyle=":",
        label="{0}".format(conditions[i]),
        #label="{0} (AUC = {1:0.2f})".format(conditions[i], roc_auc[conditions[i]]),
    )
plt.grid(False)
plt.plot([0, 1], [0, 1], "k--", lw=1)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("")
legend = plt.legend(loc="lower right",prop={'size': 4.5})
#legend = plt.legend(bbox_to_anchor=(1.1, 1.05),prop={'size': 5})
legend.get_frame().set_facecolor('none')
plt.show()

In [None]:
for i, color in zip(range(len(conditions)), colors):
    print("{0} (AUC = {1:0.4f})".format(conditions[i], roc_auc[conditions[i]]))

In [38]:
zP,_ = cpa.get_pert_embedding(adata)

In [39]:
adata.obsm['X_pert']=zP
adata.obsm['X_attn']=zAttn