In [None]:
import scanpy as sc
import numpy as np

In [None]:
dir_path = "/home/krushna/Documents/Data_integration/SCRNA_Datasets/All_h5ad/"
def load_data(dataset,batch):
    adata =sc.read_h5ad(dir_path+dataset+'.h5ad')
    sc.pp.filter_genes(adata, min_counts=3)
    adata.layers["counts"] = adata.X.copy()
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    adata.raw = adata
    sc.pp.highly_variable_genes(
            adata,
            flavor="seurat",
            n_top_genes=2000,
            layer="counts",
            batch_key=batch,
            subset=True
    )
    return adata
    
batch_key_dic = {'Immune_Human' : 'batch',
                 'Immune_human_mouse' : 'batch',
                 'Lung' : 'batch',
                 'Mouse_brain' : 'batch',
                 'Pancreas' : 'tech',
                 'Simulation1' : 'Batch',
                 'Simulation2' : 'Batch'}
cell_type_key_dic = {'Immune_Human' : 'final_annotation',
                 'Immune_human_mouse' : 'final_annotation',
                 'Lung' : 'cell_type',
                 'Mouse_brain' : 'cell_type',
                 'Pancreas' : 'celltype',
                 'Simulation1' : 'Group',
                 'Simulation2' : 'Group'}  

In [None]:
dataset = 'Immune_Human'
batch = batch_key_dic[dataset]
cell_type = cell_type_key_dic[dataset]
adata = load_data(dataset,batch)
adata_sce = adata.copy()
if type(adata_sce.X) != type(np.array([])):
    adata_sce.X = adata_sce.X.toarray()
    adata_sce.layers['counts'] = adata_sce.layers['counts'].toarray()
    # del adata_sce.layers['counts']
del adata_sce.uns
del adata_sce.var

In [None]:
print(adata,adata_sce,sep='\n')

In [None]:
import anndata2ri
anndata2ri.activate()

#Loading the rpy2 extension enables cell magic to be used
#This runs R code in jupyter notebook cells
%load_ext rpy2.ipython

sc.settings.verbosity = 3
sc.logging.print_versions()

In [None]:
%%R -i adata_sce --i batch

library(Seurat)
adata_seu <- as.Seurat(adata_sce, counts = "counts", data = "X")


In [None]:
%%R
adata_seu = RenameAssays(object = adata_seu, originalexp = 'RNA')
sobj = adata_seu

In [None]:
import time
start_time = time.time()

In [None]:
%%R
tryCatch(
    require(liger),
    warning = function (w) require(rliger)
  )
  require(Seurat)

  # Only counts is converted to liger object. To pass our own normalized data,
  # store it in the "counts" slot
  if (is.null(sobj@assays$RNA)) {
    # Seurat v4
    data <- GetAssayData(sobj, slot = "data")
    SetAssayData(sobj, slot = "counts", new.data = as.matrix(data))
  } else {
    # Seurat v3
    sobj@assays$RNA@counts = sobj@assays$RNA@data
  }



In [None]:
%%R
hvg = rownames(sobj)

In [None]:
%%R
  # Create Liger object
  lobj = seuratToLiger(
    sobj,
    combined.seurat = T,
    meta.var = batch,
    renormalize = F,
    remove.missing = F
  )


In [None]:
%%R
  # We only pass nomarlized data, so store it as such
  lobj@norm.data <- lobj@raw.data

  # Assign hvgs
  lobj = selectGenes(lobj,num.genes = 2000)

 

In [None]:
%%R
lobj <- scaleNotCenter(lobj, remove.missing = F) # Can't do our own scaling atm


In [None]:
%%R

lobj <- optimizeALS(lobj, k = 20, thresh = 5e-5, nrep = 3)

lobj <- quantileAlignSNF(lobj, resolution = 0.4, small.clust.thresh = small.clust.thresh)

# Store embedding in initial Seurat object
# Code taken from ligerToSeurat() function from LIGER
inmf.obj <- new(
Class = "DimReduc", feature.loadings = t(lobj@W),
cell.embeddings = lobj@H.norm, key = "X_emb"
)
sobj@reductions['X_emb'] <- inmf.obj



In [None]:
end_time = time.time()
print('total time taken', end_time-start_time)

In [None]:
%%R -i dataset
write.csv(lobj@H.norm,paste0(dataset,'.csv'))

In [None]:
import pandas as pd
adata = load_data(dataset,batch)
adata.obsm['X_emb'] = pd.read_csv(dataset+'.csv',index_col=0).loc[list(adata.obs_names),:].values

In [None]:
import scIB
#Trajectory is asking precomputed sudo time point
results,ilisi_all,clisi_all,kbet_all = scIB.metrics.metrics(
        adata,
        adata,
        batch_key = batch,
        label_key = cell_type,
        hvg_score_=False,
        cluster_key='cluster',
        cluster_nmi=None,
        ari_=True,
        nmi_=True,
        nmi_method='arithmetic',
        nmi_dir=None,
        silhouette_=True,
        embed='X_emb',
        si_metric='euclidean',
        pcr_=True,
        cell_cycle_=False,
        organism='mouse',
        isolated_labels_=True,  # backwards compatibility
        isolated_labels_f1_=True,
        isolated_labels_asw_=True,
        n_isolated=None,
        graph_conn_=True,
        kBET_=True,
        kBET_sub=0.5,
        lisi_graph_=True,
        lisi_raw=True,
        trajectory_=False,
        type_=None,
        verbose=False,
)

In [None]:
results

In [None]:
import numpy as np
np.savetxt(dataset+"_ilisi.csv", ilisi_all, delimiter=",")
np.savetxt(dataset+"_clisi.csv", clisi_all, delimiter=",")
np.savetxt(dataset+"_kbet_all.csv",np.concatenate([np.array(val).reshape(1,-1) for val in kbet_all],axis = 0), delimiter=',')

In [None]:
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 300

In [None]:
sc.pp.neighbors(adata, use_rep = 'X_emb')  # use_rep = 'final_embeddings'
sc.tl.umap(adata)
sc.pl.umap(adata, color=cell_type, frameon=False)
sc.pl.umap(adata, color=batch, frameon=False)
# sc.pl.umap(adata_int, color='cluster', frameon=False)

In [None]:
import scIB
#Trajectory is asking precomputed sudo time point
results,ilisi_all,clisi_all,kbet_all = scIB.metrics.metrics(
        adata,
        adata,
        batch_key = batch,
        label_key = cell_type,
        hvg_score_=False,
        cluster_key='cluster',
        cluster_nmi=None,
        ari_=True,
        nmi_=False,
        nmi_method='arithmetic',
        nmi_dir=None,
        silhouette_=False,
        embed='X_emb',
        si_metric='euclidean',
        pcr_=False,
        cell_cycle_=False,
        organism='mouse',
        isolated_labels_=False,  # backwards compatibility
        isolated_labels_f1_=False,
        isolated_labels_asw_=False,
        n_isolated=None,
        graph_conn_=False,
        kBET_=False,
        kBET_sub=0.5,
        lisi_graph_=False,
        lisi_raw=False,
        trajectory_=True,
        type_=None,
        verbose=False,
)
results

In [None]:
import scIB
#Trajectory is asking precomputed sudo time point
results,ilisi_all,clisi_all,kbet_all = scIB.metrics.metrics(
        adata,
        adata,
        batch_key = batch,
        label_key = cell_type,
        hvg_score_=False,
        cluster_key='cluster',
        cluster_nmi=None,
        ari_=True,
        nmi_=True,
        nmi_method='arithmetic',
        nmi_dir=None,
        silhouette_=False,
        embed='X_emb',
        si_metric='euclidean',
        pcr_=False,
        cell_cycle_=False,
        organism='mouse',
        isolated_labels_=False,  # backwards compatibility
        isolated_labels_f1_=False,
        isolated_labels_asw_=False,
        n_isolated=None,
        graph_conn_=False,
        kBET_=False,
        kBET_sub=0.5,
        lisi_graph_=False,
        lisi_raw=False,
        trajectory_=False,
        type_=None,
        verbose=False,
)
results

In [None]:
sc.settings.figdir = '/home/krushna/Documents/Data_integration/Figures/Immune_Human/'
import matplotlib as mpl
mpl.rcParams['figure.dpi'] = 200
sc.pp.neighbors(adata, use_rep = 'X_emb')  # use_rep = 'final_embeddings'
sc.tl.umap(adata)

In [None]:
sc.pl.umap(adata, color=cell_type, frameon=False,save='_liger_'+dataset+"_cell_type.png")
sc.pl.umap(adata, color=batch, frameon=False,save='_liger_'+dataset+"_batch.png")
sc.pl.umap(adata, color='cluster', frameon=False,save='_liger_'+dataset+"_optimal_resolution.png")

In [None]:
adata

In [None]:
import pandas as pd
cluster_cell_type = pd.crosstab(np.array(adata.obs['cluster']),np.array(adata.obs['final_annotation']))
cluster_cell_type.index.name = 'cluster'
cluster_cell_type

In [None]:
li = []
for column in cluster_cell_type:
    li.append([column,entropy(cluster_cell_type[column])])
pd.DataFrame(li)