# Time comparison

In this notebook we will analyze different size datasets (known datasets) to see how long does triku take to analyze the datasets. The time comparison will be done with 3 settings:
* All features (after basic removal), 1 processor
* Filtered features (after removal, to get ~10000 features), 1 processor
* Filtered features, several processors (2, 4, 8, 16)

We will compare these times with the time required for the rest of processing times of a basic preprocessing pipeline (get neighbors, PCA, UMAP, leiden, etc.) to see how long does running triku take in comparison with the rest of the steps of the pipeline.

In [None]:
!pip uninstall triku -y
!cd ../triku && python setup.py install

In [None]:
%matplotlib inline

In [None]:
# Imports here
import scanpy as sc
import scanpy.external as sce
import harmonypy as hpy

import numpy as np
import scipy.stats as sts
import scipy.optimize as opt
import scipy.signal as sgn
import pandas as pd

import triku as tk

import logging
import os
from tqdm.notebook import tqdm

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

# Especial imports
from sklearn.decomposition import PCA
from umap.umap_ import fuzzy_simplicial_set, nearest_neighbors

# Parallel imports
import ray
from numba import jit, njit

random_state = 10

import time
import mygene

In [None]:
ray.logger.setLevel(logging.ERROR)

import warnings
warnings.filterwarnings('ignore')

In [None]:
from triku_nb_code.palettes_and_cmaps import magma, bold_and_vivid

In [None]:
def read_adata(dataset_file, backup_url):
    try:
        adata = sc.read(dataset_file, backup_url=backup_url)
    except:
        try:
            adata = sc.read_10x_h5(dataset_file, backup_url=backup_url)
        except:
            adata = sc.read_10x_mtx(dataset_file)
    
    return adata

def processing_pipeline(dataset_file, n_genes_by_counts=4000, n_neighbors=None,
                        min_pct_counts_mt=0, pct_counts_mt=25, min_genes=50, min_cells=10, transpose=False, backup_url='', 
                        get_int=True, plot_graphs=True, prefilter=True):
    t_dict = {}
    
    # Dataset loading and basic profiling
    if isinstance(dataset_file, str):
        adata = read_adata(dataset_file, backup_url).copy()
        if transpose:
            adata = adata.transpose()
            
    elif isinstance(dataset_file, list):
        list_adatas = [read_adata(os.getcwd() + '/data/' + i.split('/')[-1], backup_url=i) for i in dataset_file]
        if transpose:
            list_adatas = [i.transpose() for i in list_adatas]
        adata = sc.AnnData.concatenate(*list_adatas)
    
    elif isinstance(dataset_file, sc.AnnData):
        adata = dataset_file.copy()
    
    # If genes are ENSMUG or like that, we set a query to rename the names
    if np.any(['ENSMU' in i for i in adata.var_names]) | np.any(['ENSG' in i for i in adata.var_names]):
        mg = mygene.MyGeneInfo()
        query = mg.querymany(adata.var_names, scopes='ensembl.gene', returnall=True)
        genes = [i['symbol'] if 'symbol' in i else '' for i in query['out']]
        adata.var_names = genes
        
    adata.var_names_make_unique() 

    t0 = time.time()
    sc.pp.filter_cells(adata, min_genes=min_genes)
    sc.pp.filter_genes(adata, min_cells=min_cells)
    tf = time.time()
    print('gene/cell filter', tf-t0)
    t_dict['gene/cell filter'] = tf-t0
    
    if prefilter:
        # Dataset profiling - get the graphs to set the n_genes_by_counts and pct_counts_mt
        t0 = time.time()
        adata.var['mt'] = adata.var_names.str.startswith('MT-') | adata.var_names.str.startswith('mt-')  # annotate the group of mitochondrial genes as 'mt'
        sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)

        if plot_graphs:
            sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
                     jitter=0.4, multi_panel=True)
            sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt')
            sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')

        tf = time.time()
        print('QC_metrics', tf-t0)
        t_dict['QC_metrics'] = tf-t0

        t0 = time.time()
        adata = adata[adata.obs.n_genes_by_counts < n_genes_by_counts, :]
        adata = adata[adata.obs.pct_counts_mt < pct_counts_mt, :]
        adata = adata[min_pct_counts_mt < adata.obs.pct_counts_mt, :]
        sc.pp.filter_cells(adata, min_genes=1)
        sc.pp.filter_genes(adata, min_counts=1)
        tf = time.time()
        print('filter_cells_genes', tf-t0)
        t_dict['filter_cells_genes'] = tf-t0

        if not get_int:
            # Normalize and log-transforming
            t0 = time.time()
            sc.pp.normalize_total(adata)
            tf = time.time()
            print('normalize', tf-t0)
            t_dict['normalize'] = tf-t0

            t0 = time.time()
            sc.pp.log1p(adata)
            tf = time.time()
            print('log1p', tf-t0)
            t_dict['log1p'] = tf-t0

    print(adata)

    t0 = time.time()
    sc.pp.pca(adata)
    tf = time.time()
    print('pca', tf-t0)
    t_dict['pca'] = tf-t0
    
    if n_neighbors is None:
        n_neighbors=int(0.5 * len(adata) ** 0.5)
    
    # Preparation for FS (PCA, kNN, etc.)
    if 'batch' in adata.obs:
        print('batch in adata')
        t0 = time.time()
        sce.pp.bbknn(adata)
        tf = time.time()
        print('bbknn', tf-t0)
        t_dict['bbknn'] = tf-t0
        
            
        t0 = time.time()
        sce.pp.harmony_integrate(adata, key='batch')
        tf = time.time()
        print('harmony', tf-t0)
        t_dict['harmony'] = tf-t0
        
        t0 = time.time()
        sc.pp.neighbors(adata, n_neighbors=n_neighbors, metric='cosine', use_rep='X_pca_harmony')
        tf = time.time()
        print('knn', tf-t0)
        t_dict['knn'] = tf-t0
    else:
        print('batch NOT in adata')
        t0 = time.time()
        sc.pp.neighbors(adata, n_neighbors=n_neighbors, metric='cosine')
        tf = time.time()
        print('knn', tf-t0)
        t_dict['knn'] = tf-t0

    # FS
    sc.pp.filter_genes(adata, min_cells=5)
    
    t0 = time.time()
    tk.tl.triku(adata)
    tf = time.time()
    print('triku', tf-t0)
    t_dict['triku'] = tf-t0

    

    
    # Dimensionality reduction
    t0 = time.time()
    sc.tl.tsne(adata, n_jobs=16)
    tf = time.time()
    print('t-SNE', tf-t0)
    t_dict['t-SNE'] = tf-t0

    t0 = time.time()
    sc.tl.umap(adata)
    tf = time.time()
    print('UMAP', tf-t0)
    t_dict['UMAP'] = tf-t0
    
    # Clustering
    t0 = time.time()
    sc.tl.leiden(adata)
    tf = time.time()
    print('leiden', tf-t0)
    t_dict['leiden'] = tf-t0
    
    t0 = time.time()
    sc.tl.louvain(adata)
    tf = time.time()
    print('louvain', tf-t0)
    t_dict['louvain'] = tf-t0
    
    # PAGA / trajectory analysis
    t0 = time.time()
    sc.tl.paga(adata)
    tf = time.time()
    print('PAGA', tf-t0)
    t_dict['PAGA'] = tf-t0
    
    return t_dict

# PBMC datasets (1, 5, 10, 68k)

## 1k

In [None]:
t_dict_pbmc_1k = processing_pipeline(dataset_file=os.getcwd() + '/data/10x/pbmc_1k_v3_filtered_feature_bc_matrix.h5', n_genes_by_counts=3000, pct_counts_mt=20, plot_graphs=False,
                                    backup_url='https://cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_1k_v3/pbmc_1k_v3_filtered_feature_bc_matrix.h5')

In [None]:
t_dict_pbmc_1k

## 5k

In [None]:
t_dict_pbmc_5k = processing_pipeline(dataset_file=os.getcwd() + '/data/10x/5k_pbmc_protein_v3_filtered_feature_bc_matrix.h5', n_genes_by_counts=4000, pct_counts_mt=20, plot_graphs=False,
                                    backup_url='https://cf.10xgenomics.com/samples/cell-exp/3.0.2/5k_pbmc_protein_v3/5k_pbmc_protein_v3_filtered_feature_bc_matrix.h5')

In [None]:
t_dict_pbmc_5k

## 10k

In [None]:
t_dict_pbmc_10k = processing_pipeline(dataset_file=os.getcwd() + '/data/10x/pbmc_10k_v3_raw_feature_bc_matrix.h5', n_genes_by_counts=4000, pct_counts_mt=20, plot_graphs=False,
                                     backup_url='cf.10xgenomics.com/samples/cell-exp/3.0.0/pbmc_10k_v3/pbmc_10k_v3_raw_feature_bc_matrix.h5')

In [None]:
t_dict_pbmc_10k

## 68k

In [None]:
!cd {os.getcwd()}/data/10x && wget https://cf.10xgenomics.com/samples/cell-exp/1.1.0/fresh_68k_pbmc_donor_a/fresh_68k_pbmc_donor_a_filtered_gene_bc_matrices.tar.gz

In [None]:
!cd {os.getcwd()}/data/10x && tar zxvf fresh_68k_pbmc_donor_a_filtered_gene_bc_matrices.tar.gz

In [None]:
t_dict_pbmc_68k = processing_pipeline(dataset_file=os.getcwd() + '/data/10x/filtered_matrices_mex/hg19', n_genes_by_counts=1000, pct_counts_mt=5, plot_graphs=False,
                                     backup_url='', n_neighbors=30)

In [None]:
t_dict_pbmc_68k

# Kidney organoid (300k, batches)


In [None]:
list_GSM = ['GSM4044536', 'GSM4044537', 'GSM4044538', 'GSM4044539', 'GSM4044540', 'GSM4044541', 'GSM4044542', 'GSM4044543', 'GSM4044544']

In [None]:
kidney_org_dir = os.getcwd() + '/data/kidney'
os.makedirs(kidney_org_dir, exist_ok=True)

In [None]:
# If it doesn't work, download it manually
!wget https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE136314&format=file -O {kidney_org_dir}/GSE136314_RAW.tar

In [None]:
!tar -xvf {kidney_org_dir}/GSE136314_RAW.tar -C {kidney_org_dir}

In [None]:
for GSM in list_GSM:
    files = [i for i in os.listdir(kidney_org_dir) if GSM in i]
    os.makedirs(f'{kidney_org_dir}/{GSM}', exist_ok=True)
    mtx, barcodes, genes = [i for i in files if 'gene_count.mtx' in i ][0], [i for i in files if 'barcodes.tsv.gz' in i ][0], [i for i in files if 'genes.tsv.gz' in i ][0]
    os.rename(f'{kidney_org_dir}/{mtx}', f'{kidney_org_dir}/{GSM}/matrix.mtx.gz')
    os.rename(f'{kidney_org_dir}/{barcodes}', f'{kidney_org_dir}/{GSM}/barcodes.tsv.gz')
    os.rename(f'{kidney_org_dir}/{genes}', f'{kidney_org_dir}/{GSM}/features.tsv.gz')

In [None]:
list_adatas = []
for GSM in list_GSM:
    print(GSM)
    adata = sc.read_mtx(f'{kidney_org_dir}/{GSM}/matrix.mtx.gz').transpose()
    adata.var_names = pd.read_csv(f'{kidney_org_dir}/{GSM}/features.tsv.gz', header=None)[0]
    adata.obs_names = pd.read_csv(f'{kidney_org_dir}/{GSM}/barcodes.tsv.gz', header=None)[0]
    list_adatas.append(adata)
    
adata_kidney = sc.AnnData.concatenate(*list_adatas, join='outer')
adata_kidney.write_h5ad(f'{kidney_org_dir}/adata_kidney.h5ad')

In [None]:
adata_kidney = sc.read(f'{kidney_org_dir}/adata_kidney.h5ad')

In [None]:
t_dict_kidney = processing_pipeline(adata_kidney, prefilter=True, min_cells=100, pct_counts_mt=5, n_genes_by_counts=3500, n_neighbors=30, plot_graphs=False,)

# Reynolds skin (500k, batches)

In [None]:
t_dict_reynolds =  processing_pipeline(dataset_file=os.getcwd() + '/data/submission_210120.h5ad', n_genes_by_counts=3000, min_pct_counts_mt=0, pct_counts_mt=15, 
                                       min_genes=50, min_cells=10, transpose=False, get_int=True, plot_graphs=False, prefilter=True, n_neighbors=30,
                                       backup_url='https://zenodo.org/record/4536165/files/submission_210120.h5ad')

In [None]:
arr_counts.sum(0) == 0

# T cells (60k, batches)

In [None]:
tcell_dir = os.getcwd() + '/data/tcell'
os.makedirs(tcell_dir, exist_ok=True)

In [None]:
# If it doesn't work, download it manually
!wget https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE150132 -O {tcell_dir}/GSE150132_RAW.tar

In [None]:
!tar -xvf {tcell_dir}/GSE150132_RAW.tar -C {tcell_dir}

In [None]:
list_GSM = ['GSM4524029', 'GSM4524030', 'GSM4524031', 'GSM4524032', 'GSM4524033']
for GSM in list_GSM:
    files = [i for i in os.listdir(tcell_dir) if GSM in i]
    os.makedirs(f'{tcell_dir}/{GSM}', exist_ok=True)
    mtx, barcodes, genes = [i for i in files if 'matrix.mtx' in i ][0], [i for i in files if 'barcodes.tsv.gz' in i ][0], [i for i in files if 'features.tsv.gz' in i ][0]
    os.rename(f'{tcell_dir}/{mtx}', f'{tcell_dir}/{GSM}/matrix.mtx.gz')
    os.rename(f'{tcell_dir}/{barcodes}', f'{tcell_dir}/{GSM}/barcodes.tsv.gz')
    os.rename(f'{tcell_dir}/{genes}', f'{tcell_dir}/{GSM}/features.tsv.gz')

In [None]:
list_adatas = []
for GSM in list_GSM:
    print(GSM)
    adata = sc.read_mtx(f'{tcell_dir}/{GSM}/matrix.mtx.gz').transpose()
    adata.var_names = pd.read_csv(f'{tcell_dir}/{GSM}/features.tsv.gz', header=None)[0]
    adata.obs_names = pd.read_csv(f'{tcell_dir}/{GSM}/barcodes.tsv.gz', header=None)[0]
    list_adatas.append(adata)
    
adata_tcell = sc.AnnData.concatenate(*list_adatas, join='outer')
adata_tcell.var_names = [i.split('\t')[1] for i in adata_tcell.var_names]
adata_tcell.write_h5ad(f'{tcell_dir}/adata_tcell.h5ad')

In [None]:
adata_tcell = sc.read(f'{tcell_dir}/adata_tcell.h5ad')

In [None]:
t_dict_tcell =  processing_pipeline(adata_tcell, n_genes_by_counts=2000, min_pct_counts_mt=0, pct_counts_mt=8, 
                                       min_genes=50, min_cells=10, transpose=False, get_int=True, plot_graphs=False, prefilter=True, n_neighbors=30,
                                       backup_url='')