In [None]:
%load_ext autoreload

In [None]:
%autoreload 2

In [None]:
import triku as tk
import scanpy as sc
import pandas as pd
import numpy as np
import scipy.sparse as spr
import scipy.stats as sts
import os
import gc
from itertools import product
import pickle
import ray
import itertools

from tqdm.notebook import tqdm

from bokeh.io import show, output_notebook, reset_output
from bokeh.plotting import figure
from bokeh.models import LinearColorMapper

import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib.lines import Line2D

from sklearn.metrics import adjusted_rand_score as ARS
from sklearn.metrics import adjusted_mutual_info_score as NMI
from sklearn.metrics import silhouette_score, davies_bouldin_score

import time 

reset_output()
output_notebook()

In [None]:
mpl.rcParams['figure.dpi'] = 150

In [None]:
!python setup.py install

In [None]:
save_dir = "../data/splatter/"
data_dir = "../data/comp_times/"

os.makedirs(data_dir, exist_ok=True)

In [None]:
adata = sc.read_loom(save_dir + '/splatter_large.loom')

In [None]:
n_cores = [1, 2, 4, 8, 16, 32]
per_cells = [1, 2, 5, 10, 20, 50]
per_genes = [5, 10, 20, 50, 100]
trials = range(5)

In [None]:
f = open(data_dir + '/results_triku_parallel_scatter.txt', 'w')
f.write("n_cores\tper_cell\tn_cells\tper_gene\tn_genes\ttrial\ttime\n")
f.close()

In [None]:
for n_core, per_cell, per_gene, trial in itertools.product(*[n_cores, per_cells, per_genes, trials]):
    n_genes, n_cells = int(len(adata.var_names) * per_gene / 100), int(len(adata.obs_names) * per_cell / 100)
    
    gene_idx, cell_idx = np.sort(np.random.choice(range(len(adata.var_names)), n_genes)), np.sort(np.random.choice(range(len(adata.obs_names)), n_cells))
    print(n_genes, n_cells)
    arr = adata.X[cell_idx, :]
    arr = arr[:, gene_idx]
    adatasub = sc.AnnData(arr)
    
    print(adatasub)
    sc.pp.filter_genes(adatasub, min_counts=1)
    
    t0 = time.time()
    tk.tl.triku(adatasub, n_procs=n_core)
    dt = time.time() - t0
    
    f = open(data_dir + '/results_triku_parallel_scatter.txt', 'a')
    f.write(f"{n_core}\t{per_cell}\t{n_cells}\t{per_gene}\t{n_genes}\t{trial}\t{dt}\n")
    f.close()

    del adatasub
#     adata_copy = adata[cell_idx, gene_idx].copy()
#     print(adata_copy)

In [None]:
df = pd.read_csv(data_dir + '/results_triku_parallel_scatter.txt', sep='\t')

In [None]:
col_fix = 'n_cells'
fix_val = 2000
col_var = 'n_genes'

n_cores_vals = sorted(set(df['n_cores'].values))
n_var_vals = sorted(set(df[col_var].values))

In [None]:
palette = ['#00429d', '#455dad', '#6b7abe', '#8c98ce', '#adb7df', '#cdd7ef', '#edf9ff']

fig, ax = plt.subplots(1,1)

for var_val_idx, var_val in enumerate(n_var_vals):
    means, stds = [], []
    for core in n_cores_vals:
        sub_df = df[(df[col_var] == var_val) & (df['n_cores'] == core) & (df[col_fix] == fix_val)]
        means.append(sub_df['time'].values.mean())
        stds.append(sub_df['time'].values.std())
    
    ax.plot(n_cores_vals, means, color=palette[var_val_idx], label=var_val)
    
    ax.scatter(n_cores_vals, means, color=palette[var_val_idx], s=10)    
#     for idx in range(len(n_cores_vals)):
#         ax.plot([n_cores_vals[idx], n_cores_vals[idx]], [means[idx] - stds[idx], means[idx] + stds[idx]], c='#898989')
        
    ax.plot()
plt.legend()

In [None]:
n_var_vals

In [None]:
sorted(set(df['n_cells'].values))