# Additional comparisons

In this notebook we will adapt secondary comparisons included during the review phase of the paper. Some of these comparisons may not appear on the paper, but are used to answer reviewers' questions.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import scanpy as sc
import triku as tk
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import seaborn as sns
from sklearn.decomposition import PCA
from itertools import product

In [None]:
import ray

In [None]:
from triku_nb_code.comparing_feat_sel import plot_CV_scores

In [None]:
list_methods_all = ['triku', 'm3drop', 'nbumi', 'scanpy', 'seurat', 'sct', 'scry', 'std', 'brennecke', 'all', 'random']

palette = [
        '#e91e63',  # triku
        '#81c784',  # m3drop
        '#388e3c',  # nbumi
        '#90caf9',  # scanpy
        '#2196f3',  # seurat
        '#1565c0',  # sctransform
        '#ff9800',  # std
        '#ff5722',  # scry
        '#ffca28',  # brennecke
        '#A5B1C2',  # all
        '#4B6584',  # random
]

In [None]:
mereu_dir = os.getcwd() + '/data/Mereu_2020/'
ding_dir = os.getcwd() + '/data/Ding_2020/'

In [None]:
# optimized number of features to select per dataset
ding_n_feat = dict(pd.read_csv(os.getcwd() + '/data/Ding_n_genes.csv', sep=',', index_col=0)['n_features'])
mereu_n_feat = dict(pd.read_csv(os.getcwd() + '/data/Mereu_n_genes.csv', sep=',', index_col=0)['n_features'])

In [None]:
def preprocessing(adatax): 
    sc.pp.filter_cells(adatax, min_genes=100)
    sc.pp.filter_genes(adatax, min_cells=3)
    sc.pp.normalize_total(adatax, target_sum=1e4)
    sc.pp.log1p(adatax)
    return adatax

def PCA_knn(adatax, seed):
    try:
        pca = PCA(n_components=30, whiten=True, svd_solver="auto", random_state=seed,).fit_transform(adatax.X.toarray())
    except: # the array is already dense
        pca = PCA(n_components=30, whiten=True, svd_solver="auto", random_state=seed,).fit_transform(adatax.X)

    adatax.obsm['X_pca'] = pca
    sc.pp.neighbors(adatax, random_state=seed, metric='cosine', n_neighbors=int(len(adatax) ** 0.5))

## Train a classifier on labeled data with different FS methods and study accuracy

* Get dataset (for instance, Ding 2020: 10X human)
* Use different FS methods on that dataset to obtain one matrix each time
* Train classifier on each differently feature-selected dataset
* Compute accuracy (10def preprocessing(adata): 
    sc.pp.filter_cells(adata, min_genes=100)
    sc.pp.filter_genes(adata, min_cells=3)
    sc.pp.normalize_total(adata, target_sum=1e4)
    sc.pp.log1p(adata)
    return adata-fold cross validation)

In [None]:
def cross_validation(adata_whole, ranking, model, n_features, n_folds, seed=0):
    fs_methods = ranking.columns.tolist() + ['all', 'random']
    cv_scores = pd.DataFrame(index=np.arange(n_folds), columns=fs_methods)
    for fs in fs_methods:
        if fs == 'all':
            selected_features = ranking.index.tolist()
        elif fs == 'random':
            idxs = np.random.choice(np.arange(len(ranking)), n_features, replace=False)
            selected_features = ranking.index[idxs].tolist()
        else:
            selected_features = ranking.sort_values(by=fs)[fs][0:n_features].index.tolist()
            
        adata_sel = adata_whole[:, [var for var in adata_whole.var_names if var in selected_features]]
        
        PCA_knn(adata, seed)
        
        ###### SELECT MODEL ######
        X = adata_sel.obsm['X_pca']
        y = adata_sel.obs['cell_types']
        
        ###### SELECT MODEL ######
        if model == 'decision_tree':
            clf= DecisionTreeClassifier(class_weight='balanced', random_state=seed)
        elif model == 'knn':
            clf = KNeighborsClassifier(n_neighbors=10, n_jobs=8)
        elif model == 'svc':
            clf = SVC(class_weight='balanced', random_state=seed)
        ###### COMPUTE CROSS-VALIDATION SCORE ######
        skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
        scores = cross_val_score(clf, X, y, cv=skf, scoring='accuracy')
        cv_scores.loc[:, fs] = scores
    return cv_scores

In [None]:
# datasets
ding_datasets = [fname.split('.')[0] for fname in os.listdir(ding_dir) if fname.endswith('h5ad')]
mereu_datasets = [fname.split('.')[0] for fname in os.listdir(mereu_dir)  if fname.endswith('h5ad') ]

In [None]:
# models 
models = ['decision_tree', 'svc', 'knn']

### Mereu datasets

In [None]:
n_folds = 10
os.makedirs(f'{os.getcwd()}/exports/effect_FS_on_classifiers/fixed_n_features/mereu/', exist_ok=True)

for dset in mereu_datasets: 
    print(dset)
    adata = sc.read(f'{mereu_dir}/{dset}.h5ad')
    n_feat = int(mereu_n_feat[dset])
    adata = preprocessing(adata)
    if 'CellType' in adata.obs.columns:
        adata.obs['cell_types'] = adata.obs['CellType']
    ranking = pd.read_csv(f'{os.getcwd()}/exports/comparisons/mereu_{dset}-log_feature_ranks.csv', index_col=0)
    for model in models:
        cv_scores = cross_validation(adata, ranking, model, n_features=n_feat, n_folds=n_folds)
        cv_scores.to_csv(f'{os.getcwd()}/exports/effect_FS_on_classifiers/fixed_n_features/mereu/CV_scores_{dset}_{model}_{n_feat}_{n_folds}-fold.csv')

### Ding datasets

In [None]:
n_folds = 10
os.makedirs(f'{os.getcwd()}/exports/effect_FS_on_classifiers/fixed_n_features/ding/', exist_ok=True)
for dset in ding_datasets: 
    print(dset) 
    adata = sc.read(f'{ding_dir}/{dset}.h5ad')
    n_feat = int(ding_n_feat[dset])
    adata = preprocessing(adata)
    if 'CellType' in adata.obs.columns:
        adata.obs['cell_types'] = adata.obs['CellType']
    ranking = pd.read_csv(f'{os.getcwd()}/exports/comparisons/ding_{dset}-log_feature_ranks.csv', index_col=0)
    for model in models:
        cv_scores = cross_validation(adata, ranking, model, n_features=n_feat, n_folds=n_folds)
        cv_scores.to_csv(f'{os.getcwd()}/exports/effect_FS_on_classifiers/fixed_n_features/ding/CV_scores_{dset}_{model}_{n_feat}_{n_folds}-fold.csv')

#### Figure 4C and 5C

In [None]:
for lab in ['ding', 'mereu']:
    plot_CV_scores(lab=lab, org='', CV_method='decision_tree', FS_methods=list_methods_all, palette=palette, sort_values='descending', 
                       read_dir=f'{os.getcwd()}/exports/effect_FS_on_classifiers/fixed_n_features/{lab}',
                       filename=f"decision_tree_{lab}.pdf")

#### Figure S2C and S3C

In [None]:
for lab in ['ding', 'mereu']:
    plot_CV_scores(lab=lab, org='', CV_method='knn', FS_methods=list_methods_all, palette=palette, sort_values='descending', 
                       read_dir=f'{os.getcwd()}/exports/effect_FS_on_classifiers/fixed_n_features/{lab}',
                       filename=f"knn_{lab}.pdf")

## DEG analysis

In this section, we are going to analyze the ability to select DEGs from more informative populations in biological datasets. This would be a complement to the use of ARI in biological datasets. The approach will be the following:
* Control: populations are the manually curated populations.
* FS + all + random: create the specific adata with those selected features and run leiden to match the number of populations manually labelled.
* Run DEGs in all cases with alpha = 0.01. 
* Extract relevant information from that analysis.

In [None]:
os.makedirs(f'{os.getcwd()}/exports/DEGs_UMAP_leiden', exist_ok=True)

In [None]:
@ray.remote
def extract_DEGs_UMAP_leiden_adata(lab, org, method, seed=0):
    try:
        adata = sc.read(f'{os.getcwd()}/data/{lab.capitalize()}_2020/{method}_{org}.h5ad')
    except:
        return 0
    df_ranks = pd.read_csv(f'{os.getcwd()}/exports/comparisons/{lab}_{method}_{org}-log_feature_ranks.csv', index_col=0)
    
    if lab == 'ding':
        n_HVG = ding_n_feat[f'{method}_{org}']
    elif lab == 'mereu':
        n_HVG = mereu_n_feat[f'{method}_{org}']
    
    preprocessing(adata)
    try:
        if 'CellType' in adata.obs.columns:
            adata.obs['cell_types'] = adata.obs['CellType']

        n_cell_types = len(set(adata.obs['cell_types']))
        df_populations = pd.DataFrame(columns=['cell_types'] + list_methods_all, index=adata.obs_names)
        df_populations['cell_types'] = adata.obs['cell_types'].cat.codes
    except KeyError:
        raise f"ERROR IN {lab}, {org}, {method}, {seed}"
        
    for FS_method in list_methods_all:
        if FS_method == 'all':
            selected_features = df_ranks.index.tolist()
        elif FS_method == 'random':
            idxs = np.random.choice(np.arange(len(adata.var_names)), n_HVG, replace=False)
            selected_features = df_ranks.index[idxs].tolist()
        else:
            selected_features = df_ranks.sort_values(by=FS_method)[FS_method][0:n_HVG].index.tolist()
    
        adata_sub = adata.copy()[:, [i in selected_features for i in adata.var_names]]   
        PCA_knn(adata_sub, seed)
        
        sc.tl.umap(adata_sub, random_state=seed)
        X_UMAP_FS = adata_sub.obsm['X_umap']
        np.savetxt(f'{os.getcwd()}/exports/DEGs_UMAP_leiden/UMAP_{lab}_{method}_{org}_FSmethod-{FS_method}_seed-{seed}.txt', 
                   X_UMAP_FS, fmt='%.4f')
        
        # Clustering binary search 
        depth = 0
        min_res, max_res, max_depth = 0.1, 2, 7
        while depth < max_depth:
            if depth == 0:
                sc.tl.leiden(adata_sub, resolution=min_res, random_state=seed)
                leiden_sol, res_sol = adata_sub.obs["leiden"], min_res
                if len(list(dict.fromkeys(leiden_sol))) == n_cell_types:
                    break

                sc.tl.leiden(adata_sub, resolution=max_res, random_state=seed)
                leiden_sol, res_sol = adata_sub.obs["leiden"], max_res
                if len(list(dict.fromkeys(leiden_sol))) == n_cell_types:
                    break

            mid_res = 0.5 * (max_res + min_res)
            sc.tl.leiden(adata_sub, resolution=mid_res, random_state=seed)
            leiden_sol, res_sol = adata_sub.obs["leiden"], mid_res
            n_clust_mid = len(list(dict.fromkeys(leiden_sol)))
            if n_clust_mid == n_cell_types:
                break

            if n_clust_mid > n_cell_types:
                max_res = mid_res
            else:
                min_res = mid_res

            depth += 1

        df_populations[FS_method] = adata_sub.obs["leiden"].copy()
        sc.tl.rank_genes_groups(adata_sub, groupby=f"leiden", method='wilcoxon')
        
        for var_DEGs in ['names', 'pvals_adj', 'scores']:
            pd.DataFrame(adata_sub.uns['rank_genes_groups'][var_DEGs]).to_csv(
                f'{os.getcwd()}/exports/DEGs_UMAP_leiden/DEGs_{lab}_{method}_{org}_FSmethod-{FS_method}_seed-{seed}_var-{var_DEGs}.csv', 
            index=None)  
        
    df_populations.to_csv(f'{os.getcwd()}/exports/DEGs_UMAP_leiden/populations_{lab}_{method}_{org}_seed-{seed}.csv')  

In [None]:
labs = ['ding', 'mereu']
orgs = ['human', 'mouse']
methods = ['10X', 'CELseq2', 'ddSEQ', 'Dropseq', 'inDrop', 'QUARTZseq', 'SingleNuclei', 'SMARTseq2',  'sci-RNA-seq', 'Seq-Well']
seeds = [0, 1, 2, 3, 4]

ray.init(ignore_reinit_error=True, num_cpus=8)
done = ray.get([extract_DEGs_UMAP_leiden_adata.remote(lab=lab, org=org, method=method, seed=seed) 
                for lab, org, method, seed in list(product(*[labs, orgs, methods, seeds]))])
ray.shutdown()

In [None]:
lab, org, method, seed = 'ding', 'human', '10X', 0

FS_method = 'triku'

adata = sc.read(f'{os.getcwd()}/data/{lab.capitalize()}_2020/{method}_{org}.h5ad')
preprocessing(adata)

df_populations = pd.read_csv(f'{os.getcwd()}/exports/DEGs_UMAP_leiden/populations_{lab}_{method}_{org}_seed-{seed}.csv', index_col=0)
adata.obs['cell_names'] = adata.obs['cell_types']
adata.obs[df_populations.columns] = df_populations.astype(str)

dict_names, dict_pvals_adj, dict_scores = {}, {}, {}
for FS_method in list_methods_all:
    X_UMAP = np.loadtxt(f'{os.getcwd()}/exports/DEGs_UMAP_leiden/UMAP_{lab}_{method}_{org}_FSmethod-{FS_method}_seed-{seed}.txt') 
    adata.obsm[f'X_umap_{FS_method}'] = X_UMAP

    dict_names[FS_method] = pd.read_csv(f'{os.getcwd()}/exports/DEGs_UMAP_leiden/DEGs_{lab}_{method}_{org}_FSmethod-{FS_method}_seed-{seed}_var-names.csv')
    dict_pvals_adj[FS_method] = pd.read_csv(f'{os.getcwd()}/exports/DEGs_UMAP_leiden/DEGs_{lab}_{method}_{org}_FSmethod-{FS_method}_seed-{seed}_var-pvals_adj.csv')
    dict_scores[FS_method] = pd.read_csv(f'{os.getcwd()}/exports/DEGs_UMAP_leiden/DEGs_{lab}_{method}_{org}_FSmethod-{FS_method}_seed-{seed}_var-scores.csv')

In [None]:
fig, axs = plt.subplots(len(list_methods_all) + 1, len(list_methods_all), figsize = (len(list_methods_all) * 3, (len(list_methods_all) + 1) * 3))

for FS_method_idx, FS_method in enumerate(list_methods_all):
    sc.pl.embedding(adata, basis=f'X_umap_{FS_method}', color='cell_types',legend_loc=False , ax=axs[0][FS_method_idx], show=False)

for FS_method_idx_row, FS_method_row in enumerate(list_methods_all):
    for FS_method_idx_col, FS_method_col in enumerate(list_methods_all):
        sc.pl.embedding(adata, basis=f'X_umap_{FS_method_col}', color=FS_method_row, legend_loc=False, ax=axs[FS_method_idx_row + 1][FS_method_idx_col], show=False)
        
for idx_row in range(len(list_methods_all) + 1):
    for idx_col in range(len(list_methods_all)):
        axs[idx_row][idx_col].set_xlabel('')
        axs[idx_row][idx_col].set_ylabel('')
        axs[idx_row][idx_col].set_title('')

for idx_row, row_name in enumerate(['cell_types'] + list_methods_all):
    for idx_col, col_name in enumerate(list_methods_all):
        axs[idx_row][idx_col].set_ylabel(row_name)
        axs[idx_row][idx_col].set_title(col_name)

In [None]:
dict_names['scry']

## Analysis on continuous datasets

To see if triku also works in datasets with continuous differentiation stages, we will use datasets from scvelo (pancreas dataset) and velocyto (dentate gyrus dataset). 

In [None]:
import scvelo as scv

In [None]:
adata = scv.datasets.pancreas()
del adata.var
del adata.obsm
del adata.obsp

scv.pp.filter_genes(adata, min_shared_counts=20)
scv.pp.normalize_per_cell(adata)
scv.pp.log1p(adata)

scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
tk.tl.triku(adata)

scv.tl.velocity(adata)
scv.tl.velocity_graph(adata)

scv.tl.umap(adata)
scv.pl.velocity_embedding_stream(adata, basis='umap')

scv.tl.paga(adata, groups='clusters')
scv.pl.paga(adata, basis='umap', size=50, alpha=.1,
            min_edge_width=2, node_size_scale=1.5)

In [None]:
hvg = list(adata.var_names[adata.var['highly_variable']])

for corr_val in [0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9]:
    scv.tl.rank_velocity_genes(adata, groupby='clusters', min_corr=corr_val, n_genes=len(hvg))
    df_names = scv.DataFrame(adata.uns['rank_velocity_genes']['names'])
    df_scores = scv.DataFrame(adata.uns['rank_velocity_genes']['scores'])
    
    scvelo_genes = list(set(df_names.values.ravel()))
    
    print(len(scvelo_genes), len(hvg))
    print(corr_val, len([i for i in scvelo_genes if i in hvg]) / len(scvelo_genes))

In [None]:
adata = sc.read(f"{os.getcwd()}/data/dentate_gyrus.loom", backup_url="http://pklab.med.harvard.edu/velocyto/DentateGyrus/DentateGyrus.loom")

del adata.var
del adata.obsm
del adata.obsp

scv.pp.filter_genes(adata, min_shared_counts=20)
scv.pp.normalize_per_cell(adata)
scv.pp.log1p(adata)

scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
tk.tl.triku(adata)

scv.tl.velocity(adata)
scv.tl.velocity_graph(adata)

scv.tl.umap(adata)
scv.pl.velocity_embedding_stream(adata, basis='umap', color='ClusterName')

scv.tl.paga(adata, groups='ClusterName')
scv.pl.paga(adata, basis='umap', size=50, alpha=.1,
            min_edge_width=2, node_size_scale=1.5)

In [None]:
hvg = list(adata.var_names[adata.var['highly_variable']])

for corr_val in [0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9]:
    scv.tl.rank_velocity_genes(adata, groupby='ClusterName', min_corr=corr_val, n_genes=len(hvg))
    df_names = scv.DataFrame(adata.uns['rank_velocity_genes']['names'])
    df_scores = scv.DataFrame(adata.uns['rank_velocity_genes']['scores'])
    
    scvelo_genes = list(set(df_names.values.ravel()))
    
    print(len(scvelo_genes), len(hvg))
    print(corr_val, len([i for i in scvelo_genes if i in hvg]) / len(scvelo_genes))

## Analysis of mean VS median correction

In [None]:
def plot_mean_median(adata):
    n_windows = 70
    n_HVG = adata.var['highly_variable'].sum()

    X = np.log10(adata.X.toarray().mean(0))
    Y_uncor = adata.var['triku_distance_uncorrected']

    linspace = np.linspace(np.min(X), np.max(X), n_windows + 1)
    linspace_median, linspace_mean = [], []
    Y_median_arr = np.zeros(len(Y_uncor))
    Y_mean_arr = np.zeros(len(Y_uncor))

    for i in range(n_windows):
        mask = (X >= linspace[i]) & (X <= linspace[i + 1])
        Y_median_arr[mask] = np.median(Y_uncor[mask])
        Y_mean_arr[mask] = np.mean(Y_uncor[mask])
        linspace_median.append(np.median(Y_uncor[mask])), linspace_mean.append(np.mean(Y_uncor[mask]))

    Y_mean, Y_median = Y_uncor - Y_mean_arr, Y_uncor - Y_median_arr

    fig, axs = plt.subplots(1, 4, figsize=(25, 5))
    X = np.log10(adata.X.toarray().mean(0))
    Y_uncor, Y = adata.var['triku_distance_uncorrected'], adata.var['triku_distance']

    max_y_mean = np.argsort(Y_mean)[-n_HVG:]
    max_y_median = np.argsort(Y_median)[-n_HVG:]
    max_both = np.intersect1d(max_y_mean, max_y_median)


    plt.suptitle(f'Mean/median correction on {lab} {org}, {method}')
    axs[0].set_title('Uncorrected')
    axs[1].set_title('Mean correction')
    axs[2].set_title('Median correction')
    axs[3].set_title('Jaccard index of 0:i features')

    axs[0].scatter(X, Y_uncor, c='#cbcbcb')
    axs[0].scatter(X[max_y_mean], Y_uncor[max_y_mean], c='#ab0000')
    axs[0].scatter(X[max_y_median], Y_uncor[max_y_median], c='#0000ab')
    axs[0].scatter(X[max_both], Y_uncor[max_both], c='#676767')

    axs[1].scatter(X, Y_mean, c='#cbcbcb')
    axs[1].scatter(X[max_y_mean], Y_mean[max_y_mean], c='#ab0000')

    axs[2].scatter(X, Y_median, c='#cbcbcb')
    axs[2].scatter(X[max_y_median], Y_median[max_y_median], c='#0000ab')


    axs[0].plot(linspace[:-1], linspace_mean, c="#ab0000")
    axs[0].plot(linspace[:-1], linspace_median, c="#0000ab")
    axs[1].plot(linspace[:-1], [0] * n_windows, c="#ab0000")
    axs[2].plot(linspace[:-1], [0] * n_windows, c="#0000ab")

    for idx in range(3):
        axs[idx].set_xlabel('log$_{10}$ mean gene expression')
        axs[idx].set_ylabel('Wasserstein distance')

    axs[3].set_xlabel('Jaccard index')
    axs[3].set_ylabel('# top features selected')    

    jaccard_index_line = []
    for idx in range(20, n_HVG, 5):
        max_y_mean = np.argsort(Y_mean)[-idx:]
        max_y_median = np.argsort(Y_median)[-idx:]
        max_and = np.intersect1d(max_y_mean, max_y_median)
        max_or = np.union1d(max_y_mean, max_y_median)

        jaccard_index_line.append(len(max_and)/len(max_or))

    axs[3].plot(np.arange(20, n_HVG, 5), jaccard_index_line)
    plt.show()

In [None]:
labs = ['ding', 'mereu']
orgs = ['human', 'mouse']
methods = ['10X', 'CELseq2', 'ddSEQ', 'Dropseq', 'inDrop', 'QUARTZseq', 'SingleNuclei', 'SMARTseq2',  'sci-RNA-seq', 'Seq-Well']

for lab, org, method in list(product(*[labs, orgs, methods])): 
    if os.path.exists(f'{os.getcwd()}/data/{lab.capitalize()}_2020/{method}_{org}.h5ad'):
        print(lab, org, method)
        adata_x = sc.read(f'{os.getcwd()}/data/{lab.capitalize()}_2020/{method}_{org}.h5ad')
        preprocessing(adata_x)
        PCA_knn(adata_x, 0)
        tk.tl.triku(adata_x, verbose='error', s=0)
        
        plot_mean_median(adata_x)