In [1]:
import pandas as pd
import scanpy as sc
import anndata as ad
import numpy as np
import scib

from sklearn.metrics import adjusted_rand_score as ARI
from sklearn.metrics import normalized_mutual_info_score as NMI
from sklearn.metrics import fowlkes_mallows_score as FMI
from sklearn.metrics import silhouette_score as SC

from scipy.optimize import linear_sum_assignment
from sklearn.preprocessing import LabelEncoder

from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors

In [2]:
def cluster_acc(y_true, y_pred):
    """
    Calculate clustering accuracy. Require scikit-learn installed

    # Arguments
        y: true labels, numpy.array with shape `(n_samples,)`
        y_pred: predicted labels, numpy.array with shape `(n_samples,)`

    # Return
        accuracy, in [0,1]
    """
    assert y_pred.size == y_true.size
    
    encoder = LabelEncoder()
    encoder = encoder.fit(np.unique(y_true))
    y_true = encoder.transform(y_true).astype(np.int64)
    y_pred = y_pred.astype(np.int64)

    D = max(y_pred.max(), y_true.max()) + 1
    w = np.zeros((D, D), dtype=np.int64)

    for i in range(y_pred.size):
        w[y_pred[i], y_true[i]] += 1

    # ind = linear_assignment(w.max() - w)
    ind = linear_sum_assignment(w.max() - w)
    ind = np.array((ind[0], ind[1])).T

    return sum([w[i, j] for i, j in ind]) * 1.0 / y_pred.size

In [3]:
def label_scores(embeddings, labels):
    nn_ = NearestNeighbors(n_neighbors=20)
    nn_.fit(embeddings)
    knns = nn_.kneighbors(embeddings, return_distance=False)

    res = 0
    for i in range(len(embeddings)):
        num = 0
        for j in range(len(knns[i])):
            if labels[i] == labels[knns[i][j]]:
                num += 1
        res += num / len(knns[i])

    return res / len(embeddings)

In [6]:
method_list = [
    # 'Ours',
    # 'PCA',
    # 'AutoClass',
    'AutoEncoder',
    # 'scPROTEIN',
    'magic',
    # 'PCA',
    'Seurat'
]

data_list = [
    'pbmc',
    # 'reap2',
    # 'reap3',
    # 'sln111',
    # 'sln206',
]

label_list = [
    'celltype.l2',
#     'labels',
#     'labels',
#     'cell_types',
#     'cell_types',
]

In [7]:
for i in range(len(method_list)):
    for j in range(len(data_list)):
        print(method_list[i], data_list[j])
        if method_list[i] == 'Ours':
            adata = sc.read_h5ad('/home/chenjn/rna2adt/A_run_test/new_emb_sc/' + data_list[j] + '.h5ad')
        else:
            embs = pd.read_csv('/home/chenjn/rna2adt/baseline/' + method_list[i] + '/output/' + data_list[j] + '/embeddings.csv', index_col=0)
            adata = ad.AnnData(embs)

        refadata = sc.read_h5ad('/home/chenjn/rna2adt/data/' + data_list[j] + '/ADT.h5ad')
        label_key = label_list[j]
        
        adata.obs = refadata.obs

        sc.pp.neighbors(adata, use_rep="X")
        sc.tl.umap(adata)
        sc.tl.louvain(adata, random_state=3407)
        # sc.pl.umap(adata, color='louvain')
        # sc.pl.umap(adata, color=label_key)

        k_means = KMeans(n_clusters=len(np.unique(adata.obs[label_key])))
        y_predict = k_means.fit_predict(adata.X)

        adata.obs['kmeans'] = y_predict
        # sc.pl.umap(adata, color='kmeans')

        print('-------------------Louvain-----------------------')

        print(ARI(adata.obs['louvain'], adata.obs[label_key]))
        print(NMI(adata.obs['louvain'], adata.obs[label_key]))
        print(cluster_acc(adata.obs[label_key].to_numpy(), adata.obs['louvain'].values.to_numpy()))
        print(FMI(adata.obs['louvain'], adata.obs[label_key]))
        # print(SC(adata.X, adata.obs['louvain'].values.reshape(-1, 1)))
        # print(label_scores(adata.X, adata.obs[label_key]))


        print('-------------------K means-----------------------')

        print(ARI(y_predict, adata.obs[label_key]))
        print(NMI(y_predict, adata.obs[label_key]))
        print(cluster_acc(adata.obs[label_key].to_numpy(), y_predict))
        print(FMI(y_predict, adata.obs[label_key]))
        # print(SC(adata.X, y_predict.reshape(-1, 1)))
        # print(label_scores(adata.X, adata.obs[label_key]))

        print('-------------------scib-----------------------')

        # scib.me.cluster_optimal_resolution(adata, cluster_key="cluster", label_key=label_key)
        # scib.me.ari(adata, cluster_key="cluster", label_key=label_key)

AutoEncoder pbmc


  super()._check_params_vs_input(X, default_n_init=10)


-------------------Louvain-----------------------
0.39900637075074097
0.6605736677444162
0.43558517346257514
0.47152256262817727
-------------------K means-----------------------
0.31505749144306366
0.6424852864249002
0.3734514477881358
0.4095120391524387
-------------------scib-----------------------
magic pbmc


  super()._check_params_vs_input(X, default_n_init=10)


-------------------Louvain-----------------------
0.2722857724158259
0.5770536505663176
0.2974456615810687
0.36407844336376294
-------------------K means-----------------------
0.27474569244299113
0.5292749693609314
0.35682228431542246
0.3499710592600223
-------------------scib-----------------------
Seurat pbmc


  super()._check_params_vs_input(X, default_n_init=10)


-------------------Louvain-----------------------
0.44990329850927335
0.6911632764451112
0.4578954526347024
0.516586816697626
-------------------K means-----------------------
0.29470562359958835
0.6214856965378583
0.34036621250710913
0.39135406493507235
-------------------scib-----------------------
