In [None]:
import os

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import display
from matplotlib.patches import Patch
from pylab import *
from sklearn.preprocessing import MinMaxScaler

mpl.rcParams["figure.dpi"] = 300

In [None]:
save_folder = "./Main_Figures/"

In [None]:
excel_path = "~/Downloads/All_metrics_15_Mar_23.xlsx"
sheet_name = "All_Metrics"

In [None]:
columns_to_scale = [
    "NMI cluster/label",
    "ARI cluster/label",
    "ASW label",
    "ASW label/batch",
    "PCR batch",
    "isolated f1 score",
    "isolated silhouette coefficient",
    "graph connectivity",
    "kBET",
    "iLISI",
    "cLISI",
]
method_color_dct = {
    "scVI": "#28DDED",
    "Harmony": "#ED7A28",
    "Seurat": "#994363",
    "BBKNN": "#B626D3",
    "Scanorama": "#EDBF28",
    "INSCT": "#286CED",
    "LIGER": "#90EE90",
    "fastMNN": "#FFB6C1",
    "iMAP": "#964B00",
    "scDML": "#6F3AF9",
    "scDREAMER": "#086E28",
    "scANVI": "#c5b0d5",
    "scGEN": "#d62829",
    "scDREAMER-Sup": "#113f0a",
    "scDREAMER redo": "#086E28",
    "scDREAMER-Sup redo": "#113f0a",
}
methods_to_plot = [
    "scVI",
    "Harmony",
    "Seurat",
    "BBKNN",
    "Scanorama",
    "INSCT",
    "LIGER",
    "iMAP",
    "scDML",
    "scDREAMER",
]  # , 'scGEN', 'scANVI', 'scDREAMER-Sup']
# methods_to_plot = ['scVI', 'Harmony', 'Seurat', 'BBKNN', 'Scanorama', 'INSCT', 'LIGER', "iMAP", 'scDML', 'scDREAMER', 'scGEN', 'scANVI', 'scDREAMER-Sup']

In [None]:
plt.plot()
patches = []
for i, method in enumerate(methods_to_plot):
    if i < 10:
        patches.append(Patch(color=method_color_dct[method], label=method))
    else:
        patches.append(Patch(color=method_color_dct[method], label=method))
legend = plt.legend(
    handles=patches, bbox_to_anchor=(1.04, 0.5), loc="center left", borderaxespad=0
)
for text in legend.get_texts():
    text.set(fontfamily="Arial", fontsize=15, fontweight="bold")

for text in legend.get_texts()[-3:]:
    text.set_color("#000080")
    print(text)

In [None]:
def plot_bar(
    df_, col_name, use_color_col=True, save_folder=False, unsup_x=3.0, sup_x=9.8
):  # save_name dataset

    rc("axes", linewidth=2)

    # plot bar
    fig = plt.figure(figsize=(6, 4))
    if use_color_col == False:
        ax = df_[col_name].plot(kind="bar")  # , color = df_['color'])
    else:
        ax = df_[col_name].plot(kind="bar", color=df_["color"])

    rects = ax.patches

    ax.tick_params(labelbottom=False, bottom=False)

    # adding y text
    for rect, label in zip(rects, df_[col_name]):
        height = rect.get_height()
        ax.text(
            rect.get_x() + rect.get_width() / 2,
            height,
            round(label, 2),
            ha="center",
            va="bottom",
        )

    mi = df_[col_name].min()
    mx = df_[col_name].max()
    mi, mx = ylim(max(mi - 0.05 * mx, 0), min(mx * 1.05, 1.0))
    # vertical line
    sep_line_x = (rects[-4].get_x() + rects[-4].get_width() + rects[-3].get_x()) / 2
    line = ax.plot(
        (sep_line_x, sep_line_x),
        (mi - 0.05 * mx, mx * 1.05),
        color="black",
        linestyle="--",
        clip_on=False,
    )

    dy_legend = 0.06
    ax.text(
        unsup_x,
        mi - (mx - mi) * (dy_legend),
        "Unsupervised",
        fontsize=13,
        fontname="Arial",
        weight="bold",
    )
    ax.text(
        sup_x,
        mi - (mx - mi) * (dy_legend),
        "Supervised",
        fontsize=13,
        fontname="Arial",
        weight="bold",
        color="#000080",
    )

    plt.ylabel(col_name, fontsize=15, fontname="Arial", fontweight="bold")
    plt.xlabel(None)  # , fontsize = 15, fontname='Arial', fontweight = 'bold')

    # remove axis lines
    plt.subplot(111).spines["right"].set_visible(False)
    plt.subplot(111).spines["top"].set_visible(False)

    plt.tight_layout()

    if save_folder:
        if not os.path.exists(save_folder):
            print(save_folder)
            os.makedirs(save_folder)
        plt.savefig(
            save_folder + "/" + col_name.replace("/", "_") + ".png",
            transparent=True,
            bbox_inches="tight",
        )
        print(
            "plot saved to file : ",
            save_folder + "/" + col_name.replace("/", "_") + ".png",
        )
    plt.show()


def plot_bar_unsup(
    df_, col_name, use_color_col=True, save_folder=False
):  # save_name dataset

    rc("axes", linewidth=2)

    # plot bar
    fig = plt.figure(figsize=(6, 4))
    if use_color_col == False:
        ax = df_[col_name].plot(kind="bar")  # , color = df_['color'])
    else:
        ax = df_[col_name].plot(kind="bar", color=df_["color"])

    rects = ax.patches
    ax.set_xticklabels(df_.index, rotation=75, fontname="Arial", fontsize=10)

    # adding y text
    for rect, label in zip(rects, df_[col_name]):
        height = rect.get_height()
        ax.text(
            rect.get_x() + rect.get_width() / 2,
            height,
            round(label, 2),
            ha="center",
            va="bottom",
        )

    mi = df_[col_name].min()
    mx = df_[col_name].max()
    mi, mx = ylim(max(mi - 0.05 * mx, 0), min(mx * 1.05, 1.0))

    plt.ylabel(col_name, fontsize=15, fontname="Arial", fontweight="bold")
    plt.xlabel(None)  # , fontsize = 15, fontname='Arial', fontweight = 'bold')
    for tick in ax.xaxis.get_major_ticks():
        tick.label1.set_fontsize(14)
        tick.label1.set_fontweight("bold")
    for tick in ax.yaxis.get_major_ticks():
        tick.label1.set_fontsize(14)
        tick.label1.set_fontweight("bold")

    # remove axis lines
    plt.subplot(111).spines["right"].set_visible(False)
    plt.subplot(111).spines["top"].set_visible(False)

    plt.tight_layout()

    if save_folder:
        if not os.path.exists(save_folder):
            print(save_folder)
            os.makedirs(save_folder)
        plt.savefig(
            save_folder + "/" + col_name.replace("/", "_") + ".png",
            transparent=True,
            bbox_inches="tight",
        )
        print(
            "plot saved to file : ",
            save_folder + "/" + col_name.replace("/", "_") + ".png",
        )
    plt.show()

# composite score main

In [None]:
def scale(df):
    scaler = MinMaxScaler()
    df = scaler.fit_transform(df)
    return df


def calculate_composite(df_dataset):
    df_dataset["color"] = pd.Series(df_dataset.index, index=df_dataset.index).replace(
        method_color_dct
    )
    # confirmed that .mean with nan are ignored [1,1,nan].mean() = 1
    df_dataset["Composite bio-conservation score"] = df_dataset[
        ["NMI cluster/label", "ARI cluster/label", "ASW label"]
    ].mean(axis=1)
    df_dataset["Composite batch-correction score"] = df_dataset[
        ["ASW label/batch", "graph connectivity", "PCR batch", "kBET"]
    ].mean(axis=1)
    df_dataset["Composite isolated label score"] = df_dataset[
        ["isolated silhouette coefficient", "isolated f1 score"]
    ].mean(axis=1)
    df_dataset["Combined composite score"] = df_dataset[
        ["Composite bio-conservation score", "Composite batch-correction score"]
    ].mean(axis=1)
    df_dataset = df_dataset.apply(
        lambda x: x.apply(lambda y: round(y, 2) if type(y) != str else y)
    )
    return df_dataset


def plot_composite(df_dataset, save_folder=False, use_color_col=True, **kwargs):
    plot_bar(
        df_dataset,
        "Combined composite score",
        use_color_col=use_color_col,
        save_folder=save_folder,
        **kwargs
    )
    plot_bar(
        df_dataset,
        "Composite batch-correction score",
        use_color_col=use_color_col,
        save_folder=save_folder,
        **kwargs
    )
    plot_bar(
        df_dataset,
        "Composite bio-conservation score",
        use_color_col=use_color_col,
        save_folder=save_folder,
        **kwargs
    )
    plot_bar(
        df_dataset,
        "Composite isolated label score",
        use_color_col=use_color_col,
        save_folder=save_folder,
        **kwargs
    )


def plot_composite_unsup(df_dataset, save_folder=False, use_color_col=True, **kwargs):
    plot_bar_unsup(
        df_dataset,
        "Combined composite score",
        use_color_col=use_color_col,
        save_folder=save_folder,
        **kwargs
    )
    plot_bar_unsup(
        df_dataset,
        "Composite batch-correction score",
        use_color_col=use_color_col,
        save_folder=save_folder,
        **kwargs
    )
    plot_bar_unsup(
        df_dataset,
        "Composite bio-conservation score",
        use_color_col=use_color_col,
        save_folder=save_folder,
        **kwargs
    )
    plot_bar_unsup(
        df_dataset,
        "Composite isolated label score",
        use_color_col=use_color_col,
        save_folder=save_folder,
        **kwargs
    )

# Pancreas

In [None]:
dataset = "Pancreas"
df = pd.read_excel(excel_path, sheet_name=sheet_name)
df_dataset = df[df["Dataset"] == dataset].reset_index(drop=True)
df_dataset.index = df_dataset["Method"]
df_dataset.drop(["Dataset", "Method"], inplace=True, axis=1)
df_dataset.loc[:, columns_to_scale] = scale(df_dataset.loc[:, columns_to_scale])
df_dataset = df_dataset.loc[df_dataset["Percentage_wrong"] == 0]
df_dataset = df_dataset.loc[methods_to_plot, :]
df_dataset = calculate_composite(df_dataset)
display(df_dataset)
plot_composite_unsup(df_dataset, save_folder + dataset)
df_dataset = df_dataset[
    [
        "Composite bio-conservation score",
        "Composite batch-correction score",
        "Composite isolated label score",
        "Combined composite score",
    ]
]
df_dataset.to_csv("./Composite_Scores/" + dataset + "_composite_scores.csv")

# Human_Retina

In [None]:
dataset = "Human_Retina"
df = pd.read_excel(excel_path, sheet_name=sheet_name)
df_dataset = df[df["Dataset"] == dataset].reset_index(drop=True)
df_dataset.index = df_dataset["Method"]
df_dataset.drop(["Dataset", "Method"], inplace=True, axis=1)
df_dataset.loc[:, columns_to_scale] = scale(df_dataset.loc[:, columns_to_scale])
df_dataset = df_dataset.loc[df_dataset["Percentage_wrong"] == 0]
df_dataset = df_dataset.loc[methods_to_plot, :]
df_dataset = calculate_composite(df_dataset)
display(df_dataset)
plot_composite_unsup(df_dataset, save_folder + dataset)
df_dataset = df_dataset[
    [
        "Composite bio-conservation score",
        "Composite batch-correction score",
        "Composite isolated label score",
        "Combined composite score",
    ]
]
df_dataset.to_csv("./Composite_Scores/" + dataset + "_composite_scores.csv")

# Lung

In [None]:
dataset = "Lung"
df = pd.read_excel(excel_path, sheet_name=sheet_name)
df_dataset = df[df["Dataset"] == dataset].reset_index(drop=True)
df_dataset.index = df_dataset["Method"]
df_dataset.drop(["Dataset", "Method"], inplace=True, axis=1)
df_dataset.loc[:, columns_to_scale] = scale(df_dataset.loc[:, columns_to_scale])
df_dataset = df_dataset.loc[df_dataset["Percentage_wrong"] == 0]
df_dataset = df_dataset.loc[methods_to_plot, :]
df_dataset = calculate_composite(df_dataset)
display(df_dataset)
plot_composite_unsup(df_dataset, save_folder + dataset)
df_dataset = df_dataset[
    [
        "Composite bio-conservation score",
        "Composite batch-correction score",
        "Composite isolated label score",
        "Combined composite score",
    ]
]
df_dataset.to_csv("./Composite_Scores/" + dataset + "_composite_scores.csv")

# Immune_Human

In [None]:
dataset = "Immune_Human"
df = pd.read_excel(excel_path, sheet_name=sheet_name)
df_dataset = df[df["Dataset"] == dataset].reset_index(drop=True)
df_dataset.index = df_dataset["Method"]
df_dataset.drop(["Dataset", "Method"], inplace=True, axis=1)
df_dataset.loc[:, columns_to_scale] = scale(df_dataset.loc[:, columns_to_scale])
df_dataset = df_dataset.loc[df_dataset["Percentage_wrong"] == 0]
df_dataset = df_dataset.loc[methods_to_plot, :]
df_dataset = calculate_composite(df_dataset)
display(df_dataset)
plot_composite_unsup(df_dataset, save_folder + dataset)
df_dataset = df_dataset[
    [
        "Composite bio-conservation score",
        "Composite batch-correction score",
        "Composite isolated label score",
        "Combined composite score",
    ]
]
df_dataset.to_csv("./Composite_Scores/" + dataset + "_composite_scores.csv")

# Human Mouse

In [None]:
dataset = "Human_Mouse"
df = pd.read_excel(excel_path, sheet_name=sheet_name)
df_dataset = df[df["Dataset"] == dataset].reset_index(drop=True)
df_dataset.index = df_dataset["Method"]
df_dataset.drop(["Dataset", "Method"], inplace=True, axis=1)
df_dataset.loc[:, columns_to_scale] = scale(df_dataset.loc[:, columns_to_scale])
df_dataset = df_dataset.loc[df_dataset["Percentage_wrong"] == 0]
methods_to_plot_hm = methods_to_plot.copy()
methods_to_plot_hm.remove("Seurat")
df_dataset = df_dataset.loc[methods_to_plot_hm, :]
df_dataset = calculate_composite(df_dataset)
display(df_dataset)
plot_composite_unsup(df_dataset, save_folder + dataset)
df_dataset = df_dataset[
    [
        "Composite bio-conservation score",
        "Composite batch-correction score",
        "Composite isolated label score",
        "Combined composite score",
    ]
]
df_dataset.to_csv("./Composite_Scores/" + dataset + "_composite_scores.csv")

In [None]:
dataset = "Healthy_Human_Heart"
df = pd.read_excel(excel_path, sheet_name="Heart _metrics_wo_Not_Assigned")
df_dataset = df[df["Dataset"] == dataset].reset_index(drop=True)
df_dataset.index = df_dataset["Method"]
df_dataset.drop(["Dataset", "Method"], inplace=True, axis=1)
df_dataset.loc[:, columns_to_scale] = scale(df_dataset.loc[:, columns_to_scale])
df_dataset = df_dataset.loc[df_dataset["Percentage_wrong"] == 0]
methods_to_plot_hm = methods_to_plot.copy()
methods_to_plot_hm.remove("Seurat")
methods_to_plot_hm.remove("iMAP")
methods_to_plot_hm.remove("LIGER")
methods_to_plot_hm.remove("scDML")
methods_to_plot_hm.remove("BBKNN")
df_dataset = df_dataset.loc[methods_to_plot_hm, :]
df_dataset = calculate_composite(df_dataset)
display(df_dataset)
plot_composite_unsup(df_dataset, save_folder + dataset)  # , sup_x = 2.5, unsup_x = 9)
df_dataset = df_dataset[
    [
        "Composite bio-conservation score",
        "Composite batch-correction score",
        "Composite isolated label score",
        "Combined composite score",
    ]
]
df_dataset.to_csv("./Composite_Scores/" + dataset + "_composite_scores.csv")

# All methods

In [None]:
methods_to_plot = [
    "scVI",
    "Harmony",
    "Seurat",
    "BBKNN",
    "Scanorama",
    "INSCT",
    "LIGER",
    "iMAP",
    "scDML",
    "scDREAMER",
    "scGEN",
    "scANVI",
    "scDREAMER-Sup",
]

In [None]:
dataset = "Human_Retina"
df = pd.read_excel(excel_path, sheet_name=sheet_name)
df_dataset = df[df["Dataset"] == dataset].reset_index(drop=True)
df_dataset.index = df_dataset["Method"]
df_dataset.drop(["Dataset", "Method"], inplace=True, axis=1)
df_dataset.loc[:, columns_to_scale] = scale(df_dataset.loc[:, columns_to_scale])
df_dataset = df_dataset.loc[df_dataset["Percentage_wrong"] == 0]
df_dataset = df_dataset.loc[methods_to_plot, :]
df_dataset = calculate_composite(df_dataset)
display(df_dataset)
plot_composite(df_dataset, save_folder + dataset + "/All/")
df_dataset = df_dataset[
    [
        "Composite bio-conservation score",
        "Composite batch-correction score",
        "Composite isolated label score",
        "Combined composite score",
    ]
]

csv_path = "./Composite_Scores/All/"
if not os.path.exists(csv_path):
    os.makedirs(csv_path)
df_dataset.to_csv(csv_path + dataset + "_composite_scores.csv")
print("saved composite scores to : ", csv_path + dataset + "_composite_scores.csv")

In [None]:
dataset = "Human_Mouse"
df = pd.read_excel(excel_path, sheet_name=sheet_name)
df_dataset = df[df["Dataset"] == dataset].reset_index(drop=True)
df_dataset.index = df_dataset["Method"]
df_dataset.drop(["Dataset", "Method"], inplace=True, axis=1)
df_dataset.loc[:, columns_to_scale] = scale(df_dataset.loc[:, columns_to_scale])
df_dataset = df_dataset.loc[df_dataset["Percentage_wrong"] == 0]
methods_to_plot_hm = methods_to_plot.copy()
methods_to_plot_hm.remove("Seurat")
df_dataset = df_dataset.loc[methods_to_plot_hm, :]
df_dataset = calculate_composite(df_dataset)
display(df_dataset)
plot_composite(df_dataset, save_folder + dataset + "/All/", unsup_x=3, sup_x=9)
df_dataset = df_dataset[
    [
        "Composite bio-conservation score",
        "Composite batch-correction score",
        "Composite isolated label score",
        "Combined composite score",
    ]
]

csv_path = "./Composite_Scores/All/"
if not os.path.exists(csv_path):
    os.makedirs(csv_path)
df_dataset.to_csv(csv_path + dataset + "_composite_scores.csv")
print("saved composite scores to : ", csv_path + dataset + "_composite_scores.csv")

In [None]:
dataset = "Human"
df = pd.read_excel(excel_path, sheet_name=sheet_name)
df_dataset = df[df["Dataset"] == dataset].reset_index(drop=True)
df_dataset.index = df_dataset["Method"]
df_dataset.drop(["Dataset", "Method"], inplace=True, axis=1)
df_dataset.loc[:, columns_to_scale] = scale(df_dataset.loc[:, columns_to_scale])
df_dataset = df_dataset.loc[df_dataset["Percentage_wrong"] == 0]
methods_to_plot_hm = methods_to_plot.copy()
methods_to_plot_hm.remove("Seurat")
df_dataset = df_dataset.loc[methods_to_plot_hm, :]
df_dataset = calculate_composite(df_dataset)
display(df_dataset)
plot_composite(df_dataset, save_folder + dataset + "/All/", unsup_x=3, sup_x=9)
df_dataset = df_dataset[
    [
        "Composite bio-conservation score",
        "Composite batch-correction score",
        "Composite isolated label score",
        "Combined composite score",
    ]
]

csv_path = "./Composite_Scores/All/"
if not os.path.exists(csv_path):
    os.makedirs(csv_path)
df_dataset.to_csv(csv_path + dataset + "_composite_scores.csv")
print("saved composite scores to : ", csv_path + dataset + "_composite_scores.csv")

# composite score ablation studies

In [None]:
methods_to_plot = [
    "scVI",
    "Harmony",
    "Seurat",
    "BBKNN",
    "Scanorama",
    "INSCT",
    "LIGER",
    "iMAP",
    "scDREAMER",
]
methods_to_scale_accross = methods_to_plot + ["scDREAMER w/o BC", "scDREAMER w/o Dis"]
datasets = ["Lung", "Pancreas", "Immune_Human", "Human_Retina"]
save_folder = "./Ablation/"

df = pd.read_excel(excel_path, sheet_name=sheet_name)
df = df.loc[df["Percentage_wrong"] == 0]


for dataset in datasets:
    print(dataset)
    df_dataset = df[df["Dataset"] == dataset].reset_index(drop=True)
    df_dataset.index = df_dataset["Method"]
    df_dataset.drop(["Dataset", "Method"], inplace=True, axis=1)
    df_dataset = df_dataset.loc[df_dataset["Percentage_wrong"] == 0]
    df_dataset = df_dataset.loc[methods_to_scale_accross, :]
    df_dataset.loc[:, columns_to_scale] = scale(df_dataset.loc[:, columns_to_scale])
    df_dataset = calculate_composite(df_dataset)
    df_dataset = df_dataset.loc[
        ["scDREAMER", "scDREAMER w/o BC", "scDREAMER w/o Dis"],
        [
            "Composite bio-conservation score",
            "Composite batch-correction score",
            "Composite isolated label score",
            "Combined composite score",
        ],
    ]
    display(df_dataset)

In [None]:
# scVI - cyan
# Harmony - orange
# Seurat - magenta
# BBKNN - purple
# Scanorama - yellow
# INSCT - blue
# iMAP - brown
# Liger - light green
# fastMNN - light pink
# scANVI - removed from main figure
# scDREAMER - Green
# scDREAMER++ - Red

# scale with mini

In [None]:
# def scale_with_mini(df,min_values):
#     for col in df.columns:
#         scaler = MinMaxScaler(feature_range=(min_values[col],1))
#         df.loc[:,col] = scaler.fit_transform(np.array(df.loc[:,col]).reshape(-1,1))
#     return df
# df = pd.read_excel(excel_path,sheet_name =sheet_name)
# df_dataset = df[df['Dataset'] == dataset].reset_index(drop = True)
# df_dataset.index = df_dataset['Method']
# df_dataset.drop(['Dataset','Method'], inplace=True, axis=1)
# mini_values = df_dataset.loc[:,columns_to_scale].min(axis = 0)
# df_dataset = df_dataset.loc[df_dataset['Percentage_wrong']==0]
# df_dataset = df_dataset.loc[methods_to_plot,:]
# df_dataset.loc[:,columns_to_scale] =scale_with_mini(df_dataset.loc[:,columns_to_scale],mini_values)
# df_dataset

# scale normal or existing

In [None]:
# df = pd.read_excel(excel_path,sheet_name =sheet_name)
# df_dataset = df[df['Dataset'] == dataset].reset_index(drop = True)
# df_dataset.index = df_dataset['Method']
# df_dataset.drop(['Dataset','Method'], inplace=True, axis=1)
# df_dataset = df_dataset.loc[df_dataset['Percentage_wrong']==0]
# df_dataset = df_dataset.loc[methods_to_plot,:]
# df_dataset.loc[:,columns_to_scale] =scale(df_dataset.loc[:,columns_to_scale]
# df_dataset)