In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import dynamo
import dynamo as dyn
from dynamo.preprocessing import Preprocessor
import pearson_residual_normalization_recipe
from benchmark_utils import *
from SCTransform import SCTransform

adata_origin = dyn.sample_data.zebrafish()

## Convert and Output Data for R Seurat inputs

## sample preprocessor code  


In [None]:
def preprocess_dyn_sctransform_result_adata():
    from dynamo.preprocessing import Preprocessor
    adata = dyn.sample_data.zebrafish()
    preprocessor = Preprocessor()
    preprocessor.config_monocle_recipe(adata)
    preprocessor.normalize_by_cells = None
    preprocessor.select_genes_function = dyn.preprocessing.select_genes_by_dispersion_svr
    # preprocessor.select_genes_function = pearson_residual_normalization_recipe.select_genes_by_pearson_residual
    # preprocessor.normalize_selected_genes_function = pearson_residual_normalization_recipe.normalize_layers_pearson_residuals
    dyn.preprocessing.unique_var_obs_adata(adata)
    preprocessor.filter_cells_by_outliers(adata, keep_filtered=False)
    preprocessor.filter_genes_by_outliers(adata, inplace=True, min_cell_s=5)
    preprocessor.select_genes(adata, n_top_genes=2000)
    adata = adata[:, adata.var["use_for_pca"]]

    print("data before sctransform:")
    print(pd.DataFrame(adata.X.data).describe())
    SCTransform(adata, inplace=True)
    print("data after sctransform:")
    print(pd.DataFrame(adata.X.data).describe())
    preprocessor.use_log1p = False
    # preprocessor.preprocess_adata(adata)
    preprocessor.pca(adata, n_pca_components=50, pca_key="X_pca")
    dyn.data_io.cleanup(adata)
    adata.write_h5ad("./data/zebrafish_sctransform_preprocessor.h5ad")

preprocess_dyn_sctransform_result_adata()

In [None]:
dyn_adata_preprocessor = dyn.read_h5ad(
    "./data/zebrafish_sctransform_preprocessor.h5ad")


Load dyn adata results

In [None]:
import scipy
import scipy.io
def write_R_input_data():
    dyn_adata = dyn.sample_data.zebrafish()
    print("original data shape:", dyn_adata.shape)
    dyn.preprocessing.convert_layers2csr(dyn_adata)
    dyn.preprocessing.unique_var_obs_adata(dyn_adata)
    dyn.preprocessing.filter_cells_by_outliers(dyn_adata)
    # use the same gene set as dyn_adata_preprocessor
    dyn_adata = dyn_adata[:, dyn_adata_preprocessor.var_names]
    scipy.io.mmwrite("./data/counts.mtx", dyn_adata.X)
    scipy.io.mmwrite("./data/spliced.mtx", dyn_adata.layers["spliced"])
    scipy.io.mmwrite("./data/unspliced.mtx", dyn_adata.layers["unspliced"])
    print(dyn_adata)
    pd.Series(dyn_adata.obs_names).to_csv("./data/cell_names.csv")
    pd.Series(dyn_adata.var_names).to_csv("./data/var_names.csv")
# write_R_input_data()


Read R results

In [None]:
seurat_X_pca = pd.read_csv("./R-seurat-test/X_pca.csv")
# seurat_features = pd.read_csv("./R-seurat-test/features_attributes.csv")
seurat_features = pd.read_csv("./R-seurat-test/features_attributes.csv")
seurat_residual_variance = seurat_features["residual_variance"]
seurat_residual_mean = seurat_features["residual_mean"]




In [None]:
print(seurat_features.shape)
seurat_features[:2]

In [None]:
X_pca_arr = seurat_X_pca.to_numpy().flatten()
sns.histplot(X_pca_arr).set_title("seurat X_pca distribution")
plt.show()
# sns.histplot(seurat_residual_mean.to_numpy().flatten()).set_title("seurat residual mean")
# plt.show()
# sns.histplot(seurat_residual_variance.to_numpy().flatten()).set_title("seurat residual variance")
# plt.show()

## Obtain seurat adata and set correspondin fields for next comparison steps

In [None]:
seurat_adata = dyn.sample_data.zebrafish()
dyn.preprocessing.unique_var_obs_adata(seurat_adata)
dyn.preprocessing.filter_cells_by_outliers(seurat_adata)
print(seurat_X_pca.shape)
print(seurat_adata.shape)
print(set(seurat_X_pca.index) - set(seurat_adata.obs_names))
print(set(seurat_adata.obs_names) - set(seurat_X_pca.index))


### Reorder seurat_X_pca index in adata obs_name order and set to X_pca

In [None]:

seurat_adata.obsm["X_pca"] = seurat_X_pca.loc[seurat_adata.obs_names, :].to_numpy()


In [None]:
print(seurat_X_pca.shape)
print(dyn_adata_preprocessor.shape)

### compare the shape of X_pca

In [None]:
print(dyn_adata_preprocessor.obsm["X_pca"].shape)
seurat_adata.obsm["X_pca"].shape


In [None]:
dyn_adata_preprocessor.var.columns

In [None]:
dyn_adata_preprocessor.var["genes_step1_sct"].sum()

### distribution comparisons

In [None]:
dyn_adata_preprocessor.obs["gene_sct"].hist()

In [None]:
def benchmark_plots(dyn_adata_preprocessor, seurat_features):
    genes = seurat_features["Unnamed: 0"]
    plt.scatter(dyn_adata_preprocessor[:, genes].var["log_umi_step1_sct"], seurat_features["step1_log_umi"])
    plt.title("log umi step1")
    plt.legend()
    plt.show()
    plt.clf()

    plt.scatter(dyn_adata_preprocessor[:, genes].var["Intercept_step1_sct"], seurat_features["step1_(Intercept)"])
    plt.title("intercept step1")
    plt.legend()
    plt.show()
    plt.clf()

    plt.scatter(dyn_adata_preprocessor[:, genes].var["log_umi_sct"], seurat_features["log_umi"])
    plt.title("log umi sct")
    plt.legend()
    plt.show()
    plt.clf()

    plt.scatter(dyn_adata_preprocessor[:, genes].var["theta_sct"], seurat_features["theta"])
    plt.title("theta comparison")
    plt.legend()
    plt.show()
    plt.clf()

    # plt.scatter(np.array(dyn_adata_preprocessor[:, genes].X.mean(0)).flatten(), seurat_features["residual_mean"])
    # plt.title("residual_mean")
    # plt.legend()
    # plt.show()
    # plt.clf()
    
    alpha = 0.7
    dyn_adata_preprocessor.var["log_umi_step1_sct"].hist(label="dynamo", alpha=alpha)
    seurat_features["step1_log_umi"].hist(label="seurat", alpha=alpha)
    plt.title("step1 umi log comparison")
    plt.legend()
    plt.show()
    plt.clf()

    dyn_adata_preprocessor.var["Intercept_step1_sct"].hist(label="dynamo", alpha=alpha)
    seurat_features["step1_(Intercept)"].hist(label="seurat", alpha=alpha)
    plt.title("Intercept_step1_sct comparison")
    plt.legend()
    plt.show()
    plt.clf()


    dyn_adata_preprocessor.var["log_umi_sct"].hist(label="dynamo", alpha=alpha)
    seurat_features["log_umi"].hist(label="seurat", alpha=alpha)
    plt.title("umi log comparison")
    plt.legend()
    plt.show()
    plt.clf()

    dyn_adata_preprocessor.var["theta_sct"].hist(label="dynamo", alpha=alpha)
    seurat_features["theta"].hist(label="seurat", alpha=alpha)
    plt.title("theta comparison")
    plt.legend()
    plt.show()
    plt.clf()



    pd.Series(np.array(dyn_adata_preprocessor.X.mean(0)).flatten()).hist(label="dynamo-sctransform", alpha=alpha) # , figsize=(20,20))
    # pd.Series(np.array(adata_origin.X.mean(0)).flatten()).hist(label="dynamo-origin", alpha=alpha)
    seurat_features["residual_mean"].hist(label="seurat", alpha=alpha, bins=10)
    plt.title("residual mean comparison")
    plt.legend()
    plt.show()
    plt.clf()

In [None]:
benchmark_plots(dyn_adata_preprocessor, seurat_features)

In [None]:
seurat_features["step1_log_umi"]

In [None]:
fig, axes = plt.subplots(1,2, figsize=(10, 20))
key = "X_pca"
use_for_pca = list(
    set(dyn_adata_preprocessor.var_names[dyn_adata_preprocessor.var["use_for_pca"]]))

lower_quantile = 0
upper_quantile = 1
def cmp_X_pca(lower_quantile, upper_quantile, ax):
    x_data = np.sort( dyn_adata_preprocessor.obsm[key].flatten())
    x_data = x_data[np.logical_and(np.quantile(x_data, lower_quantile) <= x_data, x_data <= np.quantile(x_data, upper_quantile))]
    y_data = np.sort(seurat_adata[dyn_adata_preprocessor.obs_names, :].obsm[key].flatten())
    y_data = y_data[np.logical_and(np.quantile(y_data, lower_quantile) <= y_data, y_data <= np.quantile(y_data, upper_quantile))]

    # maybe there is <=3 shape difference
    min_len = min(len(x_data), len(y_data))

    ax.scatter(x_data[:min_len],
                    y_data[:min_len])  # , title="X_pca comparison")

    ax.set_title(key + ", lower_quantile:%.2f, upper_quantile:%.2f" %(lower_quantile, upper_quantile))
    lims = [
        np.min([ax.get_xlim(), ax.get_ylim()]),  # min of both axes
        np.max([ax.get_xlim(), ax.get_ylim()]),  # max of both axes
    ]
    ax.plot(lims, lims, 'k-', alpha=0.75, zorder=0)
    ax.set_aspect('equal')

cmp_X_pca(0, 1, axes[0])

cmp_X_pca(0.2, 0.8, axes[1])





In [None]:

dyn.pl.scatters(dyn_adata_preprocessor, basis="pca", color="Cell_type")
dyn.pl.scatters(seurat_adata, basis="pca", color="Cell_type")


In [None]:

dyn.tl.reduceDimension(seurat_adata, basis="pca")
dyn.pl.umap(seurat_adata, color="Cell_type", figsize=(10, 10))


In [None]:
dyn.tl.reduceDimension(dyn_adata_preprocessor, basis="pca")
dyn.pl.umap(dyn_adata_preprocessor, color="Cell_type", figsize=(10, 10), affine_transform_degree=270)
