From d7b5616b519d5df0bf5b9deb68fed257f0777dd3 Mon Sep 17 00:00:00 2001
From: Ukyeon <beshine1@gmail.com>
Date: Fri, 19 May 2023 18:05:09 -0400
Subject: [PATCH] remove adata.obsm[X] in future, use adata.obsm.X_pca instead

---
 dynamo/configuration.py         |  9 ++++-----
 dynamo/prediction/fate.py       |  5 +++--
 dynamo/tools/cell_velocities.py |  3 ++-
 dynamo/tools/dynamics.py        |  3 ---
 dynamo/tools/metric_velocity.py |  3 ++-
 dynamo/tools/moments.py         | 28 +++++++++++++++-------------
 dynamo/tools/velocyto_scvelo.py |  4 +++-
 tests/test_preprocess.py        |  2 +-
 8 files changed, 30 insertions(+), 27 deletions(-)

diff --git a/dynamo/configuration.py b/dynamo/configuration.py
index 82946dbf0..4a48ed33f 100755
--- a/dynamo/configuration.py
+++ b/dynamo/configuration.py
@@ -36,6 +36,7 @@ class DynamoAdataKeyManager:
     # special key names frequently used in dynamo
     X_LAYER = "X"
     PROTEIN_LAYER = "protein"
+    X_PCA = "X_pca"
 
     def gen_new_layer_key(layer_name, key, sep="_") -> str:
         """utility function for returning a new key name for a specific layer. By convention layer_name should not have the separator as the last character."""
@@ -106,7 +107,7 @@ def check_if_layer_exist(adata: AnnData, layer: str) -> bool:
     def get_available_layer_keys(adata, layers="all", remove_pp_layers=True, include_protein=True):
         """Get the list of available layers' keys. If `layers` is set to all, return a list of all available layers; if `layers` is set to a list, then the intersetion of available layers and `layers` will be returned."""
         layer_keys = list(adata.layers.keys())
-        if layers is None: # layers=adata.uns["pp"]["experiment_layers"], in calc_sz_factor
+        if layers is None:  # layers=adata.uns["pp"]["experiment_layers"], in calc_sz_factor
             layers = "X"
         if remove_pp_layers:
             layer_keys = [i for i in layer_keys if not i.startswith("X_")]
@@ -143,10 +144,7 @@ def allowed_X_layer_names():
     def init_uns_pp_namespace(adata: AnnData):
         adata.uns[DynamoAdataKeyManager.UNS_PP_KEY] = {}
 
-    def get_excluded_layers(
-        X_total_layers: bool = False,
-        splicing_total_layers: bool = False
-    ) -> List:
+    def get_excluded_layers(X_total_layers: bool = False, splicing_total_layers: bool = False) -> List:
         """Get a list of excluded layers based on the provided arguments.
 
         When splicing_total_layers is False, the function normalize spliced and unspliced RNA separately using each
@@ -199,6 +197,7 @@ def aggregate_layers_into_total(
                 layers.extend(["_total_"])
         return total_layers, layers
 
+
 # TODO discuss alias naming convention
 DKM = DynamoAdataKeyManager
 
diff --git a/dynamo/prediction/fate.py b/dynamo/prediction/fate.py
index 5f96bfb13..acb5e7eda 100755
--- a/dynamo/prediction/fate.py
+++ b/dynamo/prediction/fate.py
@@ -9,6 +9,7 @@
 from sklearn.neighbors import NearestNeighbors
 from tqdm import tqdm
 
+from ..configuration import DKM
 from ..dynamo_logger import (
     LoggerManager,
     main_info,
@@ -173,7 +174,7 @@ def fate(
         ndim = adata.uns["umap_fit"]["fit"]._raw_data.shape[1]
 
         if "X" in adata.obsm_keys():
-            if ndim == adata.obsm["X"].shape[1]:  # lift the dimension up again
+            if ndim == adata.obsm[DKM.X_PCA].shape[1]:  # lift the dimension up again
                 exprs = adata.uns["pca_fit"].inverse_transform(prediction)
 
         if adata.var.use_for_dynamics.sum() == exprs.shape[1]:
@@ -211,7 +212,7 @@ def _fate(
     interpolation_num: int = 250,
     average: bool = True,
     sampling: str = "arc_length",
-    cores:int = 1,
+    cores: int = 1,
 ) -> Tuple[np.ndarray, np.ndarray]:
     """Predict the historical and future cell transcriptomic states over arbitrary time scales by integrating vector
     field functions from one or a set of initial cell state(s).
diff --git a/dynamo/tools/cell_velocities.py b/dynamo/tools/cell_velocities.py
index 822cb5f32..d3c65e5e9 100755
--- a/dynamo/tools/cell_velocities.py
+++ b/dynamo/tools/cell_velocities.py
@@ -13,6 +13,7 @@
 from sklearn.decomposition import PCA
 from sklearn.utils import sparsefuncs
 
+from ..configuration import DKM
 from ..dynamo_logger import LoggerManager, main_info, main_warning
 from ..utils import areinstance
 from .connectivity import _gen_neighbor_keys, adj_to_knn, check_and_recompute_neighbors
@@ -530,7 +531,7 @@ def cell_velocities(
             adata, pca_fit, X_pca = pca(adata, CM, n_pca_components, "X", return_all=True)
             adata.uns["pca_fit"] = pca_fit
 
-        X_pca, pca_fit = adata.obsm["X"], adata.uns["pca_fit"]
+        X_pca, pca_fit = adata.obsm[DKM.X_PCA], adata.uns["pca_fit"]
         V = adata[:, adata.var.use_for_dynamics.values].layers[vkey] if vkey in adata.layers.keys() else None
         CM, V = CM.A if sp.issparse(CM) else CM, V.A if sp.issparse(V) else V
         V[np.isnan(V)] = 0
diff --git a/dynamo/tools/dynamics.py b/dynamo/tools/dynamics.py
index 977c623ca..c5b51c7cf 100755
--- a/dynamo/tools/dynamics.py
+++ b/dynamo/tools/dynamics.py
@@ -344,9 +344,6 @@ def dynamics(
                 del adata.layers[i]
             main_info("making adata smooth...", indent_level=2)
 
-            if filter_gene_mode.lower() == "final" and "X_pca" in adata.obsm.keys():
-                adata.obsm["X"] = adata.obsm["X_pca"]
-
             if group is not None and group in adata.obs.columns:
                 moments(adata, genes=valid_bools, group=group)
             else:
diff --git a/dynamo/tools/metric_velocity.py b/dynamo/tools/metric_velocity.py
index 0d01018b0..05f1dcd07 100755
--- a/dynamo/tools/metric_velocity.py
+++ b/dynamo/tools/metric_velocity.py
@@ -12,6 +12,7 @@
 from sklearn.neighbors import NearestNeighbors
 from tqdm import tqdm
 
+from ..configuration import DKM
 from ..dynamo_logger import LoggerManager, main_critical, main_info, main_warning
 from .connectivity import (
     adj_to_knn,
@@ -114,7 +115,7 @@ def cell_wise_confidence(
         )
 
     n_neigh = n_neigh[0] if type(n_neigh) == np.ndarray else n_neigh
-    n_pca_components = adata.obsm["X"].shape[1]
+    n_pca_components = adata.obsm[DKM.X_PCA].shape[1]
 
     finite_inds = get_finite_inds(V, 0)
     X, V = X[:, finite_inds], V[:, finite_inds]
diff --git a/dynamo/tools/moments.py b/dynamo/tools/moments.py
index 3517c9754..2712406bc 100755
--- a/dynamo/tools/moments.py
+++ b/dynamo/tools/moments.py
@@ -8,7 +8,7 @@
 
 from ..configuration import DKM, DynamoAdataKeyManager
 from ..dynamo_logger import LoggerManager
-from ..preprocessing.utils import pca, normalize_mat_monocle, sz_util
+from ..preprocessing.utils import normalize_mat_monocle, pca, sz_util
 from ..utils import copy_adata
 from .connectivity import mnn, normalize_knn_graph, umap_conn_indices_dist_embedding
 from .utils import elem_prod, get_mapper, inverse_norm
@@ -98,17 +98,13 @@ def moments(
             if X_data is not None:
                 X = X_data
             else:
-                if "X" not in adata.obsm.keys():
+                if DKM.X_PCA not in adata.obsm.keys():
                     if not any([i.startswith("X_") for i in adata.layers.keys()]):
-                        from ..preprocessing.preprocess import recipe_monocle
+                        from ..preprocessing import Preprocessor
 
                         genes_to_use = adata.var_names[genes] if genes.dtype == "bool" else genes
-                        recipe_monocle(
-                            adata,
-                            genes_to_use=genes_to_use,
-                            num_dim=n_pca_components,
-                        )
-                        adata.obsm["X"] = adata.obsm["X_pca"]
+                        preprocessor = Preprocessor(force_gene_list=genes_to_use)
+                        preprocessor.preprocess_adata(adata, recipe="monocle")
                     else:
                         CM = adata.X if genes is None else adata[:, genes].X
                         cm_genesums = CM.sum(axis=0)
@@ -124,7 +120,7 @@ def moments(
 
                         adata.uns["explained_variance_ratio_"] = fit.explained_variance_ratio_[1:]
 
-                X = adata.obsm["X"][:, :n_pca_components]
+                X = adata.obsm[DKM.X_PCA][:, :n_pca_components]
 
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore")
@@ -186,7 +182,9 @@ def moments(
         layer_x = adata.layers[layer].copy()
         matched_x_group_indices = np.where([layer in x for x in [only_splicing, only_labeling, splicing_and_labeling]])
         if len(matched_x_group_indices[0]) == 0:
-            logger.warning(f"layer {layer} is not in any of the {only_splicing, only_labeling, splicing_and_labeling} groups, skipping...")
+            logger.warning(
+                f"layer {layer} is not in any of the {only_splicing, only_labeling, splicing_and_labeling} groups, skipping..."
+            )
             continue
         layer_x_group = matched_x_group_indices[0][0]
         layer_x = inverse_norm(adata, layer_x)
@@ -198,9 +196,13 @@ def moments(
                 else (conn.dot(layer_x), conn)
             )
         for layer2 in layers[i:]:
-            matched_y_group_indices = np.where([layer2 in x for x in [only_splicing, only_labeling, splicing_and_labeling]])
+            matched_y_group_indices = np.where(
+                [layer2 in x for x in [only_splicing, only_labeling, splicing_and_labeling]]
+            )
             if len(matched_y_group_indices[0]) == 0:
-                logger.warning(f"layer {layer2} is not in any of the {only_splicing, only_labeling, splicing_and_labeling} groups, skipping...")
+                logger.warning(
+                    f"layer {layer2} is not in any of the {only_splicing, only_labeling, splicing_and_labeling} groups, skipping..."
+                )
                 continue
             layer_y = adata.layers[layer2].copy()
 
diff --git a/dynamo/tools/velocyto_scvelo.py b/dynamo/tools/velocyto_scvelo.py
index 17c99ed11..64849d647 100755
--- a/dynamo/tools/velocyto_scvelo.py
+++ b/dynamo/tools/velocyto_scvelo.py
@@ -13,6 +13,8 @@
 import pandas as pd
 from scipy.sparse import csr_matrix
 
+from ..configuration import DKM
+
 
 def vlm_to_adata(
     vlm, n_comps: int = 30, basis: str = "umap", trans_mats: Optional[dict] = None, cells_ixs: List[int] = None
@@ -88,7 +90,7 @@ def vlm_to_adata(
 
     # set obsm
     obsm = {}
-    obsm["X"] = vlm.pcs[:, : min(n_comps, vlm.pcs.shape[1])]
+    obsm[DKM.X_PCA] = vlm.pcs[:, : min(n_comps, vlm.pcs.shape[1])]
     # set basis and velocity on the basis
     if basis is not None:
         obsm["X_" + basis] = vlm.ts
diff --git a/tests/test_preprocess.py b/tests/test_preprocess.py
index 20d5d17c3..1bea49b7d 100644
--- a/tests/test_preprocess.py
+++ b/tests/test_preprocess.py
@@ -358,6 +358,6 @@ def test_regress_out():
     # test_highest_frac_genes_plot_prefix_list(adata.copy())
     # test_recipe_monocle_feature_selection_layer_simple0()
     # test_gene_selection_method()
-    test_normalize()
+    # test_normalize()
     # test_regress_out()
     pass