broadinstitute · bistline · Dec 15, 2022 · Dec 5, 2022 · Dec 8, 2022 · Dec 8, 2022
diff --git a/ingest/anndata_.py b/ingest/anndata_.py
@@ -84,7 +84,7 @@ def generate_cluster_body(adata, clustering_name):
             [cluster_cells, pd.DataFrame(adata.obsm[clustering_name])], axis=1
         )
         pd.DataFrame(cluster_body).to_csv(
-            f"{clustering_name}.cluster.anndata_segment.tsv",
+            AnnDataIngestor.set_output_filename(clustering_name),
             sep="\t",
             mode="a",
             header=None,
@@ -94,9 +94,13 @@ def generate_cluster_body(adata, clustering_name):
     @staticmethod
     def files_to_delocalize(arguments):
         # ToDo - check if names using obsm_keys need sanitization
-        cluster_file_names = [name + ".tsv" for name in arguments["obsm_keys"]]
+        cluster_file_names = [AnnDataIngestor.set_output_filename(name) for name in arguments["obsm_keys"]]
         return cluster_file_names
 
+    @staticmethod
+    def set_output_filename(name):
+        return f"{name}.cluster.anndata_segment.tsv"
+
     @staticmethod
     def delocalize_cluster_files(file_path, study_file_id, files_to_delocalize):
         """ Copy cluster files to study bucket

diff --git a/tests/data/anndata/trimmed_compliant_pbmc3K.h5ad b/tests/data/anndata/trimmed_compliant_pbmc3K.h5ad
diff --git a/tests/data/annotation/metadata/convention/valid_cell_type__custom_v2.2.1.txt b/tests/data/annotation/metadata/convention/valid_cell_type__custom_v2.2.1.txt
@@ -1,7 +1,7 @@
 NAME	disease__time_since_onset	disease__time_since_onset__unit	organ_region	organ_region__ontology_label	donor	disease__treated	species	species__ontology_label	geographical_region	geographical_region__ontology_label	library_preparation_protocol	library_preparation_protocol__ontology_label	organ	organ__ontology_label	sex	is_living	organism_age__unit	organism_age__unit_label	ethnicity__ontology_label	ethnicity	organism_age	disease	disease__ontology_label	cell_type	cell_type__ontology_label	donor_id	biosample_id	biosample_type	preservation_method	cell_type__custom
 TYPE	numeric	group	group	group	group	group	group	group	group	group	group	group	group	group	group	group	group	group	group	group	numeric	group	group	group	group	group	group	group	group	group
-BM01_16dpp_AAGCAGTGGTAT	12|2	UO_0000035	MBA:000000944	Folium-tuber vermis (VII)	BM01	False|False	NCBITaxon_9606	human	GAZ_00003181	Boston	EFO_0008919	Seq-Well	UBERON_0001913	milk	female	yes	UO_0000036	year	European	HANCESTRO_0005	31	MONDO_0005015|MONDO_0006849	diabetes|mastitis	CL_0000066	epithelial cell	BM01	BM01_16dpp_r3	PrimaryBioSample_BodyFluid	Fresh	epithelial 
-BM01_16dpp_TAAGCAGTGGTA	1	UO_0000035	MBA:000000302|MBA:000000294|MBA:000000795	"Superior colliculus, sensory related|Superior colliculus, motor related|Periaqueductal gray"	BM01	FALSE	NCBITaxon_9606	Homo Sapiens	GAZ_00003181	Boston	EFO_0008919	Seq-Well	UBERON_0001913	milk	female	yes	UO_0000036	year	white	HANCESTRO_0005	31	MONDO_0005709	common cold	CL_0000066	epithelial cell	BM01	BM01_16dpp_r3	PrimaryBioSample_BodyFluid	Fresh	epithelial 
-BM01_16dpp_CTAAGCAGTGGT	24|2	UO_0000035	MBA:000000714|MBA:000000972		BM01	True|False	NCBITaxon_9606	Homo sapiens	GAZ_00003181	Boston	EFO_0008919	Seq-Well	UBERON_0001913	milk	female	yes	UO_0000036	year	British	HANCESTRO_0462	31	MONDO_0005015|MONDO_0005709	diabetes mellitus|common cold	CL_0000066	epithelial cell	BM01	BM01_16dpp_r3	PrimaryBioSample_BodyFluid	Fresh	epithelial 
-BM01_16dpp_CGGTAAACCATT	36|3|1	UO_0000035	MBA:000001041	Paraflocculus	BM01	True|False|False	NCBITaxon_9606	Homo sapiens	GAZ_00003181	Boston	EFO_0008919	Seq-Well	UBERON_0001913	milk	female	yes	UO_0000036	year		HANCESTRO_0462	31	MONDO_0005015|MONDO_0006849|MONDO_0005709	diabetes|breast infection|common cold			BM01	BM01_16dpp_r3	PrimaryBioSample_BodyFluid	Fresh	sub-epithelial 
-BM01_16dpp_CCGAATTCACCG	0	UO_0000035	MBA:000000909|MBA:000000502	Entorhinal area|Subiculum	BM01	FALSE	NCBITaxon_9606	Homo sapiens	GAZ_00003181	Boston	EFO_0008919	Seq-Well	UBERON_0001913	milk	female	yes	UO_0000036	year	Caucasian	HANCESTRO_0005	31	MONDO_0000001	disease or disorder			BM01	BM01_16dpp_r3	PrimaryBioSample_BodyFluid	Fresh	sub-epithelial 
+BM01_16dpp_AAGCAGTGGTAT	12|2	UO_0000035	MBA:000000944	Folium-tuber vermis (VII)	BM01	False|False	NCBITaxon_9606	human	GAZ_00003181	Boston	EFO_0008919	Seq-Well	UBERON_0001913	milk	female	yes	UO_0000036	year	European	HANCESTRO_0005	31	MONDO_0005015|MONDO_0006849	diabetes|mastitis	CL_0000066	epithelial cell	BM01	BM01_16dpp_r3	PrimaryBioSample_BodyFluid	Fresh	epithelial
+BM01_16dpp_TAAGCAGTGGTA	1	UO_0000035	MBA:000000302|MBA:000000294|MBA:000000795	"Superior colliculus, sensory related|Superior colliculus, motor related|Periaqueductal gray"	BM01	FALSE	NCBITaxon_9606	Homo Sapiens	GAZ_00003181	Boston	EFO_0008919	Seq-Well	UBERON_0001913	milk	female	yes	UO_0000036	year	white	HANCESTRO_0005	31	MONDO_0005709	common cold	CL_0000066	epithelial cell	BM01	BM01_16dpp_r3	PrimaryBioSample_BodyFluid	Fresh	epithelial
+BM01_16dpp_CTAAGCAGTGGT	24|2	UO_0000035	MBA:000000714|MBA:000000972		BM01	True|False	NCBITaxon_9606	Homo sapiens	GAZ_00003181	Boston	EFO_0008919	Seq-Well	UBERON_0001913	milk	female	yes	UO_0000036	year	British	HANCESTRO_0462	31	MONDO_0005015|MONDO_0005709	diabetes mellitus|common cold	CL_0000066	epithelial cell	BM01	BM01_16dpp_r3	PrimaryBioSample_BodyFluid	Fresh	epithelial
+BM01_16dpp_CGGTAAACCATT	36|3|1	UO_0000035	MBA:000001041	Paraflocculus	BM01	True|False|False	NCBITaxon_9606	Homo sapiens	GAZ_00003181	Boston	EFO_0008919	Seq-Well	UBERON_0001913	milk	female	yes	UO_0000036	year		HANCESTRO_0462	31	MONDO_0005015|MONDO_0006849|MONDO_0005709	diabetes|breast infection|common cold			BM01	BM01_16dpp_r3	PrimaryBioSample_BodyFluid	Fresh	sub-epithelial
+BM01_16dpp_CCGAATTCACCG	0	UO_0000035	MBA:000000909|MBA:000000502	Entorhinal area|Subiculum	BM01	FALSE	NCBITaxon_9606	Homo sapiens	GAZ_00003181	Boston	EFO_0008919	Seq-Well	UBERON_0001913	milk	female	yes	UO_0000036	year	Caucasian	HANCESTRO_0005	31	MONDO_0000001	disease or disorder			BM01	BM01_16dpp_r3	PrimaryBioSample_BodyFluid	Fresh	sub-epithelial
diff --git a/tests/test_anndata.py b/tests/test_anndata.py
@@ -4,28 +4,37 @@
 
 import unittest
 import sys
+import os
+from unittest.mock import patch
 
 sys.path.append("../ingest")
 from anndata_ import AnnDataIngestor
-
+from ingest_files import IngestFiles
 
 class TestAnnDataIngestor(unittest.TestCase):
+
+    @staticmethod
+    def setup_class(self):
+        filepath_valid = "../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad"
+        filepath_invalid = "../tests/data/anndata/bad.h5"
+        self.study_id = "addedfeed000000000000000"
+        self.study_file_id = "dec0dedfeed0000000000000"
+        self.valid_args = [filepath_valid, self.study_id, self.study_file_id]
+        self.invalid_args = [filepath_invalid, self.study_id, self.study_file_id]
+        self.cluster_name = 'X_tsne'
+        self.valid_kwargs = {'obsm_keys': [self.cluster_name]}
+        self.anndata_ingest = AnnDataIngestor(*self.valid_args, **self.valid_kwargs)
+        self.output_filename = f"{self.cluster_name}.cluster.anndata_segment.tsv"
+
+    def teardown_method(self, _):
+        if os.path.isfile(self.output_filename):
+            os.remove(self.output_filename)
+
     def test_minimal_valid_anndata(self):
-        good_input = AnnDataIngestor(
-            "../tests/data/anndata/test.h5ad",
-            "addedfeed000000000000000",
-            "dec0dedfeed0000000000000",
-        )
-        self.assertTrue(
-            good_input.validate(), "expect known good file to open with scanpy"
-        )
+        self.assertTrue(self.anndata_ingest.validate(), "expect known good file to open with scanpy")
 
     def test_truncated_anndata(self):
-        truncated_input = AnnDataIngestor(
-            "../tests/data/anndata/bad.h5",
-            "addedfeed000000000000000",
-            "dec0dedfeed0000000000000",
-        )
+        truncated_input = AnnDataIngestor(*self.invalid_args)
         # passing obtain_data function to assertRaises using lambda
         # otherwise truncated_input.obtain_data() is evaluated and triggers
         # an exception before assertRaises gets called
@@ -39,8 +48,8 @@ def test_truncated_anndata(self):
     def test_input_bad_suffix(self):
         bad_input = AnnDataIngestor(
             "../tests/data/anndata/bad.foo",
-            "addedfeed000000000000000",
-            "dec0dedfeed0000000000000",
+            self.study_id,
+            self.study_file_id,
         )
         # passing obtain_data function to assertRaises using lambda
         # otherwise bad_input.obtain_data() is evaluated and triggers
@@ -52,3 +61,46 @@ def test_input_bad_suffix(self):
         )
         self.assertFalse(bad_input.validate())
 
+    def test_set_output_filename(self):
+        cluster_name = "X_Umap"
+        self.assertEqual(
+            AnnDataIngestor.set_output_filename(cluster_name),
+            "X_Umap.cluster.anndata_segment.tsv"
+        )
+
+    def test_generate_cluster_header(self):
+        self.anndata_ingest.generate_cluster_header(self.anndata_ingest.obtain_adata(), self.cluster_name)
+        with open(self.output_filename) as header_file:
+            header = header_file.readline().split("\t")
+            self.assertEqual(['NAME', 'X', "Y\n"], header, "did not find expected headers")
+
+    def test_generate_cluster_type_declaration(self):
+        self.anndata_ingest.generate_cluster_type_declaration(self.anndata_ingest.obtain_adata(), self.cluster_name)
+        with open(self.output_filename) as header_file:
+            header = header_file.readline().split("\t")
+            self.assertEqual(['TYPE', 'numeric', "numeric\n"], header, "did not find expected headers")
+
+    def test_generate_cluster_body(self):
+        self.anndata_ingest.generate_cluster_body(self.anndata_ingest.obtain_adata(), self.cluster_name)
+        with open(self.output_filename) as cluster_body:
+            line = cluster_body.readline().split("\t")
+            expected_line = ['AAACATACAACCAC-1', '16.009954', "-21.073845\n"]
+            self.assertEqual(expected_line, line, 'did not get expected coordinates from cluster body')
+
+    def test_get_files_to_delocalize(self):
+        files = AnnDataIngestor.files_to_delocalize(self.valid_kwargs)
+        expected_files = [self.output_filename]
+        self.assertEqual(expected_files, files)
+
+    def test_delocalize_files(self):
+        # just create header, no reason to run full extract
+        self.anndata_ingest.generate_cluster_header(self.anndata_ingest.obtain_adata(), self.cluster_name)
+        with patch('ingest_files.IngestFiles.delocalize_file'):
+            AnnDataIngestor.delocalize_file(
+                "gs://fake_bucket", self.study_id, AnnDataIngestor.files_to_delocalize(self.valid_kwargs)
+            )
+            self.assertEqual(
+                IngestFiles.delocalize_file.call_count,
+                1,
+                "expected 1 call to delocalize output files",
+            )
diff --git a/tests/test_ingest.py b/tests/test_ingest.py
@@ -46,11 +46,10 @@
     validate_arguments,
     IngestPipeline,
     exit_pipeline,
-    run_ingest,
+    run_ingest
 )
 from expression_files.expression_files import GeneExpression
 
-
 def mock_load(self, *args, **kwargs):
     """Enables overwriting normal function with this placeholder.
     Returning the arguments enables tests to verify that the code invokes
@@ -675,6 +674,27 @@ def test_subsample_no_cell_intersection(self, mock_load_subsample):
             exit_pipeline(ingest, status, status_cell_metadata, arguments)
         self.assertEqual(cm.exception.code, 1)
 
+    def test_extract_cluster_file_from_anndata(self):
+        args = [
+            "--study-id",
+            "5d276a50421aa9117c982845",
+            "--study-file-id",
+            "5dd5ae25421aa910a723a337",
+            "ingest_anndata",
+            "--ingest-anndata",
+            "--extract-cluster",
+            "--anndata-file",
+            "../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad",
+            "--obsm-keys",
+            "['X_tsne']"
+
+        ]
+        ingest, arguments, status, status_cell_metadata = self.execute_ingest(args)
+        self.assertEqual(len(status), 1)
+        self.assertEqual(status[0], 0)
+        filename = 'X_tsne.cluster.anndata_segment.tsv'
+        self.assertTrue(os.path.isfile(filename))
+
 
 if __name__ == "__main__":
     unittest.main()