broadinstitute · bistline · Oct 6, 2022 · Oct 5, 2022 · Oct 5, 2022 · Oct 5, 2022
diff --git a/ingest/expression_writer.py b/ingest/expression_writer.py
@@ -23,6 +23,7 @@
 """
 from __future__ import annotations
 
+import logging
 import os
 import re
 import multiprocessing
@@ -69,7 +70,8 @@ def __init__(
         timestamp = datetime.datetime.now().isoformat(sep="T", timespec="seconds")
         url_safe_timestamp = re.sub(':', '', timestamp)
         log_name = f"expression_scatter_data_{url_safe_timestamp}_log.txt"
-        self.dev_logger = setup_logger(__name__, log_name, format="support_configs")
+        self.log_name = log_name
+        self.dev_logger = setup_logger(__name__, log_name, level=logging.INFO, format="support_configs")
 
     def get_storage_bucket_name(self):
         """
@@ -110,7 +112,7 @@ def get_file_seek_points(self) -> list[list]:
                 if current_byte == '':  # eof
                     current_seek.append(file_size)
                     seek_points.append(current_seek)
-                    break
+                    return seek_points
                 while current_byte != "\n":
                     current_byte = matrix_file.read(1)
                     seek_point += 1
@@ -213,14 +215,16 @@ def read_dense_matrix_slice(self, indexes, matrix_cells, cluster_cells, data_dir
         :param data_dir: (str) name of output dir
         """
         start_pos, end_pos = indexes
-        self.dev_logger.info(f" reading {self.local_matrix_path} at index {start_pos}:{end_pos}")
         with open_file(self.local_matrix_path)[0] as matrix_file:
             current_pos = start_pos
             matrix_file.seek(current_pos)
             while current_pos < end_pos:
                 line = matrix_file.readline()
-                process_dense_line(line, matrix_cells, cluster_cells, data_dir)
-                current_pos += len(line)
+                if line == '':  # eof
+                    break
+                else:
+                    process_dense_line(line, matrix_cells, cluster_cells, data_dir)
+                    current_pos += len(line)
 
     def render_artifacts(self):
         """
@@ -254,7 +258,7 @@ def render_artifacts(self):
 
     def delocalize_outputs(self, cluster_name):
         """
-        Copy all output files to study bucket in parallel using gsutil (since there are usually ~25-30K files)
+        Write all output files back to source bucket with Content-Encoding: gzip header
 
         :param cluster_name: (str) encoded name of cluster
         """
@@ -265,9 +269,9 @@ def delocalize_outputs(self, cluster_name):
             files_to_push = list(file for file in dir_files if 'gene_entries' not in file)
             for file in files_to_push:
                 local_path = f"{cluster_name}/{file}"
-                IngestFiles.delocalize_file(None, None, self.matrix_file_path, local_path, f"{bucket_path}/{file}")
+                IngestFiles.delocalize_file(
+                    None, None, self.matrix_file_path, local_path, f"{bucket_path}/{file}", 'gzip'
+                )
             self.dev_logger.info(" push completed")
-            handler = self.dev_logger.handlers[0]
-            log_filename = handler.baseFilename.split("/").pop()
-            IngestFiles.delocalize_file(None, None, self.matrix_file_path, log_filename, f"parse_logs/{log_filename}")
+            IngestFiles.delocalize_file(None, None, self.matrix_file_path, self.log_name, f"parse_logs/{self.log_name}")
 
diff --git a/ingest/ingest_files.py b/ingest/ingest_files.py
@@ -170,14 +170,14 @@ def reset_file(self, file_path, start_point, open_as=None):
 
     @staticmethod
     def delocalize_file(
-        study_file_id, study_id, file_path, file_to_delocalize, bucket_destination
+        study_file_id, study_id, file_path, file_to_delocalize, bucket_destination, content_encoding=None
     ):
         """Writes local file to Google bucket
         Args:
             file_path: path of an ingest file (MUST BE  GS url)
             file_to_delocalize: name of local file to delocalize (ie. errors.txt)
             bucket_destination: path to google bucket (ie. parse_logs/{study_file_id}/errors.txt)
-
+            content_encoding: set Content-Encoding header, if specified
         """
 
         if IngestFiles.is_remote_file(file_path):
@@ -187,6 +187,8 @@ def delocalize_file(
                 storage_client = storage.Client()
                 bucket = storage_client.get_bucket(bucket_name)
                 blob = bucket.blob(bucket_destination)
+                if content_encoding is not None:
+                    blob.content_encoding = content_encoding
                 blob.upload_from_filename(file_to_delocalize)
                 IngestFiles.dev_logger.info(
                     f"File {file_to_delocalize} uploaded to {bucket_destination}."

diff --git a/ingest/writer_functions.py b/ingest/writer_functions.py
@@ -174,7 +174,8 @@ def process_dense_line(line, matrix_cells, cluster_cells, data_dir):
     filtered_expression = filter_expression_for_cluster(
         cluster_cells, matrix_cells, exp_vals
     )
-    write_gene_scores(gene_name, filtered_expression, data_dir)
+    if gene_name:
+        write_gene_scores(gene_name, filtered_expression, data_dir)
 
 def filter_expression_for_cluster(cluster_cells, exp_cells, exp_scores) -> list:
     """
@@ -197,5 +198,5 @@ def write_gene_scores(gene_name, exp_values, data_dir):
     :param exp_values: (list) expression values
     :param data_dir: (str) name out output dir
     """
-    with gzip.open(f"{data_dir}/{gene_name}.json.gz", "wt") as file:
+    with gzip.open(f"{data_dir}/{gene_name}.json", "wt") as file:
         json.dump(list(exp_values), file, separators=(',', ':'))
diff --git a/tests/data/expression_writer/slice_testing/cluster.tsv b/tests/data/expression_writer/slice_testing/cluster.tsv
@@ -0,0 +1,132 @@
+NAME	X	Y	Category	Intensity
+TYPE	numeric	numeric	group	numeric
+BA_1	70.06229534	15.4750564	A	1
+BA_2	8.806899522	123.2885491	A	1
+BA_3	24.6668434	16.74763274	A	1
+BA_4	140.9592493	20.79997729	A	1
+BA_5	148.4521712	21.60995118	A	1
+BA_6	1.517014096	0.485673286	A	1
+BA_7	167.2990346	13.72250288	A	1
+BA_8	185.5424524	5.845698625	A	1
+BA_9	161.52395382	0.761101364	A	1
+BA_10	146.7656655	24.58489322	A	1
+BA_11	159.75926361	11.57313875	A	1
+BA_12	145.08338493	10.79800326	A	1
+BA_13	150.9113465	4.322737256	A	1
+BA_14	172.1772514	9.328130584	A	1
+BA_15	163.9284063	9.44714249	A	1
+BA_16	156.964066	27.79164097	A	1
+BA_17	142.79662488	1.678858174	A	1
+BA_18	157.0374717	34.70952586	A	2
+BA_19	31.2173586	129.10474489	A	2
+BA_20	46.5505305	145.53499128	A	2
+BA_21	31.87778438	121.80205008	A	2
+BA_22	11.58075367	124.215546281	A	2
+BA_23	14.2254167	146.38115435	A	2
+BA_24	17.1623056	136.03986355	A	2
+BA_25	9.256746399	152.66900506	A	2
+BA_26	92.88195962	128.85942001	A	2
+BA_27	21.0534343	142.28349177	A	2
+BA_28	33.2161642	140.11622647	A	2
+BA_29	20.7751554	155.950639146	A	2
+BA_30	10.07873261	128.07937405	A	2
+BA_31	8.4942204	154.97189474	A	2
+BA_32	150.9208831	11.44725646	A	2
+BA_33	0.424210306	49.21867232	A	2
+BA_34	71.30275388	5.994743174	A	2
+BA_35	127.2545029	47.24387824	A	3
+BA_36	98.83813661	0.718426176	A	3
+BA_37	132.2584735	31.57533535	A	3
+BA_38	67.38111152	1.119845514	A	3
+BA_39	139.7403707	41.34434324	A	3
+BA_40	85.78861589	58.96233194	A	3
+BA_41	78.57174785	63.72167007	A	3
+BA_42	132.5204335	56.63684983	A	3
+BA_43	32.64257016	15.23518802	A	3
+BA_44	78.4197206	3.1847521	A	3
+BA_45	81.36090575	18.05646077	A	3
+BA_46	61.54906	6.837187755	A	3
+BA_47	104.5976551	74.28083691	A	3
+BA_48	131.9139686	33.24419774	A	3
+BA_49	118.1952445	58.94229754	A	3
+BA_50	94.69533307	38.20265588	A	3
+BA_51	98.54061104	58.5908845	A	3
+BA_52	61.97581777	7.333870573	A	3
+BA_53	23.50683238	24.53014934	A	3
+BA_54	87.07327242	75.21382467	A	4
+BA_55	10.47288015	48.46996628	A	4
+BA_56	116.565805	0.859084757	A	4
+BA_57	66.01394942	84.14061223	A	4
+BA_58	24.0864559	19.26830177	A	4
+BA_59	33.90947813	4.861695771	A	4
+BA_60	25.04189822	10.04715776	A	4
+BA_61	25.36921453	24.74097752	A	4
+BA_62	24.61156152	38.57784862	A	4
+BA_63	10.1522579	5.79175452	A	4
+BA_64	13.0302349	35.84194255	A	4
+BA_65	26.011124488	32.12754131	A	4
+BA_66	5.0803395	18.15514097	A	4
+BA_67	19.28749831	23.65124583	B	4
+BA_68	19.8382996	33.18104936	B	4
+BA_69	27.20239256	12.31891513	B	4
+BA_70	35.13339329	3.898564134	B	4
+BA_71	6.61786492	28.80320591	B	5
+BA_72	8.42778854	17.97710953	B	5
+BA_73	11.4551651	12.31783043	B	5
+BA_74	49.3611518	47.59145243	B	5
+BA_75	30.68246176	14.84335078	B	5
+BA_76	19.24083924	8.548619948	B	5
+BA_77	33.91691748	42.74321772	B	5
+BA_78	9.47015852	16.02367435	B	5
+BA_79	10.08003693	19.65298385	B	5
+BA_80	28.11169025	20.92169851	B	5
+BA_81	36.51370152	15.63933441	B	5
+BA_82	42.91618048	11.72576369	B	5
+BA_83	43.59390095	93.67225673	B	5
+BA_84	14.09804905	31.58528132	B	5
+BA_85	17.40598078	100.9513669	B	5
+BA_86	75.64724783	100.168992	B	5
+BA_87	90.80427682	40.88218955	B	5
+BA_88	51.61088169	98.32832101	B	5
+BA_89	65.43895097	76.10172565	B	6
+BA_90	81.22617002	22.83792967	B	6
+BA_91	64.41566946	92.39419771	B	6
+BA_92	83.356384	25.27722813	B	6
+BA_93	6.58784847	97.72445533	B	6
+BA_94	10.22712912	96.28708028	B	6
+BA_95	25.30950773	89.08937691	B	6
+BA_96	5.064443216	70.27891805	B	6
+BA_97	33.31745272	5.358689646	B	6
+BA_98	30.75465638	104.2346434	B	6
+BA_99	3.167842064	20.56360819	B	6
+BA_100	83.58125149	13.6926983	B	6
+BA_101	41.45546199	64.79995099	B	6
+BA_102	14.97411131	3.197034022	B	6
+BA_103	56.01933189	110.7914201	B	6
+BA_104	67.84116215	106.7649515	B	6
+BA_105	77.87455225	32.3238432	B	6
+BA_106	55.57453701	23.04472008	B	6
+BA_107	56.34315699	40.66431908	C	7
+BA_108	32.17126326	79.48627703	C	7
+BA_109	61.05967231	102.9731	C	7
+BA_110	34.70836803	18.10615973	C	7
+BA_111	16.4597735	41.43657929	C	7
+BA_112	21.30203917	47.79592699	C	7
+BA_113	29.53413987	26.28481726	C	7
+BA_114	4.085451703	54.43549834	C	7
+BA_115	24.78908857	98.16155471	C	7
+BA_116	65.59349525	23.72494654	C	7
+BA_117	13.71881681	120.8683135	C	7
+BA_118	59.74758488	81.45964404	C	7
+BA_119	48.80242017	37.60205067	C	7
+BA_120	72.00492188	135.9106901	C	8
+BA_121	40.0077021	143.214796	C	8
+BA_122	54.78365687	36.69922926	C	8
+BA_123	28.47215303	149.3005494	C	8
+BA_124	20.97539554	116.0357647	C	8
+BA_125	44.68091852	22.24667467	C	8
+BA_126	45.51897826	56.59054585	C	8
+BA_127	34.42198152	143.67498	C	8
+BA_128	1.032447791	78.83679245	C	8
+BA_129	65.79875582	111.0088786	C	8
+BA_130	66.44434159	82.45952119	C	8
diff --git a/tests/data/expression_writer/slice_testing/expression_matrix.tsv b/tests/data/expression_writer/slice_testing/expression_matrix.tsv
@@ -0,0 +1,8 @@
+GENE	BA_1	BA_2	BA_3	BA_4	BA_5	BA_6	BA_7	BA_8	BA_9	BA_10	BA_11	BA_12	BA_13	BA_14	BA_15	BA_16	BA_17	BA_18	BA_19	BA_20	BA_21	BA_22	BA_23	BA_24	BA_25	BA_26	BA_27	BA_28	BA_29	BA_30	BA_31	BA_32	BA_33	BA_34	BA_35	BA_36	BA_37	BA_38	BA_39	BA_40	BA_41	BA_42	BA_43	BA_44	BA_45	BA_46	BA_47	BA_48	BA_49	BA_50	BA_51	BA_52	BA_53	BA_54	BA_55	BA_56	BA_57	BA_58	BA_59	BA_60	BA_61	BA_62	BA_63	BA_64	BA_65	BA_66	BA_67	BA_68	BA_69	BA_70	BA_71	BA_72	BA_73	BA_74	BA_75	BA_76	BA_77	BA_78	BA_79	BA_80	BA_81	BA_82	BA_83	BA_84	BA_85	BA_86	BA_87	BA_88	BA_89	BA_90	BA_91	BA_92	BA_93	BA_94	BA_95	BA_96	BA_97	BA_98	BA_99	BA_100	BA_101	BA_102	BA_103	BA_104	BA_105	BA_106	BA_107	BA_108	BA_109	BA_110	BA_111	BA_112	BA_113	BA_114	BA_115	BA_116	BA_117	BA_118	BA_119	BA_120	BA_121	BA_122	BA_123	BA_124	BA_125	BA_126	BA_127	BA_128	BA_129	BA_130
+Adcy5	0	0	4	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
+Agpat2	0	0	0	0	5	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	6	7	8	6	6	7	6	0	6	0	6	6	6	6	6	0	6	6	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
+Agtr1	6	6	6	6	6	6	6	6	6	6	6	6	4	4	5	5	8	6	6	6	6	6	6	6	6	5	4	6	6	6	4	6	6	6	4	6	7	6	6	4	6	6	8	7	5	6	6	5	6	6	6	6	6	6	8	7	6	7	7	6	6	6	7	6	5	6	8	7	6	6	7	5	6	7	6	8	6	7	6	6	5	6	2	6	6	3	6	4	6	5	6	7	6	9	6	9	6	7	6	6	5	6	6	2	6	4	6	9	6	6	6	7	6	6	6	5	6	6	6	3	6	6	6	3	6	6	6	5	6	6
+Aifm1	0	0	5	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
+Apex1	0	0	5	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
+Apoc3	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1	1
+Apoe	0	0	4	0	3	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0	0
diff --git a/tests/data/writer_functions/OXCT2.json → tests/data/writer_functions/OXCT2.orig.json b/tests/data/writer_functions/OXCT2.json → tests/data/writer_functions/OXCT2.orig.json
diff --git a/tests/test_expression_writer.py b/tests/test_expression_writer.py
@@ -45,7 +45,7 @@ def seed_test_gene_entries(data_dir):
 
     @classmethod
     def teardown_class(cls):
-        logs = glob.glob('expression_scatter_images_*_log.txt')
+        logs = glob.glob('expression_scatter_data_*_log.txt')
         for log in logs:
             os.remove(log)
         test_dirs = glob.glob(f"{TestExpressionWriter.TEST_PREFIX}*")
@@ -60,13 +60,13 @@ def test_process_dense_matrix(self):
             os.path.exists(cluster_name)
         )
         self.assertTrue(
-            os.path.exists(f"{cluster_name}/Sergef.json.gz")
+            os.path.exists(f"{cluster_name}/Sergef.json")
         )
         self.assertTrue(
-            os.path.exists(f"{cluster_name}/Itm2a.json.gz")
+            os.path.exists(f"{cluster_name}/Itm2a.json")
         )
         expected_data = json.loads(open(f"data/expression_writer/Sergef.json").read())
-        rendered_data = json.loads(gzip.open(f"{cluster_name}/Sergef.json.gz").read())
+        rendered_data = json.loads(gzip.open(f"{cluster_name}/Sergef.json").read())
         self.assertEqual(
             expected_data, rendered_data
         )
@@ -82,13 +82,13 @@ def test_process_sparse_matrix(self):
         genes.remove('HOMER2')  # doesn't render gene entry file
         for gene in genes:
             self.assertTrue(
-                os.path.exists(f"{cluster_name}/{gene}.json.gz")
+                os.path.exists(f"{cluster_name}/{gene}.json")
             )
             self.assertTrue(
                 os.path.exists(f"{cluster_name}/gene_entries/{gene}__entries.txt")
             )
-        expected_data = json.loads(open(f"data/writer_functions/OXCT2.json").read())
-        rendered_data = json.loads(gzip.open(f"{cluster_name}/OXCT2.json.gz").read())
+        expected_data = json.loads(open(f"data/writer_functions/OXCT2.orig.json").read())
+        rendered_data = json.loads(gzip.open(f"{cluster_name}/OXCT2.json").read())
         self.assertEqual(
             expected_data, rendered_data
         )
@@ -168,7 +168,7 @@ def test_process_sparse_data_fragments(self):
         genes.remove('HOMER2')
         for gene in genes:
             self.assertTrue(
-                os.path.exists(f"{cluster_name}/{gene}.json.gz")
+                os.path.exists(f"{cluster_name}/{gene}.json")
             )
 
     def test_write_empty_sparse_genes(self):
@@ -180,7 +180,7 @@ def test_write_empty_sparse_genes(self):
         genes = load_entities_as_list(open(exp_writer.gene_file))
         exp_writer.write_empty_sparse_genes(genes, num_cells, cluster_name)
         # only empty gene should be HOMER2
-        gene = 'HOMER2.json.gz'
+        gene = 'HOMER2.json'
         self.assertTrue(
             os.path.exists(f"{cluster_name}/{gene}")
         )
@@ -198,8 +198,8 @@ def test_read_dense_matrix_slice(self):
         cells = list(f"CELL_000{i}" for i in range(1, 16))
         exp_writer.read_dense_matrix_slice(indexes, cells, cells, cluster_name)
         self.assertTrue(
-            os.path.exists(f"{cluster_name}/Sergef.json.gz")
+            os.path.exists(f"{cluster_name}/Sergef.json")
         )
         self.assertTrue(
-            os.path.exists(f"{cluster_name}/Itm2a.json.gz")
+            os.path.exists(f"{cluster_name}/Itm2a.json")
         )
diff --git a/tests/test_writer_functions.py b/tests/test_writer_functions.py
@@ -104,10 +104,10 @@ def test_process_sparse_fragment(self):
         # barcodes & cluster cells should be identical in this example
         process_sparse_fragment('OXCT2__entries.txt', barcodes, barcodes, data_dir)
         self.assertTrue(
-            os.path.exists(f"{data_dir}/OXCT2.json.gz")
+            os.path.exists(f"{data_dir}/OXCT2.json")
         )
-        rendered_data = json.loads(gzip.open(f"{data_dir}/OXCT2.json.gz").read())
-        expected_data = json.loads(open(f"{data_dir}/OXCT2.json").read())
+        rendered_data = json.loads(gzip.open(f"{data_dir}/OXCT2.json").read())
+        expected_data = json.loads(open(f"{data_dir}/OXCT2.orig.json").read())
         self.assertEqual(
             expected_data, rendered_data
         )
@@ -133,9 +133,9 @@ def test_process_dense_line(self):
         data_dir = 'data/writer_functions'
         process_dense_line(line, matrix_cells, cluster_cells, data_dir)
         self.assertTrue(
-            os.path.exists(f"{data_dir}/Gad1.json.gz")
+            os.path.exists(f"{data_dir}/Gad1.json")
         )
-        rendered_data = json.loads(gzip.open(f"{data_dir}/Gad1.json.gz").read())
+        rendered_data = json.loads(gzip.open(f"{data_dir}/Gad1.json").read())
         self.assertEqual(
             expected_data, rendered_data
         )
@@ -162,7 +162,7 @@ def test_write_gene_scores(self):
         gene = 'Egfr'
         data_dir = 'data/writer_functions'
         write_gene_scores(gene, data, data_dir)
-        rendered_data = json.loads(gzip.open(f"{data_dir}/{gene}.json.gz").read())
+        rendered_data = json.loads(gzip.open(f"{data_dir}/{gene}.json").read())
         self.assertEqual(
             data, rendered_data
         )