From c8ce0fcf41668743856f5c11fed511345ab81cad Mon Sep 17 00:00:00 2001 From: bistline Date: Wed, 5 Oct 2022 17:46:02 -0400 Subject: [PATCH 1/7] fixing bug with slices landing on line ends --- ingest/expression_writer.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/ingest/expression_writer.py b/ingest/expression_writer.py index 044273fb..2333b61e 100755 --- a/ingest/expression_writer.py +++ b/ingest/expression_writer.py @@ -23,6 +23,7 @@ """ from __future__ import annotations +import logging import os import re import multiprocessing @@ -69,7 +70,8 @@ def __init__( timestamp = datetime.datetime.now().isoformat(sep="T", timespec="seconds") url_safe_timestamp = re.sub(':', '', timestamp) log_name = f"expression_scatter_data_{url_safe_timestamp}_log.txt" - self.dev_logger = setup_logger(__name__, log_name, format="support_configs") + self.log_name = log_name + self.dev_logger = setup_logger(__name__, log_name, level=logging.INFO, format="support_configs") def get_storage_bucket_name(self): """ @@ -110,7 +112,7 @@ def get_file_seek_points(self) -> list[list]: if current_byte == '': # eof current_seek.append(file_size) seek_points.append(current_seek) - break + return seek_points while current_byte != "\n": current_byte = matrix_file.read(1) seek_point += 1 @@ -213,12 +215,13 @@ def read_dense_matrix_slice(self, indexes, matrix_cells, cluster_cells, data_dir :param data_dir: (str) name of output dir """ start_pos, end_pos = indexes - self.dev_logger.info(f" reading {self.local_matrix_path} at index {start_pos}:{end_pos}") with open_file(self.local_matrix_path)[0] as matrix_file: current_pos = start_pos matrix_file.seek(current_pos) while current_pos < end_pos: line = matrix_file.readline() + if line == '': # eof + break process_dense_line(line, matrix_cells, cluster_cells, data_dir) current_pos += len(line) From 06319ed11605a357d3aff2469196faa80f0c79da Mon Sep 17 00:00:00 2001 From: bistline Date: Wed, 5 Oct 2022 17:56:20 -0400 Subject: [PATCH 2/7] prevent writing empty genes --- ingest/expression_writer.py | 5 +++-- ingest/writer_functions.py | 3 ++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/ingest/expression_writer.py b/ingest/expression_writer.py index 2333b61e..6b54570e 100755 --- a/ingest/expression_writer.py +++ b/ingest/expression_writer.py @@ -222,8 +222,9 @@ def read_dense_matrix_slice(self, indexes, matrix_cells, cluster_cells, data_dir line = matrix_file.readline() if line == '': # eof break - process_dense_line(line, matrix_cells, cluster_cells, data_dir) - current_pos += len(line) + else: + process_dense_line(line, matrix_cells, cluster_cells, data_dir) + current_pos += len(line) def render_artifacts(self): """ diff --git a/ingest/writer_functions.py b/ingest/writer_functions.py index d40baafd..28410606 100644 --- a/ingest/writer_functions.py +++ b/ingest/writer_functions.py @@ -174,7 +174,8 @@ def process_dense_line(line, matrix_cells, cluster_cells, data_dir): filtered_expression = filter_expression_for_cluster( cluster_cells, matrix_cells, exp_vals ) - write_gene_scores(gene_name, filtered_expression, data_dir) + if gene_name: + write_gene_scores(gene_name, filtered_expression, data_dir) def filter_expression_for_cluster(cluster_cells, exp_cells, exp_scores) -> list: """ From 3cca0ae55d4f734f77cf0627f2984f6e07620741 Mon Sep 17 00:00:00 2001 From: bistline Date: Wed, 5 Oct 2022 18:04:34 -0400 Subject: [PATCH 3/7] adding test data for slice bug --- .../slice_testing/cluster.tsv | 132 ++++++++++++++++++ .../slice_testing/expression_matrix.tsv | 8 ++ 2 files changed, 140 insertions(+) create mode 100644 tests/data/expression_writer/slice_testing/cluster.tsv create mode 100644 tests/data/expression_writer/slice_testing/expression_matrix.tsv diff --git a/tests/data/expression_writer/slice_testing/cluster.tsv b/tests/data/expression_writer/slice_testing/cluster.tsv new file mode 100644 index 00000000..d863a7f0 --- /dev/null +++ b/tests/data/expression_writer/slice_testing/cluster.tsv @@ -0,0 +1,132 @@ +NAME X Y Category Intensity +TYPE numeric numeric group numeric +BA_1 70.06229534 15.4750564 A 1 +BA_2 8.806899522 123.2885491 A 1 +BA_3 24.6668434 16.74763274 A 1 +BA_4 140.9592493 20.79997729 A 1 +BA_5 148.4521712 21.60995118 A 1 +BA_6 1.517014096 0.485673286 A 1 +BA_7 167.2990346 13.72250288 A 1 +BA_8 185.5424524 5.845698625 A 1 +BA_9 161.52395382 0.761101364 A 1 +BA_10 146.7656655 24.58489322 A 1 +BA_11 159.75926361 11.57313875 A 1 +BA_12 145.08338493 10.79800326 A 1 +BA_13 150.9113465 4.322737256 A 1 +BA_14 172.1772514 9.328130584 A 1 +BA_15 163.9284063 9.44714249 A 1 +BA_16 156.964066 27.79164097 A 1 +BA_17 142.79662488 1.678858174 A 1 +BA_18 157.0374717 34.70952586 A 2 +BA_19 31.2173586 129.10474489 A 2 +BA_20 46.5505305 145.53499128 A 2 +BA_21 31.87778438 121.80205008 A 2 +BA_22 11.58075367 124.215546281 A 2 +BA_23 14.2254167 146.38115435 A 2 +BA_24 17.1623056 136.03986355 A 2 +BA_25 9.256746399 152.66900506 A 2 +BA_26 92.88195962 128.85942001 A 2 +BA_27 21.0534343 142.28349177 A 2 +BA_28 33.2161642 140.11622647 A 2 +BA_29 20.7751554 155.950639146 A 2 +BA_30 10.07873261 128.07937405 A 2 +BA_31 8.4942204 154.97189474 A 2 +BA_32 150.9208831 11.44725646 A 2 +BA_33 0.424210306 49.21867232 A 2 +BA_34 71.30275388 5.994743174 A 2 +BA_35 127.2545029 47.24387824 A 3 +BA_36 98.83813661 0.718426176 A 3 +BA_37 132.2584735 31.57533535 A 3 +BA_38 67.38111152 1.119845514 A 3 +BA_39 139.7403707 41.34434324 A 3 +BA_40 85.78861589 58.96233194 A 3 +BA_41 78.57174785 63.72167007 A 3 +BA_42 132.5204335 56.63684983 A 3 +BA_43 32.64257016 15.23518802 A 3 +BA_44 78.4197206 3.1847521 A 3 +BA_45 81.36090575 18.05646077 A 3 +BA_46 61.54906 6.837187755 A 3 +BA_47 104.5976551 74.28083691 A 3 +BA_48 131.9139686 33.24419774 A 3 +BA_49 118.1952445 58.94229754 A 3 +BA_50 94.69533307 38.20265588 A 3 +BA_51 98.54061104 58.5908845 A 3 +BA_52 61.97581777 7.333870573 A 3 +BA_53 23.50683238 24.53014934 A 3 +BA_54 87.07327242 75.21382467 A 4 +BA_55 10.47288015 48.46996628 A 4 +BA_56 116.565805 0.859084757 A 4 +BA_57 66.01394942 84.14061223 A 4 +BA_58 24.0864559 19.26830177 A 4 +BA_59 33.90947813 4.861695771 A 4 +BA_60 25.04189822 10.04715776 A 4 +BA_61 25.36921453 24.74097752 A 4 +BA_62 24.61156152 38.57784862 A 4 +BA_63 10.1522579 5.79175452 A 4 +BA_64 13.0302349 35.84194255 A 4 +BA_65 26.011124488 32.12754131 A 4 +BA_66 5.0803395 18.15514097 A 4 +BA_67 19.28749831 23.65124583 B 4 +BA_68 19.8382996 33.18104936 B 4 +BA_69 27.20239256 12.31891513 B 4 +BA_70 35.13339329 3.898564134 B 4 +BA_71 6.61786492 28.80320591 B 5 +BA_72 8.42778854 17.97710953 B 5 +BA_73 11.4551651 12.31783043 B 5 +BA_74 49.3611518 47.59145243 B 5 +BA_75 30.68246176 14.84335078 B 5 +BA_76 19.24083924 8.548619948 B 5 +BA_77 33.91691748 42.74321772 B 5 +BA_78 9.47015852 16.02367435 B 5 +BA_79 10.08003693 19.65298385 B 5 +BA_80 28.11169025 20.92169851 B 5 +BA_81 36.51370152 15.63933441 B 5 +BA_82 42.91618048 11.72576369 B 5 +BA_83 43.59390095 93.67225673 B 5 +BA_84 14.09804905 31.58528132 B 5 +BA_85 17.40598078 100.9513669 B 5 +BA_86 75.64724783 100.168992 B 5 +BA_87 90.80427682 40.88218955 B 5 +BA_88 51.61088169 98.32832101 B 5 +BA_89 65.43895097 76.10172565 B 6 +BA_90 81.22617002 22.83792967 B 6 +BA_91 64.41566946 92.39419771 B 6 +BA_92 83.356384 25.27722813 B 6 +BA_93 6.58784847 97.72445533 B 6 +BA_94 10.22712912 96.28708028 B 6 +BA_95 25.30950773 89.08937691 B 6 +BA_96 5.064443216 70.27891805 B 6 +BA_97 33.31745272 5.358689646 B 6 +BA_98 30.75465638 104.2346434 B 6 +BA_99 3.167842064 20.56360819 B 6 +BA_100 83.58125149 13.6926983 B 6 +BA_101 41.45546199 64.79995099 B 6 +BA_102 14.97411131 3.197034022 B 6 +BA_103 56.01933189 110.7914201 B 6 +BA_104 67.84116215 106.7649515 B 6 +BA_105 77.87455225 32.3238432 B 6 +BA_106 55.57453701 23.04472008 B 6 +BA_107 56.34315699 40.66431908 C 7 +BA_108 32.17126326 79.48627703 C 7 +BA_109 61.05967231 102.9731 C 7 +BA_110 34.70836803 18.10615973 C 7 +BA_111 16.4597735 41.43657929 C 7 +BA_112 21.30203917 47.79592699 C 7 +BA_113 29.53413987 26.28481726 C 7 +BA_114 4.085451703 54.43549834 C 7 +BA_115 24.78908857 98.16155471 C 7 +BA_116 65.59349525 23.72494654 C 7 +BA_117 13.71881681 120.8683135 C 7 +BA_118 59.74758488 81.45964404 C 7 +BA_119 48.80242017 37.60205067 C 7 +BA_120 72.00492188 135.9106901 C 8 +BA_121 40.0077021 143.214796 C 8 +BA_122 54.78365687 36.69922926 C 8 +BA_123 28.47215303 149.3005494 C 8 +BA_124 20.97539554 116.0357647 C 8 +BA_125 44.68091852 22.24667467 C 8 +BA_126 45.51897826 56.59054585 C 8 +BA_127 34.42198152 143.67498 C 8 +BA_128 1.032447791 78.83679245 C 8 +BA_129 65.79875582 111.0088786 C 8 +BA_130 66.44434159 82.45952119 C 8 \ No newline at end of file diff --git a/tests/data/expression_writer/slice_testing/expression_matrix.tsv b/tests/data/expression_writer/slice_testing/expression_matrix.tsv new file mode 100644 index 00000000..a4c69843 --- /dev/null +++ b/tests/data/expression_writer/slice_testing/expression_matrix.tsv @@ -0,0 +1,8 @@ +GENE BA_1 BA_2 BA_3 BA_4 BA_5 BA_6 BA_7 BA_8 BA_9 BA_10 BA_11 BA_12 BA_13 BA_14 BA_15 BA_16 BA_17 BA_18 BA_19 BA_20 BA_21 BA_22 BA_23 BA_24 BA_25 BA_26 BA_27 BA_28 BA_29 BA_30 BA_31 BA_32 BA_33 BA_34 BA_35 BA_36 BA_37 BA_38 BA_39 BA_40 BA_41 BA_42 BA_43 BA_44 BA_45 BA_46 BA_47 BA_48 BA_49 BA_50 BA_51 BA_52 BA_53 BA_54 BA_55 BA_56 BA_57 BA_58 BA_59 BA_60 BA_61 BA_62 BA_63 BA_64 BA_65 BA_66 BA_67 BA_68 BA_69 BA_70 BA_71 BA_72 BA_73 BA_74 BA_75 BA_76 BA_77 BA_78 BA_79 BA_80 BA_81 BA_82 BA_83 BA_84 BA_85 BA_86 BA_87 BA_88 BA_89 BA_90 BA_91 BA_92 BA_93 BA_94 BA_95 BA_96 BA_97 BA_98 BA_99 BA_100 BA_101 BA_102 BA_103 BA_104 BA_105 BA_106 BA_107 BA_108 BA_109 BA_110 BA_111 BA_112 BA_113 BA_114 BA_115 BA_116 BA_117 BA_118 BA_119 BA_120 BA_121 BA_122 BA_123 BA_124 BA_125 BA_126 BA_127 BA_128 BA_129 BA_130 +Adcy5 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +Agpat2 0 0 0 0 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 6 7 8 6 6 7 6 0 6 0 6 6 6 6 6 0 6 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +Agtr1 6 6 6 6 6 6 6 6 6 6 6 6 4 4 5 5 8 6 6 6 6 6 6 6 6 5 4 6 6 6 4 6 6 6 4 6 7 6 6 4 6 6 8 7 5 6 6 5 6 6 6 6 6 6 8 7 6 7 7 6 6 6 7 6 5 6 8 7 6 6 7 5 6 7 6 8 6 7 6 6 5 6 2 6 6 3 6 4 6 5 6 7 6 9 6 9 6 7 6 6 5 6 6 2 6 4 6 9 6 6 6 7 6 6 6 5 6 6 6 3 6 6 6 3 6 6 6 5 6 6 +Aifm1 0 0 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +Apex1 0 0 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +Apoc3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 +Apoe 0 0 4 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 From 6342a4486ed2739e0a2504e8ced808a2e380ba86 Mon Sep 17 00:00:00 2001 From: bistline Date: Thu, 6 Oct 2022 10:11:01 -0400 Subject: [PATCH 4/7] removing .gz suffix --- ingest/writer_functions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ingest/writer_functions.py b/ingest/writer_functions.py index 28410606..ef10e0f2 100644 --- a/ingest/writer_functions.py +++ b/ingest/writer_functions.py @@ -198,5 +198,5 @@ def write_gene_scores(gene_name, exp_values, data_dir): :param exp_values: (list) expression values :param data_dir: (str) name out output dir """ - with gzip.open(f"{data_dir}/{gene_name}.json.gz", "wt") as file: + with gzip.open(f"{data_dir}/{gene_name}.json", "wt") as file: json.dump(list(exp_values), file, separators=(',', ':')) From daa7436979f3031662b5ac8f5daef709c53b2a8d Mon Sep 17 00:00:00 2001 From: bistline Date: Thu, 6 Oct 2022 10:22:50 -0400 Subject: [PATCH 5/7] test fixes re: .gz suffix, log names --- .../{OXCT2.json => OXCT2.orig.json} | 0 tests/test_expression_writer.py | 22 +++++++++---------- tests/test_writer_functions.py | 12 +++++----- 3 files changed, 17 insertions(+), 17 deletions(-) rename tests/data/writer_functions/{OXCT2.json => OXCT2.orig.json} (100%) diff --git a/tests/data/writer_functions/OXCT2.json b/tests/data/writer_functions/OXCT2.orig.json similarity index 100% rename from tests/data/writer_functions/OXCT2.json rename to tests/data/writer_functions/OXCT2.orig.json diff --git a/tests/test_expression_writer.py b/tests/test_expression_writer.py index e2df4b81..1625b950 100644 --- a/tests/test_expression_writer.py +++ b/tests/test_expression_writer.py @@ -45,7 +45,7 @@ def seed_test_gene_entries(data_dir): @classmethod def teardown_class(cls): - logs = glob.glob('expression_scatter_images_*_log.txt') + logs = glob.glob('expression_scatter_data_*_log.txt') for log in logs: os.remove(log) test_dirs = glob.glob(f"{TestExpressionWriter.TEST_PREFIX}*") @@ -60,13 +60,13 @@ def test_process_dense_matrix(self): os.path.exists(cluster_name) ) self.assertTrue( - os.path.exists(f"{cluster_name}/Sergef.json.gz") + os.path.exists(f"{cluster_name}/Sergef.json") ) self.assertTrue( - os.path.exists(f"{cluster_name}/Itm2a.json.gz") + os.path.exists(f"{cluster_name}/Itm2a.json") ) expected_data = json.loads(open(f"data/expression_writer/Sergef.json").read()) - rendered_data = json.loads(gzip.open(f"{cluster_name}/Sergef.json.gz").read()) + rendered_data = json.loads(gzip.open(f"{cluster_name}/Sergef.json").read()) self.assertEqual( expected_data, rendered_data ) @@ -82,13 +82,13 @@ def test_process_sparse_matrix(self): genes.remove('HOMER2') # doesn't render gene entry file for gene in genes: self.assertTrue( - os.path.exists(f"{cluster_name}/{gene}.json.gz") + os.path.exists(f"{cluster_name}/{gene}.json") ) self.assertTrue( os.path.exists(f"{cluster_name}/gene_entries/{gene}__entries.txt") ) - expected_data = json.loads(open(f"data/writer_functions/OXCT2.json").read()) - rendered_data = json.loads(gzip.open(f"{cluster_name}/OXCT2.json.gz").read()) + expected_data = json.loads(open(f"data/writer_functions/OXCT2.orig.json").read()) + rendered_data = json.loads(gzip.open(f"{cluster_name}/OXCT2.json").read()) self.assertEqual( expected_data, rendered_data ) @@ -168,7 +168,7 @@ def test_process_sparse_data_fragments(self): genes.remove('HOMER2') for gene in genes: self.assertTrue( - os.path.exists(f"{cluster_name}/{gene}.json.gz") + os.path.exists(f"{cluster_name}/{gene}.json") ) def test_write_empty_sparse_genes(self): @@ -180,7 +180,7 @@ def test_write_empty_sparse_genes(self): genes = load_entities_as_list(open(exp_writer.gene_file)) exp_writer.write_empty_sparse_genes(genes, num_cells, cluster_name) # only empty gene should be HOMER2 - gene = 'HOMER2.json.gz' + gene = 'HOMER2.json' self.assertTrue( os.path.exists(f"{cluster_name}/{gene}") ) @@ -198,8 +198,8 @@ def test_read_dense_matrix_slice(self): cells = list(f"CELL_000{i}" for i in range(1, 16)) exp_writer.read_dense_matrix_slice(indexes, cells, cells, cluster_name) self.assertTrue( - os.path.exists(f"{cluster_name}/Sergef.json.gz") + os.path.exists(f"{cluster_name}/Sergef.json") ) self.assertTrue( - os.path.exists(f"{cluster_name}/Itm2a.json.gz") + os.path.exists(f"{cluster_name}/Itm2a.json") ) diff --git a/tests/test_writer_functions.py b/tests/test_writer_functions.py index 04201fa1..039c57ce 100644 --- a/tests/test_writer_functions.py +++ b/tests/test_writer_functions.py @@ -104,10 +104,10 @@ def test_process_sparse_fragment(self): # barcodes & cluster cells should be identical in this example process_sparse_fragment('OXCT2__entries.txt', barcodes, barcodes, data_dir) self.assertTrue( - os.path.exists(f"{data_dir}/OXCT2.json.gz") + os.path.exists(f"{data_dir}/OXCT2.json") ) - rendered_data = json.loads(gzip.open(f"{data_dir}/OXCT2.json.gz").read()) - expected_data = json.loads(open(f"{data_dir}/OXCT2.json").read()) + rendered_data = json.loads(gzip.open(f"{data_dir}/OXCT2.json").read()) + expected_data = json.loads(open(f"{data_dir}/OXCT2.orig.json").read()) self.assertEqual( expected_data, rendered_data ) @@ -133,9 +133,9 @@ def test_process_dense_line(self): data_dir = 'data/writer_functions' process_dense_line(line, matrix_cells, cluster_cells, data_dir) self.assertTrue( - os.path.exists(f"{data_dir}/Gad1.json.gz") + os.path.exists(f"{data_dir}/Gad1.json") ) - rendered_data = json.loads(gzip.open(f"{data_dir}/Gad1.json.gz").read()) + rendered_data = json.loads(gzip.open(f"{data_dir}/Gad1.json").read()) self.assertEqual( expected_data, rendered_data ) @@ -162,7 +162,7 @@ def test_write_gene_scores(self): gene = 'Egfr' data_dir = 'data/writer_functions' write_gene_scores(gene, data, data_dir) - rendered_data = json.loads(gzip.open(f"{data_dir}/{gene}.json.gz").read()) + rendered_data = json.loads(gzip.open(f"{data_dir}/{gene}.json").read()) self.assertEqual( data, rendered_data ) From cede4395d25c15002e3fe9bbe956bea93a3e6de1 Mon Sep 17 00:00:00 2001 From: bistline Date: Thu, 6 Oct 2022 11:47:00 -0400 Subject: [PATCH 6/7] fixing Content-Encoding issue --- ingest/expression_writer.py | 6 ++++-- ingest/ingest_files.py | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/ingest/expression_writer.py b/ingest/expression_writer.py index 6b54570e..2af6fa27 100755 --- a/ingest/expression_writer.py +++ b/ingest/expression_writer.py @@ -258,7 +258,7 @@ def render_artifacts(self): def delocalize_outputs(self, cluster_name): """ - Copy all output files to study bucket in parallel using gsutil (since there are usually ~25-30K files) + Write all output files back to source bucket with Content-Encoding: gzip header :param cluster_name: (str) encoded name of cluster """ @@ -269,7 +269,9 @@ def delocalize_outputs(self, cluster_name): files_to_push = list(file for file in dir_files if 'gene_entries' not in file) for file in files_to_push: local_path = f"{cluster_name}/{file}" - IngestFiles.delocalize_file(None, None, self.matrix_file_path, local_path, f"{bucket_path}/{file}") + IngestFiles.delocalize_file( + None, None, self.matrix_file_path, local_path, f"{bucket_path}/{file}", 'gzip' + ) self.dev_logger.info(" push completed") handler = self.dev_logger.handlers[0] log_filename = handler.baseFilename.split("/").pop() diff --git a/ingest/ingest_files.py b/ingest/ingest_files.py index 70e08172..eadfbcfc 100644 --- a/ingest/ingest_files.py +++ b/ingest/ingest_files.py @@ -170,14 +170,14 @@ def reset_file(self, file_path, start_point, open_as=None): @staticmethod def delocalize_file( - study_file_id, study_id, file_path, file_to_delocalize, bucket_destination + study_file_id, study_id, file_path, file_to_delocalize, bucket_destination, content_encoding=None ): """Writes local file to Google bucket Args: file_path: path of an ingest file (MUST BE GS url) file_to_delocalize: name of local file to delocalize (ie. errors.txt) bucket_destination: path to google bucket (ie. parse_logs/{study_file_id}/errors.txt) - + content_encoding: set Content-Encoding header, if specified """ if IngestFiles.is_remote_file(file_path): @@ -187,6 +187,8 @@ def delocalize_file( storage_client = storage.Client() bucket = storage_client.get_bucket(bucket_name) blob = bucket.blob(bucket_destination) + if content_encoding is not None: + blob.content_encoding = content_encoding blob.upload_from_filename(file_to_delocalize) IngestFiles.dev_logger.info( f"File {file_to_delocalize} uploaded to {bucket_destination}." From c87e6513a04f0ef891d9871feaf853aa8cb6e75a Mon Sep 17 00:00:00 2001 From: bistline Date: Thu, 6 Oct 2022 11:51:12 -0400 Subject: [PATCH 7/7] simplifying logger delocalization --- ingest/expression_writer.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/ingest/expression_writer.py b/ingest/expression_writer.py index 2af6fa27..e2114b0c 100755 --- a/ingest/expression_writer.py +++ b/ingest/expression_writer.py @@ -273,7 +273,5 @@ def delocalize_outputs(self, cluster_name): None, None, self.matrix_file_path, local_path, f"{bucket_path}/{file}", 'gzip' ) self.dev_logger.info(" push completed") - handler = self.dev_logger.handlers[0] - log_filename = handler.baseFilename.split("/").pop() - IngestFiles.delocalize_file(None, None, self.matrix_file_path, log_filename, f"parse_logs/{log_filename}") + IngestFiles.delocalize_file(None, None, self.matrix_file_path, self.log_name, f"parse_logs/{self.log_name}")