Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 14 additions & 10 deletions ingest/expression_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
"""
from __future__ import annotations

import logging
import os
import re
import multiprocessing
Expand Down Expand Up @@ -69,7 +70,8 @@ def __init__(
timestamp = datetime.datetime.now().isoformat(sep="T", timespec="seconds")
url_safe_timestamp = re.sub(':', '', timestamp)
log_name = f"expression_scatter_data_{url_safe_timestamp}_log.txt"
self.dev_logger = setup_logger(__name__, log_name, format="support_configs")
self.log_name = log_name
self.dev_logger = setup_logger(__name__, log_name, level=logging.INFO, format="support_configs")

def get_storage_bucket_name(self):
"""
Expand Down Expand Up @@ -110,7 +112,7 @@ def get_file_seek_points(self) -> list[list]:
if current_byte == '': # eof
current_seek.append(file_size)
seek_points.append(current_seek)
break
return seek_points
while current_byte != "\n":
current_byte = matrix_file.read(1)
seek_point += 1
Expand Down Expand Up @@ -213,14 +215,16 @@ def read_dense_matrix_slice(self, indexes, matrix_cells, cluster_cells, data_dir
:param data_dir: (str) name of output dir
"""
start_pos, end_pos = indexes
self.dev_logger.info(f" reading {self.local_matrix_path} at index {start_pos}:{end_pos}")
with open_file(self.local_matrix_path)[0] as matrix_file:
current_pos = start_pos
matrix_file.seek(current_pos)
while current_pos < end_pos:
line = matrix_file.readline()
process_dense_line(line, matrix_cells, cluster_cells, data_dir)
current_pos += len(line)
if line == '': # eof
break
else:
process_dense_line(line, matrix_cells, cluster_cells, data_dir)
current_pos += len(line)

def render_artifacts(self):
"""
Expand Down Expand Up @@ -254,7 +258,7 @@ def render_artifacts(self):

def delocalize_outputs(self, cluster_name):
"""
Copy all output files to study bucket in parallel using gsutil (since there are usually ~25-30K files)
Write all output files back to source bucket with Content-Encoding: gzip header

:param cluster_name: (str) encoded name of cluster
"""
Expand All @@ -265,9 +269,9 @@ def delocalize_outputs(self, cluster_name):
files_to_push = list(file for file in dir_files if 'gene_entries' not in file)
for file in files_to_push:
local_path = f"{cluster_name}/{file}"
IngestFiles.delocalize_file(None, None, self.matrix_file_path, local_path, f"{bucket_path}/{file}")
IngestFiles.delocalize_file(
None, None, self.matrix_file_path, local_path, f"{bucket_path}/{file}", 'gzip'
)
self.dev_logger.info(" push completed")
handler = self.dev_logger.handlers[0]
log_filename = handler.baseFilename.split("/").pop()
IngestFiles.delocalize_file(None, None, self.matrix_file_path, log_filename, f"parse_logs/{log_filename}")
IngestFiles.delocalize_file(None, None, self.matrix_file_path, self.log_name, f"parse_logs/{self.log_name}")

6 changes: 4 additions & 2 deletions ingest/ingest_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,14 +170,14 @@ def reset_file(self, file_path, start_point, open_as=None):

@staticmethod
def delocalize_file(
study_file_id, study_id, file_path, file_to_delocalize, bucket_destination
study_file_id, study_id, file_path, file_to_delocalize, bucket_destination, content_encoding=None
):
"""Writes local file to Google bucket
Args:
file_path: path of an ingest file (MUST BE GS url)
file_to_delocalize: name of local file to delocalize (ie. errors.txt)
bucket_destination: path to google bucket (ie. parse_logs/{study_file_id}/errors.txt)

content_encoding: set Content-Encoding header, if specified
"""

if IngestFiles.is_remote_file(file_path):
Expand All @@ -187,6 +187,8 @@ def delocalize_file(
storage_client = storage.Client()
bucket = storage_client.get_bucket(bucket_name)
blob = bucket.blob(bucket_destination)
if content_encoding is not None:
blob.content_encoding = content_encoding
blob.upload_from_filename(file_to_delocalize)
IngestFiles.dev_logger.info(
f"File {file_to_delocalize} uploaded to {bucket_destination}."
Expand Down
5 changes: 3 additions & 2 deletions ingest/writer_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,8 @@ def process_dense_line(line, matrix_cells, cluster_cells, data_dir):
filtered_expression = filter_expression_for_cluster(
cluster_cells, matrix_cells, exp_vals
)
write_gene_scores(gene_name, filtered_expression, data_dir)
if gene_name:
write_gene_scores(gene_name, filtered_expression, data_dir)

def filter_expression_for_cluster(cluster_cells, exp_cells, exp_scores) -> list:
"""
Expand All @@ -197,5 +198,5 @@ def write_gene_scores(gene_name, exp_values, data_dir):
:param exp_values: (list) expression values
:param data_dir: (str) name out output dir
"""
with gzip.open(f"{data_dir}/{gene_name}.json.gz", "wt") as file:
with gzip.open(f"{data_dir}/{gene_name}.json", "wt") as file:
json.dump(list(exp_values), file, separators=(',', ':'))
132 changes: 132 additions & 0 deletions tests/data/expression_writer/slice_testing/cluster.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
NAME X Y Category Intensity
TYPE numeric numeric group numeric
BA_1 70.06229534 15.4750564 A 1
BA_2 8.806899522 123.2885491 A 1
BA_3 24.6668434 16.74763274 A 1
BA_4 140.9592493 20.79997729 A 1
BA_5 148.4521712 21.60995118 A 1
BA_6 1.517014096 0.485673286 A 1
BA_7 167.2990346 13.72250288 A 1
BA_8 185.5424524 5.845698625 A 1
BA_9 161.52395382 0.761101364 A 1
BA_10 146.7656655 24.58489322 A 1
BA_11 159.75926361 11.57313875 A 1
BA_12 145.08338493 10.79800326 A 1
BA_13 150.9113465 4.322737256 A 1
BA_14 172.1772514 9.328130584 A 1
BA_15 163.9284063 9.44714249 A 1
BA_16 156.964066 27.79164097 A 1
BA_17 142.79662488 1.678858174 A 1
BA_18 157.0374717 34.70952586 A 2
BA_19 31.2173586 129.10474489 A 2
BA_20 46.5505305 145.53499128 A 2
BA_21 31.87778438 121.80205008 A 2
BA_22 11.58075367 124.215546281 A 2
BA_23 14.2254167 146.38115435 A 2
BA_24 17.1623056 136.03986355 A 2
BA_25 9.256746399 152.66900506 A 2
BA_26 92.88195962 128.85942001 A 2
BA_27 21.0534343 142.28349177 A 2
BA_28 33.2161642 140.11622647 A 2
BA_29 20.7751554 155.950639146 A 2
BA_30 10.07873261 128.07937405 A 2
BA_31 8.4942204 154.97189474 A 2
BA_32 150.9208831 11.44725646 A 2
BA_33 0.424210306 49.21867232 A 2
BA_34 71.30275388 5.994743174 A 2
BA_35 127.2545029 47.24387824 A 3
BA_36 98.83813661 0.718426176 A 3
BA_37 132.2584735 31.57533535 A 3
BA_38 67.38111152 1.119845514 A 3
BA_39 139.7403707 41.34434324 A 3
BA_40 85.78861589 58.96233194 A 3
BA_41 78.57174785 63.72167007 A 3
BA_42 132.5204335 56.63684983 A 3
BA_43 32.64257016 15.23518802 A 3
BA_44 78.4197206 3.1847521 A 3
BA_45 81.36090575 18.05646077 A 3
BA_46 61.54906 6.837187755 A 3
BA_47 104.5976551 74.28083691 A 3
BA_48 131.9139686 33.24419774 A 3
BA_49 118.1952445 58.94229754 A 3
BA_50 94.69533307 38.20265588 A 3
BA_51 98.54061104 58.5908845 A 3
BA_52 61.97581777 7.333870573 A 3
BA_53 23.50683238 24.53014934 A 3
BA_54 87.07327242 75.21382467 A 4
BA_55 10.47288015 48.46996628 A 4
BA_56 116.565805 0.859084757 A 4
BA_57 66.01394942 84.14061223 A 4
BA_58 24.0864559 19.26830177 A 4
BA_59 33.90947813 4.861695771 A 4
BA_60 25.04189822 10.04715776 A 4
BA_61 25.36921453 24.74097752 A 4
BA_62 24.61156152 38.57784862 A 4
BA_63 10.1522579 5.79175452 A 4
BA_64 13.0302349 35.84194255 A 4
BA_65 26.011124488 32.12754131 A 4
BA_66 5.0803395 18.15514097 A 4
BA_67 19.28749831 23.65124583 B 4
BA_68 19.8382996 33.18104936 B 4
BA_69 27.20239256 12.31891513 B 4
BA_70 35.13339329 3.898564134 B 4
BA_71 6.61786492 28.80320591 B 5
BA_72 8.42778854 17.97710953 B 5
BA_73 11.4551651 12.31783043 B 5
BA_74 49.3611518 47.59145243 B 5
BA_75 30.68246176 14.84335078 B 5
BA_76 19.24083924 8.548619948 B 5
BA_77 33.91691748 42.74321772 B 5
BA_78 9.47015852 16.02367435 B 5
BA_79 10.08003693 19.65298385 B 5
BA_80 28.11169025 20.92169851 B 5
BA_81 36.51370152 15.63933441 B 5
BA_82 42.91618048 11.72576369 B 5
BA_83 43.59390095 93.67225673 B 5
BA_84 14.09804905 31.58528132 B 5
BA_85 17.40598078 100.9513669 B 5
BA_86 75.64724783 100.168992 B 5
BA_87 90.80427682 40.88218955 B 5
BA_88 51.61088169 98.32832101 B 5
BA_89 65.43895097 76.10172565 B 6
BA_90 81.22617002 22.83792967 B 6
BA_91 64.41566946 92.39419771 B 6
BA_92 83.356384 25.27722813 B 6
BA_93 6.58784847 97.72445533 B 6
BA_94 10.22712912 96.28708028 B 6
BA_95 25.30950773 89.08937691 B 6
BA_96 5.064443216 70.27891805 B 6
BA_97 33.31745272 5.358689646 B 6
BA_98 30.75465638 104.2346434 B 6
BA_99 3.167842064 20.56360819 B 6
BA_100 83.58125149 13.6926983 B 6
BA_101 41.45546199 64.79995099 B 6
BA_102 14.97411131 3.197034022 B 6
BA_103 56.01933189 110.7914201 B 6
BA_104 67.84116215 106.7649515 B 6
BA_105 77.87455225 32.3238432 B 6
BA_106 55.57453701 23.04472008 B 6
BA_107 56.34315699 40.66431908 C 7
BA_108 32.17126326 79.48627703 C 7
BA_109 61.05967231 102.9731 C 7
BA_110 34.70836803 18.10615973 C 7
BA_111 16.4597735 41.43657929 C 7
BA_112 21.30203917 47.79592699 C 7
BA_113 29.53413987 26.28481726 C 7
BA_114 4.085451703 54.43549834 C 7
BA_115 24.78908857 98.16155471 C 7
BA_116 65.59349525 23.72494654 C 7
BA_117 13.71881681 120.8683135 C 7
BA_118 59.74758488 81.45964404 C 7
BA_119 48.80242017 37.60205067 C 7
BA_120 72.00492188 135.9106901 C 8
BA_121 40.0077021 143.214796 C 8
BA_122 54.78365687 36.69922926 C 8
BA_123 28.47215303 149.3005494 C 8
BA_124 20.97539554 116.0357647 C 8
BA_125 44.68091852 22.24667467 C 8
BA_126 45.51897826 56.59054585 C 8
BA_127 34.42198152 143.67498 C 8
BA_128 1.032447791 78.83679245 C 8
BA_129 65.79875582 111.0088786 C 8
BA_130 66.44434159 82.45952119 C 8
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
GENE BA_1 BA_2 BA_3 BA_4 BA_5 BA_6 BA_7 BA_8 BA_9 BA_10 BA_11 BA_12 BA_13 BA_14 BA_15 BA_16 BA_17 BA_18 BA_19 BA_20 BA_21 BA_22 BA_23 BA_24 BA_25 BA_26 BA_27 BA_28 BA_29 BA_30 BA_31 BA_32 BA_33 BA_34 BA_35 BA_36 BA_37 BA_38 BA_39 BA_40 BA_41 BA_42 BA_43 BA_44 BA_45 BA_46 BA_47 BA_48 BA_49 BA_50 BA_51 BA_52 BA_53 BA_54 BA_55 BA_56 BA_57 BA_58 BA_59 BA_60 BA_61 BA_62 BA_63 BA_64 BA_65 BA_66 BA_67 BA_68 BA_69 BA_70 BA_71 BA_72 BA_73 BA_74 BA_75 BA_76 BA_77 BA_78 BA_79 BA_80 BA_81 BA_82 BA_83 BA_84 BA_85 BA_86 BA_87 BA_88 BA_89 BA_90 BA_91 BA_92 BA_93 BA_94 BA_95 BA_96 BA_97 BA_98 BA_99 BA_100 BA_101 BA_102 BA_103 BA_104 BA_105 BA_106 BA_107 BA_108 BA_109 BA_110 BA_111 BA_112 BA_113 BA_114 BA_115 BA_116 BA_117 BA_118 BA_119 BA_120 BA_121 BA_122 BA_123 BA_124 BA_125 BA_126 BA_127 BA_128 BA_129 BA_130
Adcy5 0 0 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Agpat2 0 0 0 0 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 6 7 8 6 6 7 6 0 6 0 6 6 6 6 6 0 6 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Agtr1 6 6 6 6 6 6 6 6 6 6 6 6 4 4 5 5 8 6 6 6 6 6 6 6 6 5 4 6 6 6 4 6 6 6 4 6 7 6 6 4 6 6 8 7 5 6 6 5 6 6 6 6 6 6 8 7 6 7 7 6 6 6 7 6 5 6 8 7 6 6 7 5 6 7 6 8 6 7 6 6 5 6 2 6 6 3 6 4 6 5 6 7 6 9 6 9 6 7 6 6 5 6 6 2 6 4 6 9 6 6 6 7 6 6 6 5 6 6 6 3 6 6 6 3 6 6 6 5 6 6
Aifm1 0 0 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Apex1 0 0 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
Apoc3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
Apoe 0 0 4 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
22 changes: 11 additions & 11 deletions tests/test_expression_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def seed_test_gene_entries(data_dir):

@classmethod
def teardown_class(cls):
logs = glob.glob('expression_scatter_images_*_log.txt')
logs = glob.glob('expression_scatter_data_*_log.txt')
for log in logs:
os.remove(log)
test_dirs = glob.glob(f"{TestExpressionWriter.TEST_PREFIX}*")
Expand All @@ -60,13 +60,13 @@ def test_process_dense_matrix(self):
os.path.exists(cluster_name)
)
self.assertTrue(
os.path.exists(f"{cluster_name}/Sergef.json.gz")
os.path.exists(f"{cluster_name}/Sergef.json")
)
self.assertTrue(
os.path.exists(f"{cluster_name}/Itm2a.json.gz")
os.path.exists(f"{cluster_name}/Itm2a.json")
)
expected_data = json.loads(open(f"data/expression_writer/Sergef.json").read())
rendered_data = json.loads(gzip.open(f"{cluster_name}/Sergef.json.gz").read())
rendered_data = json.loads(gzip.open(f"{cluster_name}/Sergef.json").read())
self.assertEqual(
expected_data, rendered_data
)
Expand All @@ -82,13 +82,13 @@ def test_process_sparse_matrix(self):
genes.remove('HOMER2') # doesn't render gene entry file
for gene in genes:
self.assertTrue(
os.path.exists(f"{cluster_name}/{gene}.json.gz")
os.path.exists(f"{cluster_name}/{gene}.json")
)
self.assertTrue(
os.path.exists(f"{cluster_name}/gene_entries/{gene}__entries.txt")
)
expected_data = json.loads(open(f"data/writer_functions/OXCT2.json").read())
rendered_data = json.loads(gzip.open(f"{cluster_name}/OXCT2.json.gz").read())
expected_data = json.loads(open(f"data/writer_functions/OXCT2.orig.json").read())
rendered_data = json.loads(gzip.open(f"{cluster_name}/OXCT2.json").read())
self.assertEqual(
expected_data, rendered_data
)
Expand Down Expand Up @@ -168,7 +168,7 @@ def test_process_sparse_data_fragments(self):
genes.remove('HOMER2')
for gene in genes:
self.assertTrue(
os.path.exists(f"{cluster_name}/{gene}.json.gz")
os.path.exists(f"{cluster_name}/{gene}.json")
)

def test_write_empty_sparse_genes(self):
Expand All @@ -180,7 +180,7 @@ def test_write_empty_sparse_genes(self):
genes = load_entities_as_list(open(exp_writer.gene_file))
exp_writer.write_empty_sparse_genes(genes, num_cells, cluster_name)
# only empty gene should be HOMER2
gene = 'HOMER2.json.gz'
gene = 'HOMER2.json'
self.assertTrue(
os.path.exists(f"{cluster_name}/{gene}")
)
Expand All @@ -198,8 +198,8 @@ def test_read_dense_matrix_slice(self):
cells = list(f"CELL_000{i}" for i in range(1, 16))
exp_writer.read_dense_matrix_slice(indexes, cells, cells, cluster_name)
self.assertTrue(
os.path.exists(f"{cluster_name}/Sergef.json.gz")
os.path.exists(f"{cluster_name}/Sergef.json")
)
self.assertTrue(
os.path.exists(f"{cluster_name}/Itm2a.json.gz")
os.path.exists(f"{cluster_name}/Itm2a.json")
)
12 changes: 6 additions & 6 deletions tests/test_writer_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,10 +104,10 @@ def test_process_sparse_fragment(self):
# barcodes & cluster cells should be identical in this example
process_sparse_fragment('OXCT2__entries.txt', barcodes, barcodes, data_dir)
self.assertTrue(
os.path.exists(f"{data_dir}/OXCT2.json.gz")
os.path.exists(f"{data_dir}/OXCT2.json")
)
rendered_data = json.loads(gzip.open(f"{data_dir}/OXCT2.json.gz").read())
expected_data = json.loads(open(f"{data_dir}/OXCT2.json").read())
rendered_data = json.loads(gzip.open(f"{data_dir}/OXCT2.json").read())
expected_data = json.loads(open(f"{data_dir}/OXCT2.orig.json").read())
self.assertEqual(
expected_data, rendered_data
)
Expand All @@ -133,9 +133,9 @@ def test_process_dense_line(self):
data_dir = 'data/writer_functions'
process_dense_line(line, matrix_cells, cluster_cells, data_dir)
self.assertTrue(
os.path.exists(f"{data_dir}/Gad1.json.gz")
os.path.exists(f"{data_dir}/Gad1.json")
)
rendered_data = json.loads(gzip.open(f"{data_dir}/Gad1.json.gz").read())
rendered_data = json.loads(gzip.open(f"{data_dir}/Gad1.json").read())
self.assertEqual(
expected_data, rendered_data
)
Expand All @@ -162,7 +162,7 @@ def test_write_gene_scores(self):
gene = 'Egfr'
data_dir = 'data/writer_functions'
write_gene_scores(gene, data, data_dir)
rendered_data = json.loads(gzip.open(f"{data_dir}/{gene}.json.gz").read())
rendered_data = json.loads(gzip.open(f"{data_dir}/{gene}.json").read())
self.assertEqual(
data, rendered_data
)