From e679c7779d9ec9847def24a4d614752eed60ddfb Mon Sep 17 00:00:00 2001 From: Daniel Lin <> Date: Wed, 8 May 2024 10:45:56 +0000 Subject: [PATCH] Exporting First 200 Text Of DOAB --- python/dolma/core/data_types.py | 1 + python/dolma/core/runtime.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/python/dolma/core/data_types.py b/python/dolma/core/data_types.py index d71bbab3..77b40d19 100644 --- a/python/dolma/core/data_types.py +++ b/python/dolma/core/data_types.py @@ -36,6 +36,7 @@ class InputSpecWithMetadataAndAttributes(InputSpecWithMetadata): class OutputSpec(Struct): id: str attributes: Dict[str, List[TaggerOutputValueType]] + first_two_hundred_text: Optional[str] = None source: Optional[str] = None diff --git a/python/dolma/core/runtime.py b/python/dolma/core/runtime.py index ac5e2a23..52f093a4 100644 --- a/python/dolma/core/runtime.py +++ b/python/dolma/core/runtime.py @@ -217,7 +217,7 @@ def _write_sample_to_streams( for stream_path, attributes in attributes_by_stream.items(): # actually write - output = OutputSpec(source=row.source, id=row.id, attributes=attributes) + output = OutputSpec(source=row.source, id=row.id, attributes=attributes, first_two_hundred_text=row.text[:200]) output_streams[stream_path].write(output)