diff --git a/python/dolma/core/data_types.py b/python/dolma/core/data_types.py index d71bbab3..77b40d19 100644 --- a/python/dolma/core/data_types.py +++ b/python/dolma/core/data_types.py @@ -36,6 +36,7 @@ class InputSpecWithMetadataAndAttributes(InputSpecWithMetadata): class OutputSpec(Struct): id: str attributes: Dict[str, List[TaggerOutputValueType]] + first_two_hundred_text: Optional[str] = None source: Optional[str] = None diff --git a/python/dolma/core/runtime.py b/python/dolma/core/runtime.py index ac5e2a23..52f093a4 100644 --- a/python/dolma/core/runtime.py +++ b/python/dolma/core/runtime.py @@ -217,7 +217,7 @@ def _write_sample_to_streams( for stream_path, attributes in attributes_by_stream.items(): # actually write - output = OutputSpec(source=row.source, id=row.id, attributes=attributes) + output = OutputSpec(source=row.source, id=row.id, attributes=attributes, first_two_hundred_text=row.text[:200]) output_streams[stream_path].write(output)