Skip to content

Commit

Permalink
progressbar
Browse files Browse the repository at this point in the history
  • Loading branch information
soldni committed May 25, 2024
1 parent 708affc commit 824e11e
Showing 1 changed file with 4 additions and 2 deletions.
6 changes: 4 additions & 2 deletions python/dolma/warc/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from contextlib import ExitStack
from functools import reduce
from itertools import chain
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Union
from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Set, Union

import msgspec
import smart_open
Expand Down Expand Up @@ -303,7 +303,7 @@ def create_and_run_warc_pipeline(
compression: str = "zst",
skip_duplicate_urls: bool = False,
batch_size: int = 1,
logging_progress_bar: bool = False,
progress_bar_mode: Literal["tqdm", "logger"] = "tqdm",
):
"""Create and run pipeline for extracting documents from WARC files.
Expand Down Expand Up @@ -346,6 +346,7 @@ def create_and_run_warc_pipeline(
compression (str, optional): Compression format to use for the output files. Defaults to "zst".
skip_duplicate_urls (bool, optional): Whether to skip duplicate URLs. Defaults to False.
batch_size (int, optional): Number of documents to process in each batch. Defaults to 1.
progress_bar_mode ("tqdm" | "logger", optional): Mode for the progress bar. Defaults to "tqdm".
"""

with ExitStack() as stack:
Expand Down Expand Up @@ -398,6 +399,7 @@ def create_and_run_warc_pipeline(
num_processes=num_processes,
shuffle_src_paths=False,
batch_size=batch_size,
progress_bar_mode=progress_bar_mode,
)

processor(
Expand Down

0 comments on commit 824e11e

Please sign in to comment.