airbyte.progress

A simple progress bar for the command line and IPython notebooks.

  1# Copyright (c) 2023 Airbyte, Inc., all rights reserved.
  2
  3"""A simple progress bar for the command line and IPython notebooks."""
  4from __future__ import annotations
  5
  6import datetime
  7import importlib
  8import math
  9import sys
 10import time
 11from contextlib import suppress
 12from enum import Enum, auto
 13from typing import TYPE_CHECKING, cast
 14
 15from rich.errors import LiveError
 16from rich.live import Live as RichLive
 17from rich.markdown import Markdown as RichMarkdown
 18
 19
 20if TYPE_CHECKING:
 21    from types import ModuleType
 22
 23
 24DEFAULT_REFRESHES_PER_SECOND = 2
 25IS_REPL = hasattr(sys, "ps1")  # True if we're in a Python REPL, in which case we can use Rich.
 26
 27ipy_display: ModuleType | None
 28try:
 29    # Default to IS_NOTEBOOK=False if a TTY is detected.
 30    IS_NOTEBOOK = not sys.stdout.isatty()
 31    ipy_display = importlib.import_module("IPython.display")
 32
 33except ImportError:
 34    # If IPython is not installed, then we're definitely not in a notebook.
 35    ipy_display = None
 36    IS_NOTEBOOK = False
 37
 38
 39class ProgressStyle(Enum):
 40    """An enum of progress bar styles."""
 41
 42    AUTO = auto()
 43    """Automatically select the best style for the environment."""
 44
 45    RICH = auto()
 46    """A Rich progress bar."""
 47
 48    IPYTHON = auto()
 49    """Use IPython display methods."""
 50
 51    PLAIN = auto()
 52    """A plain text progress print."""
 53
 54    NONE = auto()
 55    """Skip progress prints."""
 56
 57
 58MAX_UPDATE_FREQUENCY = 1000
 59"""The max number of records to read before updating the progress bar."""
 60
 61
 62def _to_time_str(timestamp: float) -> str:
 63    """Convert a timestamp float to a local time string.
 64
 65    For now, we'll just use UTC to avoid breaking tests. In the future, we should
 66    return a local time string.
 67    """
 68    datetime_obj = datetime.datetime.fromtimestamp(timestamp, tz=datetime.timezone.utc)
 69    # TODO: Uncomment this line when we can get tests to properly account for local timezones.
 70    #       For now, we'll just use UTC to avoid breaking tests.
 71    # datetime_obj = datetime_obj.astimezone()
 72    return datetime_obj.strftime("%H:%M:%S")
 73
 74
 75def _get_elapsed_time_str(seconds: int) -> str:
 76    """Return duration as a string.
 77
 78    Seconds are included until 10 minutes is exceeded.
 79    Minutes are always included after 1 minute elapsed.
 80    Hours are always included after 1 hour elapsed.
 81    """
 82    if seconds <= 60:  # noqa: PLR2004  # Magic numbers OK here.
 83        return f"{seconds} seconds"
 84
 85    if seconds < 60 * 10:
 86        minutes = seconds // 60
 87        seconds = seconds % 60
 88        return f"{minutes}min {seconds}s"
 89
 90    if seconds < 60 * 60:
 91        minutes = seconds // 60
 92        seconds = seconds % 60
 93        return f"{minutes}min"
 94
 95    hours = seconds // (60 * 60)
 96    minutes = (seconds % (60 * 60)) // 60
 97    return f"{hours}hr {minutes}min"
 98
 99
100class ReadProgress:
101    """A simple progress bar for the command line and IPython notebooks."""
102
103    def __init__(
104        self,
105        style: ProgressStyle = ProgressStyle.AUTO,
106    ) -> None:
107        """Initialize the progress tracker."""
108        # Streams expected (for progress bar)
109        self.num_streams_expected = 0
110
111        # Reads
112        self.read_start_time = time.time()
113        self.read_end_time: float | None = None
114        self.total_records_read = 0
115
116        # Writes
117        self.total_records_written = 0
118        self.total_batches_written = 0
119        self.written_stream_names: set[str] = set()
120
121        # Finalization
122        self.finalize_start_time: float | None = None
123        self.finalize_end_time: float | None = None
124        self.total_records_finalized = 0
125        self.total_batches_finalized = 0
126        self.finalized_stream_names: set[str] = set()
127
128        self.last_update_time: float | None = None
129
130        self._rich_view: RichLive | None = None
131        self.style: ProgressStyle = style
132        if self.style == ProgressStyle.AUTO:
133            self.style = ProgressStyle.PLAIN
134            if IS_NOTEBOOK:
135                self.style = ProgressStyle.IPYTHON
136
137            elif IS_REPL:
138                self.style = ProgressStyle.PLAIN
139
140            else:
141                # Test for Rich availability:
142                self._rich_view = RichLive()
143                try:
144                    self._rich_view.start()
145                    self._rich_view.stop()
146                    self._rich_view = None
147                    self.style = ProgressStyle.RICH
148                except LiveError:
149                    # Rich live view not available. Using plain text progress.
150                    self._rich_view = None
151                    self.style = ProgressStyle.PLAIN
152
153    def _start(self) -> None:
154        """Start the progress bar."""
155        if self.style == ProgressStyle.RICH and not self._rich_view:
156            self._rich_view = RichLive(
157                auto_refresh=True,
158                refresh_per_second=DEFAULT_REFRESHES_PER_SECOND,
159            )
160            self._rich_view.start()
161
162    def _stop(self) -> None:
163        """Stop the progress bar."""
164        if self._rich_view:
165            with suppress(Exception):
166                self._rich_view.stop()
167                self._rich_view = None
168
169    def __del__(self) -> None:
170        """Close the Rich view."""
171        self._stop()
172
173    def log_success(self) -> None:
174        """Log success and stop tracking progress."""
175        if self.finalize_end_time is None:
176            # If we haven't already finalized, do so now.
177
178            self.finalize_end_time = time.time()
179
180            self.update_display(force_refresh=True)
181            self._stop()
182
183    def reset(self, num_streams_expected: int) -> None:
184        """Reset the progress tracker."""
185        # Streams expected (for progress bar)
186        self.num_streams_expected = num_streams_expected
187
188        # Reads
189        self.read_start_time = time.time()
190        self.read_end_time = None
191        self.total_records_read = 0
192
193        # Writes
194        self.total_records_written = 0
195        self.total_batches_written = 0
196        self.written_stream_names = set()
197
198        # Finalization
199        self.finalize_start_time = None
200        self.finalize_end_time = None
201        self.total_records_finalized = 0
202        self.total_batches_finalized = 0
203        self.finalized_stream_names = set()
204
205        self._start()
206
207    @property
208    def elapsed_seconds(self) -> int:
209        """Return the number of seconds elapsed since the read operation started."""
210        if self.finalize_end_time:
211            return int(self.finalize_end_time - self.read_start_time)
212
213        return int(time.time() - self.read_start_time)
214
215    @property
216    def elapsed_time_string(self) -> str:
217        """Return duration as a string."""
218        return _get_elapsed_time_str(self.elapsed_seconds)
219
220    @property
221    def elapsed_seconds_since_last_update(self) -> float | None:
222        """Return the number of seconds elapsed since the last update."""
223        if self.last_update_time is None:
224            return None
225
226        return time.time() - self.last_update_time
227
228    @property
229    def elapsed_read_seconds(self) -> int:
230        """Return the number of seconds elapsed since the read operation started."""
231        if self.read_end_time is None:
232            return int(time.time() - self.read_start_time)
233
234        return int(self.read_end_time - self.read_start_time)
235
236    @property
237    def elapsed_read_time_string(self) -> str:
238        """Return duration as a string."""
239        return _get_elapsed_time_str(self.elapsed_read_seconds)
240
241    @property
242    def elapsed_finalization_seconds(self) -> int:
243        """Return the number of seconds elapsed since the read operation started."""
244        if self.finalize_start_time is None:
245            return 0
246        if self.finalize_end_time is None:
247            return int(time.time() - self.finalize_start_time)
248        return int(self.finalize_end_time - self.finalize_start_time)
249
250    @property
251    def elapsed_finalization_time_str(self) -> str:
252        """Return duration as a string."""
253        return _get_elapsed_time_str(self.elapsed_finalization_seconds)
254
255    def log_records_read(self, new_total_count: int) -> None:
256        """Load a number of records read."""
257        self.total_records_read = new_total_count
258
259        # This is some math to make updates adaptive to the scale of records read.
260        # We want to update the display more often when the count is low, and less
261        # often when the count is high.
262        updated_period = min(
263            MAX_UPDATE_FREQUENCY, 10 ** math.floor(math.log10(max(self.total_records_read, 1)) / 4)
264        )
265        if self.total_records_read % updated_period != 0:
266            return
267
268        self.update_display()
269
270    def log_batch_written(self, stream_name: str, batch_size: int) -> None:
271        """Log that a batch has been written.
272
273        Args:
274            stream_name: The name of the stream.
275            batch_size: The number of records in the batch.
276        """
277        self.total_records_written += batch_size
278        self.total_batches_written += 1
279        self.written_stream_names.add(stream_name)
280        self.update_display()
281
282    def log_batches_finalizing(self, stream_name: str, num_batches: int) -> None:
283        """Log that batch are ready to be finalized.
284
285        In our current implementation, we ignore the stream name and number of batches.
286        We just use this as a signal that we're finished reading and have begun to
287        finalize any accumulated batches.
288        """
289        _ = stream_name, num_batches  # unused for now
290        if self.finalize_start_time is None:
291            self.read_end_time = time.time()
292            self.finalize_start_time = self.read_end_time
293
294        self.update_display(force_refresh=True)
295
296    def log_batches_finalized(self, stream_name: str, num_batches: int) -> None:
297        """Log that a batch has been finalized."""
298        _ = stream_name  # unused for now
299        self.total_batches_finalized += num_batches
300        self.update_display(force_refresh=True)
301
302    def log_stream_finalized(self, stream_name: str) -> None:
303        """Log that a stream has been finalized."""
304        self.finalized_stream_names.add(stream_name)
305        self.update_display(force_refresh=True)
306        if len(self.finalized_stream_names) == self.num_streams_expected:
307            self.log_success()
308
309    def update_display(self, *, force_refresh: bool = False) -> None:
310        """Update the display."""
311        # Don't update more than twice per second unless force_refresh is True.
312        if (
313            not force_refresh
314            and self.last_update_time  # if not set, then we definitely need to update
315            and cast(float, self.elapsed_seconds_since_last_update) < 0.5  # noqa: PLR2004
316        ):
317            return
318
319        status_message = self._get_status_message()
320
321        if self.style == ProgressStyle.IPYTHON and ipy_display is not None:
322            # We're in a notebook so use the IPython display.
323            assert ipy_display is not None
324            ipy_display.clear_output(wait=True)
325            ipy_display.display(ipy_display.Markdown(status_message))
326
327        elif self.style == ProgressStyle.RICH and self._rich_view is not None:
328            self._rich_view.update(RichMarkdown(status_message))
329
330        elif self.style == ProgressStyle.PLAIN:
331            # TODO: Add a plain text progress print option that isn't too noisy.
332            pass
333
334        elif self.style == ProgressStyle.NONE:
335            pass
336
337        self.last_update_time = time.time()
338
339    def _get_status_message(self) -> str:
340        """Compile and return a status message."""
341        # Format start time as a friendly string in local timezone:
342        start_time_str = _to_time_str(self.read_start_time)
343        records_per_second: float = 0.0
344        if self.elapsed_read_seconds > 0:
345            records_per_second = round(self.total_records_read / self.elapsed_read_seconds, 1)
346        status_message = (
347            f"## Read Progress\n\n"
348            f"Started reading at {start_time_str}.\n\n"
349            f"Read **{self.total_records_read:,}** records "
350            f"over **{self.elapsed_read_time_string}** "
351            f"({records_per_second:,} records / second).\n\n"
352        )
353        if self.total_records_written > 0:
354            status_message += (
355                f"Wrote **{self.total_records_written:,}** records "
356                f"over {self.total_batches_written:,} batches.\n\n"
357            )
358        if self.read_end_time is not None:
359            read_end_time_str = _to_time_str(self.read_end_time)
360            status_message += f"Finished reading at {read_end_time_str}.\n\n"
361        if self.finalize_start_time is not None:
362            finalize_start_time_str = _to_time_str(self.finalize_start_time)
363            status_message += f"Started finalizing streams at {finalize_start_time_str}.\n\n"
364            status_message += (
365                f"Finalized **{self.total_batches_finalized}** batches "
366                f"over {self.elapsed_finalization_time_str}.\n\n"
367            )
368        if self.finalized_stream_names:
369            status_message += (
370                f"Completed {len(self.finalized_stream_names)} "
371                + (f"out of {self.num_streams_expected} " if self.num_streams_expected else "")
372                + "streams:\n\n"
373            )
374            for stream_name in self.finalized_stream_names:
375                status_message += f"  - {stream_name}\n"
376
377        status_message += "\n\n"
378
379        if self.finalize_end_time is not None:
380            completion_time_str = _to_time_str(self.finalize_end_time)
381            status_message += (
382                f"Completed writing at {completion_time_str}. "
383                f"Total time elapsed: {self.elapsed_time_string}\n\n"
384            )
385        status_message += "\n------------------------------------------------\n"
386
387        return status_message
388
389
390progress = ReadProgress()
DEFAULT_REFRESHES_PER_SECOND = 2
IS_REPL = False
ipy_display: module | None = None
class ProgressStyle(enum.Enum):
40class ProgressStyle(Enum):
41    """An enum of progress bar styles."""
42
43    AUTO = auto()
44    """Automatically select the best style for the environment."""
45
46    RICH = auto()
47    """A Rich progress bar."""
48
49    IPYTHON = auto()
50    """Use IPython display methods."""
51
52    PLAIN = auto()
53    """A plain text progress print."""
54
55    NONE = auto()
56    """Skip progress prints."""

An enum of progress bar styles.

AUTO = <ProgressStyle.AUTO: 1>

Automatically select the best style for the environment.

RICH = <ProgressStyle.RICH: 2>

A Rich progress bar.

IPYTHON = <ProgressStyle.IPYTHON: 3>

Use IPython display methods.

PLAIN = <ProgressStyle.PLAIN: 4>

A plain text progress print.

NONE = <ProgressStyle.NONE: 5>

Skip progress prints.

Inherited Members
enum.Enum
name
value
MAX_UPDATE_FREQUENCY = 1000

The max number of records to read before updating the progress bar.

class ReadProgress:
101class ReadProgress:
102    """A simple progress bar for the command line and IPython notebooks."""
103
104    def __init__(
105        self,
106        style: ProgressStyle = ProgressStyle.AUTO,
107    ) -> None:
108        """Initialize the progress tracker."""
109        # Streams expected (for progress bar)
110        self.num_streams_expected = 0
111
112        # Reads
113        self.read_start_time = time.time()
114        self.read_end_time: float | None = None
115        self.total_records_read = 0
116
117        # Writes
118        self.total_records_written = 0
119        self.total_batches_written = 0
120        self.written_stream_names: set[str] = set()
121
122        # Finalization
123        self.finalize_start_time: float | None = None
124        self.finalize_end_time: float | None = None
125        self.total_records_finalized = 0
126        self.total_batches_finalized = 0
127        self.finalized_stream_names: set[str] = set()
128
129        self.last_update_time: float | None = None
130
131        self._rich_view: RichLive | None = None
132        self.style: ProgressStyle = style
133        if self.style == ProgressStyle.AUTO:
134            self.style = ProgressStyle.PLAIN
135            if IS_NOTEBOOK:
136                self.style = ProgressStyle.IPYTHON
137
138            elif IS_REPL:
139                self.style = ProgressStyle.PLAIN
140
141            else:
142                # Test for Rich availability:
143                self._rich_view = RichLive()
144                try:
145                    self._rich_view.start()
146                    self._rich_view.stop()
147                    self._rich_view = None
148                    self.style = ProgressStyle.RICH
149                except LiveError:
150                    # Rich live view not available. Using plain text progress.
151                    self._rich_view = None
152                    self.style = ProgressStyle.PLAIN
153
154    def _start(self) -> None:
155        """Start the progress bar."""
156        if self.style == ProgressStyle.RICH and not self._rich_view:
157            self._rich_view = RichLive(
158                auto_refresh=True,
159                refresh_per_second=DEFAULT_REFRESHES_PER_SECOND,
160            )
161            self._rich_view.start()
162
163    def _stop(self) -> None:
164        """Stop the progress bar."""
165        if self._rich_view:
166            with suppress(Exception):
167                self._rich_view.stop()
168                self._rich_view = None
169
170    def __del__(self) -> None:
171        """Close the Rich view."""
172        self._stop()
173
174    def log_success(self) -> None:
175        """Log success and stop tracking progress."""
176        if self.finalize_end_time is None:
177            # If we haven't already finalized, do so now.
178
179            self.finalize_end_time = time.time()
180
181            self.update_display(force_refresh=True)
182            self._stop()
183
184    def reset(self, num_streams_expected: int) -> None:
185        """Reset the progress tracker."""
186        # Streams expected (for progress bar)
187        self.num_streams_expected = num_streams_expected
188
189        # Reads
190        self.read_start_time = time.time()
191        self.read_end_time = None
192        self.total_records_read = 0
193
194        # Writes
195        self.total_records_written = 0
196        self.total_batches_written = 0
197        self.written_stream_names = set()
198
199        # Finalization
200        self.finalize_start_time = None
201        self.finalize_end_time = None
202        self.total_records_finalized = 0
203        self.total_batches_finalized = 0
204        self.finalized_stream_names = set()
205
206        self._start()
207
208    @property
209    def elapsed_seconds(self) -> int:
210        """Return the number of seconds elapsed since the read operation started."""
211        if self.finalize_end_time:
212            return int(self.finalize_end_time - self.read_start_time)
213
214        return int(time.time() - self.read_start_time)
215
216    @property
217    def elapsed_time_string(self) -> str:
218        """Return duration as a string."""
219        return _get_elapsed_time_str(self.elapsed_seconds)
220
221    @property
222    def elapsed_seconds_since_last_update(self) -> float | None:
223        """Return the number of seconds elapsed since the last update."""
224        if self.last_update_time is None:
225            return None
226
227        return time.time() - self.last_update_time
228
229    @property
230    def elapsed_read_seconds(self) -> int:
231        """Return the number of seconds elapsed since the read operation started."""
232        if self.read_end_time is None:
233            return int(time.time() - self.read_start_time)
234
235        return int(self.read_end_time - self.read_start_time)
236
237    @property
238    def elapsed_read_time_string(self) -> str:
239        """Return duration as a string."""
240        return _get_elapsed_time_str(self.elapsed_read_seconds)
241
242    @property
243    def elapsed_finalization_seconds(self) -> int:
244        """Return the number of seconds elapsed since the read operation started."""
245        if self.finalize_start_time is None:
246            return 0
247        if self.finalize_end_time is None:
248            return int(time.time() - self.finalize_start_time)
249        return int(self.finalize_end_time - self.finalize_start_time)
250
251    @property
252    def elapsed_finalization_time_str(self) -> str:
253        """Return duration as a string."""
254        return _get_elapsed_time_str(self.elapsed_finalization_seconds)
255
256    def log_records_read(self, new_total_count: int) -> None:
257        """Load a number of records read."""
258        self.total_records_read = new_total_count
259
260        # This is some math to make updates adaptive to the scale of records read.
261        # We want to update the display more often when the count is low, and less
262        # often when the count is high.
263        updated_period = min(
264            MAX_UPDATE_FREQUENCY, 10 ** math.floor(math.log10(max(self.total_records_read, 1)) / 4)
265        )
266        if self.total_records_read % updated_period != 0:
267            return
268
269        self.update_display()
270
271    def log_batch_written(self, stream_name: str, batch_size: int) -> None:
272        """Log that a batch has been written.
273
274        Args:
275            stream_name: The name of the stream.
276            batch_size: The number of records in the batch.
277        """
278        self.total_records_written += batch_size
279        self.total_batches_written += 1
280        self.written_stream_names.add(stream_name)
281        self.update_display()
282
283    def log_batches_finalizing(self, stream_name: str, num_batches: int) -> None:
284        """Log that batch are ready to be finalized.
285
286        In our current implementation, we ignore the stream name and number of batches.
287        We just use this as a signal that we're finished reading and have begun to
288        finalize any accumulated batches.
289        """
290        _ = stream_name, num_batches  # unused for now
291        if self.finalize_start_time is None:
292            self.read_end_time = time.time()
293            self.finalize_start_time = self.read_end_time
294
295        self.update_display(force_refresh=True)
296
297    def log_batches_finalized(self, stream_name: str, num_batches: int) -> None:
298        """Log that a batch has been finalized."""
299        _ = stream_name  # unused for now
300        self.total_batches_finalized += num_batches
301        self.update_display(force_refresh=True)
302
303    def log_stream_finalized(self, stream_name: str) -> None:
304        """Log that a stream has been finalized."""
305        self.finalized_stream_names.add(stream_name)
306        self.update_display(force_refresh=True)
307        if len(self.finalized_stream_names) == self.num_streams_expected:
308            self.log_success()
309
310    def update_display(self, *, force_refresh: bool = False) -> None:
311        """Update the display."""
312        # Don't update more than twice per second unless force_refresh is True.
313        if (
314            not force_refresh
315            and self.last_update_time  # if not set, then we definitely need to update
316            and cast(float, self.elapsed_seconds_since_last_update) < 0.5  # noqa: PLR2004
317        ):
318            return
319
320        status_message = self._get_status_message()
321
322        if self.style == ProgressStyle.IPYTHON and ipy_display is not None:
323            # We're in a notebook so use the IPython display.
324            assert ipy_display is not None
325            ipy_display.clear_output(wait=True)
326            ipy_display.display(ipy_display.Markdown(status_message))
327
328        elif self.style == ProgressStyle.RICH and self._rich_view is not None:
329            self._rich_view.update(RichMarkdown(status_message))
330
331        elif self.style == ProgressStyle.PLAIN:
332            # TODO: Add a plain text progress print option that isn't too noisy.
333            pass
334
335        elif self.style == ProgressStyle.NONE:
336            pass
337
338        self.last_update_time = time.time()
339
340    def _get_status_message(self) -> str:
341        """Compile and return a status message."""
342        # Format start time as a friendly string in local timezone:
343        start_time_str = _to_time_str(self.read_start_time)
344        records_per_second: float = 0.0
345        if self.elapsed_read_seconds > 0:
346            records_per_second = round(self.total_records_read / self.elapsed_read_seconds, 1)
347        status_message = (
348            f"## Read Progress\n\n"
349            f"Started reading at {start_time_str}.\n\n"
350            f"Read **{self.total_records_read:,}** records "
351            f"over **{self.elapsed_read_time_string}** "
352            f"({records_per_second:,} records / second).\n\n"
353        )
354        if self.total_records_written > 0:
355            status_message += (
356                f"Wrote **{self.total_records_written:,}** records "
357                f"over {self.total_batches_written:,} batches.\n\n"
358            )
359        if self.read_end_time is not None:
360            read_end_time_str = _to_time_str(self.read_end_time)
361            status_message += f"Finished reading at {read_end_time_str}.\n\n"
362        if self.finalize_start_time is not None:
363            finalize_start_time_str = _to_time_str(self.finalize_start_time)
364            status_message += f"Started finalizing streams at {finalize_start_time_str}.\n\n"
365            status_message += (
366                f"Finalized **{self.total_batches_finalized}** batches "
367                f"over {self.elapsed_finalization_time_str}.\n\n"
368            )
369        if self.finalized_stream_names:
370            status_message += (
371                f"Completed {len(self.finalized_stream_names)} "
372                + (f"out of {self.num_streams_expected} " if self.num_streams_expected else "")
373                + "streams:\n\n"
374            )
375            for stream_name in self.finalized_stream_names:
376                status_message += f"  - {stream_name}\n"
377
378        status_message += "\n\n"
379
380        if self.finalize_end_time is not None:
381            completion_time_str = _to_time_str(self.finalize_end_time)
382            status_message += (
383                f"Completed writing at {completion_time_str}. "
384                f"Total time elapsed: {self.elapsed_time_string}\n\n"
385            )
386        status_message += "\n------------------------------------------------\n"
387
388        return status_message

A simple progress bar for the command line and IPython notebooks.

ReadProgress(style: ProgressStyle = <ProgressStyle.AUTO: 1>)
104    def __init__(
105        self,
106        style: ProgressStyle = ProgressStyle.AUTO,
107    ) -> None:
108        """Initialize the progress tracker."""
109        # Streams expected (for progress bar)
110        self.num_streams_expected = 0
111
112        # Reads
113        self.read_start_time = time.time()
114        self.read_end_time: float | None = None
115        self.total_records_read = 0
116
117        # Writes
118        self.total_records_written = 0
119        self.total_batches_written = 0
120        self.written_stream_names: set[str] = set()
121
122        # Finalization
123        self.finalize_start_time: float | None = None
124        self.finalize_end_time: float | None = None
125        self.total_records_finalized = 0
126        self.total_batches_finalized = 0
127        self.finalized_stream_names: set[str] = set()
128
129        self.last_update_time: float | None = None
130
131        self._rich_view: RichLive | None = None
132        self.style: ProgressStyle = style
133        if self.style == ProgressStyle.AUTO:
134            self.style = ProgressStyle.PLAIN
135            if IS_NOTEBOOK:
136                self.style = ProgressStyle.IPYTHON
137
138            elif IS_REPL:
139                self.style = ProgressStyle.PLAIN
140
141            else:
142                # Test for Rich availability:
143                self._rich_view = RichLive()
144                try:
145                    self._rich_view.start()
146                    self._rich_view.stop()
147                    self._rich_view = None
148                    self.style = ProgressStyle.RICH
149                except LiveError:
150                    # Rich live view not available. Using plain text progress.
151                    self._rich_view = None
152                    self.style = ProgressStyle.PLAIN

Initialize the progress tracker.

num_streams_expected
read_start_time
read_end_time: float | None
total_records_read
total_records_written
total_batches_written
written_stream_names: set[str]
finalize_start_time: float | None
finalize_end_time: float | None
total_records_finalized
total_batches_finalized
finalized_stream_names: set[str]
last_update_time: float | None
style: ProgressStyle
def log_success(self) -> None:
174    def log_success(self) -> None:
175        """Log success and stop tracking progress."""
176        if self.finalize_end_time is None:
177            # If we haven't already finalized, do so now.
178
179            self.finalize_end_time = time.time()
180
181            self.update_display(force_refresh=True)
182            self._stop()

Log success and stop tracking progress.

def reset(self, num_streams_expected: int) -> None:
184    def reset(self, num_streams_expected: int) -> None:
185        """Reset the progress tracker."""
186        # Streams expected (for progress bar)
187        self.num_streams_expected = num_streams_expected
188
189        # Reads
190        self.read_start_time = time.time()
191        self.read_end_time = None
192        self.total_records_read = 0
193
194        # Writes
195        self.total_records_written = 0
196        self.total_batches_written = 0
197        self.written_stream_names = set()
198
199        # Finalization
200        self.finalize_start_time = None
201        self.finalize_end_time = None
202        self.total_records_finalized = 0
203        self.total_batches_finalized = 0
204        self.finalized_stream_names = set()
205
206        self._start()

Reset the progress tracker.

elapsed_seconds: int
208    @property
209    def elapsed_seconds(self) -> int:
210        """Return the number of seconds elapsed since the read operation started."""
211        if self.finalize_end_time:
212            return int(self.finalize_end_time - self.read_start_time)
213
214        return int(time.time() - self.read_start_time)

Return the number of seconds elapsed since the read operation started.

elapsed_time_string: str
216    @property
217    def elapsed_time_string(self) -> str:
218        """Return duration as a string."""
219        return _get_elapsed_time_str(self.elapsed_seconds)

Return duration as a string.

elapsed_seconds_since_last_update: float | None
221    @property
222    def elapsed_seconds_since_last_update(self) -> float | None:
223        """Return the number of seconds elapsed since the last update."""
224        if self.last_update_time is None:
225            return None
226
227        return time.time() - self.last_update_time

Return the number of seconds elapsed since the last update.

elapsed_read_seconds: int
229    @property
230    def elapsed_read_seconds(self) -> int:
231        """Return the number of seconds elapsed since the read operation started."""
232        if self.read_end_time is None:
233            return int(time.time() - self.read_start_time)
234
235        return int(self.read_end_time - self.read_start_time)

Return the number of seconds elapsed since the read operation started.

elapsed_read_time_string: str
237    @property
238    def elapsed_read_time_string(self) -> str:
239        """Return duration as a string."""
240        return _get_elapsed_time_str(self.elapsed_read_seconds)

Return duration as a string.

elapsed_finalization_seconds: int
242    @property
243    def elapsed_finalization_seconds(self) -> int:
244        """Return the number of seconds elapsed since the read operation started."""
245        if self.finalize_start_time is None:
246            return 0
247        if self.finalize_end_time is None:
248            return int(time.time() - self.finalize_start_time)
249        return int(self.finalize_end_time - self.finalize_start_time)

Return the number of seconds elapsed since the read operation started.

elapsed_finalization_time_str: str
251    @property
252    def elapsed_finalization_time_str(self) -> str:
253        """Return duration as a string."""
254        return _get_elapsed_time_str(self.elapsed_finalization_seconds)

Return duration as a string.

def log_records_read(self, new_total_count: int) -> None:
256    def log_records_read(self, new_total_count: int) -> None:
257        """Load a number of records read."""
258        self.total_records_read = new_total_count
259
260        # This is some math to make updates adaptive to the scale of records read.
261        # We want to update the display more often when the count is low, and less
262        # often when the count is high.
263        updated_period = min(
264            MAX_UPDATE_FREQUENCY, 10 ** math.floor(math.log10(max(self.total_records_read, 1)) / 4)
265        )
266        if self.total_records_read % updated_period != 0:
267            return
268
269        self.update_display()

Load a number of records read.

def log_batch_written(self, stream_name: str, batch_size: int) -> None:
271    def log_batch_written(self, stream_name: str, batch_size: int) -> None:
272        """Log that a batch has been written.
273
274        Args:
275            stream_name: The name of the stream.
276            batch_size: The number of records in the batch.
277        """
278        self.total_records_written += batch_size
279        self.total_batches_written += 1
280        self.written_stream_names.add(stream_name)
281        self.update_display()

Log that a batch has been written.

Arguments:
  • stream_name: The name of the stream.
  • batch_size: The number of records in the batch.
def log_batches_finalizing(self, stream_name: str, num_batches: int) -> None:
283    def log_batches_finalizing(self, stream_name: str, num_batches: int) -> None:
284        """Log that batch are ready to be finalized.
285
286        In our current implementation, we ignore the stream name and number of batches.
287        We just use this as a signal that we're finished reading and have begun to
288        finalize any accumulated batches.
289        """
290        _ = stream_name, num_batches  # unused for now
291        if self.finalize_start_time is None:
292            self.read_end_time = time.time()
293            self.finalize_start_time = self.read_end_time
294
295        self.update_display(force_refresh=True)

Log that batch are ready to be finalized.

In our current implementation, we ignore the stream name and number of batches. We just use this as a signal that we're finished reading and have begun to finalize any accumulated batches.

def log_batches_finalized(self, stream_name: str, num_batches: int) -> None:
297    def log_batches_finalized(self, stream_name: str, num_batches: int) -> None:
298        """Log that a batch has been finalized."""
299        _ = stream_name  # unused for now
300        self.total_batches_finalized += num_batches
301        self.update_display(force_refresh=True)

Log that a batch has been finalized.

def log_stream_finalized(self, stream_name: str) -> None:
303    def log_stream_finalized(self, stream_name: str) -> None:
304        """Log that a stream has been finalized."""
305        self.finalized_stream_names.add(stream_name)
306        self.update_display(force_refresh=True)
307        if len(self.finalized_stream_names) == self.num_streams_expected:
308            self.log_success()

Log that a stream has been finalized.

def update_display(self, *, force_refresh: bool = False) -> None:
310    def update_display(self, *, force_refresh: bool = False) -> None:
311        """Update the display."""
312        # Don't update more than twice per second unless force_refresh is True.
313        if (
314            not force_refresh
315            and self.last_update_time  # if not set, then we definitely need to update
316            and cast(float, self.elapsed_seconds_since_last_update) < 0.5  # noqa: PLR2004
317        ):
318            return
319
320        status_message = self._get_status_message()
321
322        if self.style == ProgressStyle.IPYTHON and ipy_display is not None:
323            # We're in a notebook so use the IPython display.
324            assert ipy_display is not None
325            ipy_display.clear_output(wait=True)
326            ipy_display.display(ipy_display.Markdown(status_message))
327
328        elif self.style == ProgressStyle.RICH and self._rich_view is not None:
329            self._rich_view.update(RichMarkdown(status_message))
330
331        elif self.style == ProgressStyle.PLAIN:
332            # TODO: Add a plain text progress print option that isn't too noisy.
333            pass
334
335        elif self.style == ProgressStyle.NONE:
336            pass
337
338        self.last_update_time = time.time()

Update the display.

progress = <ReadProgress object>