airbyte.progress
A simple progress bar for the command line and IPython notebooks.
1# Copyright (c) 2023 Airbyte, Inc., all rights reserved. 2 3"""A simple progress bar for the command line and IPython notebooks.""" 4from __future__ import annotations 5 6import datetime 7import importlib 8import math 9import sys 10import time 11from contextlib import suppress 12from enum import Enum, auto 13from typing import TYPE_CHECKING, cast 14 15from rich.errors import LiveError 16from rich.live import Live as RichLive 17from rich.markdown import Markdown as RichMarkdown 18 19 20if TYPE_CHECKING: 21 from types import ModuleType 22 23 24DEFAULT_REFRESHES_PER_SECOND = 2 25IS_REPL = hasattr(sys, "ps1") # True if we're in a Python REPL, in which case we can use Rich. 26 27ipy_display: ModuleType | None 28try: 29 # Default to IS_NOTEBOOK=False if a TTY is detected. 30 IS_NOTEBOOK = not sys.stdout.isatty() 31 ipy_display = importlib.import_module("IPython.display") 32 33except ImportError: 34 # If IPython is not installed, then we're definitely not in a notebook. 35 ipy_display = None 36 IS_NOTEBOOK = False 37 38 39class ProgressStyle(Enum): 40 """An enum of progress bar styles.""" 41 42 AUTO = auto() 43 """Automatically select the best style for the environment.""" 44 45 RICH = auto() 46 """A Rich progress bar.""" 47 48 IPYTHON = auto() 49 """Use IPython display methods.""" 50 51 PLAIN = auto() 52 """A plain text progress print.""" 53 54 NONE = auto() 55 """Skip progress prints.""" 56 57 58MAX_UPDATE_FREQUENCY = 1000 59"""The max number of records to read before updating the progress bar.""" 60 61 62def _to_time_str(timestamp: float) -> str: 63 """Convert a timestamp float to a local time string. 64 65 For now, we'll just use UTC to avoid breaking tests. In the future, we should 66 return a local time string. 67 """ 68 datetime_obj = datetime.datetime.fromtimestamp(timestamp, tz=datetime.timezone.utc) 69 # TODO: Uncomment this line when we can get tests to properly account for local timezones. 70 # For now, we'll just use UTC to avoid breaking tests. 71 # datetime_obj = datetime_obj.astimezone() 72 return datetime_obj.strftime("%H:%M:%S") 73 74 75def _get_elapsed_time_str(seconds: int) -> str: 76 """Return duration as a string. 77 78 Seconds are included until 10 minutes is exceeded. 79 Minutes are always included after 1 minute elapsed. 80 Hours are always included after 1 hour elapsed. 81 """ 82 if seconds <= 60: # noqa: PLR2004 # Magic numbers OK here. 83 return f"{seconds} seconds" 84 85 if seconds < 60 * 10: 86 minutes = seconds // 60 87 seconds = seconds % 60 88 return f"{minutes}min {seconds}s" 89 90 if seconds < 60 * 60: 91 minutes = seconds // 60 92 seconds = seconds % 60 93 return f"{minutes}min" 94 95 hours = seconds // (60 * 60) 96 minutes = (seconds % (60 * 60)) // 60 97 return f"{hours}hr {minutes}min" 98 99 100class ReadProgress: 101 """A simple progress bar for the command line and IPython notebooks.""" 102 103 def __init__( 104 self, 105 style: ProgressStyle = ProgressStyle.AUTO, 106 ) -> None: 107 """Initialize the progress tracker.""" 108 # Streams expected (for progress bar) 109 self.num_streams_expected = 0 110 111 # Reads 112 self.read_start_time = time.time() 113 self.read_end_time: float | None = None 114 self.total_records_read = 0 115 116 # Writes 117 self.total_records_written = 0 118 self.total_batches_written = 0 119 self.written_stream_names: set[str] = set() 120 121 # Finalization 122 self.finalize_start_time: float | None = None 123 self.finalize_end_time: float | None = None 124 self.total_records_finalized = 0 125 self.total_batches_finalized = 0 126 self.finalized_stream_names: set[str] = set() 127 128 self.last_update_time: float | None = None 129 130 self._rich_view: RichLive | None = None 131 self.style: ProgressStyle = style 132 if self.style == ProgressStyle.AUTO: 133 self.style = ProgressStyle.PLAIN 134 if IS_NOTEBOOK: 135 self.style = ProgressStyle.IPYTHON 136 137 elif IS_REPL: 138 self.style = ProgressStyle.PLAIN 139 140 else: 141 # Test for Rich availability: 142 self._rich_view = RichLive() 143 try: 144 self._rich_view.start() 145 self._rich_view.stop() 146 self._rich_view = None 147 self.style = ProgressStyle.RICH 148 except LiveError: 149 # Rich live view not available. Using plain text progress. 150 self._rich_view = None 151 self.style = ProgressStyle.PLAIN 152 153 def _start(self) -> None: 154 """Start the progress bar.""" 155 if self.style == ProgressStyle.RICH and not self._rich_view: 156 self._rich_view = RichLive( 157 auto_refresh=True, 158 refresh_per_second=DEFAULT_REFRESHES_PER_SECOND, 159 ) 160 self._rich_view.start() 161 162 def _stop(self) -> None: 163 """Stop the progress bar.""" 164 if self._rich_view: 165 with suppress(Exception): 166 self._rich_view.stop() 167 self._rich_view = None 168 169 def __del__(self) -> None: 170 """Close the Rich view.""" 171 self._stop() 172 173 def log_success(self) -> None: 174 """Log success and stop tracking progress.""" 175 if self.finalize_end_time is None: 176 # If we haven't already finalized, do so now. 177 178 self.finalize_end_time = time.time() 179 180 self.update_display(force_refresh=True) 181 self._stop() 182 183 def reset(self, num_streams_expected: int) -> None: 184 """Reset the progress tracker.""" 185 # Streams expected (for progress bar) 186 self.num_streams_expected = num_streams_expected 187 188 # Reads 189 self.read_start_time = time.time() 190 self.read_end_time = None 191 self.total_records_read = 0 192 193 # Writes 194 self.total_records_written = 0 195 self.total_batches_written = 0 196 self.written_stream_names = set() 197 198 # Finalization 199 self.finalize_start_time = None 200 self.finalize_end_time = None 201 self.total_records_finalized = 0 202 self.total_batches_finalized = 0 203 self.finalized_stream_names = set() 204 205 self._start() 206 207 @property 208 def elapsed_seconds(self) -> int: 209 """Return the number of seconds elapsed since the read operation started.""" 210 if self.finalize_end_time: 211 return int(self.finalize_end_time - self.read_start_time) 212 213 return int(time.time() - self.read_start_time) 214 215 @property 216 def elapsed_time_string(self) -> str: 217 """Return duration as a string.""" 218 return _get_elapsed_time_str(self.elapsed_seconds) 219 220 @property 221 def elapsed_seconds_since_last_update(self) -> float | None: 222 """Return the number of seconds elapsed since the last update.""" 223 if self.last_update_time is None: 224 return None 225 226 return time.time() - self.last_update_time 227 228 @property 229 def elapsed_read_seconds(self) -> int: 230 """Return the number of seconds elapsed since the read operation started.""" 231 if self.read_end_time is None: 232 return int(time.time() - self.read_start_time) 233 234 return int(self.read_end_time - self.read_start_time) 235 236 @property 237 def elapsed_read_time_string(self) -> str: 238 """Return duration as a string.""" 239 return _get_elapsed_time_str(self.elapsed_read_seconds) 240 241 @property 242 def elapsed_finalization_seconds(self) -> int: 243 """Return the number of seconds elapsed since the read operation started.""" 244 if self.finalize_start_time is None: 245 return 0 246 if self.finalize_end_time is None: 247 return int(time.time() - self.finalize_start_time) 248 return int(self.finalize_end_time - self.finalize_start_time) 249 250 @property 251 def elapsed_finalization_time_str(self) -> str: 252 """Return duration as a string.""" 253 return _get_elapsed_time_str(self.elapsed_finalization_seconds) 254 255 def log_records_read(self, new_total_count: int) -> None: 256 """Load a number of records read.""" 257 self.total_records_read = new_total_count 258 259 # This is some math to make updates adaptive to the scale of records read. 260 # We want to update the display more often when the count is low, and less 261 # often when the count is high. 262 updated_period = min( 263 MAX_UPDATE_FREQUENCY, 10 ** math.floor(math.log10(max(self.total_records_read, 1)) / 4) 264 ) 265 if self.total_records_read % updated_period != 0: 266 return 267 268 self.update_display() 269 270 def log_batch_written(self, stream_name: str, batch_size: int) -> None: 271 """Log that a batch has been written. 272 273 Args: 274 stream_name: The name of the stream. 275 batch_size: The number of records in the batch. 276 """ 277 self.total_records_written += batch_size 278 self.total_batches_written += 1 279 self.written_stream_names.add(stream_name) 280 self.update_display() 281 282 def log_batches_finalizing(self, stream_name: str, num_batches: int) -> None: 283 """Log that batch are ready to be finalized. 284 285 In our current implementation, we ignore the stream name and number of batches. 286 We just use this as a signal that we're finished reading and have begun to 287 finalize any accumulated batches. 288 """ 289 _ = stream_name, num_batches # unused for now 290 if self.finalize_start_time is None: 291 self.read_end_time = time.time() 292 self.finalize_start_time = self.read_end_time 293 294 self.update_display(force_refresh=True) 295 296 def log_batches_finalized(self, stream_name: str, num_batches: int) -> None: 297 """Log that a batch has been finalized.""" 298 _ = stream_name # unused for now 299 self.total_batches_finalized += num_batches 300 self.update_display(force_refresh=True) 301 302 def log_stream_finalized(self, stream_name: str) -> None: 303 """Log that a stream has been finalized.""" 304 self.finalized_stream_names.add(stream_name) 305 self.update_display(force_refresh=True) 306 if len(self.finalized_stream_names) == self.num_streams_expected: 307 self.log_success() 308 309 def update_display(self, *, force_refresh: bool = False) -> None: 310 """Update the display.""" 311 # Don't update more than twice per second unless force_refresh is True. 312 if ( 313 not force_refresh 314 and self.last_update_time # if not set, then we definitely need to update 315 and cast(float, self.elapsed_seconds_since_last_update) < 0.5 # noqa: PLR2004 316 ): 317 return 318 319 status_message = self._get_status_message() 320 321 if self.style == ProgressStyle.IPYTHON and ipy_display is not None: 322 # We're in a notebook so use the IPython display. 323 assert ipy_display is not None 324 ipy_display.clear_output(wait=True) 325 ipy_display.display(ipy_display.Markdown(status_message)) 326 327 elif self.style == ProgressStyle.RICH and self._rich_view is not None: 328 self._rich_view.update(RichMarkdown(status_message)) 329 330 elif self.style == ProgressStyle.PLAIN: 331 # TODO: Add a plain text progress print option that isn't too noisy. 332 pass 333 334 elif self.style == ProgressStyle.NONE: 335 pass 336 337 self.last_update_time = time.time() 338 339 def _get_status_message(self) -> str: 340 """Compile and return a status message.""" 341 # Format start time as a friendly string in local timezone: 342 start_time_str = _to_time_str(self.read_start_time) 343 records_per_second: float = 0.0 344 if self.elapsed_read_seconds > 0: 345 records_per_second = round(self.total_records_read / self.elapsed_read_seconds, 1) 346 status_message = ( 347 f"## Read Progress\n\n" 348 f"Started reading at {start_time_str}.\n\n" 349 f"Read **{self.total_records_read:,}** records " 350 f"over **{self.elapsed_read_time_string}** " 351 f"({records_per_second:,} records / second).\n\n" 352 ) 353 if self.total_records_written > 0: 354 status_message += ( 355 f"Wrote **{self.total_records_written:,}** records " 356 f"over {self.total_batches_written:,} batches.\n\n" 357 ) 358 if self.read_end_time is not None: 359 read_end_time_str = _to_time_str(self.read_end_time) 360 status_message += f"Finished reading at {read_end_time_str}.\n\n" 361 if self.finalize_start_time is not None: 362 finalize_start_time_str = _to_time_str(self.finalize_start_time) 363 status_message += f"Started finalizing streams at {finalize_start_time_str}.\n\n" 364 status_message += ( 365 f"Finalized **{self.total_batches_finalized}** batches " 366 f"over {self.elapsed_finalization_time_str}.\n\n" 367 ) 368 if self.finalized_stream_names: 369 status_message += ( 370 f"Completed {len(self.finalized_stream_names)} " 371 + (f"out of {self.num_streams_expected} " if self.num_streams_expected else "") 372 + "streams:\n\n" 373 ) 374 for stream_name in self.finalized_stream_names: 375 status_message += f" - {stream_name}\n" 376 377 status_message += "\n\n" 378 379 if self.finalize_end_time is not None: 380 completion_time_str = _to_time_str(self.finalize_end_time) 381 status_message += ( 382 f"Completed writing at {completion_time_str}. " 383 f"Total time elapsed: {self.elapsed_time_string}\n\n" 384 ) 385 status_message += "\n------------------------------------------------\n" 386 387 return status_message 388 389 390progress = ReadProgress()
40class ProgressStyle(Enum): 41 """An enum of progress bar styles.""" 42 43 AUTO = auto() 44 """Automatically select the best style for the environment.""" 45 46 RICH = auto() 47 """A Rich progress bar.""" 48 49 IPYTHON = auto() 50 """Use IPython display methods.""" 51 52 PLAIN = auto() 53 """A plain text progress print.""" 54 55 NONE = auto() 56 """Skip progress prints."""
An enum of progress bar styles.
Inherited Members
- enum.Enum
- name
- value
The max number of records to read before updating the progress bar.
101class ReadProgress: 102 """A simple progress bar for the command line and IPython notebooks.""" 103 104 def __init__( 105 self, 106 style: ProgressStyle = ProgressStyle.AUTO, 107 ) -> None: 108 """Initialize the progress tracker.""" 109 # Streams expected (for progress bar) 110 self.num_streams_expected = 0 111 112 # Reads 113 self.read_start_time = time.time() 114 self.read_end_time: float | None = None 115 self.total_records_read = 0 116 117 # Writes 118 self.total_records_written = 0 119 self.total_batches_written = 0 120 self.written_stream_names: set[str] = set() 121 122 # Finalization 123 self.finalize_start_time: float | None = None 124 self.finalize_end_time: float | None = None 125 self.total_records_finalized = 0 126 self.total_batches_finalized = 0 127 self.finalized_stream_names: set[str] = set() 128 129 self.last_update_time: float | None = None 130 131 self._rich_view: RichLive | None = None 132 self.style: ProgressStyle = style 133 if self.style == ProgressStyle.AUTO: 134 self.style = ProgressStyle.PLAIN 135 if IS_NOTEBOOK: 136 self.style = ProgressStyle.IPYTHON 137 138 elif IS_REPL: 139 self.style = ProgressStyle.PLAIN 140 141 else: 142 # Test for Rich availability: 143 self._rich_view = RichLive() 144 try: 145 self._rich_view.start() 146 self._rich_view.stop() 147 self._rich_view = None 148 self.style = ProgressStyle.RICH 149 except LiveError: 150 # Rich live view not available. Using plain text progress. 151 self._rich_view = None 152 self.style = ProgressStyle.PLAIN 153 154 def _start(self) -> None: 155 """Start the progress bar.""" 156 if self.style == ProgressStyle.RICH and not self._rich_view: 157 self._rich_view = RichLive( 158 auto_refresh=True, 159 refresh_per_second=DEFAULT_REFRESHES_PER_SECOND, 160 ) 161 self._rich_view.start() 162 163 def _stop(self) -> None: 164 """Stop the progress bar.""" 165 if self._rich_view: 166 with suppress(Exception): 167 self._rich_view.stop() 168 self._rich_view = None 169 170 def __del__(self) -> None: 171 """Close the Rich view.""" 172 self._stop() 173 174 def log_success(self) -> None: 175 """Log success and stop tracking progress.""" 176 if self.finalize_end_time is None: 177 # If we haven't already finalized, do so now. 178 179 self.finalize_end_time = time.time() 180 181 self.update_display(force_refresh=True) 182 self._stop() 183 184 def reset(self, num_streams_expected: int) -> None: 185 """Reset the progress tracker.""" 186 # Streams expected (for progress bar) 187 self.num_streams_expected = num_streams_expected 188 189 # Reads 190 self.read_start_time = time.time() 191 self.read_end_time = None 192 self.total_records_read = 0 193 194 # Writes 195 self.total_records_written = 0 196 self.total_batches_written = 0 197 self.written_stream_names = set() 198 199 # Finalization 200 self.finalize_start_time = None 201 self.finalize_end_time = None 202 self.total_records_finalized = 0 203 self.total_batches_finalized = 0 204 self.finalized_stream_names = set() 205 206 self._start() 207 208 @property 209 def elapsed_seconds(self) -> int: 210 """Return the number of seconds elapsed since the read operation started.""" 211 if self.finalize_end_time: 212 return int(self.finalize_end_time - self.read_start_time) 213 214 return int(time.time() - self.read_start_time) 215 216 @property 217 def elapsed_time_string(self) -> str: 218 """Return duration as a string.""" 219 return _get_elapsed_time_str(self.elapsed_seconds) 220 221 @property 222 def elapsed_seconds_since_last_update(self) -> float | None: 223 """Return the number of seconds elapsed since the last update.""" 224 if self.last_update_time is None: 225 return None 226 227 return time.time() - self.last_update_time 228 229 @property 230 def elapsed_read_seconds(self) -> int: 231 """Return the number of seconds elapsed since the read operation started.""" 232 if self.read_end_time is None: 233 return int(time.time() - self.read_start_time) 234 235 return int(self.read_end_time - self.read_start_time) 236 237 @property 238 def elapsed_read_time_string(self) -> str: 239 """Return duration as a string.""" 240 return _get_elapsed_time_str(self.elapsed_read_seconds) 241 242 @property 243 def elapsed_finalization_seconds(self) -> int: 244 """Return the number of seconds elapsed since the read operation started.""" 245 if self.finalize_start_time is None: 246 return 0 247 if self.finalize_end_time is None: 248 return int(time.time() - self.finalize_start_time) 249 return int(self.finalize_end_time - self.finalize_start_time) 250 251 @property 252 def elapsed_finalization_time_str(self) -> str: 253 """Return duration as a string.""" 254 return _get_elapsed_time_str(self.elapsed_finalization_seconds) 255 256 def log_records_read(self, new_total_count: int) -> None: 257 """Load a number of records read.""" 258 self.total_records_read = new_total_count 259 260 # This is some math to make updates adaptive to the scale of records read. 261 # We want to update the display more often when the count is low, and less 262 # often when the count is high. 263 updated_period = min( 264 MAX_UPDATE_FREQUENCY, 10 ** math.floor(math.log10(max(self.total_records_read, 1)) / 4) 265 ) 266 if self.total_records_read % updated_period != 0: 267 return 268 269 self.update_display() 270 271 def log_batch_written(self, stream_name: str, batch_size: int) -> None: 272 """Log that a batch has been written. 273 274 Args: 275 stream_name: The name of the stream. 276 batch_size: The number of records in the batch. 277 """ 278 self.total_records_written += batch_size 279 self.total_batches_written += 1 280 self.written_stream_names.add(stream_name) 281 self.update_display() 282 283 def log_batches_finalizing(self, stream_name: str, num_batches: int) -> None: 284 """Log that batch are ready to be finalized. 285 286 In our current implementation, we ignore the stream name and number of batches. 287 We just use this as a signal that we're finished reading and have begun to 288 finalize any accumulated batches. 289 """ 290 _ = stream_name, num_batches # unused for now 291 if self.finalize_start_time is None: 292 self.read_end_time = time.time() 293 self.finalize_start_time = self.read_end_time 294 295 self.update_display(force_refresh=True) 296 297 def log_batches_finalized(self, stream_name: str, num_batches: int) -> None: 298 """Log that a batch has been finalized.""" 299 _ = stream_name # unused for now 300 self.total_batches_finalized += num_batches 301 self.update_display(force_refresh=True) 302 303 def log_stream_finalized(self, stream_name: str) -> None: 304 """Log that a stream has been finalized.""" 305 self.finalized_stream_names.add(stream_name) 306 self.update_display(force_refresh=True) 307 if len(self.finalized_stream_names) == self.num_streams_expected: 308 self.log_success() 309 310 def update_display(self, *, force_refresh: bool = False) -> None: 311 """Update the display.""" 312 # Don't update more than twice per second unless force_refresh is True. 313 if ( 314 not force_refresh 315 and self.last_update_time # if not set, then we definitely need to update 316 and cast(float, self.elapsed_seconds_since_last_update) < 0.5 # noqa: PLR2004 317 ): 318 return 319 320 status_message = self._get_status_message() 321 322 if self.style == ProgressStyle.IPYTHON and ipy_display is not None: 323 # We're in a notebook so use the IPython display. 324 assert ipy_display is not None 325 ipy_display.clear_output(wait=True) 326 ipy_display.display(ipy_display.Markdown(status_message)) 327 328 elif self.style == ProgressStyle.RICH and self._rich_view is not None: 329 self._rich_view.update(RichMarkdown(status_message)) 330 331 elif self.style == ProgressStyle.PLAIN: 332 # TODO: Add a plain text progress print option that isn't too noisy. 333 pass 334 335 elif self.style == ProgressStyle.NONE: 336 pass 337 338 self.last_update_time = time.time() 339 340 def _get_status_message(self) -> str: 341 """Compile and return a status message.""" 342 # Format start time as a friendly string in local timezone: 343 start_time_str = _to_time_str(self.read_start_time) 344 records_per_second: float = 0.0 345 if self.elapsed_read_seconds > 0: 346 records_per_second = round(self.total_records_read / self.elapsed_read_seconds, 1) 347 status_message = ( 348 f"## Read Progress\n\n" 349 f"Started reading at {start_time_str}.\n\n" 350 f"Read **{self.total_records_read:,}** records " 351 f"over **{self.elapsed_read_time_string}** " 352 f"({records_per_second:,} records / second).\n\n" 353 ) 354 if self.total_records_written > 0: 355 status_message += ( 356 f"Wrote **{self.total_records_written:,}** records " 357 f"over {self.total_batches_written:,} batches.\n\n" 358 ) 359 if self.read_end_time is not None: 360 read_end_time_str = _to_time_str(self.read_end_time) 361 status_message += f"Finished reading at {read_end_time_str}.\n\n" 362 if self.finalize_start_time is not None: 363 finalize_start_time_str = _to_time_str(self.finalize_start_time) 364 status_message += f"Started finalizing streams at {finalize_start_time_str}.\n\n" 365 status_message += ( 366 f"Finalized **{self.total_batches_finalized}** batches " 367 f"over {self.elapsed_finalization_time_str}.\n\n" 368 ) 369 if self.finalized_stream_names: 370 status_message += ( 371 f"Completed {len(self.finalized_stream_names)} " 372 + (f"out of {self.num_streams_expected} " if self.num_streams_expected else "") 373 + "streams:\n\n" 374 ) 375 for stream_name in self.finalized_stream_names: 376 status_message += f" - {stream_name}\n" 377 378 status_message += "\n\n" 379 380 if self.finalize_end_time is not None: 381 completion_time_str = _to_time_str(self.finalize_end_time) 382 status_message += ( 383 f"Completed writing at {completion_time_str}. " 384 f"Total time elapsed: {self.elapsed_time_string}\n\n" 385 ) 386 status_message += "\n------------------------------------------------\n" 387 388 return status_message
A simple progress bar for the command line and IPython notebooks.
104 def __init__( 105 self, 106 style: ProgressStyle = ProgressStyle.AUTO, 107 ) -> None: 108 """Initialize the progress tracker.""" 109 # Streams expected (for progress bar) 110 self.num_streams_expected = 0 111 112 # Reads 113 self.read_start_time = time.time() 114 self.read_end_time: float | None = None 115 self.total_records_read = 0 116 117 # Writes 118 self.total_records_written = 0 119 self.total_batches_written = 0 120 self.written_stream_names: set[str] = set() 121 122 # Finalization 123 self.finalize_start_time: float | None = None 124 self.finalize_end_time: float | None = None 125 self.total_records_finalized = 0 126 self.total_batches_finalized = 0 127 self.finalized_stream_names: set[str] = set() 128 129 self.last_update_time: float | None = None 130 131 self._rich_view: RichLive | None = None 132 self.style: ProgressStyle = style 133 if self.style == ProgressStyle.AUTO: 134 self.style = ProgressStyle.PLAIN 135 if IS_NOTEBOOK: 136 self.style = ProgressStyle.IPYTHON 137 138 elif IS_REPL: 139 self.style = ProgressStyle.PLAIN 140 141 else: 142 # Test for Rich availability: 143 self._rich_view = RichLive() 144 try: 145 self._rich_view.start() 146 self._rich_view.stop() 147 self._rich_view = None 148 self.style = ProgressStyle.RICH 149 except LiveError: 150 # Rich live view not available. Using plain text progress. 151 self._rich_view = None 152 self.style = ProgressStyle.PLAIN
Initialize the progress tracker.
174 def log_success(self) -> None: 175 """Log success and stop tracking progress.""" 176 if self.finalize_end_time is None: 177 # If we haven't already finalized, do so now. 178 179 self.finalize_end_time = time.time() 180 181 self.update_display(force_refresh=True) 182 self._stop()
Log success and stop tracking progress.
184 def reset(self, num_streams_expected: int) -> None: 185 """Reset the progress tracker.""" 186 # Streams expected (for progress bar) 187 self.num_streams_expected = num_streams_expected 188 189 # Reads 190 self.read_start_time = time.time() 191 self.read_end_time = None 192 self.total_records_read = 0 193 194 # Writes 195 self.total_records_written = 0 196 self.total_batches_written = 0 197 self.written_stream_names = set() 198 199 # Finalization 200 self.finalize_start_time = None 201 self.finalize_end_time = None 202 self.total_records_finalized = 0 203 self.total_batches_finalized = 0 204 self.finalized_stream_names = set() 205 206 self._start()
Reset the progress tracker.
208 @property 209 def elapsed_seconds(self) -> int: 210 """Return the number of seconds elapsed since the read operation started.""" 211 if self.finalize_end_time: 212 return int(self.finalize_end_time - self.read_start_time) 213 214 return int(time.time() - self.read_start_time)
Return the number of seconds elapsed since the read operation started.
216 @property 217 def elapsed_time_string(self) -> str: 218 """Return duration as a string.""" 219 return _get_elapsed_time_str(self.elapsed_seconds)
Return duration as a string.
221 @property 222 def elapsed_seconds_since_last_update(self) -> float | None: 223 """Return the number of seconds elapsed since the last update.""" 224 if self.last_update_time is None: 225 return None 226 227 return time.time() - self.last_update_time
Return the number of seconds elapsed since the last update.
229 @property 230 def elapsed_read_seconds(self) -> int: 231 """Return the number of seconds elapsed since the read operation started.""" 232 if self.read_end_time is None: 233 return int(time.time() - self.read_start_time) 234 235 return int(self.read_end_time - self.read_start_time)
Return the number of seconds elapsed since the read operation started.
237 @property 238 def elapsed_read_time_string(self) -> str: 239 """Return duration as a string.""" 240 return _get_elapsed_time_str(self.elapsed_read_seconds)
Return duration as a string.
242 @property 243 def elapsed_finalization_seconds(self) -> int: 244 """Return the number of seconds elapsed since the read operation started.""" 245 if self.finalize_start_time is None: 246 return 0 247 if self.finalize_end_time is None: 248 return int(time.time() - self.finalize_start_time) 249 return int(self.finalize_end_time - self.finalize_start_time)
Return the number of seconds elapsed since the read operation started.
251 @property 252 def elapsed_finalization_time_str(self) -> str: 253 """Return duration as a string.""" 254 return _get_elapsed_time_str(self.elapsed_finalization_seconds)
Return duration as a string.
256 def log_records_read(self, new_total_count: int) -> None: 257 """Load a number of records read.""" 258 self.total_records_read = new_total_count 259 260 # This is some math to make updates adaptive to the scale of records read. 261 # We want to update the display more often when the count is low, and less 262 # often when the count is high. 263 updated_period = min( 264 MAX_UPDATE_FREQUENCY, 10 ** math.floor(math.log10(max(self.total_records_read, 1)) / 4) 265 ) 266 if self.total_records_read % updated_period != 0: 267 return 268 269 self.update_display()
Load a number of records read.
271 def log_batch_written(self, stream_name: str, batch_size: int) -> None: 272 """Log that a batch has been written. 273 274 Args: 275 stream_name: The name of the stream. 276 batch_size: The number of records in the batch. 277 """ 278 self.total_records_written += batch_size 279 self.total_batches_written += 1 280 self.written_stream_names.add(stream_name) 281 self.update_display()
Log that a batch has been written.
Arguments:
- stream_name: The name of the stream.
- batch_size: The number of records in the batch.
283 def log_batches_finalizing(self, stream_name: str, num_batches: int) -> None: 284 """Log that batch are ready to be finalized. 285 286 In our current implementation, we ignore the stream name and number of batches. 287 We just use this as a signal that we're finished reading and have begun to 288 finalize any accumulated batches. 289 """ 290 _ = stream_name, num_batches # unused for now 291 if self.finalize_start_time is None: 292 self.read_end_time = time.time() 293 self.finalize_start_time = self.read_end_time 294 295 self.update_display(force_refresh=True)
Log that batch are ready to be finalized.
In our current implementation, we ignore the stream name and number of batches. We just use this as a signal that we're finished reading and have begun to finalize any accumulated batches.
297 def log_batches_finalized(self, stream_name: str, num_batches: int) -> None: 298 """Log that a batch has been finalized.""" 299 _ = stream_name # unused for now 300 self.total_batches_finalized += num_batches 301 self.update_display(force_refresh=True)
Log that a batch has been finalized.
303 def log_stream_finalized(self, stream_name: str) -> None: 304 """Log that a stream has been finalized.""" 305 self.finalized_stream_names.add(stream_name) 306 self.update_display(force_refresh=True) 307 if len(self.finalized_stream_names) == self.num_streams_expected: 308 self.log_success()
Log that a stream has been finalized.
310 def update_display(self, *, force_refresh: bool = False) -> None: 311 """Update the display.""" 312 # Don't update more than twice per second unless force_refresh is True. 313 if ( 314 not force_refresh 315 and self.last_update_time # if not set, then we definitely need to update 316 and cast(float, self.elapsed_seconds_since_last_update) < 0.5 # noqa: PLR2004 317 ): 318 return 319 320 status_message = self._get_status_message() 321 322 if self.style == ProgressStyle.IPYTHON and ipy_display is not None: 323 # We're in a notebook so use the IPython display. 324 assert ipy_display is not None 325 ipy_display.clear_output(wait=True) 326 ipy_display.display(ipy_display.Markdown(status_message)) 327 328 elif self.style == ProgressStyle.RICH and self._rich_view is not None: 329 self._rich_view.update(RichMarkdown(status_message)) 330 331 elif self.style == ProgressStyle.PLAIN: 332 # TODO: Add a plain text progress print option that isn't too noisy. 333 pass 334 335 elif self.style == ProgressStyle.NONE: 336 pass 337 338 self.last_update_time = time.time()
Update the display.