Add Codec for ZStandard Compression (#801)

* Add log_start_offset to message protocol parsing Based on: https://github.com/dpkp/kafka-python/pull/2020/files * Add zstd compression support * Add changelog entry
aio-libs · Dec 6, 2021 · 5370a4c · 5370a4c
1 parent 67f55c0
commit 5370a4c
Show file tree

Hide file tree

Showing 25 changed files with 213 additions and 60 deletions.
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -183,7 +183,7 @@ jobs:
       - name: Install system dependencies
         run: |
           sudo apt-get update
-          sudo apt-get install -y libsnappy-dev libkrb5-dev
+          sudo apt-get install -y libsnappy-dev libzstd-dev libkrb5-dev
       - name: Install python dependencies
         run: |
           pip install --upgrade pip setuptools wheel

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -27,7 +27,7 @@ jobs:
     - name: Install system dependencies
       run: |
         sudo apt-get update
-        sudo apt-get install -y libsnappy-dev libkrb5-dev
+        sudo apt-get install -y libsnappy-dev libzstd-dev libkrb5-dev
 
     - name: Get pip cache dir
       id: pip-cache
@@ -270,7 +270,7 @@ jobs:
     - name: Install system dependencies
       run: |
         sudo apt-get update
-        sudo apt-get install -y libsnappy-dev libkrb5-dev krb5-user
+        sudo apt-get install -y libsnappy-dev libzstd-dev libkrb5-dev krb5-user
 
     - name: Get pip cache dir
       id: pip-cache

diff --git a/.travis.yml_bak b/.travis.yml_bak
@@ -134,6 +134,7 @@ addons:
   apt:
     packages:
     - libsnappy-dev
+    - libzstd-dev
     - krb5-user
 
 install:

diff --git a/CHANGES/801.feature b/CHANGES/801.feature
@@ -0,0 +1 @@
+Add Codec for ZStandard Compression (KIP-110)
diff --git a/README.rst b/README.rst
@@ -81,7 +81,7 @@ generate ssh keys for some tests.
 
 Setting up tests requirements (assuming you're within virtualenv on ubuntu 14.04+)::
 
-    sudo apt-get install -y libsnappy-dev
+    sudo apt-get install -y libsnappy-dev libzstd-dev
     make setup
 
 Running tests with coverage::

diff --git a/aiokafka/producer/message_accumulator.py b/aiokafka/producer/message_accumulator.py
@@ -143,7 +143,7 @@ def append(self, key, value, timestamp_ms, _create_future=create_future,
         self._msg_futures.append((future, metadata))
         return future
 
-    def done(self, base_offset, timestamp=None,
+    def done(self, base_offset, timestamp=None, log_start_offset=None,
              _record_metadata_class=RecordMetadata):
         """Resolve all pending futures"""
         tp = self._tp
@@ -157,7 +157,8 @@ def done(self, base_offset, timestamp=None,
         # Set main batch future
         if not self.future.done():
             self.future.set_result(_record_metadata_class(
-                topic, partition, tp, base_offset, timestamp, timestamp_type))
+                topic, partition, tp, base_offset, timestamp, timestamp_type,
+                log_start_offset))
 
         # Set message futures
         for future, metadata in self._msg_futures:
@@ -169,7 +170,8 @@ def done(self, base_offset, timestamp=None,
                 timestamp = metadata.timestamp
             offset = base_offset + metadata.offset
             future.set_result(_record_metadata_class(
-                topic, partition, tp, offset, timestamp, timestamp_type))
+                topic, partition, tp, offset, timestamp, timestamp_type,
+                log_start_offset))
 
     def done_noack(self):
         """ Resolve all pending futures to None """

diff --git a/aiokafka/producer/producer.py b/aiokafka/producer/producer.py
@@ -5,11 +5,12 @@
 import warnings
 
 from kafka.partitioner.default import DefaultPartitioner
-from kafka.codec import has_gzip, has_snappy, has_lz4
+from kafka.codec import has_gzip, has_snappy, has_lz4, has_zstd
 
 from aiokafka.client import AIOKafkaClient
 from aiokafka.errors import (
     MessageSizeTooLargeError, UnsupportedVersionError, IllegalOperation)
+from aiokafka.record.default_records import DefaultRecordBatch
 from aiokafka.record.legacy_records import LegacyRecordBatchBuilder
 from aiokafka.structs import TopicPartition
 from aiokafka.util import (
@@ -86,10 +87,10 @@ class AIOKafkaProducer:
             If unset, defaults to *acks=1*. If ``enable_idempotence`` is
             ``True`` defaults to *acks=all*
         compression_type (str): The compression type for all data generated by
-            the producer. Valid values are 'gzip', 'snappy', 'lz4', or None.
-            Compression is of full batches of data, so the efficacy of batching
-            will also impact the compression ratio (more batching means better
-            compression). Default: None.
+            the producer. Valid values are 'gzip', 'snappy', 'lz4', 'zstd' or
+            None. Compression is of full batches of data, so the efficacy of
+            batching will also impact the compression ratio (more batching
+            means better compression). Default: None.
         max_batch_size (int): Maximum size of buffered data per partition.
             After this amount `send` coroutine will block until batch is
             drained.
@@ -167,9 +168,10 @@ class AIOKafkaProducer:
     _PRODUCER_CLIENT_ID_SEQUENCE = 0
 
     _COMPRESSORS = {
-        'gzip': (has_gzip, LegacyRecordBatchBuilder.CODEC_GZIP),
-        'snappy': (has_snappy, LegacyRecordBatchBuilder.CODEC_SNAPPY),
-        'lz4': (has_lz4, LegacyRecordBatchBuilder.CODEC_LZ4),
+        'gzip': (has_gzip, DefaultRecordBatch.CODEC_GZIP),
+        'snappy': (has_snappy, DefaultRecordBatch.CODEC_SNAPPY),
+        'lz4': (has_lz4, DefaultRecordBatch.CODEC_LZ4),
+        'zstd': (has_zstd, DefaultRecordBatch.CODEC_ZSTD),
     }
 
     _closed = None  # Serves as an uninitialized flag for __del__
@@ -203,7 +205,7 @@ def __init__(self, *, loop=None, bootstrap_servers='localhost',
 
         if acks not in (0, 1, -1, 'all', _missing):
             raise ValueError("Invalid ACKS parameter")
-        if compression_type not in ('gzip', 'snappy', 'lz4', None):
+        if compression_type not in ('gzip', 'snappy', 'lz4', 'zstd', None):
             raise ValueError("Invalid compression type!")
         if compression_type:
             checker, compression_attrs = self._COMPRESSORS[compression_type]
@@ -298,6 +300,9 @@ async def start(self):
         if self._compression_type == 'lz4':
             assert self.client.api_version >= (0, 8, 2), \
                 'LZ4 Requires >= Kafka 0.8.2 Brokers'
+        elif self._compression_type == 'zstd':
+            assert self.client.api_version >= (2, 1, 0), \
+                'Zstd Requires >= Kafka 2.1.0 Brokers'
 
         if self._txn_manager is not None and self.client.api_version < (0, 11):
             raise UnsupportedVersionError(

diff --git a/aiokafka/producer/sender.py b/aiokafka/producer/sender.py
@@ -677,7 +677,15 @@ def create_request(self):
                 (tp.partition, batch.get_data_buffer())
             )
 
-        if self._client.api_version >= (0, 11):
+        if self._client.api_version >= (2, 1):
+            version = 7
+        elif self._client.api_version >= (2, 0):
+            version = 6
+        elif self._client.api_version >= (1, 1):
+            version = 5
+        elif self._client.api_version >= (1, 0):
+            version = 4
+        elif self._client.api_version >= (0, 11):
             version = 3
         elif self._client.api_version >= (0, 10):
             version = 2
@@ -737,20 +745,34 @@ async def do(self, node_id):
     def handle_response(self, response):
         for topic, partitions in response.topics:
             for partition_info in partitions:
+                global_error = None
+                log_start_offset = None
                 if response.API_VERSION < 2:
                     partition, error_code, offset = partition_info
                     # Mimic CREATE_TIME to take user provided timestamp
                     timestamp = -1
-                else:
+                elif 2 <= response.API_VERSION <= 4:
                     partition, error_code, offset, timestamp = partition_info
+                elif 5 <= response.API_VERSION <= 7:
+                    (
+                        partition, error_code, offset, timestamp,
+                        log_start_offset
+                    ) = partition_info
+                else:
+                    # the ignored parameter is record_error of type
+                    # list[(batch_index: int, error_message: str)]
+                    (
+                        partition, error_code, offset, timestamp,
+                        log_start_offset, _, global_error
+                    ) = partition_info
                 tp = TopicPartition(topic, partition)
                 error = Errors.for_code(error_code)
                 batch = self._batches.get(tp)
                 if batch is None:
                     continue
 
                 if error is Errors.NoError:
-                    batch.done(offset, timestamp)
+                    batch.done(offset, timestamp, log_start_offset)
                 elif error is DuplicateSequenceNumber:
                     # If we have received a duplicate sequence error,
                     # it means that the sequence number has advanced
@@ -761,7 +783,7 @@ def handle_response(self, response):
                     # The only thing we can do is to return success to
                     # the user and not return a valid offset and
                     # timestamp.
-                    batch.done(offset, timestamp)
+                    batch.done(offset, timestamp, log_start_offset)
                 elif not self._can_retry(error(), batch):
                     if error is InvalidProducerEpoch:
                         exc = ProducerFenced()
@@ -773,7 +795,7 @@ def handle_response(self, response):
                 else:
                     log.warning(
                         "Got error produce response on topic-partition"
-                        " %s, retrying. Error: %s", tp, error)
+                        " %s, retrying. Error: %s", tp, global_error or error)
                     # Ok, we can retry this batch
                     if getattr(error, "invalid_metadata", False):
                         self._client.force_metadata_update()

diff --git a/aiokafka/protocol/produce.py b/aiokafka/protocol/produce.py
@@ -80,6 +80,51 @@ class ProduceResponse_v5(Response):
     )
 
 
+class ProduceResponse_v6(Response):
+    """
+    The version number is bumped to indicate that on quota violation brokers
+    send out responses before throttling.
+    """
+    API_KEY = 0
+    API_VERSION = 6
+    SCHEMA = ProduceResponse_v5.SCHEMA
+
+
+class ProduceResponse_v7(Response):
+    """
+    V7 bumped up to indicate ZStandard capability. (see KIP-110)
+    """
+    API_KEY = 0
+    API_VERSION = 7
+    SCHEMA = ProduceResponse_v6.SCHEMA
+
+
+class ProduceResponse_v8(Response):
+    """
+    V8 bumped up to add two new fields record_errors offset list and error_message
+    (See KIP-467)
+    """
+    API_KEY = 0
+    API_VERSION = 8
+    SCHEMA = Schema(
+        ('topics', Array(
+            ('topic', String('utf-8')),
+            ('partitions', Array(
+                ('partition', Int32),
+                ('error_code', Int16),
+                ('offset', Int64),
+                ('timestamp', Int64),
+                ('log_start_offset', Int64)),
+                ('record_errors', (Array(
+                    ('batch_index', Int32),
+                    ('batch_index_error_message', String('utf-8'))
+                 ))),
+                ('error_message', String('utf-8'))
+             ))),
+        ('throttle_time_ms', Int32)
+    )
+
+
 class ProduceRequest(Request):
     API_KEY = 0
 
@@ -152,11 +197,42 @@ class ProduceRequest_v5(ProduceRequest):
     SCHEMA = ProduceRequest_v4.SCHEMA
 
 
+class ProduceRequest_v6(ProduceRequest):
+    """
+    The version number is bumped to indicate that on quota violation brokers
+    send out responses before throttling.
+    """
+    API_VERSION = 6
+    RESPONSE_TYPE = ProduceResponse_v6
+    SCHEMA = ProduceRequest_v5.SCHEMA
+
+
+class ProduceRequest_v7(ProduceRequest):
+    """
+    V7 bumped up to indicate ZStandard capability. (see KIP-110)
+    """
+    API_VERSION = 7
+    RESPONSE_TYPE = ProduceResponse_v7
+    SCHEMA = ProduceRequest_v6.SCHEMA
+
+
+class ProduceRequest_v8(ProduceRequest):
+    """
+    V8 bumped up to add two new fields record_errors offset list and error_message
+    to PartitionResponse (See KIP-467)
+    """
+    API_VERSION = 8
+    RESPONSE_TYPE = ProduceResponse_v8
+    SCHEMA = ProduceRequest_v7.SCHEMA
+
+
 ProduceRequest = [
     ProduceRequest_v0, ProduceRequest_v1, ProduceRequest_v2,
-    ProduceRequest_v3, ProduceRequest_v4, ProduceRequest_v5
+    ProduceRequest_v3, ProduceRequest_v4, ProduceRequest_v5,
+    ProduceRequest_v6, ProduceRequest_v7, ProduceRequest_v8,
 ]
 ProduceResponse = [
     ProduceResponse_v0, ProduceResponse_v1, ProduceResponse_v2,
-    ProduceResponse_v3, ProduceResponse_v4, ProduceResponse_v5
+    ProduceResponse_v3, ProduceResponse_v4, ProduceResponse_v5,
+    ProduceResponse_v6, ProduceResponse_v7, ProduceResponse_v8,
 ]
diff --git a/aiokafka/record/_crecords/consts.pxi b/aiokafka/record/_crecords/consts.pxi
@@ -5,6 +5,7 @@ DEF _ATTR_CODEC_NONE = 0x00
 DEF _ATTR_CODEC_GZIP = 0x01
 DEF _ATTR_CODEC_SNAPPY = 0x02
 DEF _ATTR_CODEC_LZ4 = 0x03
+DEF _ATTR_CODEC_ZSTD = 0x04
 
 DEF _TIMESTAMP_TYPE_MASK = 0x08
 DEF _TRANSACTIONAL_MASK = 0x10
@@ -21,4 +22,4 @@ DEF _LEGACY_RECORD_FREELIST_SIZE = 100
 
 DEF _DEFAULT_RECORD_METADATA_FREELIST_SIZE = 20
 DEF _DEFAULT_RECORD_BATCH_FREELIST_SIZE = 100
-DEF _DEFAULT_RECORD_FREELIST_SIZE = 100
+DEF _DEFAULT_RECORD_FREELIST_SIZE = 100
diff --git a/aiokafka/record/_crecords/default_records.pyx b/aiokafka/record/_crecords/default_records.pyx
@@ -57,8 +57,8 @@
 
 from aiokafka.errors import CorruptRecordException, UnsupportedCodecError
 from kafka.codec import (
-    gzip_encode, snappy_encode, lz4_encode,
-    gzip_decode, snappy_decode, lz4_decode
+    gzip_encode, snappy_encode, lz4_encode, zstd_encode,
+    gzip_decode, snappy_decode, lz4_decode, zstd_decode
 )
 import kafka.codec as codecs
 
@@ -116,6 +116,8 @@ cdef _assert_has_codec(char compression_type):
         checker, name = codecs.has_snappy, "snappy"
     elif compression_type == _ATTR_CODEC_LZ4:
         checker, name = codecs.has_lz4, "lz4"
+    elif compression_type == _ATTR_CODEC_ZSTD:
+        checker, name = codecs.has_zstd, "zstd"
     else:
         raise UnsupportedCodecError(
             f"Unknown compression codec {compression_type:#04x}")
@@ -134,6 +136,7 @@ cdef class DefaultRecordBatch:
     CODEC_GZIP = _ATTR_CODEC_GZIP
     CODEC_SNAPPY = _ATTR_CODEC_SNAPPY
     CODEC_LZ4 = _ATTR_CODEC_LZ4
+    CODEC_ZSTD = _ATTR_CODEC_ZSTD
 
     def __init__(self, object buffer):
         PyObject_GetBuffer(buffer, &self._buffer, PyBUF_SIMPLE)
@@ -240,6 +243,8 @@ cdef class DefaultRecordBatch:
                     uncompressed = snappy_decode(data.tobytes())
                 elif compression_type == _ATTR_CODEC_LZ4:
                     uncompressed = lz4_decode(data.tobytes())
+                elif compression_type == _ATTR_CODEC_ZSTD:
+                    uncompressed = zstd_decode(data.tobytes())
 
                 PyBuffer_Release(&self._buffer)
                 PyObject_GetBuffer(uncompressed, &self._buffer, PyBUF_SIMPLE)
@@ -694,6 +699,8 @@ cdef class DefaultRecordBatchBuilder:
                 compressed = snappy_encode(data)
             elif self._compression_type == _ATTR_CODEC_LZ4:
                 compressed = lz4_encode(data)
+            elif self._compression_type == _ATTR_CODEC_ZSTD:
+                compressed = zstd_encode(data)
             size = (<Py_ssize_t> len(compressed)) + FIRST_RECORD_OFFSET
             # We will just write the result into the same memory space.
             PyByteArray_Resize(self._buffer, size)