apache · pitrou · Oct 19, 2023 · Oct 11, 2023 · pitrou · Oct 19, 2023
@@ -23,8 +23,8 @@ arrow_dir=${1}
 gold_dir=$arrow_dir/testing/data/arrow-ipc-stream/integration
 
 pip install -e $arrow_dir/dev/archery[integration]
-# For C# C Data Interface testing
-pip install pythonnet
+# For C Data Interface testing
+pip install jpype1 pythonnet
 
 # Get more detailed context on crashes
 export PYTHONFAULTHANDLER=1

@@ -80,6 +80,15 @@ def ffi() -> cffi.FFI:
     return ffi
 
 
+def _release_memory_steps(exporter: CDataExporter, importer: CDataImporter):
+    yield
+    for i in range(max(exporter.required_gc_runs, importer.required_gc_runs)):
+        importer.run_gc()
+        yield
+        exporter.run_gc()
+        yield
+
+
 @contextmanager
 def check_memory_released(exporter: CDataExporter, importer: CDataImporter):
     """
@@ -96,12 +105,13 @@ def check_memory_released(exporter: CDataExporter, importer: CDataImporter):
     if do_check:
         before = exporter.record_allocation_state()
     yield
-    # We don't use a `finally` clause: if the enclosed block raised an
-    # exception, no need to add another one.
+    # Only check for memory state if `yield` didn't raise.
     if do_check:
-        ok = exporter.compare_allocation_state(before, importer.gc_until)
-        if not ok:
+        for _ in _release_memory_steps(exporter, importer):
             after = exporter.record_allocation_state()
+            if after == before:
+                break
+        if after != before:
             raise RuntimeError(
                 f"Memory was not released correctly after roundtrip: "
                 f"before = {before}, after = {after} (should have been equal)")
@@ -1722,7 +1722,6 @@ def generate_dictionary_unsigned_case():
 
     # TODO: JavaScript does not support uint64 dictionary indices, so disabled
     # for now
-
     # dict3 = Dictionary(3, StringField('dictionary3'), size=5, name='DICT3')
     fields = [
         DictionaryField('f0', get_field('', 'uint8'), dict0),

@@ -421,17 +421,18 @@ def _compare_c_data_implementations(
         # Serial execution is required for proper memory accounting
         serial = True
 
-        exporter = producer.make_c_data_exporter()
-        importer = consumer.make_c_data_importer()
-
-        case_runner = partial(self._run_c_schema_test_case, producer, consumer,
-                              exporter, importer)
-        self._run_test_cases(case_runner, self.json_files, serial=serial)
-
-        if producer.C_DATA_ARRAY_EXPORTER and consumer.C_DATA_ARRAY_IMPORTER:
-            case_runner = partial(self._run_c_array_test_cases, producer, consumer,
-                                  exporter, importer)
-            self._run_test_cases(case_runner, self.json_files, serial=serial)
+        with producer.make_c_data_exporter() as exporter:
+            with consumer.make_c_data_importer() as importer:
+                case_runner = partial(self._run_c_schema_test_case,
+                                      producer, consumer,
+                                      exporter, importer)
+                self._run_test_cases(case_runner, self.json_files, serial=serial)
+
+                if producer.C_DATA_ARRAY_EXPORTER and consumer.C_DATA_ARRAY_IMPORTER:
+                    case_runner = partial(self._run_c_array_test_cases,
+                                          producer, consumer,
+                                          exporter, importer)
+                    self._run_test_cases(case_runner, self.json_files, serial=serial)
 
     def _run_c_schema_test_case(self,
                                 producer: Tester, consumer: Tester,

@@ -68,52 +68,52 @@ def supports_releasing_memory(self) -> bool:
         Whether the implementation is able to release memory deterministically.
 
         Here, "release memory" means that, after the `release` callback of
-        a C Data Interface export is called, `compare_allocation_state` is
-        able to trigger the deallocation of the memory underlying the export
-        (for example buffer data).
+        a C Data Interface export is called, `run_gc` is able to trigger
+        the deallocation of the memory underlying the export (such as buffer data).
 
-        If false, then `record_allocation_state` and `compare_allocation_state`
-        are allowed to raise NotImplementedError.
+        If false, then `record_allocation_state` is allowed to raise
+        NotImplementedError.
         """
 
     def record_allocation_state(self) -> object:
         """
-        Record the current memory allocation state.
+        Return the current memory allocation state.
 
         Returns
         -------
         state : object
-            Opaque object representing the allocation state,
-            for example the number of allocated bytes.
+            Equality-comparable object representing the allocation state,
+            for example the number of allocated or exported bytes.
         """
         raise NotImplementedError
 
-    def compare_allocation_state(self, recorded: object,
-                                 gc_until: typing.Callable[[_Predicate], bool]
-                                 ) -> bool:
+    def run_gc(self):
         """
-        Compare the current memory allocation state with the recorded one.
+        Run the GC if necessary.
 
-        Parameters
-        ----------
-        recorded : object
-            The previous allocation state returned by
-            `record_allocation_state()`
-        gc_until : callable
-            A callable itself accepting a callable predicate, and
-            returning a boolean.
-            `gc_until` should try to release memory until the predicate
-            becomes true, or until it decides to give up. The final value
-            of the predicate should be returned.
-            `gc_until` is typically provided by the C Data Interface importer.
+        This should ensure that any temporary objects and data created by
+        previous exporter calls are collected.
+        """
 
-        Returns
-        -------
-        success : bool
-            Whether memory allocation state finally reached its previously
-            recorded value.
+    @property
+    def required_gc_runs(self):
         """
-        raise NotImplementedError
+        The maximum number of calls to `run_gc` that need to be issued to
+        ensure proper deallocation. Some implementations may require this
+        to be greater than one.
+        """
+        return 1
+
+    def close(self):
+        """
+        Final cleanup after usage.
+        """
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *exc):
+        self.close()
 
 
 class CDataImporter(ABC):
@@ -163,32 +163,40 @@ def supports_releasing_memory(self) -> bool:
         """
         Whether the implementation is able to release memory deterministically.
 
-        Here, "release memory" means calling the `release` callback of
-        a C Data Interface export (which should then trigger a deallocation
-        mechanism on the exporter).
+        Here, "release memory" means `run_gc()` is able to trigger the
+        `release` callback of a C Data Interface export (which would then
+        induce a deallocation mechanism on the exporter).
+        """
 
-        If false, then `gc_until` is allowed to raise NotImplementedError.
+    def run_gc(self):
         """
+        Run the GC if necessary.
 
-    def gc_until(self, predicate: _Predicate):
+        This should ensure that any imported data has its release callback called.
         """
-        Try to release memory until the predicate becomes true, or fail.
 
-        Depending on the CDataImporter implementation, this may for example
-        try once, or run a garbage collector a given number of times, or
-        any other implementation-specific strategy for releasing memory.
+    @property
+    def required_gc_runs(self):
+        """
+        The maximum number of calls to `run_gc` that need to be issued to
+        ensure release callbacks are triggered. Some implementations may
+        require this to be greater than one.
+        """
+        return 1
 
-        The running time should be kept reasonable and compatible with
-        execution of multiple C Data integration tests.
+    def close(self):
+        """
+        Final cleanup after usage.
+        """
 
-        This should not raise if `supports_releasing_memory` is true.
+    def __enter__(self):
+        return self
 
-        Returns
-        -------
-        success : bool
-            The final value of the predicate.
-        """
-        raise NotImplementedError
+    def __exit__(self, *exc):
+        # Make sure any exported data is released.
+        for i in range(self.required_gc_runs):
+            self.run_gc()
+        self.close()
 
 
 class Tester:

@@ -223,13 +223,6 @@ def supports_releasing_memory(self):
     def record_allocation_state(self):
         return self.dll.ArrowCpp_BytesAllocated()
 
-    def compare_allocation_state(self, recorded, gc_until):
-        def pred():
-            # No GC on our side, so just compare allocation state
-            return self.record_allocation_state() == recorded
-
-        return gc_until(pred)
-
 
 class CppCDataImporter(CDataImporter, _CDataBase):
 
@@ -247,7 +240,3 @@ def import_batch_and_compare_to_json(self, json_path, num_batch,
     @property
     def supports_releasing_memory(self):
         return True
-
-    def gc_until(self, predicate):
-        # No GC on our side, so can evaluate predicate immediately
-        return predicate()
@@ -16,7 +16,6 @@
 # under the License.
 
 from contextlib import contextmanager
-import gc
 import os
 
 from . import cdata
@@ -82,6 +81,10 @@ def _read_batch_from_json(self, json_path, num_batch):
         schema = jf.Schema.ToArrow()
         return schema, jf.Batches[num_batch].ToArrow(schema)
 
+    def _run_gc(self):
+        from Apache.Arrow.IntegrationTest import CDataInterface
+        CDataInterface.RunGC()
+
 
 class CSharpCDataExporter(CDataExporter, _CDataBase):
 
@@ -105,6 +108,9 @@ def supports_releasing_memory(self):
         # XXX the C# GC doesn't give reliable allocation measurements
         return False
 
+    def run_gc(self):
+        self._run_gc()
+
 
 class CSharpCDataImporter(CDataImporter, _CDataBase):
 
@@ -134,15 +140,8 @@ def import_batch_and_compare_to_json(self, json_path, num_batch,
     def supports_releasing_memory(self):
         return True
 
-    def gc_until(self, predicate):
-        from Apache.Arrow.IntegrationTest import CDataInterface
-        for i in range(3):
-            if predicate():
-                return True
-            # Collect any C# objects hanging around through Python
-            gc.collect()
-            CDataInterface.RunGC()
-        return predicate()
+    def run_gc(self):
+        self._run_gc()
 
 
 class CSharpTester(Tester):

@@ -200,9 +200,6 @@ def _check_go_error(self, go_error):
             finally:
                 self.dll.ArrowGo_FreeError(go_error)
 
-    def _run_gc(self):
-        self.dll.ArrowGo_RunGC()
-
 
 class GoCDataExporter(CDataExporter, _CDataBase):
     # Note: the Arrow Go C Data export functions expect their output
@@ -225,14 +222,10 @@ def supports_releasing_memory(self):
         return True
 
     def record_allocation_state(self):
-        self._run_gc()
         return self.dll.ArrowGo_BytesAllocated()
 
-    def compare_allocation_state(self, recorded, gc_until):
-        def pred():
-            return self.record_allocation_state() == recorded
-
-        return gc_until(pred)
+    # Note: no need to call the Go GC anywhere thanks to Arrow Go's
+    # explicit refcounting.
 
 
 class GoCDataImporter(CDataImporter, _CDataBase):
@@ -252,10 +245,3 @@ def import_batch_and_compare_to_json(self, json_path, num_batch,
     @property
     def supports_releasing_memory(self):
         return True
-
-    def gc_until(self, predicate):
-        for i in range(10):
-            if predicate():
-                return True
-            self._run_gc()
-        return False