Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Release][Python] Parquet test fails on AlmaLinux8 #30173

Closed
asfimport opened this issue Nov 8, 2021 · 3 comments
Closed

[Release][Python] Parquet test fails on AlmaLinux8 #30173

asfimport opened this issue Nov 8, 2021 · 3 comments

Comments

@asfimport
Copy link

When running verification tests on AlmaLinux 8, Parquet test fails

Main steps to reproduce

dnf -y update
dnf clean all
dnf -y install \
  dnf-plugins-core \
  yum-utils
dnf config-manager --set-enabled powertools
dnf -y update
dnf -y module disable ruby
dnf -y module enable ruby:2.7
dnf -y groupinstall "Development Tools"
dnf -y install \
  epel-release \
  ninja-build \
  libcurl-devel \
  python3-pip \
  python3-devel \
  cmake \
  git \
  ncurses-devel \
  gobject-introspection-devel \
  libffi-devel \
  openssl-devel \
  maven \
  java-1.8.0-openjdk-devel \
  wget \
  readline-devel \
  gdbm-devel \
  ruby-devel \
  llvm-toolset \
  llvm-devel
dnf -y update
alias pip=pip3
alternatives --set python /usr/bin/python3
ln -s /usr/bin/pip3 /usr/bin/pip
git clone https://github.com/apache/arrow/
pip install -r arrow/python/requirements-build.txt \
     -r arrow/python/requirements-test.txt
cd arrow
mkdir dist
export ARROW_HOME=$(pwd)/dist
export LD_LIBRARY_PATH=$(pwd)/dist/lib:$LD_LIBRARY_PATH
cd cpp
mkdir build
cmake -DCMAKE_INSTALL_PREFIX=$ARROW_HOME \
      -DCMAKE_INSTALL_LIBDIR=lib \
      -DARROW_WITH_BZ2=ON \
      -DARROW_WITH_ZLIB=ON \
      -DARROW_WITH_ZSTD=ON \
      -DARROW_WITH_LZ4=ON \
      -DARROW_WITH_SNAPPY=ON \
      -DARROW_WITH_BROTLI=ON \
      -DARROW_PARQUET=ON \
      -DARROW_PYTHON=ON \
      -DARROW_BUILD_TESTS=ON \
      ..
make -j4
make install
cd ..
cd ..
cd python
export PYARROW_WITH_PARQUET=1
python setup.py build_ext --inplace
export PYARROW_TEST_PARQUET=ON
 python -m pytest -r s --pyargs pyarrow

Resulting error:

============================================ FAILURES =============================================
________________________________ test_permutation_of_column_order _________________________________

source = '/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0/dataset_column_order_permutation'
columns = None, use_threads = True, metadata = None, use_pandas_metadata = False
memory_map = False, read_dictionary = None
filesystem = <pyarrow._fs.LocalFileSystem object at 0x7f70875b7e30>, filters = None
buffer_size = 0, partitioning = 'hive', use_legacy_dataset = False, ignore_prefixes = None
pre_buffer = True, coerce_int96_timestamp_unit = None

    def read_table(source, columns=None, use_threads=True, metadata=None,
                   use_pandas_metadata=False, memory_map=False,
                   read_dictionary=None, filesystem=None, filters=None,
                   buffer_size=0, partitioning="hive", use_legacy_dataset=False,
                   ignore_prefixes=None, pre_buffer=True,
                   coerce_int96_timestamp_unit=None):
        if not use_legacy_dataset:
            if metadata is not None:
                raise ValueError(
                    "The 'metadata' keyword is no longer supported with the new "
                    "datasets-based implementation. Specify "
                    "'use_legacy_dataset=True' to temporarily recover the old "
                    "behaviour."
                )
            try:
                dataset = _ParquetDatasetV2(
                    source,
                    filesystem=filesystem,
                    partitioning=partitioning,
                    memory_map=memory_map,
                    read_dictionary=read_dictionary,
                    buffer_size=buffer_size,
                    filters=filters,
                    ignore_prefixes=ignore_prefixes,
                    pre_buffer=pre_buffer,
>                   coerce_int96_timestamp_unit=coerce_int96_timestamp_unit
                )

pyarrow/parquet.py:1960: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = <pyarrow.parquet._ParquetDatasetV2 object at 0x7f7087556da0>
path_or_paths = '/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0/dataset_column_order_permutation'
filesystem = None, filters = None, partitioning = 'hive', read_dictionary = None, buffer_size = 0
memory_map = False, ignore_prefixes = None, pre_buffer = True, coerce_int96_timestamp_unit = None
kwargs = {}

    def __init__(self, path_or_paths, filesystem=None, filters=None,
                 partitioning="hive", read_dictionary=None, buffer_size=None,
                 memory_map=False, ignore_prefixes=None, pre_buffer=True,
                 coerce_int96_timestamp_unit=None, **kwargs):
>       import pyarrow.dataset as ds

pyarrow/parquet.py:1680: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

    """Dataset is currently unstable. APIs subject to change without notice."""
    
    import pyarrow as pa
    from pyarrow.util import _is_iterable, _stringify_path, _is_path_like
    
>   from pyarrow._dataset import (  # noqa
        CsvFileFormat,
        CsvFragmentScanOptions,
        Expression,
        Dataset,
        DatasetFactory,
        DirectoryPartitioning,
        FileFormat,
        FileFragment,
        FileSystemDataset,
        FileSystemDatasetFactory,
        FileSystemFactoryOptions,
        FileWriteOptions,
        Fragment,
        HivePartitioning,
        IpcFileFormat,
        IpcFileWriteOptions,
        InMemoryDataset,
        ParquetDatasetFactory,
        ParquetFactoryOptions,
        ParquetFileFormat,
        ParquetFileFragment,
        ParquetFileWriteOptions,
        ParquetFragmentScanOptions,
        ParquetReadOptions,
        Partitioning,
        PartitioningFactory,
        RowGroupInfo,
        Scanner,
        TaggedRecordBatch,
        UnionDataset,
        UnionDatasetFactory,
        _get_partition_keys,
        _filesystemdataset_write,
    )
E   ModuleNotFoundError: No module named 'pyarrow._dataset'

pyarrow/dataset.py:23: ModuleNotFoundError

During handling of the above exception, another exception occurred:

tempdir = PosixPath('/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0')

    def test_permutation_of_column_order(tempdir):
        # ARROW-2366
        case = tempdir / "dataset_column_order_permutation"
        case.mkdir(exist_ok=True)
    
        data1 = pa.table([[1, 2, 3], [.1, .2, .3]], names=['a', 'b'])
        pq.write_table(data1, case / "data1.parquet")
    
        data2 = pa.table([[.4, .5, .6], [4, 5, 6]], names=['b', 'a'])
        pq.write_table(data2, case / "data2.parquet")
    
>       table = pq.read_table(str(case))

pyarrow/tests/parquet/test_basic.py:645: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
pyarrow/parquet.py:1977: in read_table
    source = filesystem.open_input_file(path)
pyarrow/_fs.pyx:588: in pyarrow._fs.FileSystem.open_input_file
    in_handle = GetResultValue(self.fs.OpenInputFile(pathstr))
pyarrow/error.pxi:143: in pyarrow.lib.pyarrow_internal_check_status
    return check_status(status)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

>   raise IOError(message)
E   OSError: Cannot open for reading: path '/tmp/pytest-of-root/pytest-9/test_permutation_of_column_ord0/dataset_column_order_permutation' is a directory

pyarrow/error.pxi:114: OSError

Reporter: Benson Muite / @bkmgit
Assignee: Joris Van den Bossche / @jorisvandenbossche

PRs and other links:

Note: This issue was originally created as ARROW-14629. Please see the migration documentation for further details.

@asfimport
Copy link
Author

Joris Van den Bossche / @jorisvandenbossche:
Ah, we forgot to add a dataset marker in this case. (it's easy to forget since we don't have a CI build that does not include dataset at the moment ..)

@asfimport
Copy link
Author

Joris Van den Bossche / @jorisvandenbossche:
Issue resolved by pull request 11643
#11643

@asfimport
Copy link
Author

Kouhei Sutou / @kou:
The verification steps verify the master not 6.0.1.
So this isn't needed for 6.0.1.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

2 participants