Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Python][CI] Windows tests are failing with latest pandas 2.0 #34880

Closed
jorisvandenbossche opened this issue Apr 4, 2023 · 1 comment · Fixed by #34881
Closed

[Python][CI] Windows tests are failing with latest pandas 2.0 #34880

jorisvandenbossche opened this issue Apr 4, 2023 · 1 comment · Fixed by #34881

Comments

@jorisvandenbossche
Copy link
Member

See eg https://ci.appveyor.com/project/ApacheSoftwareFoundation/arrow/builds/46696219. It seems to be related to int32 vs int64 being created

================================== FAILURES ===================================
_____________ TestZeroCopyConversion.test_zero_copy_dictionaries ______________
self = <pyarrow.tests.test_pandas.TestZeroCopyConversion object at 0x000001947D9150E0>
    def test_zero_copy_dictionaries(self):
        arr = pa.DictionaryArray.from_arrays(
            np.array([0, 0]),
            np.array([5]))
    
        result = arr.to_pandas(zero_copy_only=True)
        values = pd.Categorical([5, 5])
    
>       tm.assert_series_equal(pd.Series(result), pd.Series(values),
                               check_names=False)
E       AssertionError: Attributes of Series are different
E       
E       Attribute "dtype" are different
E       [left]:  CategoricalDtype(categories=[5], ordered=False)
E       [right]: CategoricalDtype(categories=[5], ordered=False)
pyarrow\tests\test_pandas.py:2578: AssertionError
_______ test_dataset_read_pandas_common_metadata[_metadata-False-True] ________
tempdir = WindowsPath('C:/Users/appveyor/AppData/Local/Temp/1/pytest-of-appveyor/pytest-0/test_dataset_read_pandas_commo2')
use_legacy_dataset = True, preserve_index = False, metadata_fname = '_metadata'
    @pytest.mark.pandas
    @parametrize_legacy_dataset
    @pytest.mark.parametrize('preserve_index', [True, False, None])
    @pytest.mark.parametrize('metadata_fname', ["_metadata", "_common_metadata"])
    def test_dataset_read_pandas_common_metadata(
        tempdir, use_legacy_dataset, preserve_index, metadata_fname
    ):
        # ARROW-1103
        nfiles = 5
        size = 5
    
        dirpath = tempdir / guid()
        dirpath.mkdir()
    
        test_data = []
        frames = []
        paths = []
        for i in range(nfiles):
            df = _test_dataframe(size, seed=i)
            df.index = pd.Index(np.arange(i * size, (i + 1) * size), name='index')
    
            path = dirpath / '{}.parquet'.format(i)
    
            table = pa.Table.from_pandas(df, preserve_index=preserve_index)
    
            # Obliterate metadata
            table = table.replace_schema_metadata(None)
            assert table.schema.metadata is None
    
            _write_table(table, path)
            test_data.append(table)
            frames.append(df)
            paths.append(path)
    
        # Write _metadata common file
        table_for_metadata = pa.Table.from_pandas(
            df, preserve_index=preserve_index
        )
        pq.write_metadata(table_for_metadata.schema, dirpath / metadata_fname)
    
        dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset)
        columns = ['uint8', 'strings']
        result = dataset.read_pandas(columns=columns).to_pandas()
        expected = pd.concat([x[columns] for x in frames])
        expected.index.name = (
            df.index.name if preserve_index is not False else None)
>       tm.assert_frame_equal(result, expected)
pyarrow\tests\parquet\test_pandas.py:698: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
left = RangeIndex(start=0, stop=25, step=1)
right = Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24],
      dtype='int32')
obj = 'DataFrame.index'
    def _check_types(left, right, obj: str = "Index") -> None:
        if not exact:
            return
    
        assert_class_equal(left, right, exact=exact, obj=obj)
        assert_attr_equal("inferred_type", left, right, obj=obj)
    
        # Skip exact dtype checking when `check_categorical` is False
        if is_categorical_dtype(left.dtype) and is_categorical_dtype(right.dtype):
            if check_categorical:
                assert_attr_equal("dtype", left, right, obj=obj)
                assert_index_equal(left.categories, right.categories, exact=exact)
            return
    
>       assert_attr_equal("dtype", left, right, obj=obj)
E       AssertionError: DataFrame.index are different
E       
E       Attribute "dtype" are different
E       [left]:  int64
E       [right]: int32
C:\Miniconda38-x64\envs\arrow\lib\site-packages\pandas\_testing\asserters.py:247: AssertionError
_______ test_dataset_read_pandas_common_metadata[_metadata-False-False] _______
tempdir = WindowsPath('C:/Users/appveyor/AppData/Local/Temp/1/pytest-of-appveyor/pytest-0/test_dataset_read_pandas_commo3')
use_legacy_dataset = False, preserve_index = False, metadata_fname = '_metadata'
    @pytest.mark.pandas
    @parametrize_legacy_dataset
    @pytest.mark.parametrize('preserve_index', [True, False, None])
    @pytest.mark.parametrize('metadata_fname', ["_metadata", "_common_metadata"])
    def test_dataset_read_pandas_common_metadata(
        tempdir, use_legacy_dataset, preserve_index, metadata_fname
    ):
        # ARROW-1103
        nfiles = 5
        size = 5
    
        dirpath = tempdir / guid()
        dirpath.mkdir()
    
        test_data = []
        frames = []
        paths = []
        for i in range(nfiles):
            df = _test_dataframe(size, seed=i)
            df.index = pd.Index(np.arange(i * size, (i + 1) * size), name='index')
    
            path = dirpath / '{}.parquet'.format(i)
    
            table = pa.Table.from_pandas(df, preserve_index=preserve_index)
    
            # Obliterate metadata
            table = table.replace_schema_metadata(None)
            assert table.schema.metadata is None
    
            _write_table(table, path)
            test_data.append(table)
            frames.append(df)
            paths.append(path)
    
        # Write _metadata common file
        table_for_metadata = pa.Table.from_pandas(
            df, preserve_index=preserve_index
        )
        pq.write_metadata(table_for_metadata.schema, dirpath / metadata_fname)
    
        dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset)
        columns = ['uint8', 'strings']
        result = dataset.read_pandas(columns=columns).to_pandas()
        expected = pd.concat([x[columns] for x in frames])
        expected.index.name = (
            df.index.name if preserve_index is not False else None)
>       tm.assert_frame_equal(result, expected)
pyarrow\tests\parquet\test_pandas.py:698: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
left = RangeIndex(start=0, stop=25, step=1)
right = Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24],
      dtype='int32')
obj = 'DataFrame.index'
    def _check_types(left, right, obj: str = "Index") -> None:
        if not exact:
            return
    
        assert_class_equal(left, right, exact=exact, obj=obj)
        assert_attr_equal("inferred_type", left, right, obj=obj)
    
        # Skip exact dtype checking when `check_categorical` is False
        if is_categorical_dtype(left.dtype) and is_categorical_dtype(right.dtype):
            if check_categorical:
                assert_attr_equal("dtype", left, right, obj=obj)
                assert_index_equal(left.categories, right.categories, exact=exact)
            return
    
>       assert_attr_equal("dtype", left, right, obj=obj)
E       AssertionError: DataFrame.index are different
E       
E       Attribute "dtype" are different
E       [left]:  int64
E       [right]: int32
C:\Miniconda38-x64\envs\arrow\lib\site-packages\pandas\_testing\asserters.py:247: AssertionError
____ test_dataset_read_pandas_common_metadata[_common_metadata-False-True] ____
tempdir = WindowsPath('C:/Users/appveyor/AppData/Local/Temp/1/pytest-of-appveyor/pytest-0/test_dataset_read_pandas_commo8')
use_legacy_dataset = True, preserve_index = False
metadata_fname = '_common_metadata'
    @pytest.mark.pandas
    @parametrize_legacy_dataset
    @pytest.mark.parametrize('preserve_index', [True, False, None])
    @pytest.mark.parametrize('metadata_fname', ["_metadata", "_common_metadata"])
    def test_dataset_read_pandas_common_metadata(
        tempdir, use_legacy_dataset, preserve_index, metadata_fname
    ):
        # ARROW-1103
        nfiles = 5
        size = 5
    
        dirpath = tempdir / guid()
        dirpath.mkdir()
    
        test_data = []
        frames = []
        paths = []
        for i in range(nfiles):
            df = _test_dataframe(size, seed=i)
            df.index = pd.Index(np.arange(i * size, (i + 1) * size), name='index')
    
            path = dirpath / '{}.parquet'.format(i)
    
            table = pa.Table.from_pandas(df, preserve_index=preserve_index)
    
            # Obliterate metadata
            table = table.replace_schema_metadata(None)
            assert table.schema.metadata is None
    
            _write_table(table, path)
            test_data.append(table)
            frames.append(df)
            paths.append(path)
    
        # Write _metadata common file
        table_for_metadata = pa.Table.from_pandas(
            df, preserve_index=preserve_index
        )
        pq.write_metadata(table_for_metadata.schema, dirpath / metadata_fname)
    
        dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset)
        columns = ['uint8', 'strings']
        result = dataset.read_pandas(columns=columns).to_pandas()
        expected = pd.concat([x[columns] for x in frames])
        expected.index.name = (
            df.index.name if preserve_index is not False else None)
>       tm.assert_frame_equal(result, expected)
pyarrow\tests\parquet\test_pandas.py:698: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
left = RangeIndex(start=0, stop=25, step=1)
right = Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24],
      dtype='int32')
obj = 'DataFrame.index'
    def _check_types(left, right, obj: str = "Index") -> None:
        if not exact:
            return
    
        assert_class_equal(left, right, exact=exact, obj=obj)
        assert_attr_equal("inferred_type", left, right, obj=obj)
    
        # Skip exact dtype checking when `check_categorical` is False
        if is_categorical_dtype(left.dtype) and is_categorical_dtype(right.dtype):
            if check_categorical:
                assert_attr_equal("dtype", left, right, obj=obj)
                assert_index_equal(left.categories, right.categories, exact=exact)
            return
    
>       assert_attr_equal("dtype", left, right, obj=obj)
E       AssertionError: DataFrame.index are different
E       
E       Attribute "dtype" are different
E       [left]:  int64
E       [right]: int32
C:\Miniconda38-x64\envs\arrow\lib\site-packages\pandas\_testing\asserters.py:247: AssertionError
___ test_dataset_read_pandas_common_metadata[_common_metadata-False-False] ____
tempdir = WindowsPath('C:/Users/appveyor/AppData/Local/Temp/1/pytest-of-appveyor/pytest-0/test_dataset_read_pandas_commo9')
use_legacy_dataset = False, preserve_index = False
metadata_fname = '_common_metadata'
    @pytest.mark.pandas
    @parametrize_legacy_dataset
    @pytest.mark.parametrize('preserve_index', [True, False, None])
    @pytest.mark.parametrize('metadata_fname', ["_metadata", "_common_metadata"])
    def test_dataset_read_pandas_common_metadata(
        tempdir, use_legacy_dataset, preserve_index, metadata_fname
    ):
        # ARROW-1103
        nfiles = 5
        size = 5
    
        dirpath = tempdir / guid()
        dirpath.mkdir()
    
        test_data = []
        frames = []
        paths = []
        for i in range(nfiles):
            df = _test_dataframe(size, seed=i)
            df.index = pd.Index(np.arange(i * size, (i + 1) * size), name='index')
    
            path = dirpath / '{}.parquet'.format(i)
    
            table = pa.Table.from_pandas(df, preserve_index=preserve_index)
    
            # Obliterate metadata
            table = table.replace_schema_metadata(None)
            assert table.schema.metadata is None
    
            _write_table(table, path)
            test_data.append(table)
            frames.append(df)
            paths.append(path)
    
        # Write _metadata common file
        table_for_metadata = pa.Table.from_pandas(
            df, preserve_index=preserve_index
        )
        pq.write_metadata(table_for_metadata.schema, dirpath / metadata_fname)
    
        dataset = pq.ParquetDataset(dirpath, use_legacy_dataset=use_legacy_dataset)
        columns = ['uint8', 'strings']
        result = dataset.read_pandas(columns=columns).to_pandas()
        expected = pd.concat([x[columns] for x in frames])
        expected.index.name = (
            df.index.name if preserve_index is not False else None)
>       tm.assert_frame_equal(result, expected)
pyarrow\tests\parquet\test_pandas.py:698: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
left = RangeIndex(start=0, stop=25, step=1)
right = Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24],
      dtype='int32')
obj = 'DataFrame.index'
    def _check_types(left, right, obj: str = "Index") -> None:
        if not exact:
            return
    
        assert_class_equal(left, right, exact=exact, obj=obj)
        assert_attr_equal("inferred_type", left, right, obj=obj)
    
        # Skip exact dtype checking when `check_categorical` is False
        if is_categorical_dtype(left.dtype) and is_categorical_dtype(right.dtype):
            if check_categorical:
                assert_attr_equal("dtype", left, right, obj=obj)
                assert_index_equal(left.categories, right.categories, exact=exact)
            return
    
>       assert_attr_equal("dtype", left, right, obj=obj)
E       AssertionError: DataFrame.index are different
E       
E       Attribute "dtype" are different
E       [left]:  int64
E       [right]: int32
C:\Miniconda38-x64\envs\arrow\lib\site-packages\pandas\_testing\asserters.py:247: AssertionError
@jorisvandenbossche
Copy link
Member Author

I suppose this might need some additional fixed in the same line of #34498 (there we only fixed the failures that appeared on non-windows builds)

assignUser pushed a commit that referenced this issue Apr 4, 2023
….0 (#34881)

* Closes: #34880

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Jacob Wujciak-Jens <jacob@wujciak.de>
@assignUser assignUser added this to the 12.0.0 milestone Apr 4, 2023
ArgusLi pushed a commit to Bit-Quill/arrow that referenced this issue May 15, 2023
…ndas 2.0 (apache#34881)

* Closes: apache#34880

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Jacob Wujciak-Jens <jacob@wujciak.de>
rtpsw pushed a commit to rtpsw/arrow that referenced this issue May 16, 2023
…ndas 2.0 (apache#34881)

* Closes: apache#34880

Authored-by: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Signed-off-by: Jacob Wujciak-Jens <jacob@wujciak.de>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment