ARROW-4867: [Python] Respect ordering of columns argument passed to T…

…able.from_pandas I read through the discussion on ARROW-3766 where this change was originally made, and I think there was a misunderstanding about a comment I made where I said "The columns argument in Table.from_pandas is just for column filtering". I admit it's a big ambiguous what's the right thing to do, but it seems like the user intent of passing `columns` is to use that order in the resulting schema, but not error on columns that are not found. We could also introduce "null" type columns for "not found" column names but we could do that in a separate patch Author: Wes McKinney <wesm+git@apache.org> Closes #3930 from wesm/ARROW-4867 and squashes the following commits: 4b4ad64 <Wes McKinney> Respect ordering of columns argument passed to Table.from_pandas
apache · Mar 16, 2019 · 76e8fe9 · 76e8fe9
1 parent c707822
commit 76e8fe9
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 7 deletions.
diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
@@ -411,9 +411,7 @@ def _resolve_columns_of_interest(df, schema, columns):
     elif schema is not None:
         columns = schema.names
     elif columns is not None:
-        # columns is only for filtering, the function must keep the column
-        # ordering of either the dataframe or the passed schema
-        columns = [c for c in df.columns if c in columns]
+        columns = [c for c in columns if c in df.columns]
     else:
         columns = df.columns
 

diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py
@@ -140,7 +140,8 @@ def test_non_string_columns(self):
         assert table.column(0).name == '0'
 
     def test_from_pandas_with_columns(self):
-        df = pd.DataFrame({0: [1, 2, 3], 1: [1, 3, 3], 2: [2, 4, 5]})
+        df = pd.DataFrame({0: [1, 2, 3], 1: [1, 3, 3], 2: [2, 4, 5]},
+                          columns=[1, 0])
 
         table = pa.Table.from_pandas(df, columns=[0, 1])
         expected = pa.Table.from_pandas(df[[0, 1]])
@@ -2495,15 +2496,15 @@ def test_table_from_pandas_columns_argument_only_does_filtering():
 
     columns1 = ['arrays', 'floats', 'partition']
     schema1 = pa.schema([
-        ('partition', pa.int64()),
         ('arrays', pa.list_(pa.int64())),
         ('floats', pa.float64()),
+        ('partition', pa.int64())
     ])
 
     columns2 = ['floats', 'partition']
     schema2 = pa.schema([
-        ('partition', pa.int64()),
-        ('floats', pa.float64())
+        ('floats', pa.float64()),
+        ('partition', pa.int64())
     ])
 
     table1 = pa.Table.from_pandas(df, columns=columns1, preserve_index=False)