In [1]:
import pandas as pd
import numpy as np
import pyarrow as pa

def get_types(df):
    return tuple(map(str, df.dtypes))

In [2]:
# pandas.core.dtypes.dtypes.ArrowDtype
# pandas.core.arrays.string_.StringDtype

In [3]:
df = pd.DataFrame({
    'colA': [1, 2, 3, None],
    'colB': ['A', 'B', 'C', None],
    'colC': ['aa', 'bb', 'cc', None],
})
df['colC'] = df['colC'].astype('category')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   colA    3 non-null      float64 
 1   colB    3 non-null      object  
 2   colC    3 non-null      category
dtypes: category(1), float64(1), object(1)
memory usage: 332.0+ bytes


In [41]:
def canon1(df): # works, but doesn't use pyarrow categorical
    df = df.convert_dtypes(dtype_backend='pyarrow')
    table = pa.Table.from_pandas(df)
    df = table.to_pandas()
    # df = df.convert_dtypes(dtype_backend='pyarrow')
    return df

def canon2(df): # not idempotent; gives error
    df = df.convert_dtypes(dtype_backend='pyarrow')
    table = pa.Table.from_pandas(df)
    df = table.to_pandas(types_mapper=pd.ArrowDtype)
    return df

In [42]:
canon1(df).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   colA    3 non-null      int64[pyarrow]
 1   colB    3 non-null      string        
 2   colC    3 non-null      category      
dtypes: category(1), int64[pyarrow](1), string(1)
memory usage: 337.0 bytes


In [43]:
canon1(df).dtypes

colA     int64[pyarrow]
colB    string[pyarrow]
colC           category
dtype: object

In [44]:
type(canon1(df).colB.dtype)

pandas.core.arrays.string_.StringDtype

In [31]:
canon2(df).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype                                                      
---  ------  --------------  -----                                                      
 0   colA    3 non-null      int64[pyarrow]                                             
 1   colB    3 non-null      string[pyarrow]                                            
 2   colC    3 non-null      dictionary<values=string, indices=int8, ordered=0>[pyarrow]
dtypes: dictionary<values=string, indices=int8, ordered=0>[pyarrow](1), int64[pyarrow](1), string[pyarrow](1)
memory usage: 208.0 bytes


In [45]:
canon2(canon2(df))

ValueError: format number 1 of "dictionary<values=string, indices=int8, ordered=0>[pyarrow]" is not recognized

In [46]:
pa.Table.from_pandas(canon2(df)).to_pandas()

ValueError: format number 1 of "dictionary<values=string, indices=int8, ordered=0>[pyarrow]" is not recognized

In [12]:
pa.Table.from_pandas(canon2(df)).to_pandas().info()

ValueError: format number 1 of "dictionary<values=string, indices=int8, ordered=0>[pyarrow]" is not recognized

In [54]:
canon1(df)

Unnamed: 0,colA,colB,colC
0,1.0,A,aa
1,2.0,B,bb
2,3.0,C,cc
3,,,


In [46]:
canon(canon(df))

ValueError: format number 1 of "dictionary<values=string, indices=int8, ordered=0>[pyarrow]" is not recognized

In [43]:
canonicalize(df).colC.dtype

CategoricalDtype(categories=['aa', 'bb', 'cc'], ordered=False, categories_dtype=object)

In [19]:
canonicalize(df)['colB']

0       A
1       B
2       C
3    <NA>
Name: colB, dtype: string[pyarrow]

In [48]:
pa.Table.from_pandas(canonicalize(df)).to_pandas().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   colA    3 non-null      int64[pyarrow]
 1   colB    3 non-null      category      
dtypes: category(1), int64[pyarrow](1)
memory usage: 301.0 bytes


In [49]:
def canonicalize(df): 
    df = df.convert_dtypes()
    table = pa.Table.from_pandas(df)
    df = table.to_pandas()
    return df

In [51]:
canonicalize(df).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   colA    3 non-null      Int64   
 1   colB    3 non-null      category
dtypes: Int64(1), category(1)
memory usage: 304.0 bytes


In [52]:
pa.Table.from_pandas(canonicalize(df)).to_pandas().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   colA    3 non-null      Int64   
 1   colB    3 non-null      category
dtypes: Int64(1), category(1)
memory usage: 304.0 bytes


In [36]:
canonicalize(df).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   colA    3 non-null      Int64   
 1   colB    3 non-null      category
dtypes: Int64(1), category(1)
memory usage: 304.0 bytes


In [53]:
def canonicalize(df): # not idempotent because error
    df = df.convert_dtypes()
    table = pa.Table.from_pandas(df)
    df = table.to_pandas(types_mapper=pd.ArrowDtype)
    return df

In [54]:
canonicalize(df).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype                                                      
---  ------  --------------  -----                                                      
 0   colA    3 non-null      int64[pyarrow]                                             
 1   colB    3 non-null      dictionary<values=string, indices=int8, ordered=0>[pyarrow]
dtypes: dictionary<values=string, indices=int8, ordered=0>[pyarrow](1), int64[pyarrow](1)
memory usage: 185.0 bytes


In [16]:
(
pa.Table.from_pandas(df.convert_dtypes(dtype_backend='pyarrow')).to_pandas().convert_dtypes(dtype_backend='pyarrow')
).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   colA    3 non-null      int64[pyarrow]
 1   colB    3 non-null      category      
dtypes: category(1), int64[pyarrow](1)
memory usage: 301.0 bytes


In [6]:
pa.Table.from_pandas(df).to_pandas().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   colA    3 non-null      float64 
 1   colB    3 non-null      category
dtypes: category(1), float64(1)
memory usage: 300.0 bytes


In [7]:
pa.Table.from_pandas(df).to_pandas(types_mapper=pd.ArrowDtype).convert_dtypes()

KeyError: DictionaryType(dictionary<values=string, indices=int8, ordered=0>)