In [1]:
import pandas as pd
import numpy as np
import pyarrow as pa

def get_types(df):
    return tuple(map(str, df.dtypes))

In [2]:
df = pd.DataFrame({
    'colA': [1, 2, 3, None],
    'colB': ['A', 'B', 'C', None],
})

In [3]:
out = [
    df,
    df.convert_dtypes(),
    df.convert_dtypes(dtype_backend='pyarrow'),
    pa.Table.from_pandas(df).to_pandas(),
    pa.Table.from_pandas(df).to_pandas().convert_dtypes(),
    pa.Table.from_pandas(df).to_pandas().convert_dtypes(dtype_backend='pyarrow'),
    pa.Table.from_pandas(df).to_pandas(types_mapper=pd.ArrowDtype),
    pa.Table.from_pandas(df).to_pandas(types_mapper=pd.ArrowDtype).convert_dtypes(),
    pa.Table.from_pandas(df).to_pandas(types_mapper=pd.ArrowDtype).convert_dtypes(dtype_backend='pyarrow'),
]
pd.DataFrame([get_types(o) for o in out])

Unnamed: 0,0,1
0,float64,object
1,Int64,string
2,int64[pyarrow],string[pyarrow]
3,float64,object
4,Int64,string
5,int64[pyarrow],string[pyarrow]
6,double[pyarrow],string[pyarrow]
7,Float64,string
8,double[pyarrow],string[pyarrow]


In [4]:
df['colB'] = df['colB'].astype('category')
out = [
    df,
    df.convert_dtypes(),
    df.convert_dtypes(dtype_backend='pyarrow'),
    pa.Table.from_pandas(df).to_pandas(),
    pa.Table.from_pandas(df).to_pandas().convert_dtypes(),
    pa.Table.from_pandas(df).to_pandas().convert_dtypes(dtype_backend='pyarrow'),
    pa.Table.from_pandas(df).to_pandas(types_mapper=pd.ArrowDtype),
    # pa.Table.from_pandas(df).to_pandas(types_mapper=pd.ArrowDtype).convert_dtypes(),
    pa.Table.from_pandas(df).to_pandas(types_mapper=pd.ArrowDtype).convert_dtypes(dtype_backend='pyarrow'),
]
pd.DataFrame([get_types(o) for o in out])

Unnamed: 0,0,1
0,float64,category
1,Int64,category
2,int64[pyarrow],category
3,float64,category
4,Int64,category
5,int64[pyarrow],category
6,double[pyarrow],"dictionary<values=string, indices=int8, ordere..."
7,double[pyarrow],"dictionary<values=string, indices=int8, ordere..."


In [46]:
def canonicalize(df): # best one?
    df = df.convert_dtypes(dtype_backend='pyarrow')
    table = pa.Table.from_pandas(df)
    df = table.to_pandas()
    return df

In [47]:
canonicalize(df).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   colA    3 non-null      int64[pyarrow]
 1   colB    3 non-null      category      
dtypes: category(1), int64[pyarrow](1)
memory usage: 301.0 bytes


In [48]:
pa.Table.from_pandas(canonicalize(df)).to_pandas().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   colA    3 non-null      int64[pyarrow]
 1   colB    3 non-null      category      
dtypes: category(1), int64[pyarrow](1)
memory usage: 301.0 bytes


In [49]:
def canonicalize(df): 
    df = df.convert_dtypes()
    table = pa.Table.from_pandas(df)
    df = table.to_pandas()
    return df

In [51]:
canonicalize(df).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   colA    3 non-null      Int64   
 1   colB    3 non-null      category
dtypes: Int64(1), category(1)
memory usage: 304.0 bytes


In [52]:
pa.Table.from_pandas(canonicalize(df)).to_pandas().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   colA    3 non-null      Int64   
 1   colB    3 non-null      category
dtypes: Int64(1), category(1)
memory usage: 304.0 bytes


In [36]:
canonicalize(df).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   colA    3 non-null      Int64   
 1   colB    3 non-null      category
dtypes: Int64(1), category(1)
memory usage: 304.0 bytes


In [53]:
def canonicalize(df): # not idempotent because error
    df = df.convert_dtypes()
    table = pa.Table.from_pandas(df)
    df = table.to_pandas(types_mapper=pd.ArrowDtype)
    return df

In [54]:
canonicalize(df).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype                                                      
---  ------  --------------  -----                                                      
 0   colA    3 non-null      int64[pyarrow]                                             
 1   colB    3 non-null      dictionary<values=string, indices=int8, ordered=0>[pyarrow]
dtypes: dictionary<values=string, indices=int8, ordered=0>[pyarrow](1), int64[pyarrow](1)
memory usage: 185.0 bytes


In [16]:
(
pa.Table.from_pandas(df.convert_dtypes(dtype_backend='pyarrow')).to_pandas().convert_dtypes(dtype_backend='pyarrow')
).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   colA    3 non-null      int64[pyarrow]
 1   colB    3 non-null      category      
dtypes: category(1), int64[pyarrow](1)
memory usage: 301.0 bytes


In [6]:
pa.Table.from_pandas(df).to_pandas().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   colA    3 non-null      float64 
 1   colB    3 non-null      category
dtypes: category(1), float64(1)
memory usage: 300.0 bytes


In [7]:
pa.Table.from_pandas(df).to_pandas(types_mapper=pd.ArrowDtype).convert_dtypes()

KeyError: DictionaryType(dictionary<values=string, indices=int8, ordered=0>)