In [1]:
import pandas as pd
import numpy as np
import uuid
import random

def make_node():
    return str(uuid.uuid4())

def make_int(null_prob=0.1):
    n = np.random.randint(1000)

    if np.random.rand() <= null_prob:
        n = None

    return n

def make_float(null_prob=0.1):
    n = np.random.rand()*100

    if np.random.rand() <= null_prob:
        n = None

    return n

def make_list(null_prob=0.1):
    n = np.random.randint(4)
    out = [make_node() for _ in range(n)]

    if np.random.rand() <= null_prob:
        out = None

    return out

def make_dict(null_prob=0.1):
    keys = list('ABCD')
    random.shuffle(keys)

    n = np.random.randint(len(keys))
    keys = keys[:n]

    d = {
        k: np.random.rand()
        for k in keys
    }

    if np.random.rand() <= null_prob:
        d = None

    return d

def make_category(null_prob=0.1):
    keys = list('ABCDEFG')
    n = np.random.randint(len(keys))
    key = keys[n]

    if np.random.rand() <= null_prob:
        key = None

    return key


def make_row():
    d = {
        'node_id': make_node(),
        'the_int': make_int(),
        'the_float': make_float(),
        'the_list': make_list(),
        'the_dict': make_dict(),
        'the_category': make_category(),
    }
    return d

In [2]:
df = pd.DataFrame.from_records([make_row() for _ in range(1000)])

In [3]:
df['the_category'] = df.the_category.astype('category')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   node_id       1000 non-null   object  
 1   the_int       894 non-null    float64 
 2   the_float     901 non-null    float64 
 3   the_list      893 non-null    object  
 4   the_dict      882 non-null    object  
 5   the_category  913 non-null    category
dtypes: category(1), float64(2), object(3)
memory usage: 40.5+ KB


In [5]:
df.infer_objects().info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   node_id       1000 non-null   object  
 1   the_int       894 non-null    float64 
 2   the_float     901 non-null    float64 
 3   the_list      893 non-null    object  
 4   the_dict      882 non-null    object  
 5   the_category  913 non-null    category
dtypes: category(1), float64(2), object(3)
memory usage: 40.5+ KB


In [6]:
df.convert_dtypes(dtype_backend='pyarrow').info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype          
---  ------        --------------  -----          
 0   node_id       1000 non-null   string[pyarrow]
 1   the_int       894 non-null    int64[pyarrow] 
 2   the_float     901 non-null    double[pyarrow]
 3   the_list      893 non-null    object         
 4   the_dict      882 non-null    object         
 5   the_category  913 non-null    category       
dtypes: category(1), double[pyarrow](1), int64[pyarrow](1), object(2), string[pyarrow](1)
memory usage: 72.0+ KB


In [7]:
import keepdb

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   node_id       1000 non-null   object  
 1   the_int       894 non-null    float64 
 2   the_float     901 non-null    float64 
 3   the_list      893 non-null    object  
 4   the_dict      882 non-null    object  
 5   the_category  913 non-null    category
dtypes: category(1), float64(2), object(3)
memory usage: 40.5+ KB


In [9]:
keepdb.to_zip('bah.zip', {'df': df})

In [10]:
tables = keepdb.pa.from_zip('bah.zip')

In [11]:
tables['df'].schema

node_id: string
the_int: double
the_float: double
the_list: list<element: string>
  child 0, element: string
the_dict: struct<A: double, B: double, C: double, D: double>
  child 0, A: double
  child 1, B: double
  child 2, C: double
  child 3, D: double
the_category: dictionary<values=string, indices=int32, ordered=0>
-- schema metadata --
pandas: '{"index_columns": [], "column_indexes": [], "columns": [{"name":' + 821

In [12]:
df2 = tables['df'].to_pandas(types_mapper=pd.ArrowDtype)

In [13]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column        Non-Null Count  Dtype                                                       
---  ------        --------------  -----                                                       
 0   node_id       1000 non-null   string[pyarrow]                                             
 1   the_int       894 non-null    double[pyarrow]                                             
 2   the_float     901 non-null    double[pyarrow]                                             
 3   the_list      893 non-null    list<element: string>[pyarrow]                              
 4   the_dict      882 non-null    struct<A: double, B: double, C: double, D: double>[pyarrow] 
 5   the_category  913 non-null    dictionary<values=string, indices=int32, ordered=0>[pyarrow]
dtypes: dictionary<values=string, indices=int32, ordered=0>[pyarrow](1), double[pyarrow](2), list<element: string>[pyarrow](1),

In [None]:
import pyarrow as pa

In [None]:
df

In [None]:
dfa = pa.Table.from_pandas(df).to_pandas(types_mapper=pd.ArrowDtype)
dfb = pa.Table.from_pandas(df).to_pandas(None)
dfc = pa.Table.from_pandas(df).to_pandas(None).convert_dtypes(dtype_backend='pyarrow')

In [None]:
dfa.info()

In [None]:
dfb.info()

In [None]:
dfc.info()

In [None]:
dfa.convert_dtypes(dtype_backend='pyarrow')