In [1]:
import seaborn as sns

keys = [
    'diamonds',
    'car_crashes',
]

dfs = {
    k: sns.load_dataset(k)
    for k in keys
}

In [2]:
def print_info(dfs):
    for k, df in dfs.items():
        print(f'Dataframe: {k}')
        df.info()
        print()

print_info(dfs)

Dataframe: diamonds
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    53940 non-null  float64 
 1   cut      53940 non-null  category
 2   color    53940 non-null  category
 3   clarity  53940 non-null  category
 4   depth    53940 non-null  float64 
 5   table    53940 non-null  float64 
 6   price    53940 non-null  int64   
 7   x        53940 non-null  float64 
 8   y        53940 non-null  float64 
 9   z        53940 non-null  float64 
dtypes: category(3), float64(6), int64(1)
memory usage: 3.0 MB

Dataframe: car_crashes
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   total           51 non-null     float64
 1   speeding        51 non-null     float64
 2   alcohol         51 non-null  

In [3]:
import keepdb as kd

filename = 'test_file.zip'
kd.to_zip(filename, dfs)
dfs2 = kd.from_zip(filename)

print_info(dfs2)

Dataframe: diamonds
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    53940 non-null  float64 
 1   cut      53940 non-null  category
 2   color    53940 non-null  category
 3   clarity  53940 non-null  category
 4   depth    53940 non-null  float64 
 5   table    53940 non-null  float64 
 6   price    53940 non-null  int64   
 7   x        53940 non-null  float64 
 8   y        53940 non-null  float64 
 9   z        53940 non-null  float64 
dtypes: category(3), float64(6), int64(1)
memory usage: 3.0 MB

Dataframe: car_crashes
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   total           51 non-null     float64
 1   speeding        51 non-null     float64
 2   alcohol         51 non-null  

# PyArrow

You can also work directly with [`pyarrow.Table` objects](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html).
This may be useful to avoid dtype conversion issues that occur with Pandas DataFrames.

In [4]:
# read from the existing zip file

tables = kd.pa.from_zip(filename)
tables['diamonds']

pyarrow.Table
carat: double
cut: dictionary<values=string, indices=int32, ordered=0>
color: dictionary<values=string, indices=int32, ordered=0>
clarity: dictionary<values=string, indices=int32, ordered=0>
depth: double
table: double
price: int64
x: double
y: double
z: double
----
carat: [[0.23,0.21,0.23,0.29,0.31,...,0.72,0.72,0.7,0.86,0.75]]
cut: [  -- dictionary:
["Ideal","Premium","Very Good","Good","Fair"]  -- indices:
[0,1,3,1,3,...,0,3,2,1,0]]
color: [  -- dictionary:
["D","E","F","G","H","I","J"]  -- indices:
[1,1,1,5,6,...,0,0,0,4,0]]
clarity: [  -- dictionary:
["IF","VVS1","VVS2","VS1","VS2","SI1","SI2","I1"]  -- indices:
[6,5,3,4,6,...,5,5,5,6,6]]
depth: [[61.5,59.8,56.9,62.4,63.3,...,60.8,63.1,62.8,61,62.2]]
table: [[55,61,65,58,58,...,57,55,60,58,55]]
price: [[326,326,327,334,335,...,2757,2757,2757,2757,2757]]
x: [[3.95,3.89,4.05,4.2,4.34,...,5.75,5.69,5.66,6.15,5.83]]
y: [[3.98,3.84,4.07,4.23,4.35,...,5.76,5.75,5.68,6.12,5.87]]
z: [[2.43,2.31,2.31,2.63,2.75,...,3.5,3.61,3.

Use `keepdb.pa.to_zip()` to write `pyarrow.Table`s to parquet.

Note that `keepdb.to_zip()` is the same as `keepdb.pd.to_zip()`, and this function simply converts Pandas DataFrames
to PyArrow Tables before calling `keepdb.pa.to_zip()`.

In [5]:
kd.pa.to_zip(filename, tables)

Use `kd.pa.from_zip()` to return PyArrow Tables instead of DataFrames.

In [6]:
tables = kd.pa.from_zip(filename)

In [7]:
tables['diamonds']

pyarrow.Table
carat: double
cut: dictionary<values=string, indices=int32, ordered=0>
color: dictionary<values=string, indices=int32, ordered=0>
clarity: dictionary<values=string, indices=int32, ordered=0>
depth: double
table: double
price: int64
x: double
y: double
z: double
----
carat: [[0.23,0.21,0.23,0.29,0.31,...,0.72,0.72,0.7,0.86,0.75]]
cut: [  -- dictionary:
["Ideal","Premium","Very Good","Good","Fair"]  -- indices:
[0,1,3,1,3,...,0,3,2,1,0]]
color: [  -- dictionary:
["D","E","F","G","H","I","J"]  -- indices:
[1,1,1,5,6,...,0,0,0,4,0]]
clarity: [  -- dictionary:
["IF","VVS1","VVS2","VS1","VS2","SI1","SI2","I1"]  -- indices:
[6,5,3,4,6,...,5,5,5,6,6]]
depth: [[61.5,59.8,56.9,62.4,63.3,...,60.8,63.1,62.8,61,62.2]]
table: [[55,61,65,58,58,...,57,55,60,58,55]]
price: [[326,326,327,334,335,...,2757,2757,2757,2757,2757]]
x: [[3.95,3.89,4.05,4.2,4.34,...,5.75,5.69,5.66,6.15,5.83]]
y: [[3.98,3.84,4.07,4.23,4.35,...,5.76,5.75,5.68,6.12,5.87]]
z: [[2.43,2.31,2.31,2.63,2.75,...,3.5,3.61,3.