In [3]:
import time
import os

import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
import h5py

In [4]:
df = pd.DataFrame(np.random.randn(15000000, 3), columns=list('ABC'))

In [13]:
base = '/scratch/midway2/annawoodard/'

In [3]:
def benchmark_csv(df, filename):
    path = os.path.join(base, filename)
    
    start = time.time()
    df.to_csv(path)
    write_time = time.time() - start
    
    start = time.time()
    df = pd.read_csv(path)
    read_time = time.time() - start
    
    size = os.stat(path).st_size / 1e9
    
    print('file size: {:.2f} GB\nwrite time: {:.2f} seconds\nread time: {:.3f} seconds'.format(
        size, write_time, read_time))

In [8]:
def benchmark_pickle(df, filename):
    path = os.path.join(base, filename)
    
    start = time.time()
    df.to_pickle(path)
    write_time = time.time() - start
    
    start = time.time()
    df = pd.read_pickle(path)
    read_time = time.time() - start
    
    size = os.stat(path).st_size / 1e9
    
    print('file size: {:.2f} GB\nwrite time: {:.2f} seconds\nread time: {:.3f} seconds'.format(
        size, write_time, read_time))

In [14]:
def benchmark_h5(df, filename):
    path = os.path.join(base, filename)
    
    start = time.time()
    df.to_hdf(path, key='foo', mode='w')
    write_time = time.time() - start
    
    start = time.time()
    file = h5py.File(path, 'r')
    df = file['foo']
    read_time = time.time() - start
    
    size = os.stat(path).st_size / 1e9
    
    print('file size: {:.2f} GB\nwrite time: {:.2f} seconds\nread time: {:.3f} seconds'.format(
        size, write_time, read_time))

In [20]:
def benchmark_parquet(df, filename):
    path = os.path.join(base, filename)
    
    start = time.time()
    table = pa.Table.from_pandas(df)
    pq.write_table(table, path)
    write_time = time.time() - start
    
    start = time.time()
    table = pq.read_table(path)
    df = table.to_pandas()
    read_time = time.time() - start
    
    size = os.stat(path).st_size / 1e9
    
    print('file size: {:.2f} GB\nwrite time: {:.2f} seconds\nread time: {:.2f} seconds'.format(
        size, write_time, read_time))

# uncompressed csv

In [5]:
benchmark_csv(df, 'foo.csv')

file size: 1.01 GB
write time: 105.53 seconds
read time: 11.20 seconds


# compressed csv

In [6]:
benchmark_csv(df, 'foo.csv.bz2')

file size: 0.40 GB
write time: 247.84 seconds
read time: 65.69 seconds


# pickle

In [9]:
benchmark_pickle(df, 'foo.pkl')

file size: 0.36 GB
write time: 0.91 seconds
read time: 0.64 seconds


# HDF5

In [15]:
benchmark_h5(df, 'foo.h5')

file size: 0.48 GB
write time: 0.25 seconds
read time: 0.001 seconds


# parquet

In [21]:
benchmark_parquet(df, 'foo.parquet')

file size: 0.36 GB
write time: 1.38 seconds
read time: 1.54 seconds
