In [1]:
import pandas as pd
import numpy as np
from time import perf_counter

In [2]:
td = pd.DataFrame.from_records(np.random.randn(1000000, 10), columns=list('abcdefghij'))

In [3]:
tic = perf_counter()
td.to_csv('something.csv')
print(perf_counter() - tic)

21.83264280399453


In [13]:
tic = perf_counter()
td.to_msgpack('something.mspk')
print(perf_counter() - tic)

0.23619673500070348


In [25]:
tic = perf_counter()
td.to_hdf('something.h5', '/test')
print(perf_counter() - tic)

0.16679915400163736


In [7]:
import feather

In [26]:
tic = perf_counter()
feather.write_dataframe(td, 'something.feather')
print(perf_counter() - tic)

0.1381438899989007


In [9]:
import fastparquet

In [29]:
tic = perf_counter()
fastparquet.write('something.parq', td, compression='GZIP')
print(perf_counter() - tic)

3.655557748999854


In [32]:
from sqlalchemy import create_engine, Table, MetaData, Column, Float, BigInteger
engine = create_engine('sqlite:///something.db')

In [33]:
# Do this and blow your memory up for 1M records
tic = perf_counter()
td.to_sql('test1', engine, chunksize=10000) 
print(perf_counter() - tic)

23.568673335001222


In [34]:
%load_ext line_profiler

In [35]:
metadata = MetaData()

test_table = Table('test', metadata, 
                   Column('index', BigInteger, index=True),
                   Column('a', Float),
                   Column('b', Float),
                   Column('c', Float),
                   Column('d', Float),
                   Column('e', Float),
                   Column('f', Float),
                   Column('g', Float),
                   Column('h', Float),
                   Column('i', Float),
                   Column('j', Float))

metadata.create_all(engine)
connection = engine.connect()

def chunked_sql(connection, df):
    def chunker(seq, size):
        return (seq[pos:pos + size] for pos in range(0, len(seq), size))

    tic = perf_counter()

    for chunk in chunker(df, 10000):
        params = [c[1].to_dict() for c in chunk.iterrows()]
        connection.execute(test_table.insert(), params)
    print(perf_counter() - tic)

In [36]:
chunked_sql(connection, td)

64.51136466200114


In [None]:
%lprun -f chunked_sql chunked_sql(connection, td)

In [37]:
td.values

array([[-0.74390642,  1.0754658 , -0.75182111, ..., -0.22499674,
         0.37310822,  0.23912584],
       [-0.72584056,  0.392192  , -3.44725952, ..., -0.19907306,
         0.59576617,  1.0131095 ],
       [-0.31823582, -0.55654468, -1.31238557, ..., -1.21973423,
        -1.39000928, -1.02860781],
       ..., 
       [ 1.13036761,  0.65445893,  0.83913626, ..., -0.1923888 ,
         2.25438088,  1.00333901],
       [ 1.15152809,  1.80424349, -1.26026771, ..., -0.88880133,
        -1.27889135, -1.78820358],
       [ 0.46277507,  0.59565984,  0.92318522, ..., -0.4336095 ,
         0.34822949,  0.99846409]])

In [39]:
tic = perf_counter()
td.to_sql('test2', engine, chunksize=10000)
print(perf_counter() - tic)

19.3291776210026
