In [1]:
import numpy as np
import pandas as pd
from datetime import datetime as dtt
import pyarrow as pa
import pyarrow.csv as pa_csv
import random
import string

In [2]:
def gen_random_string(length: int = 32) -> str:
    return ''.join(random.choices(string.ascii_uppercase + string.digits, k=length))

In [3]:
def gen_random_df() -> pd.core.frame.DataFrame:
    dt = pd.date_range(start=dtt(2008, 1, 1), end=dtt(2021, 1, 1), freq='min')
    
    np.random.seeed = 42
    df_size = len(dt)
    
    return pd.DataFrame({
        'date': dt,
        'a': np.random.rand(df_size),
        'b': np.random.rand(df_size),
        'c': np.random.rand(df_size),
        'd': np.random.rand(df_size),
        'e': np.random.rand(df_size),
        'str1': [gen_random_string() for x in range(df_size)],
        'str2': [gen_random_string() for x in range(df_size)],
    })

## Pandas

In [4]:
%%time
df = gen_random_df()

CPU times: user 1min 8s, sys: 740 ms, total: 1min 9s
Wall time: 1min 9s


In [5]:
df.head()

Unnamed: 0,date,a,b,c,d,e,str1,str2
0,2008-01-01 00:00:00,0.838608,0.289483,0.903988,0.093236,0.410304,HXN0A3VV9SMHR3ULR76RQ0F4U1Q0CNJ2,KU5NQ85YY583GPCKMDB4RC25QL9HMLIO
1,2008-01-01 00:01:00,0.949606,0.710316,0.638049,0.250692,0.070841,SZSP1MIEECO2OR1Y3X0Q72E5IUC4GUXP,D6N3LWE0PNG20KBOB1AN5MGVEP8F7YVS
2,2008-01-01 00:02:00,0.437038,0.829742,0.954897,0.579914,0.861055,9GK36SUZR1MS6IXJWCXVEUGGSAF17CLR,W9JYROCYMX4CYXYAXUBOMQUR1MFYYXET
3,2008-01-01 00:03:00,0.914051,0.333282,0.498265,0.136487,0.844198,BW8IWSN0B8ZXC2X8NT2COFRHDU7FE5XX,BBXEDSRMCEZW8EHU4MCNWFJUGS8BLGUB
4,2008-01-01 00:04:00,0.801538,0.714043,0.502537,0.500085,0.083226,VMJYM9HWL9EXANLFXQ6F8VC8G4HBB8JY,XKOJN794IP599NVCAS6YX76I5I3SVA00


In [6]:
df.shape

(6838561, 8)

In [7]:
%%time
df.to_csv('datasets/csv_pandas.csv', index=False)

CPU times: user 1min 7s, sys: 1.51 s, total: 1min 8s
Wall time: 1min 11s


In [8]:
%%time
df.to_csv('datasets/csv_pandas.csv.gz', index=False, compression='gzip')

CPU times: user 3min 8s, sys: 1.18 s, total: 3min 9s
Wall time: 3min 12s


In [9]:
%%time
df1 = pd.read_csv('datasets/csv_pandas.csv')

CPU times: user 15.9 s, sys: 4.62 s, total: 20.5 s
Wall time: 21.9 s


In [10]:
%%time
df2 = pd.read_csv('datasets/csv_pandas.csv.gz')

CPU times: user 23.6 s, sys: 4.66 s, total: 28.3 s
Wall time: 30 s


In [11]:
df_pa = df.copy()

In [12]:
df_pa['date'] = df_pa['date'].values.astype(np.int64) // 18 ** 9

In [13]:
df_pa.head()

Unnamed: 0,date,a,b,c,d,e,str1,str2
0,6045321,0.838608,0.289483,0.903988,0.093236,0.410304,HXN0A3VV9SMHR3ULR76RQ0F4U1Q0CNJ2,KU5NQ85YY583GPCKMDB4RC25QL9HMLIO
1,6045321,0.949606,0.710316,0.638049,0.250692,0.070841,SZSP1MIEECO2OR1Y3X0Q72E5IUC4GUXP,D6N3LWE0PNG20KBOB1AN5MGVEP8F7YVS
2,6045321,0.437038,0.829742,0.954897,0.579914,0.861055,9GK36SUZR1MS6IXJWCXVEUGGSAF17CLR,W9JYROCYMX4CYXYAXUBOMQUR1MFYYXET
3,6045321,0.914051,0.333282,0.498265,0.136487,0.844198,BW8IWSN0B8ZXC2X8NT2COFRHDU7FE5XX,BBXEDSRMCEZW8EHU4MCNWFJUGS8BLGUB
4,6045322,0.801538,0.714043,0.502537,0.500085,0.083226,VMJYM9HWL9EXANLFXQ6F8VC8G4HBB8JY,XKOJN794IP599NVCAS6YX76I5I3SVA00


## PyArrow

In [14]:
df_pa_table = pa.Table.from_pandas(df_pa)

In [15]:
%%time
pa_csv.write_csv(df_pa_table, 'datasets/csv_pyarrow.csv')

CPU times: user 6.02 s, sys: 615 ms, total: 6.64 s
Wall time: 8.1 s


In [16]:
%%time

with pa.CompressedOutputStream('datasets/csv_pyarrow.csv.gz', 'gzip') as out:
    pa_csv.write_csv(df_pa_table, out)

CPU times: user 1min 18s, sys: 475 ms, total: 1min 19s
Wall time: 1min 21s


In [17]:
%%time
df_pa1 = pa_csv.read_csv('datasets/csv_pyarrow.csv')

CPU times: user 6.56 s, sys: 2.69 s, total: 9.25 s
Wall time: 5.76 s


In [18]:
%%time
df_pa2 = pa_csv.read_csv('datasets/csv_pyarrow.csv.gz')

CPU times: user 14.1 s, sys: 1.62 s, total: 15.7 s
Wall time: 10.1 s


## Parquet

In [19]:
%%time
table = pa.Table.from_pandas(df)

CPU times: user 1.73 s, sys: 3.98 s, total: 5.72 s
Wall time: 7.42 s


In [20]:
%%time

import pyarrow.parquet as pq

pq.write_table(table, 'datasets/df.parquet')

CPU times: user 1.99 s, sys: 1.15 s, total: 3.15 s
Wall time: 7.95 s


In [21]:
table.schema

date: timestamp[ns]
a: double
b: double
c: double
d: double
e: double
str1: string
str2: string
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 1113

In [22]:
table.shape

(6838561, 8)

In [23]:
table

pyarrow.Table
date: timestamp[ns]
a: double
b: double
c: double
d: double
e: double
str1: string
str2: string
----
date: [[2008-01-01 00:00:00.000000000,2008-01-01 00:01:00.000000000,2008-01-01 00:02:00.000000000,2008-01-01 00:03:00.000000000,2008-01-01 00:04:00.000000000,...,2020-12-31 23:56:00.000000000,2020-12-31 23:57:00.000000000,2020-12-31 23:58:00.000000000,2020-12-31 23:59:00.000000000,2021-01-01 00:00:00.000000000]]
a: [[0.8386083811816082,0.9496059656573795,0.4370376008891683,0.9140509306355624,0.801538146183963,...,0.027526856750125672,0.05302330882384365,0.4550113035650607,0.13441335183181513,0.669967983404543]]
b: [[0.2894832440712872,0.7103159078904976,0.8297423887118833,0.3332818669946236,0.7140429769582083,...,0.2545229213726765,0.6970564031052601,0.26893050713845223,0.045630783787576346,0.6725564832244854]]
c: [[0.9039875648132133,0.6380485030438576,0.9548968990063806,0.49826522156905606,0.5025372974047053,...,0.12084021482000706,0.24970061864953264,0.9165223577254412,

In [24]:
%%time

table2 = pq.read_table('datasets/df.parquet')

CPU times: user 1.08 s, sys: 1.28 s, total: 2.36 s
Wall time: 1.23 s


In [25]:
table2.shape

(6838561, 8)