In [1]:
import numpy as np
import pandas as pd
from datetime import datetime as dtt
import pyarrow as pa
import pyarrow.csv as pa_csv
import random
import string

In [2]:
def gen_random_string(length: int = 32) -> str:
    return ''.join(random.choices(string.ascii_uppercase + string.digits, k=length))

In [3]:
def gen_random_df() -> pd.core.frame.DataFrame:
    dt = pd.date_range(start=dtt(2008, 1, 1), end=dtt(2021, 1, 1), freq='min')
    
    np.random.seeed = 42
    df_size = len(dt)
    
    return pd.DataFrame({
        'date': dt,
        'a': np.random.rand(df_size),
        'b': np.random.rand(df_size),
        'c': np.random.rand(df_size),
        'd': np.random.rand(df_size),
        'e': np.random.rand(df_size),
        'str1': [gen_random_string() for x in range(df_size)],
        'str2': [gen_random_string() for x in range(df_size)],
    })

## Pandas

In [4]:
%%time
df = gen_random_df()

CPU times: user 1min 9s, sys: 2.52 s, total: 1min 11s
Wall time: 1min 14s


In [5]:
df.head()

Unnamed: 0,date,a,b,c,d,e,str1,str2
0,2008-01-01 00:00:00,0.894641,0.157044,0.136433,0.767557,0.199758,DU5FEWD4CQPKK9ZS0OJFEE6D21822PNQ,XZDFL3QNH2K769JQAR6MZ8C4OS5L3JKW
1,2008-01-01 00:01:00,0.005116,0.395153,0.242322,0.814322,0.052537,1LANI21HPC2RNSAP4F32FMCXIWCCHZ0S,XU7Q26BHLLU297TVL4586TA80V05GII9
2,2008-01-01 00:02:00,0.561317,0.136245,0.418272,0.608092,0.972008,UZPN3WQG4PLBL8R60HFP9X2AMYRQ14J6,3Y8KJGA4UNON1EUPIMJ7G8SOL3KOGE71
3,2008-01-01 00:03:00,0.900031,0.78309,0.376043,0.189969,0.183286,WZ4VX2VQ1WV6JU9JNHBYD0059E3QUAG1,KA73304RGI37QN072DS97I8KGXY5OHJV
4,2008-01-01 00:04:00,0.718266,0.674473,0.578599,0.8378,0.602903,HNFJYANYAUSR7SV420L9YLL0CDCNNWRV,4CM87ATIIR0ZSB4RA57TRAMT18ADV920


In [6]:
df.shape

(6838561, 8)

In [7]:
%%time
df.to_csv('datasets/csv_pandas.csv', index=False)

CPU times: user 1min 5s, sys: 1.69 s, total: 1min 7s
Wall time: 1min 10s


In [8]:
%%time
df.to_csv('datasets/csv_pandas.csv.gz', index=False, compression='gzip')

CPU times: user 2min 58s, sys: 1.01 s, total: 2min 59s
Wall time: 3min 1s


In [9]:
%%time
df1 = pd.read_csv('datasets/csv_pandas.csv')

CPU times: user 14.9 s, sys: 4.06 s, total: 18.9 s
Wall time: 20 s


In [10]:
%%time
df2 = pd.read_csv('datasets/csv_pandas.csv.gz')

CPU times: user 21.6 s, sys: 3.96 s, total: 25.5 s
Wall time: 26 s


In [11]:
df_pa = df.copy()

In [12]:
df_pa['date'] = df_pa['date'].values.astype(np.int64) // 18 ** 9

In [13]:
df_pa.head()

Unnamed: 0,date,a,b,c,d,e,str1,str2
0,6045321,0.894641,0.157044,0.136433,0.767557,0.199758,DU5FEWD4CQPKK9ZS0OJFEE6D21822PNQ,XZDFL3QNH2K769JQAR6MZ8C4OS5L3JKW
1,6045321,0.005116,0.395153,0.242322,0.814322,0.052537,1LANI21HPC2RNSAP4F32FMCXIWCCHZ0S,XU7Q26BHLLU297TVL4586TA80V05GII9
2,6045321,0.561317,0.136245,0.418272,0.608092,0.972008,UZPN3WQG4PLBL8R60HFP9X2AMYRQ14J6,3Y8KJGA4UNON1EUPIMJ7G8SOL3KOGE71
3,6045321,0.900031,0.78309,0.376043,0.189969,0.183286,WZ4VX2VQ1WV6JU9JNHBYD0059E3QUAG1,KA73304RGI37QN072DS97I8KGXY5OHJV
4,6045322,0.718266,0.674473,0.578599,0.8378,0.602903,HNFJYANYAUSR7SV420L9YLL0CDCNNWRV,4CM87ATIIR0ZSB4RA57TRAMT18ADV920


## PyArrow

In [14]:
df_pa_table = pa.Table.from_pandas(df_pa)

In [15]:
%%time
pa_csv.write_csv(df_pa_table, 'datasets/csv_pyarrow.csv')

CPU times: user 5.69 s, sys: 661 ms, total: 6.35 s
Wall time: 7.45 s


In [16]:
%%time

with pa.CompressedOutputStream('datasets/csv_pyarrow.csv.gz', 'gzip') as out:
    pa_csv.write_csv(df_pa_table, out)

CPU times: user 1min 15s, sys: 430 ms, total: 1min 15s
Wall time: 1min 17s


In [17]:
%%time
df_pa1 = pa_csv.read_csv('datasets/csv_pyarrow.csv')

CPU times: user 5.36 s, sys: 2.28 s, total: 7.64 s
Wall time: 4.63 s


In [18]:
%%time
df_pa2 = pa_csv.read_csv('datasets/csv_pyarrow.csv.gz')

CPU times: user 11.3 s, sys: 1.27 s, total: 12.6 s
Wall time: 8.5 s


## Parquet

In [19]:
%%time
table = pa.Table.from_pandas(df)

CPU times: user 1.58 s, sys: 3.4 s, total: 4.98 s
Wall time: 6.95 s


In [20]:
%%time

import pyarrow.parquet as pq

pq.write_table(table, 'datasets/df.parquet')

CPU times: user 1.89 s, sys: 1.03 s, total: 2.92 s
Wall time: 5.71 s


In [21]:
table.schema

date: timestamp[ns]
a: double
b: double
c: double
d: double
e: double
str1: string
str2: string
-- schema metadata --
pandas: '{"index_columns": [{"kind": "range", "name": null, "start": 0, "' + 1113

In [22]:
table.shape

(6838561, 8)

In [23]:
table

pyarrow.Table
date: timestamp[ns]
a: double
b: double
c: double
d: double
e: double
str1: string
str2: string
----
date: [[2008-01-01 00:00:00.000000000,2008-01-01 00:01:00.000000000,2008-01-01 00:02:00.000000000,2008-01-01 00:03:00.000000000,2008-01-01 00:04:00.000000000,...,2020-12-31 23:56:00.000000000,2020-12-31 23:57:00.000000000,2020-12-31 23:58:00.000000000,2020-12-31 23:59:00.000000000,2021-01-01 00:00:00.000000000]]
a: [[0.8946414753120778,0.005116394198519947,0.5613172463870624,0.9000312139092288,0.7182660521239629,...,0.39216593993630156,0.24147259671693877,0.5135818964989683,0.48850886891649714,0.6871488565571172]]
b: [[0.15704372059366511,0.3951526508395937,0.1362447581438836,0.7830895851309322,0.6744726320103409,...,0.6971443897492829,0.12232051061508531,0.49924330998415223,0.792691011704123,0.5943368122649845]]
c: [[0.13643335537041545,0.24232193933645474,0.41827156104085694,0.37604252619018574,0.5785990847333402,...,0.5767158759988779,0.7487147103008416,0.25556372010251

In [24]:
%%time

table2 = pq.read_table('datasets/df.parquet')

CPU times: user 1.24 s, sys: 2.36 s, total: 3.61 s
Wall time: 3.9 s


In [25]:
table2.shape

(6838561, 8)