In [1]:
import pandas as pd
import pandas.util.testing as tu

from uuid import uuid3, NAMESPACE_URL
from pathlib import Path

# Setup

In [2]:
def file_name(extension: str) -> str:
    """Create UUID file name from file extension to avoid overwriting existing files."""
    return f'{uuid3(NAMESPACE_URL, extension)}.{extension}'

In [3]:
pd.np.random.seed(0)
tu.N = 5
d = tu.makeDataFrame().set_index('A')
d

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.085487,-0.114103,-0.415745,-0.956931
2.382224,-0.844231,-0.524512,0.067311
-0.406024,0.705641,0.813101,0.206499
0.266445,-0.398786,-0.229251,-0.456881
-1.355714,-0.827197,2.161717,-1.059976


# Export and import

## Parquet

Available with pandas $\ge$ 0.21.0

In [4]:
parquet_file = file_name('parquet')
d.to_parquet(parquet_file)

In [5]:
pd.read_parquet(parquet_file)

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.085487,-0.114103,-0.415745,-0.956931
2.382224,-0.844231,-0.524512,0.067311
-0.406024,0.705641,0.813101,0.206499
0.266445,-0.398786,-0.229251,-0.456881
-1.355714,-0.827197,2.161717,-1.059976


### Column names must be strings!

In [6]:
try:
    d.rename(columns={'B': 1, 'C': 2, 'D': 3}).to_parquet(parquet_file)
except ValueError as e:
    print(e)

parquet must have string column names


In [7]:
# use .rename(columns=str) for a quick fix
d.rename(columns={'B': 1, 'C': 2, 'D': 3}).rename(columns=str).to_parquet(parquet_file)

## Excel

Note: index is stored as an ordinary column!

In [8]:
excel_file = file_name('xlsx')
d.to_excel(excel_file)

In [9]:
pd.read_excel(excel_file)

Unnamed: 0,A,B,C,D
0,1.085487,-0.114103,-0.415745,-0.956931
1,2.382224,-0.844231,-0.524512,0.067311
2,-0.406024,0.705641,0.813101,0.206499
3,0.266445,-0.398786,-0.229251,-0.456881
4,-1.355714,-0.827197,2.161717,-1.059976


In [10]:
pd.read_excel(excel_file).set_index('A')  # restore index manually!

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.085487,-0.114103,-0.415745,-0.956931
2.382224,-0.844231,-0.524512,0.067311
-0.406024,0.705641,0.813101,0.206499
0.266445,-0.398786,-0.229251,-0.456881
-1.355714,-0.827197,2.161717,-1.059976


## Tab-separated txt file with custom float format

Note: index is stored as an ordinary column!

In [11]:
csv_file = file_name('txt')
d.to_csv(csv_file, sep='\t', float_format='%.2f')

In [12]:
pd.read_csv(csv_file, sep='\t')

Unnamed: 0,A,B,C,D
0,1.09,-0.11,-0.42,-0.96
1,2.38,-0.84,-0.52,0.07
2,-0.41,0.71,0.81,0.21
3,0.27,-0.4,-0.23,-0.46
4,-1.36,-0.83,2.16,-1.06


In [13]:
pd.read_csv(csv_file, sep='\t').set_index('A')  # restore index manually!

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.09,-0.11,-0.42,-0.96
2.38,-0.84,-0.52,0.07
-0.41,0.71,0.81,0.21
0.27,-0.4,-0.23,-0.46
-1.36,-0.83,2.16,-1.06


# Benchmark

In [14]:
tu.N = 100000
d = tu.makeDataFrame()
d.shape

(100000, 4)

## Write

In [15]:
%%time
d.to_parquet(parquet_file)

CPU times: user 92.1 ms, sys: 16 ms, total: 108 ms
Wall time: 110 ms


In [16]:
%%time
d.to_excel(excel_file)

CPU times: user 9.78 s, sys: 92.9 ms, total: 9.87 s
Wall time: 9.88 s


In [17]:
%%time
d.to_csv(csv_file)

CPU times: user 733 ms, sys: 40 ms, total: 773 ms
Wall time: 772 ms


## Read

In [18]:
%%time
__ = pd.read_parquet(parquet_file)

CPU times: user 51.4 ms, sys: 24.1 ms, total: 75.4 ms
Wall time: 67.1 ms


In [19]:
%%time
__ = pd.read_excel(excel_file)

CPU times: user 7.08 s, sys: 52 ms, total: 7.14 s
Wall time: 7.15 s


In [20]:
%%time
__ = pd.read_csv(csv_file)

CPU times: user 136 ms, sys: 4.09 ms, total: 141 ms
Wall time: 137 ms


## Size

In [21]:
pd.DataFrame(
    [(Path(f).suffix, Path(f).stat().st_size) for f in [parquet_file, excel_file, csv_file]],
    columns=['type', 'size'])

Unnamed: 0,type,size
0,.parquet,5424181
1,.xlsx,7138754
2,.txt,8952867


# Remove exported files

In [22]:
Path(parquet_file).unlink()
Path(excel_file).unlink()
Path(csv_file).unlink()