In [1]:
import pandas as pd
import pandas.util.testing as tu

from uuid import uuid3, NAMESPACE_URL
from pathlib import Path

# Setup

In [2]:
def file_name(extension: str) -> str:
    """Create UUID file name from file extension to avoid overwriting existing files."""
    return f'{uuid3(NAMESPACE_URL, extension)}.{extension}'

In [3]:
tu.N = 5
d = tu.makeDataFrame().set_index('A')
d

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-0.146036,-1.673916,-0.031588,1.093347
0.20855,-2.581037,1.090273,0.306071
1.750899,-0.037915,0.262552,-2.295756
0.600635,-0.429778,0.512313,-1.169304
0.759383,0.683251,-0.858447,-1.378045


# Export and import

## Parquet

Available with pandas $\ge$ 0.21.0

In [4]:
parquet_file = file_name('parquet')
d.to_parquet(parquet_file)

In [5]:
pd.read_parquet(parquet_file)

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-0.146036,-1.673916,-0.031588,1.093347
0.20855,-2.581037,1.090273,0.306071
1.750899,-0.037915,0.262552,-2.295756
0.600635,-0.429778,0.512313,-1.169304
0.759383,0.683251,-0.858447,-1.378045


### Column names must be strings!

In [6]:
try:
    d.rename(columns={'B': 1, 'C': 2, 'D': 3}).to_parquet(parquet_file)
except ValueError as e:
    print(e)

parquet must have string column names


In [7]:
# use .rename(columns=str) for a quick fix
d.rename(columns={'B': 1, 'C': 2, 'D': 3}).rename(columns=str).to_parquet(parquet_file)

## Excel

Note: index is stored as an ordinary column!

In [8]:
excel_file = file_name('xlsx')
d.to_excel(excel_file)

In [9]:
pd.read_excel(excel_file)

Unnamed: 0,A,B,C,D
0,-0.146036,-1.673916,-0.031588,1.093347
1,0.20855,-2.581037,1.090273,0.306071
2,1.750899,-0.037915,0.262552,-2.295756
3,0.600635,-0.429778,0.512313,-1.169304
4,0.759383,0.683251,-0.858447,-1.378045


In [10]:
pd.read_excel(excel_file).set_index('A')  # restore index manually!

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-0.146036,-1.673916,-0.031588,1.093347
0.20855,-2.581037,1.090273,0.306071
1.750899,-0.037915,0.262552,-2.295756
0.600635,-0.429778,0.512313,-1.169304
0.759383,0.683251,-0.858447,-1.378045


## Tab-separated txt file with custom float format

Note: index is stored as an ordinary column!

In [11]:
csv_file = file_name('txt')
d.to_csv(csv_file, sep='\t', float_format='%.2f')

In [12]:
pd.read_csv(csv_file, sep='\t')

Unnamed: 0,A,B,C,D
0,-0.15,-1.67,-0.03,1.09
1,0.21,-2.58,1.09,0.31
2,1.75,-0.04,0.26,-2.3
3,0.6,-0.43,0.51,-1.17
4,0.76,0.68,-0.86,-1.38


In [13]:
pd.read_csv(csv_file, sep='\t').set_index('A')  # restore index manually!

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-0.15,-1.67,-0.03,1.09
0.21,-2.58,1.09,0.31
1.75,-0.04,0.26,-2.3
0.6,-0.43,0.51,-1.17
0.76,0.68,-0.86,-1.38


# Benchmark

In [14]:
tu.N = 100000
d = tu.makeDataFrame()
d.shape

(100000, 4)

## Write

In [15]:
%time d.to_parquet(parquet_file)

CPU times: user 75.3 ms, sys: 20.4 ms, total: 95.6 ms
Wall time: 95.4 ms


In [16]:
%time d.to_excel(excel_file)

CPU times: user 9.87 s, sys: 139 ms, total: 10 s
Wall time: 10 s


In [17]:
%time d.to_csv(csv_file)

CPU times: user 692 ms, sys: 40 ms, total: 732 ms
Wall time: 731 ms


## Read

In [18]:
%time __ = pd.read_parquet(parquet_file)

CPU times: user 50.9 ms, sys: 20.1 ms, total: 71 ms
Wall time: 66.4 ms


In [19]:
%time __ = pd.read_excel(excel_file)

CPU times: user 6.34 s, sys: 39.8 ms, total: 6.38 s
Wall time: 6.38 s


In [20]:
%time __ = pd.read_csv(csv_file)

CPU times: user 132 ms, sys: 4.09 ms, total: 136 ms
Wall time: 134 ms


## Size

In [21]:
pd.DataFrame(
    [(Path(f).suffix, Path(f).stat().st_size) for f in [parquet_file, excel_file, csv_file]],
    columns=['type', 'size'])

Unnamed: 0,type,size
0,.parquet,5424228
1,.xlsx,7139053
2,.txt,8952991


# Remove exported files

In [22]:
Path(parquet_file).unlink()
Path(excel_file).unlink()
Path(csv_file).unlink()