In [5]:
import pandas as pd
import numpy as np

def get_dataset(size):
    # create fake Dataset
    df = pd.DataFrame()
    df["size"] = np.random.choice(["big", "medium", "small"], size)
    df["age"] = np.random.randint(1, 50, size)
    df["team"] = np.random.choice(["red", "blue", "yellow", "green"], size)
    df["win"] = np.random.choice(["yes", "no"], size)
    dates = pd.date_range("2020-01-01", "2022-12-31")
    df["date"] = np.random.choice(dates, size)
    df["prob"] = np.random.uniform(0, 1, size)
    return df


def set_dtypes(df):
    df["size"] = df["size"].astype("category")
    df["age"] = df["age"].astype("int16")
    df["team"] = df["team"].astype("category")
    df["win"] = df["win"].astype("bool")
    dates = pd.date_range("2020-01-01", "2022-12-31")
    df["prob"] = df["prob"].astype("float16")
    return df


# Generate BIG dataset

In [8]:
df = get_dataset(1_000_000)

## CSV

- WRITE 3.21 s
- READ 0.489 s
- 46 MB
  
**Perde** il Casting una volta ricaricato il file csv!

In [4]:
%%timeit
df.to_csv("bigfile.csv", index=False)

11 s ± 295 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [5]:
!ls -GFlashj bigfile.csv

ls: opzione non valida -- "j"
Try 'ls --help' for more information.


In [6]:
%%timeit
df = pd.read_csv("bigfile.csv")

539 ms ± 4.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## PICKLE

- WRITE 0,955 s
- READ 0.338 s
- MB 43

**Mantiene** il casting una volta ricaricato

In [7]:
df = get_dataset(1_000_000)
%timeit df.to_pickle("bigfile.pickle")

961 ms ± 11.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
%%timeit 
df_pickle = pd.read_pickle("bigfile.pickle")

330 ms ± 5.42 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
!ls -GFlash bigfile.pickle

43M -rw-rw-r-- 1 andrea 43M giu  1 00:55 bigfile.pickle


## Parquet

- WRITE 1.21 s
-  0.271 s
- MB 18

In [10]:
import fastparquet

In [11]:
df = get_dataset(1_000_000)

In [12]:
%%timeit
df.to_parquet("bigfile.parquet")

305 ms ± 12.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
%%timeit
df_parquet= pd.read_parquet("bigfile.parquet")

101 ms ± 3.24 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [14]:
!ls -GFlash bigfile.parquet

11M -rw-rw-r-- 1 andrea 11M giu  1 00:55 bigfile.parquet


In [15]:
# Read in specific columns
df_example = pd.read_parquet("bigfile.parquet", columns=["date", "win"])
df_example.head(5)

Unnamed: 0,date,win
0,2022-03-23,no
1,2020-07-01,yes
2,2021-02-07,yes
3,2021-11-08,no
4,2020-11-14,no


In [18]:
!ls -GFlash big*

46M -rw-rw-r-- 1 andrea 46M giu  1 00:55 bigfile.csv
11M -rw-rw-r-- 1 andrea 11M giu  1 00:55 bigfile.parquet
43M -rw-rw-r-- 1 andrea 43M giu  1 00:55 bigfile.pickle


## CSV

- WRITE 3.21 s
- READ 0.489 s
- 46 MB

## PICKLE
**Perde** il Casting una volta ricaricato il file csv!

- WRITE 0,955 s
- READ 0.338 s
- MB 43

**Mantiene** il casting una volta ricaricato

## PARQUET

- WRITE 1.21 s
-  0.271 s
- MB 18
