In [None]:
from gzeus import Chunker, stream_polars_csv_gz
import polars as pl
import pandas as pd

In [None]:

# The dataset is available at:
# https://catalog.data.gov/dataset/insurance-complaints-all-data

In [None]:
def chunk_load_data_pandas(path:str, chunk_size:int = 10_000) -> pd.DataFrame:
    frames = [
        df_chunk
        for df_chunk in pd.read_csv("../data/insurance.csv.gz", iterator = True, chunksize=chunk_size, engine="c")   
    ]
    return pd.concat(frames)

def chunk_load_data_gzeus2(path:str, chunk_size:int = 1_000_000) -> pl.DataFrame:
    # Turn portion of the produced bytes into a DataFrame. Only possible with Polars, 
    # or dataframe packages with "lazy" capabilities. Lazy read + filters ensure 
    # only necessary bytes are copied into our dataframe 
    def get_necessary_data(df:pl.LazyFrame) -> pl.DataFrame:
        return df.filter(
            (pl.col("Confirmed complaint") != 'No')
            & (pl.col("Keywords").is_not_null())
        ).select(
            'Complaint number',
            'Complaint filed against',
            'Complaint filed by',
            'Reason complaint filed',
            'Confirmed complaint',
            'Received date',
            'Closed date',
            'Complaint type',
            'Coverage level',
        ).collect()

    return pl.concat(stream_polars_csv_gz(path, buffer_size=chunk_size, func = get_necessary_data))

def chunk_load_data_pandas2(path:str, chunk_size:int = 10_000) -> pd.DataFrame:
    def get_necessary_data(df:pd.DataFrame) -> pd.DataFrame:
        return df.loc[
            (df['Confirmed complaint'] != 'No') & (~df['Keywords'].isna())
            , :
        ][[
            'Complaint number',
            'Complaint filed against',
            'Complaint filed by',
            'Reason complaint filed',
            'Confirmed complaint',
            'Received date',
            'Closed date',
            'Complaint type',
            'Coverage level',
        ]]

    frames = [
        get_necessary_data(df_chunk)
        for df_chunk in pd.read_csv("../data/insurance.csv.gz", iterator = True, chunksize=chunk_size, engine="c")   
    ]
    return pd.concat(frames)

In [None]:
# If you want more logging info, set verbose=True
# and set up your own logging
import logging
logging.basicConfig(level=logging.INFO)

In [None]:
df = pl.concat(stream_polars_csv_gz("../data/insurance.csv.gz", verbose=True))
df.head()

In [None]:
df_pd = chunk_load_data_pandas("../data/insurance.csv.gz")
df_pd.head()

# Benchmark 1 - (GZeus + Polars) vs. Pandas 

No work per chunk.

Tuning pandas chunksize doesn't really help with performance at all.

In [None]:
%%timeit
df = pl.concat(stream_polars_csv_gz("../data/insurance.csv.gz"))

In [None]:
%%timeit
df = chunk_load_data_pandas("../data/insurance.csv.gz")

In [None]:
%%timeit
df = pl.concat(stream_polars_csv_gz("../data/insurance.csv.gz", buffer_size=5_000_000)) # bigger chunks, 5mb per chunk

In [None]:
%%timeit
df = chunk_load_data_pandas("../data/insurance.csv.gz", chunk_size=50_000)

In [None]:
%%timeit
df = chunk_load_data_pandas("../data/insurance.csv.gz", chunk_size=100_000) # the whole df is 260k rows

# Benchmark 2 - (GZeus + Polars) vs. Pandas 

Some work per chunk.

GZeus + Polars runs faster because the workload can be optimized by Polars and further speeds up the process. On the other hand, pandas shows no speed improvement because it will read the full chunk regardless of any work you do on the chunk.

In [None]:
%%timeit
df = chunk_load_data_gzeus2("../data/insurance.csv.gz")


In [None]:
%%timeit
df = chunk_load_data_pandas2("../data/insurance.csv.gz")

In [None]:
%%timeit
df = chunk_load_data_gzeus2("../data/insurance.csv.gz", chunk_size=5_000_000) # bigger chunks

In [None]:
%%timeit
df = chunk_load_data_pandas2("../data/insurance.csv.gz", chunk_size=50_000)

In [None]:
%%timeit
df = chunk_load_data_pandas2("../data/insurance.csv.gz", chunk_size=100_000)