# 1 - Starting

Add BASE_DIR to path env


In [2]:
import pathlib
import sys

BASE_DIR = pathlib.Path().absolute().parent
sys.path.append(str(BASE_DIR))

# Comparing Import Time


In [3]:
import time

# Lista das bibliotecas para testar
bibliotecas = ["pandas", "cudf", "polars"]

print("Tempo de importação (segundos):")
print("-" * 40)

for lib in bibliotecas:
    try:
        start = time.perf_counter()  # Mais preciso que time.time()
        __import__(lib)  # Importa a biblioteca
        end = time.perf_counter()
        print(f"{lib:10}: {(end - start):.6f}")
    except ImportError:
        print(f"{lib:10}: Não instalada")

print("-" * 40)

Tempo de importação (segundos):
----------------------------------------
pandas    : 0.425536
cudf      : 1.550308
polars    : 0.071201
----------------------------------------


# CUDF


In [4]:
import cudf

cudf_df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
display(cudf_df)
cudf_df = cudf_df.sum()
cudf_df

Unnamed: 0,a,b
0,1,4
1,2,5
2,3,6


a     6
b    15
dtype: int64

# Polars


In [5]:
import polars as pl

pl.Config.set_tbl_rows(10)
pl.Config.set_tbl_cols(100)
pl.Config.set_tbl_width_chars(10000)
pl.enable_string_cache()

polars_df = pl.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
display(polars_df)
polars_df = polars_df.sum()
polars_df

a,b
i64,i64
1,4
2,5
3,6


a,b
i64,i64
6,15


# Pandas


In [6]:
import pandas as pd

pandas_df = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
display(pandas_df)
pandas_df = pandas_df.sum()
pandas_df

Unnamed: 0,a,b
0,1,4
1,2,5
2,3,6


a     6
b    15
dtype: int64

# Comparing Execution Time


In [7]:
import time
import importlib
import numpy as np

# --- Config -------------------------------------------------------------
N_ROWS, N_COLS = 10_000_000, 10
np.random.seed(42)
base = np.random.rand(N_ROWS, N_COLS)
cols = [f"col{i}" for i in range(N_COLS)]
groups = np.random.randint(0, 10, size=N_ROWS)
libs = ["pandas", "polars", "cudf"]
results = []

# ------------------------------------------------------------------------
for name in libs:
    try:
        mod = importlib.import_module(name)

        # -------- DataFrame ------------------------------------------------
        t0 = time.perf_counter()
        if name == "pandas":
            df = mod.DataFrame(base, columns=cols)
            df["grp"] = groups
        elif name == "polars":
            import polars as pl

            df = pl.DataFrame({c: base[:, i] for i, c in enumerate(cols)})
            df = df.with_columns(pl.Series("grp", groups))
        elif name == "cudf":
            import cudf

            df = cudf.DataFrame({c: base[:, i] for i, c in enumerate(cols)})
            df["grp"] = groups
        make_t = time.perf_counter() - t0

        # -------- Filter ---------------------------------------------------
        t0 = time.perf_counter()
        filt = df.filter(pl.col("col0") > 0.5) if name == "polars" else df[df["col0"] > 0.5]
        filt_t = time.perf_counter() - t0

        # -------- Sort -----------------------------------------------------
        t0 = time.perf_counter()
        sort = df.sort("col1") if name == "polars" else df.sort_values("col1")
        sort_t = time.perf_counter() - t0

        # -------- Group-by mean -------------------------------------------
        t0 = time.perf_counter()
        if name == "polars":
            try:
                gb = df.group_by("grp")  # >= 0.19
            except AttributeError:
                gb = df.groupby("grp")  # < 0.19
            try:
                agg = gb.mean()  # se existir
            except AttributeError:
                agg = gb.agg(pl.all().mean())  # fallback
        else:  # pandas / cudf
            agg = df.groupby("grp").mean()
        agg_t = time.perf_counter() - t0

        results.append(dict(library=name, create_df_s=make_t, filter_s=filt_t, sort_s=sort_t, groupby_mean_s=agg_t))
    except Exception as e:
        results.append(dict(library=name, error=str(e)))

# ------------------------------------------------------------------------
bench = pd.DataFrame(results)
bench

Unnamed: 0,library,create_df_s,filter_s,sort_s,groupby_mean_s
0,pandas,0.02886,0.307135,2.103085,0.195894
1,polars,0.482869,0.072265,0.465482,0.123118
2,cudf,0.957405,0.037192,0.160478,0.07112
