In [None]:
import pandas as pd
import polars as pl
import numpy as np
import time

# Configuration
NUM_ROWS = 10_000_000

# Génération de données
np.random.seed(42)
data = {
    "id": np.arange(NUM_ROWS),
    "age": np.random.randint(18, 90, size=NUM_ROWS),
    "city": np.random.choice(["Paris", "Lyon", "Marseille", "Toulouse", "Nice"], size=NUM_ROWS),
    "score": np.random.normal(loc=50, scale=10, size=NUM_ROWS),
    "gender": np.random.choice(["M", "F"], size=NUM_ROWS)
}

### 🔹 Pandas Benchmark
df_pandas = pd.DataFrame(data)

start = time.time()

# Filtrage
filtered_pd = df_pandas[df_pandas["age"] > 30]

# GroupBy & Agrégation
grouped_pd = filtered_pd.groupby("city").agg({
    "score": "mean",
    "age": "max"
})

end = time.time()
print(f"⏱ Pandas time: {end - start:.4f} seconds")

### 🔹 Polars Benchmark
df_polars = pl.DataFrame(data)

start = time.time()

# Filtrage
filtered_pl = df_polars.filter(pl.col("age") > 30)

# GroupBy & Agrégation
grouped_pl = (
    filtered_pl
    .group_by("city")
    .agg([
        pl.col("score").mean().alias("score_mean"),
        pl.col("age").max().alias("age_max")
    ])
)

end = time.time()
print(f"Polars time: {end - start:.4f} seconds")


⏱ Pandas time: 0.7532 seconds
Polars time: 0.1639 seconds
