In [2]:
import time
import pandas as pd
import numpy as np
import dask.dataframe as dd

def create_datasets(nrows: int, ncols: int) -> tuple[pd.DataFrame, pd.DataFrame]:
    main_data = {f"col_{i}": np.random.rand(nrows) for i in range(ncols)}
    ref_data = {f"col_{i}": np.random.rand(nrows // 10) for i in range(ncols)}
    main_df = pd.DataFrame(main_data)
    ref_df = pd.DataFrame(ref_data)
    return main_df, ref_df

def pandas_operations(main_df: pd.DataFrame, ref_df: pd.DataFrame) -> tuple[float, float]:
    start_time_agg = time.time()
    grouped = main_df.groupby("col_0").mean()
    end_time_agg = time.time()

    start_time_join = time.time()
    joined = main_df.merge(ref_df, on="col_0", how="left")
    end_time_join = time.time()

    return end_time_agg - start_time_agg, end_time_join - start_time_join

def dask_operations(
    main_df: pd.DataFrame, ref_df: pd.DataFrame, npartitions: int
) -> tuple[float, float]:
    dmain_df = dd.from_pandas(main_df, npartitions=npartitions)
    dref_df = dd.from_pandas(ref_df, npartitions=npartitions)

    start_time_agg = time.time()
    grouped_task = dmain_df.groupby("col_0").mean()
    grouped = grouped_task.compute()
    end_time_agg = time.time()
    grouped_task.visualize("grouped.svg")

    start_time_join = time.time()
    joined_task = dmain_df.merge(dref_df, on="col_0", how="left")
    joined = joined_task.compute()
    end_time_join = time.time()
    joined_task.visualize("joined.svg")

    return end_time_agg - start_time_agg, end_time_join - start_time_join

main_df, ref_df = create_datasets(10_000_000, 5)

pandas_agg_time, pandas_join_time = pandas_operations(main_df, ref_df)
dask_agg_time, dask_join_time = dask_operations(main_df, ref_df, npartitions=10)

print("Pandas 집계 시간:", pandas_agg_time, "초")
print("Pandas 조인 시간:", pandas_join_time, "초")
print("Dask 집계 시간:", dask_agg_time, "초")
print("Dask 조인 시간:", dask_join_time, "초")

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



Pandas 집계 시간: 3.510481357574463 초
Pandas 조인 시간: 1.1892461776733398 초
Dask 집계 시간: 3.222404956817627 초
Dask 조인 시간: 1.0353312492370605 초
