In [2]:
import modin.pandas as pd
import pandas as old_pd
import dask.dataframe as dd
import numpy as np
import random

from pyspark.sql import SparkSession, functions
from pyspark.sql.types import *

In [3]:
row_size = 10000000
print(int(row_size / 1000000), "mio rows")

10 mio rows


In [4]:
frame_data = np.random.randint(0, 100, size=(row_size, 1))

In [5]:
foo = ['a', 'b', 'c']
random_groups = [random.choice(foo) for x in range(row_size)]

In [6]:
%%time
print('### PANDAS ###')
old_df = old_pd.DataFrame(frame_data)
old_df['group'] = random_groups
old_df.columns = ['data', 'group']
print(old_df.groupby('group').sum())
print('\n')

### PANDAS ###
            data
group           
a      165192238
b      165131214
c      164822405


CPU times: user 518 ms, sys: 149 ms, total: 667 ms
Wall time: 666 ms


In [7]:
%%time
print('### MODIN ###')
df = pd.DataFrame(frame_data)
df['group'] = random_groups
df.columns = ['data', 'group']
print(df.groupby('group').sum())
print('\n')

### MODIN ###
            data
group           
a      165192238
b      165131214
c      164822405


CPU times: user 20.9 s, sys: 330 ms, total: 21.3 s
Wall time: 29.4 s


In [8]:
%%time
print('### DASK ###')
df3 = dd.from_pandas(old_df, chunksize=10000)
print(df3.groupby('group').sum().compute())
print('\n')

### DASK ###
            data
group           
a      165192238
b      165131214
c      164822405


CPU times: user 4.01 s, sys: 207 ms, total: 4.21 s
Wall time: 3.75 s


In [16]:
schema = StructType([StructField("data", IntegerType(), True), StructField("group", StringType(), True)])

In [9]:
old_df.to_csv('foo.csv', index=False)

In [17]:
spark = SparkSession.builder.appName('test').getOrCreate()

In [18]:
df = spark.read.csv('foo.csv', header=True, schema=schema)

In [19]:
df.registerTempTable("df")

In [20]:
df.printSchema()

root
 |-- data: integer (nullable = true)
 |-- group: string (nullable = true)



In [22]:
%%time
print('### PYSPARK ###')
print(spark.sql("select group, sum(data) from df group by 1 order by 1").show())
print('\n')

### PYSPARK ###
+-----+---------+
|group|sum(data)|
+-----+---------+
|    a|165192238|
|    b|165131214|
|    c|164822405|
+-----+---------+

None


CPU times: user 6.54 ms, sys: 39 µs, total: 6.58 ms
Wall time: 5.21 s
