In [1]:
import pandas as pd

print(pd.__version__)

1.5.3


In [2]:
import polars as pl

print(pl.__version__)

0.17.3


In [3]:
df = pd.read_csv('/content/drive/MyDrive/abalone.csv')

In [4]:
df_big = df.copy()

for i in range(240):
  df_big = pd.concat([df_big, df])

df_big.to_csv("abalone_big.csv", index=False)

In [5]:
df_big.shape, df.shape

((1006657, 9), (4177, 9))

**Загрузка данных.**

In [6]:
# Polars
%%time

df_pl = pl.read_csv("abalone_big.csv")

CPU times: user 721 ms, sys: 116 ms, total: 837 ms
Wall time: 595 ms


In [7]:
# Pandas
%%time

df_pd = pd.read_csv("abalone_big.csv")

CPU times: user 1.15 s, sys: 79.8 ms, total: 1.23 s
Wall time: 1.29 s


Polars быстрее, чем Pandas, в 2.2 раза.

**Фильтрация данных.**

In [14]:
# Polars
%%time

df_pl[['shell_weight', 'age']]

CPU times: user 981 µs, sys: 0 ns, total: 981 µs
Wall time: 1 ms


shell_weight,age
f64,f64
0.15,16.5
0.07,8.5
0.21,10.5
0.155,11.5
0.055,8.5
0.12,9.5
0.33,21.5
0.26,17.5
0.165,10.5
0.32,20.5


In [15]:
# Pandas
%%time

df_pd[['shell_weight', 'age']]

CPU times: user 11.6 ms, sys: 9.99 ms, total: 21.6 ms
Wall time: 23.3 ms


Unnamed: 0,shell_weight,age
0,0.1500,16.5
1,0.0700,8.5
2,0.2100,10.5
3,0.1550,11.5
4,0.0550,8.5
...,...,...
1006652,0.2490,12.5
1006653,0.2605,11.5
1006654,0.3080,10.5
1006655,0.2960,11.5


Polars быстрее, чем Pandas, в 23 раза.

In [10]:
# Polars
%%time

df_pl.filter((pl.col('age') > 5) & (pl.col('whole_weight') < 0.3))

CPU times: user 18.7 ms, sys: 4.59 ms, total: 23.3 ms
Wall time: 18.9 ms


sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,age
str,f64,f64,f64,f64,f64,f64,f64,f64
"""M""",0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,8.5
"""I""",0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,8.5
"""I""",0.355,0.28,0.085,0.2905,0.095,0.0395,0.115,8.5
"""M""",0.365,0.295,0.08,0.2555,0.097,0.043,0.1,8.5
"""M""",0.355,0.28,0.095,0.2455,0.0955,0.062,0.075,12.5
"""I""",0.38,0.275,0.1,0.2255,0.08,0.049,0.085,11.5
"""I""",0.24,0.175,0.045,0.07,0.0315,0.0235,0.02,6.5
"""I""",0.21,0.15,0.05,0.042,0.0175,0.0125,0.015,5.5
"""I""",0.39,0.295,0.095,0.203,0.0875,0.045,0.075,8.5
"""I""",0.325,0.245,0.07,0.161,0.0755,0.0255,0.045,7.5


In [11]:
%%time
df_pd.query('age > 5 & whole_weight < 0.3')

CPU times: user 30.7 ms, sys: 1.69 ms, total: 32.4 ms
Wall time: 47.7 ms


Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,age
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,8.5
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,8.5
16,I,0.355,0.280,0.085,0.2905,0.0950,0.0395,0.1150,8.5
18,M,0.365,0.295,0.080,0.2555,0.0970,0.0430,0.1000,8.5
20,M,0.355,0.280,0.095,0.2455,0.0955,0.0620,0.0750,12.5
...,...,...,...,...,...,...,...,...,...
1006629,I,0.280,0.215,0.070,0.1240,0.0630,0.0215,0.0300,7.5
1006630,I,0.330,0.230,0.080,0.1400,0.0565,0.0365,0.0460,8.5
1006631,I,0.350,0.250,0.075,0.1695,0.0835,0.0355,0.0410,7.5
1006632,I,0.370,0.280,0.090,0.2180,0.0995,0.0545,0.0615,8.5


Polars быстрее, чем Pandas, в 2.5 раза.

**Агреграция данных.**

In [20]:
# Polars
%%time

df_pl.groupby('sex').agg([pl.mean('length'), pl.max('shell_weight')])

CPU times: user 73.7 ms, sys: 0 ns, total: 73.7 ms
Wall time: 49.1 ms


sex,length,shell_weight
str,f64,f64
"""I""",0.427273,0.655
"""F""",0.572978,1.005
"""M""",0.560059,0.885


In [21]:
# Pandas
%%time

df.groupby('sex').agg({'length' : 'mean', 'shell_weight' : 'max'})

CPU times: user 4.92 ms, sys: 0 ns, total: 4.92 ms
Wall time: 4.87 ms


Unnamed: 0_level_0,length,shell_weight
sex,Unnamed: 1_level_1,Unnamed: 2_level_1
F,0.572978,1.005
I,0.427273,0.655
M,0.560059,0.885


Polars медленне, чем Pandas, в 10 раз.