In [1]:
# numpy can still be inefficient cause it relies on intermediate arrays between python and C
# eval and query bypass this

In [2]:
# numpy and pandas are fast for vectorized functions

import numpy as np

rng = np.random.RandomState(42)
x = rng.rand(1000000)
y = rng.rand(1000000)

%timeit x + y

1.61 ms ± 22.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [3]:
# much faster than doing it via iterations
%timeit np.fromiter((xi + yi for xi, yi in zip(x, y)), dtype=x.dtype, count=len(x))

131 ms ± 1.11 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [4]:
# but can still become less efficient in certain situations

mask = (x > .5) & (y < .5)

# roughly equivalent to
tmp1 = (x > .5)
tmp2 = (y < .5)
masktmp = tmp1 & tmp2

# which means extra memory is allocated for the intermediate steps

In [7]:
# numexpr allows for this type of compound computation to not require intermediate memory
import numexpr

mask_numexpr = numexpr.evaluate('(x > 0.5) & (y < 0.5)')
np.allclose(mask, mask_numexpr)

True

In [8]:
## Pandas eval and query are based on numexpr and work similarly

In [9]:
import pandas as pd
nrows, ncols = 10000, 100
rn = np.random.RandomState(42)
df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols))
                     for i in range(4))

In [10]:
%timeit df1 + df2 + df3 + df4

5.92 ms ± 157 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [11]:
# equivalent to

%timeit pd.eval('df1 + df2 + df3 + df4')

5.44 ms ± 71.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [13]:
# uses less memory

np.allclose(df1 + df2 + df3 + df4,
            pd.eval('df1 + df2 + df3 + df4'))

True

In [14]:
## eval operations

df1, df2, df3, df4, df5 = (pd.DataFrame(rng.randint(0, 1000, (100, 3))) for i in range(5))

In [19]:
# arithmetic
result1 = -df1 * df2 / (df3 + df4) - df5
result2 = pd.eval('-df1 * df2 / (df3 + df4) - df5')
np.allclose(result1, result2)

True

In [21]:
# comparison
result1 = (df1 < df2) & (df2 <= df3) & (df3 != df4)
result2 = pd.eval('(df1 < df2) & (df2 <= df3) & (df3 != df4)')
np.allclose(result1, result2)

True

In [22]:
# bitwise
result1 = (df1 < .5) & (df2 < .5) | (df3 > df4)
result2 = pd.eval('(df1 < .5) & (df2 < .5) | (df3 > df4)')
np.allclose(result1, result2)

True

In [24]:
# works with "and" and "or"
result3 = pd.eval('(df1 < 0.5) and (df2 < 0.5) or (df3 > df4)')
np.allclose(result1, result3)

True

In [25]:
# object attributes and indices
# attributs via dot notation, and indexes via [ ] notation

result1 = df2.T[0] + df3.iloc[1]
result2 = pd.eval('df2.T[0] + df3.iloc[1]')
np.allclose(result1, result2)

True

In [26]:
# more advanced constructs like function calls, conditional statements, loops, etc are NOT implemented
# to do this, use numexpr library itself

In [27]:
## eval() for column-wise operations

In [29]:
# dataframes have eval as well

df = pd.DataFrame(rng.rand(1000, 3), columns=['A', 'B', 'C'])
df.head()

Unnamed: 0,A,B,C
0,0.444891,0.863699,0.377109
1,0.829335,0.398937,0.393512
2,0.008342,0.689972,0.773722
3,0.534841,0.428441,0.657332
4,0.9999,0.295076,0.989675


In [31]:
result1 = (df['A'] + df['B']) / (df['C'] - 1)
result2 = pd.eval("(df.A + df.B) / (df.C - 1)")
np.allclose(result1, result2)

True

In [36]:
# use dataframe val instead
result3 = df.eval('(A + B) / (C - 1)')
np.allclose(result1, result2)


True

In [37]:
## Assignment in DataFrame eval
df.head()

Unnamed: 0,A,B,C
0,0.444891,0.863699,0.377109
1,0.829335,0.398937,0.393512
2,0.008342,0.689972,0.773722
3,0.534841,0.428441,0.657332
4,0.9999,0.295076,0.989675


In [38]:
df.eval('D = (A + B) / C', inplace=True)
df.head()

Unnamed: 0,A,B,C,D
0,0.444891,0.863699,0.377109,3.470052
1,0.829335,0.398937,0.393512,3.121308
2,0.008342,0.689972,0.773722,0.902538
3,0.534841,0.428441,0.657332,1.465443
4,0.9999,0.295076,0.989675,1.308486


In [39]:
# can modify existing as well
df.eval('D = (A - B) / C', inplace=True)
df.head()

Unnamed: 0,A,B,C,D
0,0.444891,0.863699,0.377109,-1.110576
1,0.829335,0.398937,0.393512,1.093735
2,0.008342,0.689972,0.773722,-0.880976
3,0.534841,0.428441,0.657332,0.161866
4,0.9999,0.295076,0.989675,0.712177


In [42]:
# local variables in eval
# @ notationonly works on dataframe eval!

column_mean = df.mean(1)
result1 = df['A'] + column_mean
result2 = df.eval('A + @column_mean')
np.allclose(result1, result2)

True

In [46]:
## DataFrame query() method

result1 = df[(df.A < .5) & (df.B < .5)]
result2 = pd.eval('df[(df.A < .5) & (df.B < .5)]')
np.allclose(result1, result2)

True

In [47]:
result3 = df.query('A < .5 and B < .5')

In [48]:
np.allclose(result1, result3)

True

In [49]:
Cmean = df['C'].mean()
result1 = df[(df.A < Cmean) & (df.B < Cmean)]
result2 = df.query('A < @Cmean and B < @Cmean')
np.allclose(result1, result2)

True

In [51]:
## Performance of these

# computation time and memory use. Compount dataframe expressions result in temporary arrays. eval() and query() do not
# for small arrays, it doesn't matter, but for large data sets it often does