# High-Performance Pandas: eval( ) and query( )

## Motivating query( ) and eval( )

In [1]:
# Regular numpy evaluations store full size arrays of all intermediate computations.

import numpy as np
rng = np.random.RandomState(42)
x = rng.rand(1000000)
y = rng.rand(1000000)
%timeit x + y

1.27 ms ± 83.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [2]:
%timeit np.fromiter((xi + yi for xi, yi in zip(x,y)), dtype=x.dtype, count=len(x))

270 ms ± 1.48 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [3]:
%timeit mask = (x > 0.5) & (y < 0.5)

# Equivalent to tmp1 = ( x > 0.5 )
#               tmp2 = ( y < 0.5 )
#               mask = tmp1 & tmp2

1.23 ms ± 52.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [4]:
mask = (x > 0.5) & (y < 0.5)

In [5]:
import numexpr

In [6]:
# numexpr. methods do the same bitwise operations but instead only
# generate intermediate copies of the elements being operated on

%timeit mask_numexpr = numexpr.evaluate('(x > 0.5) & (y < 0.5)')

1.33 ms ± 25.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [7]:
# We can see that the two masks are identical but the the numexpr method doesn't create
# an intermediate copy for each fo the expressions

mask_numexpr = numexpr.evaluate('(x > 0.5) & (y < 0.5)')
np.allclose(mask, mask_numexpr)

True

## pandas.eval( ) for Efficient Operations

In [8]:
import pandas as pd
nrows, ncols = 100000, 100
rng = np.random.RandomState(42)
df1, df2, df3, df4 = (pd.DataFrame(rng.rand(nrows, ncols)) for i in range(4))

In [9]:
%timeit df1 + df2 + df3 + df4

62 ms ± 1.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [10]:
%timeit pd.eval('df2 + df2 + df3 + df4')

24.6 ms ± 404 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [11]:
np.allclose(df1 + df2 + df3 + df4, pd.eval('df1 + df2 + df3 + df4'))

True

## Operations supported by pd.eval( )

In [12]:
df1, df2, df3, df4, df5 = (pd.DataFrame(rng.randint(0, 1000, (100, 3))) for i in range(5))

In [13]:
# Arithmetic

result1 = -df1 * df2 / (df3 + df4) - df5
result2 = pd.eval('-df1 * df2 / (df3 + df4) - df5')
np.allclose(result1, result2)

True

In [14]:
# Comparison Operators

result1 = (df1 < df2) & (df2 <= df3) & (df3 != df4)
result2 = pd.eval('(df1 < df2) & (df2 <= df3) & (df3 != df4)')
np.allclose(result1, result2)

True

In [15]:
# Bitwise Operators

result1 = (df1 < 0.5) & (df2 < 0.5) | (df3 < df4)
result2 = (df1 < 0.5) & (df2 < 0.5) | (df3 < df4)
np.allclose(result1, result2)

True

In [16]:
# pd.eval( ) also support the literal 'and' and 'or' operators.

result3 = pd.eval('(df1 < 0.5) and (df2 < 0.5) or (df3 < df4)')
np.allclose(result1, result3)

True

In [17]:
# Object attributes and indices

result1 = df2.T[0] + df3.iloc[1]
result2 = pd.eval('df2.T[0] + df3.iloc[1]')
np.allclose(result1, result2)

True

### DataFrame.eval( ) for Column-Wise Operations

In [18]:
# generate a DataFrame for practice 
df = pd.DataFrame(rng.rand(1000,3), columns=['A', 'B', 'C'])
df.head()

Unnamed: 0,A,B,C
0,0.375506,0.406939,0.069938
1,0.069087,0.235615,0.154374
2,0.677945,0.433839,0.652324
3,0.264038,0.808055,0.347197
4,0.589161,0.252418,0.557789


In [19]:
# Evaluate some exprssions using pandas DataFrames

result1 = (df['A'] + df['B']) / (df['C']-1)
result2 = pd.eval("(df.A + df.B) / (df.C-1)")
np.allclose(result1, result2)

True

In [20]:
# Call the .eval( ) method directly on the DataFrame to eliminate the 'df.' prefix from
# the names of the columns
result3 = df.eval("(A + B) / (C-1)")
np.allclose(result1, result3)

True

### Assignments in DataFrame.eval( )

In [21]:
df.head()

Unnamed: 0,A,B,C
0,0.375506,0.406939,0.069938
1,0.069087,0.235615,0.154374
2,0.677945,0.433839,0.652324
3,0.264038,0.808055,0.347197
4,0.589161,0.252418,0.557789


In [22]:
# You can use .eval() to assign values to new columns

df.eval('D = (A + B) / C', inplace=True)
df.head()

Unnamed: 0,A,B,C,D
0,0.375506,0.406939,0.069938,11.18762
1,0.069087,0.235615,0.154374,1.973796
2,0.677945,0.433839,0.652324,1.704344
3,0.264038,0.808055,0.347197,3.087857
4,0.589161,0.252418,0.557789,1.508776


In [23]:
# You can assign new values to existing columns also

df.eval('D = (A - B) / C', inplace=True)
df.head()


Unnamed: 0,A,B,C,D
0,0.375506,0.406939,0.069938,-0.449425
1,0.069087,0.235615,0.154374,-1.078728
2,0.677945,0.433839,0.652324,0.374209
3,0.264038,0.808055,0.347197,-1.566886
4,0.589161,0.252418,0.557789,0.603708


### Local Variables in DataFrame.eval( )

In [27]:
# use @variablename to use local variables in the eval function
column_mean = df.mean(1)
result1 = df['A'] + column_mean
result2 = df.eval('A + @column_mean')
np.allclose(result1, result2)

True

## DataFrame.query( ) Method

In [34]:
# Generate a dataframe of filtered data using the traditional method
result1 = df[(df.A < 0.5) & (df.B < 0.5)]

# and the '.eval( )' method called on pd
result2 = pd.eval('df[(df.A < 0.5) & (df.B < 0.5)]')
np.allclose(result1, result2)

True

In [35]:
# and '.eval( )' called on the DataFrame with the @ flag
result3 = df.eval('@df[A < 0.5 & B < 0.5]')
np.allclose(result1, result3)

True

In [36]:
# Query simplifies the syntax by calling directly on the DataFrame and eliminating the
# need to surround statement in the call to the dataframe.
result4 = df.query('A < 0.5 and B < 0.5')
np.allclose(result1, result4)

True