<h1>High performance Pandas: eval() and query()</h1>

In [1]:
# As of version 0.13, Pandas includes some experimental tools that allow you to directly access C-speed operations
# without costly allocation of intermediate arrays. 

<h3>Motivating query() and eval(): Compound expressions</h3>

In [2]:
# Numpy and Pandas support fast vectorized operations:

import numpy as np

In [3]:
rng = np.random.RandomState(42)
x = rng.rand(int(1E6))
y = rng.rand(int(1E6))
%timeit x + y;

750 µs ± 6.08 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [4]:
# This is much faster than doing addition via a python loop or comprehension

%timeit np.fromiter((xi + yi for xi, yi in zip(x,y)), dtype=x.dtype, count=len(x))

90 ms ± 132 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [5]:
# This expression can become less efficient when you are computing compund expressions

mask = (x > 0.5) & (y < 0.5)

In [6]:
tmp1 = (x > 0.5)
tmp2 = (y < 0.5)
mask = tmp1 & tmp2

In [7]:
import numexpr
mask_numexpr = numexpr.evaluate('(x > 0.5) & (y < 0.5)')
np.allclose(mask, mask_numexpr)

True

In [8]:
# The benefit here is that Numexpr evaluates the expression in a way that does not use full sized temporary arrays
# and thus be much more efficient than NumPy, expecially for large arrays. 

# The Pandas.eval() and query() tools depend on Numexpr package. 

<h4>pandas.eval() for efficient operations</h4>

In [9]:
# The eval() function in pandas uses string expressions to efficiently compute operations using DataFrames
import pandas as pd
nrows,ncols = 100000,100
rng = np.random.RandomState(42)

In [10]:
df1,df2,df3,df4 = (pd.DataFrame(rng.rand(nrows,ncols)) for i in range(4))

In [11]:
# To compute sum of all four DataFrames using the typical Pandas approach :
%timeit df1 + df2 + df3 + df4

21.9 ms ± 288 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [12]:
# We can compute the same result using pd.eval by constructing the expression as String
%timeit pd.eval('df1 + df2 + df3 + df4')

11.1 ms ± 54.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [13]:
np.allclose(df1 + df2 + df3 + df4, pd.eval('df1 + df2 + df3 + df4'))

True

<h4>Operations Supported by pd.eval()</h4>

In [14]:
df1,df2,df3,df4,df5 = (pd.DataFrame(rng.randint(0,1000,(100,3))) for i in range(5))

In [15]:
# pd.eval() supports all arithmetic operators. 

result1 = -df1 * df2/(df3 + df4) - df5
result2 = pd.eval('-df1 * df2/(df3 + df4) - df5')
np.allclose(result1,result2)

True

<h4>Comparison operators</h4>

In [16]:
# pd.eval() supports all comparison operators, including chained expressions

result1 = (df1 < df2) & (df2 <= df3) & (df3 != df4)
result2 = pd.eval('df1 < df2 <= df3 != df4')
np.allclose(result1,result2)

True

<h4>Bitwise Operators</h4>

In [17]:
# pd.eval() supports the & and | bitwise operators:

result1 = (df1 < 0.5) & (df2 < 0.5) | (df3 < df4)
result2 = pd.eval('(df1 < 0.5) & (df2 < 0.5) | (df3 < df4)')
np.allclose(result1,result2)

True

In [18]:
result3 = pd.eval('(df1 < 0.5) and (df2 < 0.5) or (df3 < df4)')
np.allclose(result1,result3)

True

<h4>Object attributes and indices</h4>

In [20]:
# pd.eval() supports access to object attributes via the obj.attr syntax and indexes via the obj[index] syntax:

result1 = df2.T[0] + df3.iloc[1]
result2 = pd.eval('df2.T[0] + df3.iloc[1]')
np.allclose(result1,result2)

True

<h3>DataFrame.eval() for Column-Wise Operations</h3>

In [21]:
df = pd.DataFrame(rng.rand(1000,3),columns=['A','B','C'])
df.head()

Unnamed: 0,A,B,C
0,0.375506,0.406939,0.069938
1,0.069087,0.235615,0.154374
2,0.677945,0.433839,0.652324
3,0.264038,0.808055,0.347197
4,0.589161,0.252418,0.557789


In [25]:
result1 = (df['A'] + df['B']) / (df['C'] - 1)
result2 = pd.eval("(df.A + df.B) / (df.C - 1)")
np.allclose(result1,result2)

True

In [26]:
# The DataFrame.eval() method allows much more succinct evaluation of expressions with the columns:

result3 = df.eval('(A + B)/(C - 1)')
np.allclose(result1, result3)

True

<h4>Assignment in DataFrame.eval()</h4>

In [27]:
# DataFrame.eval() allows assignment to any column

df.head()

Unnamed: 0,A,B,C
0,0.375506,0.406939,0.069938
1,0.069087,0.235615,0.154374
2,0.677945,0.433839,0.652324
3,0.264038,0.808055,0.347197
4,0.589161,0.252418,0.557789


In [28]:
# We can use df.eval() to create a new column 'D' and assign it a value computed from other columns

df.eval('D=(A+B)/C', inplace=True)
df.head()

Unnamed: 0,A,B,C,D
0,0.375506,0.406939,0.069938,11.18762
1,0.069087,0.235615,0.154374,1.973796
2,0.677945,0.433839,0.652324,1.704344
3,0.264038,0.808055,0.347197,3.087857
4,0.589161,0.252418,0.557789,1.508776


In [29]:
# In the same way any existing column can be modified:

df.eval('D = (A - B)/C',inplace=True)
df.head()

Unnamed: 0,A,B,C,D
0,0.375506,0.406939,0.069938,-0.449425
1,0.069087,0.235615,0.154374,-1.078728
2,0.677945,0.433839,0.652324,0.374209
3,0.264038,0.808055,0.347197,-1.566886
4,0.589161,0.252418,0.557789,0.603708


<h4>Local Variables in Dataframe.eval()</h4>

In [30]:
# The DataFrame.eval supports an additional method that supports an additional syntax letting it work with 
# local Python Variables. 

column_mean = df.mean(1)
result1 = df['A'] + column_mean
result2 = df.eval('A + @column_mean')
np.allclose(result1,result2)

True

<h3>DataFrame.query() Method</h3>

In [31]:
# Dataframe has another method based on evaluated Strings, called the query() method
result1 = df[(df.A < 0.5) & (df.B < 0.5)]
result2 = pd.eval('df[(df.A < 0.5) & (df.B < 0.5)]')
np.allclose(result1, result2)

True

In [32]:
# To filter data in this type of querying operation, the query() method can be used as :
result2 = df.query('A < 0.5 and B < 0.5')
np.allclose(result1, result2)

True

In [33]:
# The query() method also accepts the @flag to mark local variables

Cmean = df['C'].mean()
result1 = df[(df.A < Cmean) & (df.B < Cmean)]
result2 = df.query('A < @Cmean and B < @Cmean')
np.allclose(result1, result2)

True

<h3>Performance: When to use the functions</h3>

In [34]:
# Computation time and memory use. 

# Memory use is the most predictable aspect. 

# Every compound expression involving NumPy arrays or Pandas Data Frames will result in implicit creation of 
# temporary arrays:

# Example:
x = df[(df.A < 0.5) & (df.B < 0.5)]

In [35]:
# The above expression is similar to:

tmp1 = df.A < 0.5
tmp2 = df.B < 0.5
tmp3 = tmp1 & tmp2
x = df[tmp3]

In [36]:
# To check the approximate size of your array i bytes using :
df.values.nbytes

32000