# Filtering DataFrames

In [1]:
import numpy as np
import pandas as pd

In [2]:
retail_df = pd.read_csv("../DataFrames/retail_2016_2017.csv")
retail_df

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,1945944,2016-01-01,1,AUTOMOTIVE,0.000,0
1,1945945,2016-01-01,1,BABY CARE,0.000,0
2,1945946,2016-01-01,1,BEAUTY,0.000,0
3,1945947,2016-01-01,1,BEVERAGES,0.000,0
4,1945948,2016-01-01,1,BOOKS,0.000,0
...,...,...,...,...,...,...
1054939,3000883,2017-08-15,9,POULTRY,438.133,0
1054940,3000884,2017-08-15,9,PREPARED FOODS,154.553,1
1054941,3000885,2017-08-15,9,PRODUCE,2419.729,148
1054942,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8


### You can filter the rows in a DataFrame by passing a logical test into the .loc[] accessor, just like filtering a Series or a NumPy array

In [3]:
# This filters the retail_df DataFrame and only returns the rows where the date is equal to "2016-10-28"
retail_df.loc[retail_df["date"] == "2016-10-28"]

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
536382,2482326,2016-10-28,1,AUTOMOTIVE,8.000,0
536383,2482327,2016-10-28,1,BABY CARE,0.000,0
536384,2482328,2016-10-28,1,BEAUTY,9.000,1
536385,2482329,2016-10-28,1,BEVERAGES,2576.000,38
536386,2482330,2016-10-28,1,BOOKS,0.000,0
...,...,...,...,...,...,...
538159,2484103,2016-10-28,9,POULTRY,391.292,24
538160,2484104,2016-10-28,9,PREPARED FOODS,78.769,1
538161,2484105,2016-10-28,9,PRODUCE,993.760,5
538162,2484106,2016-10-28,9,SCHOOL AND OFFICE SUPPLIES,0.000,0


### You can filter the columns in a DataFrame by passing them into the .loc[] accesor as a list or slice

In [4]:
# This filters the retail_df DataFrame to the columns selected, and only returns rows where the date is equal to "2016-10-28"
retail_df.loc[retail_df["date"] == "2016-10-28", ["date", "sales"]].head()

Unnamed: 0,date,sales
536382,2016-10-28,8.0
536383,2016-10-28,0.0
536384,2016-10-28,9.0
536385,2016-10-28,2576.0
536386,2016-10-28,0.0


### You can apply multiple filters by joingin the logical tests with an "&" operator
* Try creating a Boolean mask for creating filters with complex logic

In [5]:
conds = retail_df["family"].isin(["CLEANING", "DAIRY"]) & (retail_df["sales"] > 0)
retail_df.loc[conds]

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
568,1946512,2016-01-01,25,CLEANING,734.0,0
569,1946513,2016-01-01,25,DAIRY,1033.0,11
1789,1947733,2016-01-02,1,CLEANING,526.0,3
1790,1947734,2016-01-02,1,DAIRY,627.0,15
1822,1947766,2016-01-02,10,CLEANING,1216.0,4
...,...,...,...,...,...,...
1054853,3000797,2017-08-15,7,DAIRY,1279.0,25
1054885,3000829,2017-08-15,8,CLEANING,1198.0,13
1054886,3000830,2017-08-15,8,DAIRY,1330.0,24
1054918,3000862,2017-08-15,9,CLEANING,1439.0,25


### More Examples

In [6]:
# We're going to import this DataFrame and think about how we can filter this data down to prices that are greater than 100
oil = pd.read_csv("../DataFrames/oil.csv")
oil['benchmark'] = 100
oil.head()

Unnamed: 0,date,dcoilwtico,benchmark
0,2013-01-01,,100
1,2013-01-02,93.14,100
2,2013-01-03,92.97,100
3,2013-01-04,93.12,100
4,2013-01-07,93.2,100


In [7]:
oil.describe()

Unnamed: 0,dcoilwtico,benchmark
count,1175.0,1218.0
mean,67.714366,100.0
std,25.630476,0.0
min,26.19,100.0
25%,46.405,100.0
50%,53.19,100.0
75%,95.66,100.0
max,110.62,100.0


In [8]:
oil.loc[oil['dcoilwtico'] > 100] # this logical test will evalulate if 'dcoilwtico' prices are greater than 100. But we can be more specific in our test.

Unnamed: 0,date,dcoilwtico,benchmark
131,2013-07-03,101.92,100
133,2013-07-05,103.09,100
134,2013-07-08,103.03,100
135,2013-07-09,103.46,100
136,2013-07-10,106.41,100
...,...,...,...
407,2014-07-24,102.76,100
408,2014-07-25,105.23,100
409,2014-07-28,105.68,100
410,2014-07-29,104.91,100


In [9]:
oil.loc[oil['dcoilwtico'] > oil['benchmark']] # this time we're comparing to our predescribed value 'benchmark'

Unnamed: 0,date,dcoilwtico,benchmark
131,2013-07-03,101.92,100
133,2013-07-05,103.09,100
134,2013-07-08,103.03,100
135,2013-07-09,103.46,100
136,2013-07-10,106.41,100
...,...,...,...
407,2014-07-24,102.76,100
408,2014-07-25,105.23,100
409,2014-07-28,105.68,100
410,2014-07-29,104.91,100


In [10]:
oil.loc[oil['date'].str[:4] == "2013"]

Unnamed: 0,date,dcoilwtico,benchmark
0,2013-01-01,,100
1,2013-01-02,93.14,100
2,2013-01-03,92.97,100
3,2013-01-04,93.12,100
4,2013-01-07,93.20,100
...,...,...,...
256,2013-12-25,,100
257,2013-12-26,99.18,100
258,2013-12-27,99.94,100
259,2013-12-30,98.90,100


In [11]:
mask = ((oil['dcoilwtico'] > oil['benchmark']) 
        & (oil['date'].str[:4] == "2013"))

oil.loc[mask]

Unnamed: 0,date,dcoilwtico,benchmark
131,2013-07-03,101.92,100
133,2013-07-05,103.09,100
134,2013-07-08,103.03,100
135,2013-07-09,103.46,100
136,2013-07-10,106.41,100
...,...,...,...
204,2013-10-14,102.46,100
205,2013-10-15,101.15,100
206,2013-10-16,102.34,100
207,2013-10-17,100.72,100


# PRO TIP: Query
### The .query() method lets you use SQL-like syntax to filter DataFrames
* You can specify any number of filtering conditions by using the "and" & "or" keywords

In [12]:
# This query filters rows where the family is "CLEANING" or "DAIRY", and the sales are greater than 0
# Note that you don;t need to call the DataFrame name repeatedly, saving keystrokes and making the filter easier to interpret

retail_df.query("family in ['CLEANING', 'DAIRY'] and sales > 0")

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
568,1946512,2016-01-01,25,CLEANING,734.0,0
569,1946513,2016-01-01,25,DAIRY,1033.0,11
1789,1947733,2016-01-02,1,CLEANING,526.0,3
1790,1947734,2016-01-02,1,DAIRY,627.0,15
1822,1947766,2016-01-02,10,CLEANING,1216.0,4
...,...,...,...,...,...,...
1054853,3000797,2017-08-15,7,DAIRY,1279.0,25
1054885,3000829,2017-08-15,8,CLEANING,1198.0,13
1054886,3000830,2017-08-15,8,DAIRY,1330.0,24
1054918,3000862,2017-08-15,9,CLEANING,1439.0,25


* You can also reference external variables using the @ symbol

In [14]:
avg_sales = retail_df.loc[:, "sales"].mean() # calculating the average sales in the sales column of the retail_df
avg_sales

457.72248700136413

In [15]:
# This query filters rows where the family is "CLEANING" or "DAIRY", and the sales are greater than the avg_sales value variable
retail_df.query("family in ['CLEANING', 'DAIRY'] and sales > @avg_sales")

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
568,1946512,2016-01-01,25,CLEANING,734.0,0
569,1946513,2016-01-01,25,DAIRY,1033.0,11
1789,1947733,2016-01-02,1,CLEANING,526.0,3
1790,1947734,2016-01-02,1,DAIRY,627.0,15
1822,1947766,2016-01-02,10,CLEANING,1216.0,4
...,...,...,...,...,...,...
1054853,3000797,2017-08-15,7,DAIRY,1279.0,25
1054885,3000829,2017-08-15,8,CLEANING,1198.0,13
1054886,3000830,2017-08-15,8,DAIRY,1330.0,24
1054918,3000862,2017-08-15,9,CLEANING,1439.0,25


### More Examples

In [16]:
oil.head()

Unnamed: 0,date,dcoilwtico,benchmark
0,2013-01-01,,100
1,2013-01-02,93.14,100
2,2013-01-03,92.97,100
3,2013-01-04,93.12,100
4,2013-01-07,93.2,100


In [17]:
oil.dtypes

date           object
dcoilwtico    float64
benchmark       int64
dtype: object

In [18]:
# using SQL like functions on these columns to perform a logical test
oil.query(
    "dcoilwtico > benchmark"
)

Unnamed: 0,date,dcoilwtico,benchmark
131,2013-07-03,101.92,100
133,2013-07-05,103.09,100
134,2013-07-08,103.03,100
135,2013-07-09,103.46,100
136,2013-07-10,106.41,100
...,...,...,...
407,2014-07-24,102.76,100
408,2014-07-25,105.23,100
409,2014-07-28,105.68,100
410,2014-07-29,104.91,100


In [19]:
# But there are functions that also do not work in .query(). This will throw the "slice not supported" error
oil.query(
    "dcoilwtico > benchmark or date.str[:4]"
)

ValueError: "slice" is not a supported function