In [1]:
# Filtering DataFrames for values that fit certain conditions

# 1. Filters from conditions
# 2. Filters from functions
# 3. Feature filtering

import pandas as pd 

In [2]:
# 1. Filter Conditions
df = pd.DataFrame({
  'playerID': ['bettsmo01', 'canoro01', 'cruzne02', 'ortizda01', 'cruzne02'],
  'yearID': [2016, 2016, 2016, 2016, 2017],
  'teamID': ['BOS', 'SEA', 'SEA', 'BOS', 'SEA'],
  'HR': [31, 39, 43, 38, 39]})
print(df)

    playerID  yearID teamID  HR
0  bettsmo01    2016    BOS  31
1   canoro01    2016    SEA  39
2   cruzne02    2016    SEA  43
3  ortizda01    2016    BOS  38
4   cruzne02    2017    SEA  39


In [3]:
cruzne02 = df['playerID'] == 'cruzne02'
print(cruzne02)

0    False
1    False
2     True
3    False
4     True
Name: playerID, dtype: bool


In [4]:
hr = df["HR"] > 40
print(hr)

0    False
1    False
2     True
3    False
4    False
Name: HR, dtype: bool


In [5]:
notbos = df["teamID"] != "BOS"
print(notbos)

0    False
1     True
2     True
3    False
4     True
Name: teamID, dtype: bool


In [8]:
# Each boolean series, is effectively boolean mask
filtered = df[cruzne02]
print(filtered)

   playerID  yearID teamID  HR
2  cruzne02    2016    SEA  43
4  cruzne02    2017    SEA  39


In [9]:
# 2. Filter from functions
# Based on a datatype of a value you can apply functions , which by themselves naturally returns boolean

str_f1 = df['playerID'].str.startswith('c')

print(df[str_f1])


   playerID  yearID teamID  HR
1  canoro01    2016    SEA  39
2  cruzne02    2016    SEA  43
4  cruzne02    2017    SEA  39


In [10]:
str_f2 = df["teamID"].str.endswith("S")
print(df[str_f2])

    playerID  yearID teamID  HR
0  bettsmo01    2016    BOS  31
3  ortizda01    2016    BOS  38


In [11]:
str_f3 = df["playerID"].str.contains("o")

# interesting idea is to flip all booleans to get a negation, like not
print(df[~str_f3])

   playerID  yearID teamID  HR
2  cruzne02    2016    SEA  43
4  cruzne02    2017    SEA  39


In [12]:
# if we have multiple values, we can use membership function isin([option1,option2])

isin_f1 = df["playerID"].isin(["cruzne02","ortizda01"])
print(df[isin_f1])

    playerID  yearID teamID  HR
2   cruzne02    2016    SEA  43
3  ortizda01    2016    BOS  38
4   cruzne02    2017    SEA  39


In [14]:
import numpy as np
# watchout for NaN values
df = pd.DataFrame({
  'playerID': ['bettsmo01', 'canoro01', 'doejo01'],
  'yearID': [2016, 2016, 2017],
  'teamID': ['BOS', 'SEA', np.nan],
  'HR': [31, 39, 99]})

print(df)

    playerID  yearID teamID  HR
0  bettsmo01    2016    BOS  31
1   canoro01    2016    SEA  39
2    doejo01    2017    NaN  99


In [17]:
# looking for missing values, isna() and notna()


In [18]:
isna = df["teamID"].isna()
print(df[isna])

  playerID  yearID teamID  HR
2  doejo01    2017    NaN  99


In [19]:
notna = df["teamID"].notna()
print(df[notna])

    playerID  yearID teamID  HR
0  bettsmo01    2016    BOS  31
1   canoro01    2016    SEA  39
