In [1]:
import pandas as pd
import numpy as np

# Filtering data

In [4]:
bios = pd.read_csv('./data/bios.csv')
bios.head()

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
0,1,Jean-François Blanchy,1886-12-12,Bordeaux,Gironde,FRA,France,,,1960-10-02
1,2,Arnaud Boetsch,1969-04-01,Meulan,Yvelines,FRA,France,183.0,76.0,
2,3,Jean Borotra,1898-08-13,Biarritz,Pyrénées-Atlantiques,FRA,France,183.0,76.0,1994-07-17
3,4,Jacques Brugnon,1895-05-11,Paris VIIIe,Paris,FRA,France,168.0,64.0,1978-03-20
4,5,Albert Canet,1878-04-17,Wandsworth,England,GBR,France,,,1930-07-25


## create panda series of boolean values for the condition

In [25]:
is_tall = bios.height_cm > 215   # get boolean series for condition
display(type(is_tall))
display(len(is_tall))
true_is_tall = is_tall[is_tall]          # filter true values from series
display(len(true_is_tall))
true_is_tall.head()

pandas.core.series.Series

145500

35

5089    True
5583    True
5673    True
5716    True
5781    True
Name: height_cm, dtype: bool

## use the panda series to filter df

In [24]:
len(bios[is_tall])
display(bios[is_tall].head())
bios[bios['height_cm'] > 215].head()

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
5089,5108,Viktor Pankrashkin,1957-06-19,Moskva (Moscow),Moskva,RUS,Soviet Union,220.0,112.0,1993-07-24
5583,5606,Paulinho Villas Boas,1963-01-26,São Paulo,São Paulo,BRA,Brazil,217.0,106.0,
5673,5696,Gunther Behnke,1963-01-19,Leverkusen,Nordrhein-Westfalen,GER,Germany,221.0,114.0,
5716,5739,Uwe Blab,1962-03-26,München (Munich),Bayern,GER,Germany West Germany,218.0,110.0,
5781,5804,Tommy Burleson,1952-02-24,Crossnore,North Carolina,USA,United States,223.0,102.0,


Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
5089,5108,Viktor Pankrashkin,1957-06-19,Moskva (Moscow),Moskva,RUS,Soviet Union,220.0,112.0,1993-07-24
5583,5606,Paulinho Villas Boas,1963-01-26,São Paulo,São Paulo,BRA,Brazil,217.0,106.0,
5673,5696,Gunther Behnke,1963-01-19,Leverkusen,Nordrhein-Westfalen,GER,Germany,221.0,114.0,
5716,5739,Uwe Blab,1962-03-26,München (Munich),Bayern,GER,Germany West Germany,218.0,110.0,
5781,5804,Tommy Burleson,1952-02-24,Crossnore,North Carolina,USA,United States,223.0,102.0,


## filter df using loc

In [31]:
bios.loc[
    bios['height_cm'] > 215,    # filter
    ['name', 'height_cm']       # output columns
].head()

Unnamed: 0,name,height_cm
5089,Viktor Pankrashkin,220.0
5583,Paulinho Villas Boas,217.0
5673,Gunther Behnke,221.0
5716,Uwe Blab,218.0
5781,Tommy Burleson,223.0


## alternate syntax

In [33]:
bios[bios['height_cm']>215][['name', 'height_cm']].head()

Unnamed: 0,name,height_cm
5089,Viktor Pankrashkin,220.0
5583,Paulinho Villas Boas,217.0
5673,Gunther Behnke,221.0
5716,Uwe Blab,218.0
5781,Tommy Burleson,223.0


## multiple conditions

In [37]:
bios[
    (bios['height_cm']>215) & 
    (bios['born_country'].str.casefold() == 'usa'.casefold())
][
    ['name', 'height_cm', 'born_country']
]

Unnamed: 0,name,height_cm,born_country
5781,Tommy Burleson,223.0,USA
6722,Shaquille O'Neal,216.0,USA
6937,David Robinson,216.0,USA
123850,Tyson Chandler,216.0,USA


## string accessor

In [40]:
bios[bios['name'].str.contains('bob|keith', case=False)].head()

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
756,760,Bobby Powell,1881-04-02,Victoria,British Columbia,CAN,Canada,,,1917-04-28
1897,1907,Keith Hanlon,1966-09-01,,,,Ireland,,,
3422,3434,Bobby Kelsey,1938-12-08,London,England,GBR,Great Britain,177.0,62.0,
3505,3517,Keith Wallace,1961-03-29,Preston,England,GBR,Great Britain,165.0,51.0,1999-12-31
3513,3525,Bobby Wells,1961-05-15,London,England,GBR,Great Britain,193.0,91.0,


## .isin()

In [50]:
bios[
    (bios['born_country'].str.lower()
    .isin('usa ita fra'.split())) &
    (bios['name'].str.lower().str.startswith('rob'))
].sample(5).head()

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
61797,62252,Robbie Haines,1954-03-27,San Diego,California,USA,United States,188.0,95.0,
48137,48490,Roberto Cassio,1968-01-08,Roma,Roma,ITA,Italy,178.0,70.0,
13293,13374,Roberto Amadio,1963-07-10,Portogruaro,Venezia,ITA,Italy,188.0,78.0,
78156,78747,Robert Maxwell,1902-06-09,Los Angeles,California,USA,United States,183.0,81.0,1985-08-15
124484,126787,Rob Crane,1986-02-19,Stamford,Connecticut,USA,United States,188.0,84.0,


## using .query

In [53]:
bios.query('born_country == "ITA" and born_city == "Torino"').head()

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
801,805,Paola Bologna,1898-08-20,Torino,Torino,ITA,Italy,,,1960-01-13
1555,1562,Dante Carbini,1973-08-09,Torino,Torino,ITA,Italy,188.0,86.0,
5512,5533,Rosanna Vergnano,1954-05-21,Torino,Torino,ITA,Italy,172.0,64.0,
10088,10144,Maria Cristina Giai Pron,1974-08-21,Torino,Torino,ITA,Italy,168.0,60.0,
10158,10214,Eligio Valentino,1925-07-19,Torino,Torino,ITA,Italy,,,2012-10-10


In [56]:
bios.query(
    'born_country.str.lower() == "ita" '
    'and born_city.str.lower() == "torino"'
).head()

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
801,805,Paola Bologna,1898-08-20,Torino,Torino,ITA,Italy,,,1960-01-13
1555,1562,Dante Carbini,1973-08-09,Torino,Torino,ITA,Italy,188.0,86.0,
5512,5533,Rosanna Vergnano,1954-05-21,Torino,Torino,ITA,Italy,172.0,64.0,
10088,10144,Maria Cristina Giai Pron,1974-08-21,Torino,Torino,ITA,Italy,168.0,60.0,
10158,10214,Eligio Valentino,1925-07-19,Torino,Torino,ITA,Italy,,,2012-10-10


## using .isin() inside .query()

In [60]:
states = 'ITA FRA USA'.lower().split()
bios.query(
    'born_country.str.lower().isin(@states)'
).sample(5)

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
58481,58899,Dante Ceccatelli,1895-07-14,Prato,Prato,ITA,Italy,,,1936-03-13
48153,48506,Ezio Della Savia,1942-06-24,Cormons,Gorizia,ITA,Italy,185.0,92.0,2021-09-05
90592,91315,Anne-Laure Klein,1983-11-14,Calais,Pas-de-Calais,FRA,France,171.0,53.0,
145339,149061,Katie Hensien,1999-12-01,Redmond,Washington,USA,United States,,,
36640,36933,Clément Dorlia,1870-12-07,Paris XIIIe,Paris,FRA,France,,,1942-01-23
