In [61]:
import pandas as pd
import numpy as np

# Filtering data

In [62]:
bios = pd.read_csv('./data/bios.csv')
bios.head()

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
0,1,Jean-François Blanchy,1886-12-12,Bordeaux,Gironde,FRA,France,,,1960-10-02
1,2,Arnaud Boetsch,1969-04-01,Meulan,Yvelines,FRA,France,183.0,76.0,
2,3,Jean Borotra,1898-08-13,Biarritz,Pyrénées-Atlantiques,FRA,France,183.0,76.0,1994-07-17
3,4,Jacques Brugnon,1895-05-11,Paris VIIIe,Paris,FRA,France,168.0,64.0,1978-03-20
4,5,Albert Canet,1878-04-17,Wandsworth,England,GBR,France,,,1930-07-25


## create panda series of boolean values for the condition

In [63]:
is_tall = bios.height_cm > 215   # get boolean series for condition
display(type(is_tall))
display(len(is_tall))
true_is_tall = is_tall[is_tall]          # filter true values from series
display(len(true_is_tall))
true_is_tall.head()

pandas.core.series.Series

145500

35

5089    True
5583    True
5673    True
5716    True
5781    True
Name: height_cm, dtype: bool

## use the panda series to filter df

In [64]:
len(bios[is_tall])
display(bios[is_tall].head())
bios[bios['height_cm'] > 215].head()

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
5089,5108,Viktor Pankrashkin,1957-06-19,Moskva (Moscow),Moskva,RUS,Soviet Union,220.0,112.0,1993-07-24
5583,5606,Paulinho Villas Boas,1963-01-26,São Paulo,São Paulo,BRA,Brazil,217.0,106.0,
5673,5696,Gunther Behnke,1963-01-19,Leverkusen,Nordrhein-Westfalen,GER,Germany,221.0,114.0,
5716,5739,Uwe Blab,1962-03-26,München (Munich),Bayern,GER,Germany West Germany,218.0,110.0,
5781,5804,Tommy Burleson,1952-02-24,Crossnore,North Carolina,USA,United States,223.0,102.0,


Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
5089,5108,Viktor Pankrashkin,1957-06-19,Moskva (Moscow),Moskva,RUS,Soviet Union,220.0,112.0,1993-07-24
5583,5606,Paulinho Villas Boas,1963-01-26,São Paulo,São Paulo,BRA,Brazil,217.0,106.0,
5673,5696,Gunther Behnke,1963-01-19,Leverkusen,Nordrhein-Westfalen,GER,Germany,221.0,114.0,
5716,5739,Uwe Blab,1962-03-26,München (Munich),Bayern,GER,Germany West Germany,218.0,110.0,
5781,5804,Tommy Burleson,1952-02-24,Crossnore,North Carolina,USA,United States,223.0,102.0,


## filter df using loc

In [65]:
bios.loc[
    bios['height_cm'] > 215,    # filter
    ['name', 'height_cm']       # output columns
].head()

Unnamed: 0,name,height_cm
5089,Viktor Pankrashkin,220.0
5583,Paulinho Villas Boas,217.0
5673,Gunther Behnke,221.0
5716,Uwe Blab,218.0
5781,Tommy Burleson,223.0


## alternate syntax

In [66]:
bios[bios['height_cm']>215][['name', 'height_cm']].head()

Unnamed: 0,name,height_cm
5089,Viktor Pankrashkin,220.0
5583,Paulinho Villas Boas,217.0
5673,Gunther Behnke,221.0
5716,Uwe Blab,218.0
5781,Tommy Burleson,223.0


## multiple conditions

In [67]:
bios[
    (bios['height_cm']>215) & 
    (bios['born_country'].str.casefold() == 'usa'.casefold())
][
    ['name', 'height_cm', 'born_country']
]

Unnamed: 0,name,height_cm,born_country
5781,Tommy Burleson,223.0,USA
6722,Shaquille O'Neal,216.0,USA
6937,David Robinson,216.0,USA
123850,Tyson Chandler,216.0,USA


## string accessor

In [68]:
bios[bios['name'].str.contains('bob|keith', case=False)].head()

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
756,760,Bobby Powell,1881-04-02,Victoria,British Columbia,CAN,Canada,,,1917-04-28
1897,1907,Keith Hanlon,1966-09-01,,,,Ireland,,,
3422,3434,Bobby Kelsey,1938-12-08,London,England,GBR,Great Britain,177.0,62.0,
3505,3517,Keith Wallace,1961-03-29,Preston,England,GBR,Great Britain,165.0,51.0,1999-12-31
3513,3525,Bobby Wells,1961-05-15,London,England,GBR,Great Britain,193.0,91.0,


## .isin()

In [69]:
bios[
    (bios['born_country'].str.lower()
    .isin('usa ita fra'.split())) &
    (bios['name'].str.lower().str.startswith('fab'))
].sample(5).head()

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
33296,33548,Fabien Canu,1960-04-23,Saint-Valery-en-Caux,Seine-Maritime,FRA,France,182.0,87.0,
105036,106066,Fabrice Jeannet,1980-10-20,Fort-de-France,Martinique,FRA,France,193.0,85.0,
140578,144090,Fabrizia Marrone,1996-10-10,Chieti,Chieti,ITA,Italy,160.0,,
71278,71817,Fabia Trabaldo,1972-03-05,Borgosesia,Vercelli,ITA,Italy,168.0,53.0,
37991,38293,Fabrizio Biondi,1954-11-07,Livorno,Livorno,ITA,Italy,181.0,82.0,


## using .query

In [70]:
bios.query('born_country == "ITA" and born_city == "Torino"').head()

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
801,805,Paola Bologna,1898-08-20,Torino,Torino,ITA,Italy,,,1960-01-13
1555,1562,Dante Carbini,1973-08-09,Torino,Torino,ITA,Italy,188.0,86.0,
5512,5533,Rosanna Vergnano,1954-05-21,Torino,Torino,ITA,Italy,172.0,64.0,
10088,10144,Maria Cristina Giai Pron,1974-08-21,Torino,Torino,ITA,Italy,168.0,60.0,
10158,10214,Eligio Valentino,1925-07-19,Torino,Torino,ITA,Italy,,,2012-10-10


In [71]:
bios.query(
    'born_country.str.lower() == "ita" '
    'and born_city.str.lower() == "torino"'
).head()

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
801,805,Paola Bologna,1898-08-20,Torino,Torino,ITA,Italy,,,1960-01-13
1555,1562,Dante Carbini,1973-08-09,Torino,Torino,ITA,Italy,188.0,86.0,
5512,5533,Rosanna Vergnano,1954-05-21,Torino,Torino,ITA,Italy,172.0,64.0,
10088,10144,Maria Cristina Giai Pron,1974-08-21,Torino,Torino,ITA,Italy,168.0,60.0,
10158,10214,Eligio Valentino,1925-07-19,Torino,Torino,ITA,Italy,,,2012-10-10


## using .isin() inside .query()

In [72]:
states = 'ITA FRA USA'.lower().split()
bios.query(
    'born_country.str.lower().isin(@states)'
).sample(5)

Unnamed: 0,athlete_id,name,born_date,born_city,born_region,born_country,NOC,height_cm,weight_kg,died_date
67924,68439,Robert Paul,1910-04-20,Facture-Biganos,Gironde,FRA,France,170.0,65.0,1998-12-15
7256,7294,Gabriele Vianello,1938-05-06,Venezia,Venezia,ITA,Italy,191.0,83.0,
129403,132019,Brianna Rollins,1991-08-18,Miami,Florida,USA,United States,165.0,59.0,
77200,77791,Gwyn Coogan,1965-08-21,Trenton,New Jersey,USA,United States,155.0,50.0,
67850,68364,Louis Lesca,1887-02-05,Bordeaux,Gironde,FRA,France,176.0,,1974-04-30
