
## Técnicas de filtrado de datos con Pandas

In [123]:
import pandas as pd

In [124]:
df = pd.read_csv("nba.csv")
df.head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
2,John Holland,Boston Celtics,30.0,SG,27.0,6-5,205.0,Boston University,
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
4,Jonas Jerebko,Boston Celtics,8.0,PF,29.0,6-10,231.0,,5000000.0


In [125]:
df.dropna(inplace=True)

In [126]:
df.isnull().sum()

Name        0
Team        0
Number      0
Position    0
Age         0
Height      0
Weight      0
College     0
Salary      0
dtype: int64

### 1.Usando [] para filtrar datos

Por ejemplo, para filtrar todos los datos donde la edad es mayor que su valor promedio y luego ordenar los resultados en orden descendente por edad, podemos hacer lo siguiente:

In [127]:
df[df['Age']>df['Age'].mean()].sort_values(by='Age',ascending=False).head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
304,Andre Miller,San Antonio Spurs,24.0,PG,40.0,6-3,200.0,Utah,250750.0
298,Tim Duncan,San Antonio Spurs,21.0,C,40.0,6-11,250.0,Wake Forest,5250000.0
261,Vince Carter,Memphis Grizzlies,15.0,SG,39.0,6-6,220.0,North Carolina,4088019.0
256,Jason Terry,Houston Rockets,31.0,SG,38.0,6-2,185.0,Arizona,947276.0
101,Paul Pierce,Los Angeles Clippers,34.0,SF,38.0,6-7,235.0,Kansas,3376000.0


También podemos usar condiciones de combinación con operadores lógicos como & y |. Por ejemplo, en el siguiente caso, además de la condición anterior, agregamos la condición de que el salario debe ser mayor a 250K. Hay que Tener en cuenta que las condiciones separadas por operadores lógicos deben estar entre paréntesis.

In [128]:
df[(df['Age']>df['Age'].mean()) & (df['Salary'] > 250000)].sort_values(by='Salary',ascending=False).head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
33,Carmelo Anthony,New York Knicks,7.0,SF,32.0,6-8,240.0,Syracuse,22875000.0
339,Chris Bosh,Miami Heat,1.0,PF,32.0,6-11,235.0,Georgia Tech,22192730.0
100,Chris Paul,Los Angeles Clippers,3.0,PG,31.0,6-0,175.0,Wake Forest,21468695.0
414,Kevin Durant,Oklahoma City Thunder,35.0,SF,27.0,6-9,240.0,Texas,20158622.0
164,Derrick Rose,Chicago Bulls,1.0,PG,27.0,6-3,190.0,Memphis,20093064.0


### 2.Usando loc/iloc 

loc accede a los datos por etiqueta (nombres de columnas e índices de filas), mientras que iloc accede a los datos por índice numérico. 

In [129]:
df.loc[(df.Team == 'Chicago Bulls') & (df.Age < 25)] 

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
159,Doug McDermott,Chicago Bulls,3.0,SF,24.0,6-8,225.0,Creighton,2380440.0
163,Bobby Portis,Chicago Bulls,5.0,PF,21.0,6-11,230.0,Arkansas,1391160.0
165,Tony Snell,Chicago Bulls,20.0,SF,24.0,6-7,200.0,New Mexico,1535880.0


In [130]:
df.iloc[3]

Name         Jordan Mickey
Team        Boston Celtics
Number                55.0
Position                PF
Age                   21.0
Height                 6-8
Weight               235.0
College                LSU
Salary           1170960.0
Name: 6, dtype: object

In [131]:
df.iloc[0:3] # Recordar que el iloc no trae el último elemento solicitado

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0


### 3.Usando isin 

A veces necesitamos apuntar a valores específicos. En tales casos, utilizamos isin. Por ejemplo, si queremos restringir los valores de la Position

In [132]:
df.loc[df['Position'].isin(['PG','SG']),:].sample(5)

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
103,JJ Redick,Los Angeles Clippers,4.0,SG,31.0,6-4,190.0,Duke,7085000.0
307,Jonathon Simmons,San Antonio Spurs,17.0,SG,26.0,6-6,195.0,Houston,525093.0
214,Jared Cunningham,Milwaukee Bucks,9.0,SG,25.0,6-4,195.0,Oregon State,947276.0
213,Michael Carter-Williams,Milwaukee Bucks,5.0,PG,24.0,6-6,190.0,Syracuse,2399040.0
161,E'Twaun Moore,Chicago Bulls,55.0,SG,27.0,6-4,191.0,Purdue,1015421.0


In [133]:
# restringir a dos equipos 
condicion = df["Team"].isin([ "Chicago Bulls", "Utah Jazz"]) 

In [134]:
df[condicion].head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
151,Cameron Bairstow,Chicago Bulls,41.0,PF,25.0,6-9,250.0,New Mexico,845059.0
152,Aaron Brooks,Chicago Bulls,0.0,PG,31.0,6-0,161.0,Oregon,2250000.0
153,Jimmy Butler,Chicago Bulls,21.0,SG,26.0,6-7,220.0,Marquette,16407500.0
154,Mike Dunleavy,Chicago Bulls,34.0,SG,35.0,6-9,230.0,Duke,4500000.0
157,Taj Gibson,Chicago Bulls,22.0,PF,30.0,6-9,225.0,USC,8500000.0


### 4.Usando str.contains

In [135]:
condicion2 = df["Team"].str.contains("boston", na = False, case = False)
df[condicion2].head()

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
6,Jordan Mickey,Boston Celtics,55.0,PF,21.0,6-8,235.0,LSU,1170960.0
7,Kelly Olynyk,Boston Celtics,41.0,C,25.0,7-0,238.0,Gonzaga,2165160.0


### 5.Función where

In [136]:
df.where(df["Age"] < 30)


Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
0,Avery Bradley,Boston Celtics,0.0,PG,25.0,6-2,180.0,Texas,7730337.0
1,Jae Crowder,Boston Celtics,99.0,SF,25.0,6-6,235.0,Marquette,6796117.0
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
6,Jordan Mickey,Boston Celtics,55.0,PF,21.0,6-8,235.0,LSU,1170960.0
7,Kelly Olynyk,Boston Celtics,41.0,C,25.0,7-0,238.0,Gonzaga,2165160.0
...,...,...,...,...,...,...,...,...,...
449,Rodney Hood,Utah Jazz,5.0,SG,23.0,6-8,206.0,Duke,1348440.0
451,Chris Johnson,Utah Jazz,23.0,SF,26.0,6-6,206.0,Dayton,981348.0
452,Trey Lyles,Utah Jazz,41.0,PF,20.0,6-10,234.0,Kentucky,2239800.0
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0


### 6.Función query

In [137]:
df.query('Salary < 5000000')

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
3,R.J. Hunter,Boston Celtics,28.0,SG,22.0,6-5,185.0,Georgia State,1148640.0
6,Jordan Mickey,Boston Celtics,55.0,PF,21.0,6-8,235.0,LSU,1170960.0
7,Kelly Olynyk,Boston Celtics,41.0,C,25.0,7-0,238.0,Gonzaga,2165160.0
8,Terry Rozier,Boston Celtics,12.0,PG,22.0,6-2,190.0,Louisville,1824360.0
9,Marcus Smart,Boston Celtics,36.0,PG,22.0,6-4,220.0,Oklahoma State,3431040.0
...,...,...,...,...,...,...,...,...,...
449,Rodney Hood,Utah Jazz,5.0,SG,23.0,6-8,206.0,Duke,1348440.0
451,Chris Johnson,Utah Jazz,23.0,SF,26.0,6-6,206.0,Dayton,981348.0
452,Trey Lyles,Utah Jazz,41.0,PF,20.0,6-10,234.0,Kentucky,2239800.0
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0


In [138]:
df.query("Salary < 5000000 & Age > 25")

Unnamed: 0,Name,Team,Number,Position,Age,Height,Weight,College,Salary
12,Evan Turner,Boston Celtics,11.0,SG,27.0,6-7,220.0,Ohio State,3425510.0
14,Tyler Zeller,Boston Celtics,44.0,C,26.0,7-0,253.0,North Carolina,2616975.0
17,Wayne Ellington,Brooklyn Nets,21.0,SG,28.0,6-4,200.0,North Carolina,1500000.0
21,Sean Kilpatrick,Brooklyn Nets,6.0,SG,26.0,6-4,219.0,Cincinnati,134215.0
25,Willie Reed,Brooklyn Nets,33.0,PF,26.0,6-10,220.0,Saint Louis,947276.0
...,...,...,...,...,...,...,...,...,...
440,Brian Roberts,Portland Trail Blazers,2.0,PG,30.0,6-1,173.0,Dayton,2854940.0
442,Trevor Booker,Utah Jazz,33.0,PF,28.0,6-8,228.0,Clemson,4775000.0
451,Chris Johnson,Utah Jazz,23.0,SF,26.0,6-6,206.0,Dayton,981348.0
453,Shelvin Mack,Utah Jazz,8.0,PG,26.0,6-3,203.0,Butler,2433333.0


### by María Belén Camandone