# `.dropna()` to drop NULL values

### Import Pandas

In [1]:
import pandas as pd

### U.S. Major League Soccer Salaries

In [2]:
salaries = pd.read_csv("mls_salaries.csv")

### Remove NaN values

In [3]:
# return the DataFrame
salaries

Unnamed: 0,club,last_name,first_name,position,base_salary,guaranteed_compensation
0,ATL,Almiron,Miguel,M,1912500.0,2297000.00
1,ATL,Ambrose,Mikey,D,65625.0,65625.00
2,ATL,Asad,Yamil,M,150000.0,150000.00
3,ATL,Bloom,Mark,D,99225.0,106573.89
4,ATL,Carleton,Andrew,F,65000.0,77400.00
...,...,...,...,...,...,...
610,VAN,Teibert,Russell,M,126500.0,194000.00
611,VAN,Tornaghi,Paolo,GK,80000.0,80000.00
612,VAN,Waston,Kendall,D,350000.0,368125.00
613,,,,,,


### Return information about the DataFrame including Non_Null Count

In [4]:
salaries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 615 entries, 0 to 614
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   club                     614 non-null    object 
 1   last_name                614 non-null    object 
 2   first_name               610 non-null    object 
 3   position                 604 non-null    object 
 4   base_salary              614 non-null    float64
 5   guaranteed_compensation  614 non-null    float64
dtypes: float64(2), object(4)
memory usage: 29.0+ KB


### Drop any rows with missing values

In [5]:
# drop any row that contains a NaN value by default (how='any')
salaries.dropna(inplace=True)

### Return information about the DataFrame including Non_Null Count

In [6]:
salaries.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 600 entries, 0 to 614
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   club                     600 non-null    object 
 1   last_name                600 non-null    object 
 2   first_name               600 non-null    object 
 3   position                 600 non-null    object 
 4   base_salary              600 non-null    float64
 5   guaranteed_compensation  600 non-null    float64
dtypes: float64(2), object(4)
memory usage: 32.8+ KB


### Return DataFrame without null values

In [7]:
# return the DataFrame with no NaN values
salaries

Unnamed: 0,club,last_name,first_name,position,base_salary,guaranteed_compensation
0,ATL,Almiron,Miguel,M,1912500.0,2297000.00
1,ATL,Ambrose,Mikey,D,65625.0,65625.00
2,ATL,Asad,Yamil,M,150000.0,150000.00
3,ATL,Bloom,Mark,D,99225.0,106573.89
4,ATL,Carleton,Andrew,F,65000.0,77400.00
...,...,...,...,...,...,...
609,VAN,Techera,Cristian,M,352000.0,377000.00
610,VAN,Teibert,Russell,M,126500.0,194000.00
611,VAN,Tornaghi,Paolo,GK,80000.0,80000.00
612,VAN,Waston,Kendall,D,350000.0,368125.00


### Reset dataset

In [8]:
# reset by re-reading original dataset
salaries = pd.read_csv("mls_salaries.csv")

### Remove NaN values by column

In [9]:
# fisrt, return a column as a Series from the DataFrame
salaries["club"]

0      ATL
1      ATL
2      ATL
3      ATL
4      ATL
      ... 
610    VAN
611    VAN
612    VAN
613    NaN
614    VAN
Name: club, Length: 615, dtype: object

In [10]:
# you can also use a list to specify which columns to target using subset=
salaries.dropna(subset=["club","base_salary"], inplace=True)

In [11]:
# no NaN values
salaries["club"]

0      ATL
1      ATL
2      ATL
3      ATL
4      ATL
      ... 
609    VAN
610    VAN
611    VAN
612    VAN
614    VAN
Name: club, Length: 614, dtype: object

### Reset dataset

In [16]:
# reset by re-reading original dataset
salaries = pd.read_csv("mls_salaries.csv")

In [17]:
salaries

Unnamed: 0,club,last_name,first_name,position,base_salary,guaranteed_compensation
0,ATL,Almiron,Miguel,M,1912500.0,2297000.00
1,ATL,Ambrose,Mikey,D,65625.0,65625.00
2,ATL,Asad,Yamil,M,150000.0,150000.00
3,ATL,Bloom,Mark,D,99225.0,106573.89
4,ATL,Carleton,Andrew,F,65000.0,77400.00
...,...,...,...,...,...,...
610,VAN,Teibert,Russell,M,126500.0,194000.00
611,VAN,Tornaghi,Paolo,GK,80000.0,80000.00
612,VAN,Waston,Kendall,D,350000.0,368125.00
613,,,,,,


In [18]:
# drop only rows with all null values (how='all')
salaries.dropna(how="all")

Unnamed: 0,club,last_name,first_name,position,base_salary,guaranteed_compensation
0,ATL,Almiron,Miguel,M,1912500.0,2297000.00
1,ATL,Ambrose,Mikey,D,65625.0,65625.00
2,ATL,Asad,Yamil,M,150000.0,150000.00
3,ATL,Bloom,Mark,D,99225.0,106573.89
4,ATL,Carleton,Andrew,F,65000.0,77400.00
...,...,...,...,...,...,...
609,VAN,Techera,Cristian,M,352000.0,377000.00
610,VAN,Teibert,Russell,M,126500.0,194000.00
611,VAN,Tornaghi,Paolo,GK,80000.0,80000.00
612,VAN,Waston,Kendall,D,350000.0,368125.00


In [19]:
salaries

Unnamed: 0,club,last_name,first_name,position,base_salary,guaranteed_compensation
0,ATL,Almiron,Miguel,M,1912500.0,2297000.00
1,ATL,Ambrose,Mikey,D,65625.0,65625.00
2,ATL,Asad,Yamil,M,150000.0,150000.00
3,ATL,Bloom,Mark,D,99225.0,106573.89
4,ATL,Carleton,Andrew,F,65000.0,77400.00
...,...,...,...,...,...,...
610,VAN,Teibert,Russell,M,126500.0,194000.00
611,VAN,Tornaghi,Paolo,GK,80000.0,80000.00
612,VAN,Waston,Kendall,D,350000.0,368125.00
613,,,,,,


In [20]:
# make drop permanent with inplace
salaries.dropna(how="all", inplace=True)

In [21]:
# it is now permanent
salaries

Unnamed: 0,club,last_name,first_name,position,base_salary,guaranteed_compensation
0,ATL,Almiron,Miguel,M,1912500.0,2297000.00
1,ATL,Ambrose,Mikey,D,65625.0,65625.00
2,ATL,Asad,Yamil,M,150000.0,150000.00
3,ATL,Bloom,Mark,D,99225.0,106573.89
4,ATL,Carleton,Andrew,F,65000.0,77400.00
...,...,...,...,...,...,...
609,VAN,Techera,Cristian,M,352000.0,377000.00
610,VAN,Teibert,Russell,M,126500.0,194000.00
611,VAN,Tornaghi,Paolo,GK,80000.0,80000.00
612,VAN,Waston,Kendall,D,350000.0,368125.00


### Reset dataset

In [33]:
# reset by re-reading original dataset
salaries = pd.read_csv("mls_salaries.csv")

In [34]:
salaries

Unnamed: 0,club,last_name,first_name,position,base_salary,guaranteed_compensation
0,ATL,Almiron,Miguel,M,1912500.0,2297000.00
1,ATL,Ambrose,Mikey,D,65625.0,65625.00
2,ATL,Asad,Yamil,M,150000.0,150000.00
3,ATL,Bloom,Mark,D,99225.0,106573.89
4,ATL,Carleton,Andrew,F,65000.0,77400.00
...,...,...,...,...,...,...
610,VAN,Teibert,Russell,M,126500.0,194000.00
611,VAN,Tornaghi,Paolo,GK,80000.0,80000.00
612,VAN,Waston,Kendall,D,350000.0,368125.00
613,,,,,,


In [49]:
salaries.dropna(how='any', inplace=True)

In [50]:
# drop columns that contain any null values 
# remember (how='any') is the default
# salaries.dropna(axis='columns', how='all')
# axis = 0 for columns or axis = 'columns'
# axis = 1 for rows or axis = 'rows'
salaries.dropna(axis=1)

Unnamed: 0,club,last_name,first_name,position,base_salary,guaranteed_compensation
0,ATL,Almiron,Miguel,M,1912500.0,2297000.00
1,ATL,Ambrose,Mikey,D,65625.0,65625.00
2,ATL,Asad,Yamil,M,150000.0,150000.00
3,ATL,Bloom,Mark,D,99225.0,106573.89
4,ATL,Carleton,Andrew,F,65000.0,77400.00
...,...,...,...,...,...,...
609,VAN,Techera,Cristian,M,352000.0,377000.00
610,VAN,Teibert,Russell,M,126500.0,194000.00
611,VAN,Tornaghi,Paolo,GK,80000.0,80000.00
612,VAN,Waston,Kendall,D,350000.0,368125.00


### Reset dataset

In [51]:
# reset by re-reading original dataset
salaries = pd.read_csv("mls_salaries.csv")

In [52]:
# drop columns with null values in the base_salary column
salaries.dropna(subset=['base_salary'])

Unnamed: 0,club,last_name,first_name,position,base_salary,guaranteed_compensation
0,ATL,Almiron,Miguel,M,1912500.0,2297000.00
1,ATL,Ambrose,Mikey,D,65625.0,65625.00
2,ATL,Asad,Yamil,M,150000.0,150000.00
3,ATL,Bloom,Mark,D,99225.0,106573.89
4,ATL,Carleton,Andrew,F,65000.0,77400.00
...,...,...,...,...,...,...
609,VAN,Techera,Cristian,M,352000.0,377000.00
610,VAN,Teibert,Russell,M,126500.0,194000.00
611,VAN,Tornaghi,Paolo,GK,80000.0,80000.00
612,VAN,Waston,Kendall,D,350000.0,368125.00
