## Import Libraries

In [1]:
import pandas as pd
import numpy as np

## Creating dataframe

In [2]:
raw_data = {'first_name': ['Jason', np.nan, 'Tina', 'Jake', 'Amy'],
        'last_name': ['Miller', np.nan, 'Ali', 'Milner', 'Cooze'],
        'age': [42, np.nan, 36, 24, 73],
        'sex': ['m', np.nan, 'f', 'm', 'f'],
        'preTestScore': [4, np.nan, np.nan, 2, 3],
        'postTestScore': [25, np.nan, np.nan, 62, 70]}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'sex', 'preTestScore', 'postTestScore'])
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


## Dropping Missing Value

In [4]:
df_no_missing = df.dropna()
df_no_missing

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42,m,4,25
3,Jake,Milner,24,m,2,62
4,Amy,Cooze,73,f,3,70


## Drop rows where all record is nan

In [5]:
df_cleaned = df.dropna(how='all')
df_cleaned

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42,m,4.0,25.0
2,Tina,Ali,36,f,,
3,Jake,Milner,24,m,2.0,62.0
4,Amy,Cooze,73,f,3.0,70.0


## Creating new column full of missing values

In [6]:
df['location'] = np.nan
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4.0,25.0,
1,,,,,,,
2,Tina,Ali,36.0,f,,,
3,Jake,Milner,24.0,m,2.0,62.0,
4,Amy,Cooze,73.0,f,3.0,70.0,


## Drop column if they only contain missing values


In [7]:
df.dropna(axis=1, how='all')

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore
0,Jason,Miller,42.0,m,4.0,25.0
1,,,,,,
2,Tina,Ali,36.0,f,,
3,Jake,Milner,24.0,m,2.0,62.0
4,Amy,Cooze,73.0,f,3.0,70.0


## Drop rows that contain less than five observations


In [8]:
df.dropna(thresh=5)

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42,m,4,25,
3,Jake,Milner,24,m,2,62,
4,Amy,Cooze,73,f,3,70,


## Fill in missing data with zeros

In [9]:
df.fillna(0)

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42,m,4,25,0
1,0,0,0,0,0,0,0
2,Tina,Ali,36,f,0,0,0
3,Jake,Milner,24,m,2,62,0
4,Amy,Cooze,73,f,3,70,0


## Fill in missing in preTestScore with the mean value of preTestScore

In [10]:
## inplace=True means that the changes are saved to the df right away

df["preTestScore"].fillna(df["preTestScore"].mean(), inplace=True)
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4,25.0,
1,,,,,3,,
2,Tina,Ali,36.0,f,3,,
3,Jake,Milner,24.0,m,2,62.0,
4,Amy,Cooze,73.0,f,3,70.0,


## Fill in missing in postTestScore with each sex's mean value of postTestScore



In [11]:
df["postTestScore"].fillna(df.groupby("sex")["postTestScore"].transform("mean"), inplace=True)
df

Unnamed: 0,first_name,last_name,age,sex,preTestScore,postTestScore,location
0,Jason,Miller,42.0,m,4,25.0,
1,,,,,3,,
2,Tina,Ali,36.0,f,3,70.0,
3,Jake,Milner,24.0,m,2,62.0,
4,Amy,Cooze,73.0,f,3,70.0,


## Select some raws but ignore the missing data points

In [None]:
# Select the rows of df where age is not NaN and sex is not NaN
df[df['age'].notnull() & df['sex'].notnull()]