In [1]:
import pandas as pd

In [2]:
# Sometime our datasets contains a lot of missing values.
# and we dont want to keep them

In [6]:
# reading a dataset which has missing values
df = pd.read_csv('random_data_with_missing_values_semicolon.csv')
df
# as we can see the file is totall messed up,
# and we can see it is separated by ; instead of it.

Unnamed: 0,Column1;Column2;Column3
0,13.0;0.6149270458154269;
1,;0.9166875083842575;Value
2,35.0;;
3,65.0;0.9261219701329632;Value
4,;0.4024318263697635;Value


In [8]:
# so if you want to fix above, then we can define separator
df = pd.read_csv("random_data_with_missing_values_semicolon.csv", sep=';')
df
# Ta-da!! fixed

Unnamed: 0,Column1,Column2,Column3
0,13.0,0.614927,
1,,0.916688,Value
2,35.0,,
3,65.0,0.926122,Value
4,,0.402432,Value


In [10]:
# now lets try to remove these missing values

# so, if we use dropna() -> it removes every row that contains a 'NaN'
df.dropna() # so this is dangerous because it will remove the row completely if any value is NaN..

Unnamed: 0,Column1,Column2,Column3
3,65.0,0.926122,Value


In [11]:
# we can also specify the axis
# by default it is axis = 0
df.dropna(axis=1) # so this removes that column if it has NaN value.

0
1
2
3
4


In [13]:
# also we can specify how we want to drop this value
# how='any' is default
# it means if any row contains missing value then we are going to drop it.
# but if we specify how='all' that means to drop that row/column all values must be NaN
df.dropna(axis=1, how='all')

Unnamed: 0,Column1,Column2,Column3
0,13.0,0.614927,
1,,0.916688,Value
2,35.0,,
3,65.0,0.926122,Value
4,,0.402432,Value


In [18]:
# dropna also has this subset parameter
# means Column3 has some missing values.
# so I want to get rid of those rows if for Column3 row value is missing.
# subset expects a list of column names
df.dropna(axis=0, how='any', subset=['Column3'])

Unnamed: 0,Column1,Column2,Column3
1,,0.916688,Value
3,65.0,0.926122,Value
4,,0.402432,Value
