In [37]:
import pandas as pd
from io import StringIO
import numpy as np

In [5]:
csv_data ='''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''
print(csv_data)

A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,


In [7]:
df = pd.read_csv(StringIO(csv_data))
df

'''
    You can see in the following output that certain values 
    that were missing in the csv_data were replaced by the 
    string NaN, after reading it as a csv.
'''

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [12]:
# We can count the number of NaN/Null values present in each column:

df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [13]:
# One of the easiest ways to deal with missing values is to remove the corresponding features or rows
# from the dataset entirely.

# Drop the rows in which there is atleast one NaN value is found
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [14]:
# Similarly, drop the features in which atleast one NaN value is found
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [15]:
# Exploring additional parameters of dropna() method

# Only drop those rows which contain NaN values in all columns:
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [19]:
# Drop rows having fewer than four values
df.dropna(thresh=4)

      A     B     C    D
0   1.0   2.0   3.0  4.0
1   5.0   6.0   NaN  8.0
2  10.0  11.0  12.0  NaN


In [20]:
df.dropna(thresh=3)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [38]:
# Only drop those rows in which a specific column has the NaN value:
df.dropna(subset=['D'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0


In [None]:
'''
    Handling missing values by removing features or training examples 
    might be a convenient approach. However, using this technique we
    might end up removing a lot of training examples, shrinking our
    training data, which will make reliable analysis impossible. Or,
    if we remove features, we might end up removing valuable predictors
    that our classifier needs in order to discriminate between classes
'''

In [47]:
# The alternative approach is to use interpolation techniques
# of which the most common is mean imputation. 

# Take the mean of all the values for a specific column
# Use a simple imputer

from sklearn.impute import SimpleImputer
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(df.values)
imputed_data = imp.transform(df.values)
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

In [49]:
# Other values for the parameter strategy includes median or most_frequent
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp = imp.fit(df.values)
imputed_data = imp.transform(df.values)
imputed_data

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6.,  3.,  8.],
       [10., 11., 12.,  4.]])

In [50]:
# The easiest way is to use pandas' fillna() method with df.mean() passed as an argument for imputation method
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0
