In [36]:
import pandas as pd
from io import StringIO
import numpy as np
from sklearn.impute import SimpleImputer

In [9]:
csv_data = \
'''A,B,C,D
1.0, 2.0, 3.0, 4.0
5.0, 6.0,, 8.0
10.0, 11.0, 12.0,
'''
# The backslash '\': continuing on the next line
# The change line mark \n is automatically included in the string. This can be seen by running print(repr(csv_data)), see below
# two commas without space between them is interpreted as NaN by pandas. DO NOT add a space!

In [13]:
print(repr(csv_data))

'A,B,C,D\n1.0, 2.0, 3.0, 4.0\n5.0, 6.0,, 8.0\n10.0, 11.0, 12.0,\n'


In [10]:
df = pd.read_csv(StringIO(csv_data))
# StringIO: this function creates a virtual file as an input to pd.read_csv
# It is as if we feed pd.read_csv with a real csv file

In [11]:
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [16]:
df.isnull()
# This function detects whether a value is missing

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,True,False
2,False,False,False,True


In [18]:
# Count the number of missing values for each column
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [19]:
# Count the number of missing values for each row
df.isnull().sum(axis=1)

0    0
1    1
2    1
dtype: int64

In [21]:
df.values
# This provides the data in the pandas dataframe as a numpy array
# sklearn supports only numpy arrays originally. It is recommended to use numpy arrays as input for sklearn

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

In [22]:
df.dropna()
# Eliminate columns with missing data

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [23]:
df.dropna(axis=1)
# Eliminate rows with missing data

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [25]:
csv_data1 = \
'''A,B,C,D
1.0, 2.0, 3.0, 4.0
5.0, 6.0,, 8.0
,,,
'''
df1 = pd.read_csv(StringIO(csv_data1))
df1

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,,,,


In [28]:
df1.dropna(how='all')
# This drops a row or a column if all its entries are NaN

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0


In [32]:
df.dropna(thresh=4)
# Drop rows that have less than 4 real values

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [33]:
df.dropna(subset=['C'])
# Drop rows where NaN appear in column C

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


In [37]:
# Replacement by the means
imr = SimpleImputer(missing_values=np.nan, strategy='mean')
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
imputed_data
# Note that the means are column means.

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

In [40]:
# SimpleImputer belongs to transformer API in sklearn
# The two essential functions of all the methods in the API are fit and transform
# fit: learn parameters from the data
# transform: transform a data set with learned parameters. The data set must have the same number of features as the set used to fit the model.

# The transform methods are similar as those models in Ch3 (linear regression, logistic regression, decision tree,...). 
# But the latter also have a 'predict' method
# The latter are also called 'estimators' in sklearn
# A key difference is that we need to provide labels as well as feature values to fit an estimator. 
# To fit a transformer, we only need to provide feature values.

In [38]:
# Replacement by the means, implemented by pandas
df.fillna(df.mean())

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0


In [41]:
# Final note: The missing data can be replaced by the means, or based on k-nearest neighbors…