Practice the missing data filling methods that you learned to fill the missing data generated by the following code (note that you could work on the Numpy arrays if you decide to use functions in Sklearn, or you could use the Pandas Dataframe if you would like to process the missing data by using methods in Pandas). Here is the code to generate two datasets with missing data.

In [1]:
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.datasets import load_diabetes

rng = np.random.RandomState(42)

X_diabetes, y_diabetes = load_diabetes(return_X_y=True)
X_california, y_california = fetch_california_housing(return_X_y=True)
X_california = X_california[:400]
y_california = y_california[:400]

def add_missing_values(X_full, y_full):
    n_samples, n_features = X_full.shape

    # Add missing values in 75% of the lines
    missing_rate = 0.75
    n_missing_samples = int(n_samples * missing_rate)

    missing_samples = np.zeros(n_samples, dtype=bool)
    missing_samples[: n_missing_samples] = True

    rng.shuffle(missing_samples)
    missing_features = rng.randint(0, n_features, n_missing_samples)
    X_missing = X_full.copy()
    X_missing[missing_samples, missing_features] = np.nan
    y_missing = y_full.copy()

    return X_missing, y_missing

X_miss_california, y_miss_california = add_missing_values(
    X_california, y_california)

X_miss_diabetes, y_miss_diabetes = add_missing_values(
    X_diabetes, y_diabetes)

import pandas as pd
diabetes_pddata = pd.DataFrame(data= X_miss_diabetes)
diabetes_pddata['target'] = pd.Series(y_miss_diabetes)

print(diabetes_pddata.head(5))

          0         1         2         3         4         5         6  \
0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1 -0.001882 -0.044642 -0.051474       NaN -0.008449 -0.019163  0.074412   
2       NaN  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   
3 -0.089063 -0.044642 -0.011595       NaN  0.012191  0.024991 -0.036038   
4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596       NaN   

          7         8         9  target  
0 -0.002592  0.019907       NaN   151.0  
1 -0.039493 -0.068332 -0.092204    75.0  
2 -0.002592  0.002861 -0.025930   141.0  
3  0.034309  0.022688 -0.009362   206.0  
4 -0.002592 -0.031988 -0.046641   135.0  


In [2]:
# *Solutions Exampl
#pandas 
import numpy as np
import pandas as pd
# use diabetes_pddata
filled_diabetes_pddata= diabetes_pddata.fillna(0)
diabetes_pddata_mean = diabetes_pddata.fillna(diabetes_pddata.mean())
print(diabetes_pddata.head(5))
print(filled_diabetes_pddata.head(5))
print(diabetes_pddata_mean.head(5))

# Sklearn
# use X_miss_california
import numpy as np
from sklearn.impute import SimpleImputer
california_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
california_mean.fit(X_miss_california)
california_filled = california_mean.transform(X_miss_california)
print(california_filled)


          0         1         2         3         4         5         6  \
0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1 -0.001882 -0.044642 -0.051474       NaN -0.008449 -0.019163  0.074412   
2       NaN  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   
3 -0.089063 -0.044642 -0.011595       NaN  0.012191  0.024991 -0.036038   
4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596       NaN   

          7         8         9  target  
0 -0.002592  0.019907       NaN   151.0  
1 -0.039493 -0.068332 -0.092204    75.0  
2 -0.002592  0.002861 -0.025930   141.0  
3  0.034309  0.022688 -0.009362   206.0  
4 -0.002592 -0.031988 -0.046641   135.0  
          0         1         2         3         4         5         6  \
0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1 -0.001882 -0.044642 -0.051474  0.000000 -0.008449 -0.019163  0.074412   
2  0.000000  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356 