In [None]:
#based in this tutorial:
#https://towardsdatascience.com/6-different-ways-to-compensate-for-missing-values-data-imputation-with-examples-6022d9ca0779

In [42]:
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from math import sqrt
import random
import numpy as np
random.seed(0)

In [6]:
#Fetching the dataset
import pandas as pd
dataset = fetch_california_housing()
train, target = pd.DataFrame(dataset.data), pd.DataFrame(dataset.target)
train.columns = ['0','1','2','3','4','5','6','7']
train.insert(loc=len(train.columns), column='target', value=target)

In [10]:
train.shape

(20640, 9)

In [11]:
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [22]:
#Randomly replace 40% of the first column with NaN values
column = train['0']
print(column.size)
missing_pct = int(column.size * 0.4)
i = [random.choice(range(column.shape[0])) for _ in range(missing_pct)]
column[i] = np.NaN
print(column.shape[0])

20640
20640


In [25]:
train.head() #how the original values of column 0 were changed?

Unnamed: 0,0,1,2,3,4,5,6,7,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [26]:
missing_val_count_by_column = (train.isnull().sum())

#print just the columns which have more than 0 null values
print(missing_val_count_by_column[missing_val_count_by_column > 0])

0    11361
dtype: int64


In [17]:
#Impute the values using scikit-learn SimpleImpute Class
from sklearn.impute import SimpleImputer

In [18]:
imp_mean = SimpleImputer( strategy='mean') #for median imputation replace 'mean' with 'median'


In [37]:
imp_mean.fit(train)

imputed_mean_train_df = pd.DataFrame(imp_mean.transform(train))

In [38]:
imputed_mean_train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,3.887458,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [39]:
imp_median = SimpleImputer( strategy='median') #for median imputation replace 'mean' with 'median'
imputed_median_train_df = pd.DataFrame( imp_median.fit_transform(train))

In [41]:
imputed_median_train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,3.5481,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [43]:
#imputation removed columns names. Let's put them back!
imputed_median_train_df.columns = train.columns

In [44]:
imputed_median_train_df

Unnamed: 0,0,1,2,3,4,5,6,7,target
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,3.5481,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,3.5481,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [63]:
#Lets train it?! -------------------------------------------------------###########

In [185]:
from sklearn.model_selection import train_test_split

In [186]:
dataset = fetch_california_housing()
X_full, y = pd.DataFrame(dataset.data), pd.DataFrame(dataset.target)

In [187]:
train.columns = dataset.feature_names

In [188]:
X = X_full.select_dtypes(exclude=['object'])

In [189]:
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [190]:
X_train.shape

(16512, 8)

In [191]:
X_valid.shape

(4128, 8)

In [192]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [193]:
# Function for comparing different approaches
#MAE
def score_dataset_mae(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [194]:
#MSE
def score_dataset_mse(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_squared_error(y_valid, preds)

<h2>1- Do Nothing</h2>

In [195]:
# Number of missing values in each column of training data
missing_val_count_by_column = (X_train.isnull().sum())

#print just the columns which have more than 0 null values
print(missing_val_count_by_column[missing_val_count_by_column > 0])

#there aren't missing values

Series([], dtype: int64)


<h3>Test I: dataset without missings</h3>

In [196]:
score_dataset_mae(X_train, X_valid, y_train, y_valid)

  """


0.3359917207147898

In [93]:
score_dataset_mse(X_train, X_valid, y_train, y_valid)

  after removing the cwd from sys.path.


0.2637015859470573

<h2>2- Imputation Using (Mean/Median) Values:</h2>

In [198]:
dataset = fetch_california_housing()
X, y = pd.DataFrame(dataset.data), pd.DataFrame(dataset.target)
X.columns = dataset.feature_names

In [199]:
#Simulating NaN in dataset
#Randomly replace 40% of the first column (MedInc) with NaN values
column = X['MedInc'] #is this a reference? like a pointer?
print(column.size)
missing_pct = int(column.size * 0.4)
i = [random.choice(range(column.shape[0])) for _ in range(missing_pct)]
column[i] = np.NaN
print(column.shape[0])

20640
20640


In [200]:
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [201]:
missing_val_count_by_column = (X.isnull().sum())

#print just the columns which have more than 0 null values
print(missing_val_count_by_column[missing_val_count_by_column > 0])

MedInc    6826
dtype: int64


In [202]:
# Break off validation set from training data
X = X.select_dtypes(exclude=['object'])

In [203]:
# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [204]:
X_train.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
12069,,6.0,7.723077,1.169231,228.0,3.507692,33.83,-117.55
15925,4.3898,52.0,5.326622,1.100671,1485.0,3.322148,37.73,-122.44
11162,3.9333,26.0,4.668478,1.046196,1022.0,2.777174,33.83,-118.0
4904,,38.0,3.383495,1.009709,749.0,3.635922,34.01,-118.26
4683,3.1765,52.0,4.119792,1.043403,1135.0,1.970486,34.08,-118.36


In [205]:
X_valid.head(2)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
14740,4.1518,22.0,5.663073,1.075472,1551.0,4.180593,32.58,-117.05
10101,5.7796,32.0,6.107226,0.927739,1296.0,3.020979,33.92,-117.97


In [206]:
from sklearn.impute import SimpleImputer

#default strategy  is mean
imputer_mean = SimpleImputer(strategy='mean')

#train (FIT AND TRANSFORM) the imputer on X_train
imputed_X_train = pd.DataFrame(imputer_mean.fit_transform(X_train))

#..and use it (TRANSFORM) on X_valid
imputed_X_valid = pd.DataFrame(imputer_mean.transform(X_valid)) #We use the imputer already tranformed here

In [207]:
imputed_X_train.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7
0,3.885697,6.0,7.723077,1.169231,228.0,3.507692,33.83,-117.55
1,4.3898,52.0,5.326622,1.100671,1485.0,3.322148,37.73,-122.44


In [208]:
imputed_X_valid.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7
0,4.1518,22.0,5.663073,1.075472,1551.0,4.180593,32.58,-117.05
1,5.7796,32.0,6.107226,0.927739,1296.0,3.020979,33.92,-117.97


In [209]:
#imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

In [210]:
imputed_X_train.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,3.885697,6.0,7.723077,1.169231,228.0,3.507692,33.83,-117.55
1,4.3898,52.0,5.326622,1.100671,1485.0,3.322148,37.73,-122.44
2,3.9333,26.0,4.668478,1.046196,1022.0,2.777174,33.83,-118.0
3,3.885697,38.0,3.383495,1.009709,749.0,3.635922,34.01,-118.26
4,3.1765,52.0,4.119792,1.043403,1135.0,1.970486,34.08,-118.36


<h3>Test II: imputation with mean</h3>

In [214]:
print("MAE: Imputation: ", score_dataset_mae(imputed_X_train, imputed_X_valid, y_train, y_valid) )
print("MSE: Imputation: ", score_dataset_mse(imputed_X_train, imputed_X_valid, y_train, y_valid) )


  """


MAE: Imputation:  0.33331648463561064


  after removing the cwd from sys.path.


MSE: Imputation:  0.2547847597224546


<h2>3- Imputation Using (Most Frequent) or (Zero/Constant) Values:</h2>

In [216]:
#PRE-PROCESSING SUMMARY CODE:
#get dataset
dataset = fetch_california_housing()
X, y = pd.DataFrame(dataset.data), pd.DataFrame(dataset.target)
X.columns = dataset.feature_names

#Simulating NaN in dataset
#Randomly replace 40% of the first column (MedInc) with NaN values
column = X['MedInc'] #is this a reference? like a pointer?
#print(column.size)
missing_pct = int(column.size * 0.4)
i = [random.choice(range(column.shape[0])) for _ in range(missing_pct)]
column[i] = np.NaN
#print(column.shape[0])

# Break off validation set from training data
X = X.select_dtypes(exclude=['object'])

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [219]:
from sklearn.impute import SimpleImputer

#default strategy  is mean
imputer_most_frequency = SimpleImputer(strategy='most_frequent')

#train (FIT AND TRANSFORM) the imputer on X_train
imputed_X_train_most_frequency = pd.DataFrame(imputer_most_frequency.fit_transform(X_train))

#..and use it (TRANSFORM) on X_valid
imputed_X_valid_most_frequency = pd.DataFrame(imputer_most_frequency.transform(X_valid)) #We use the imputer already tranformed here

<h3>Test: imputation with most frequent</h3>

In [224]:
print("MAE: Imputation: ", 
      score_dataset_mae(imputed_X_train_most_frequency, imputed_X_valid_most_frequency, y_train, y_valid) )

print("MSE: Imputation: ", 
      score_dataset_mse(imputed_X_train_most_frequency, imputed_X_valid_most_frequency, y_train, y_valid) )

  """


MAE: Imputation:  0.3369479230008089


  after removing the cwd from sys.path.


MSE: Imputation:  0.26578693999010305


<h3>Test: imputation with zero or constant values</h3>

In [232]:
from sklearn.impute import SimpleImputer

#default strategy  is mean
imputer_zero = SimpleImputer(strategy='constant')

#train (FIT AND TRANSFORM) the imputer on X_train
imputed_X_train_zero = pd.DataFrame(imputer_zero.fit_transform(X_train))

#..and use it (TRANSFORM) on X_valid
imputed_X_valid_zero = pd.DataFrame(imputer_zero.transform(X_valid)) 

#imputation removed column names; put them back
imputed_X_train_zero.columns = X_train.columns
imputed_X_valid_zero.columns = X_valid.columns

In [233]:
X_valid.head(2)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
14740,4.1518,22.0,5.663073,1.075472,1551.0,4.180593,32.58,-117.05
10101,,32.0,6.107226,0.927739,1296.0,3.020979,33.92,-117.97


In [234]:
imputed_X_valid_zero.head(2)

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,4.1518,22.0,5.663073,1.075472,1551.0,4.180593,32.58,-117.05
1,0.0,32.0,6.107226,0.927739,1296.0,3.020979,33.92,-117.97


In [235]:
print("MAE: Imputation: ", 
      score_dataset_mae(imputed_X_train_zero, imputed_X_valid_zero, y_train, y_valid) )

print("MSE: Imputation: ", 
      score_dataset_mse(imputed_X_train_zero, imputed_X_valid_zero, y_train, y_valid) )

  """


MAE: Imputation:  0.3156156985937476


  after removing the cwd from sys.path.


MSE: Imputation:  0.23091998398281943


In [237]:
#wow..best results above until this moment!

<h2>4- Imputation Using k-NN</h2>

In [257]:
#PRE-PROCESSING SUMMARY CODE:
#get dataset
dataset = fetch_california_housing()
X, y = pd.DataFrame(dataset.data), pd.DataFrame(dataset.target)
X.columns = dataset.feature_names

#Simulating NaN in dataset
#Randomly replace 40% of the first column (MedInc) with NaN values
column = X['MedInc'] #is this a reference? like a pointer?
#print(column.size)
missing_pct = int(column.size * 0.4)
i = [random.choice(range(column.shape[0])) for _ in range(missing_pct)]
column[i] = np.NaN
#print(column.shape[0])

# Break off validation set from training data
X = X.select_dtypes(exclude=['object'])

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [246]:
#IMPYUTE DOESN'T IMPORTS
#import sys
#from impyute.imputation.cs import fast_knn
#sys.setrecursionlimit(100000) #Increase the recursion limit of the OS

# start the KNN training
#imputed_training=fast_knn(train.values, k=30) 

In [260]:
import numpy as np
from sklearn.impute import KNNImputer

X = [[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]]

imputer = KNNImputer(n_neighbors=2)
imputed_knn = imputer.fit_transform(X)

In [261]:
pd.DataFrame(X)

Unnamed: 0,0,1,2
0,1.0,2,
1,3.0,4,3.0
2,,6,5.0
3,8.0,8,7.0


In [262]:
pd.DataFrame(imputed_knn)

Unnamed: 0,0,1,2
0,1.0,2.0,4.0
1,3.0,4.0,3.0
2,5.5,6.0,5.0
3,8.0,8.0,7.0


In [283]:
imputer_knn = KNNImputer(n_neighbors=30)
imputed_knn_X_train = pd.DataFrame( imputer_knn.fit_transform(X_train) )
imputed_knn_X_valid = pd.DataFrame( imputer_knn.transform(X_valid) )

In [284]:
imputed_knn_X_train.columns = X_train.columns
imputed_knn_X_valid.columns = X_valid.columns

In [285]:
X_train.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
12069,,6.0,7.723077,1.169231,228.0,3.507692,33.83,-117.55
15925,4.3898,52.0,5.326622,1.100671,1485.0,3.322148,37.73,-122.44
11162,3.9333,26.0,4.668478,1.046196,1022.0,2.777174,33.83,-118.0
4904,,38.0,3.383495,1.009709,749.0,3.635922,34.01,-118.26
4683,,52.0,4.119792,1.043403,1135.0,1.970486,34.08,-118.36


In [286]:
imputed_knn_X_train.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,4.28389,6.0,7.723077,1.169231,228.0,3.507692,33.83,-117.55
1,4.3898,52.0,5.326622,1.100671,1485.0,3.322148,37.73,-122.44
2,3.9333,26.0,4.668478,1.046196,1022.0,2.777174,33.83,-118.0
3,3.787693,38.0,3.383495,1.009709,749.0,3.635922,34.01,-118.26
4,3.28651,52.0,4.119792,1.043403,1135.0,1.970486,34.08,-118.36


In [288]:
#let's test
print("MAE: Imputation: ", 
      score_dataset_mae(imputed_knn_X_train, imputed_knn_X_valid, y_train, y_valid) )

print("MSE: Imputation: ", 
      score_dataset_mse(imputed_knn_X_train, imputed_knn_X_valid, y_train, y_valid) )

  """


MAE: Imputation:  0.326360462409547


  after removing the cwd from sys.path.


MSE: Imputation:  0.23873541761304168


<h3>5 - Imputation Using Multivariate Imputation by Chained Equation (MICE)</h3>

In [289]:
#PRE-PROCESSING SUMMARY CODE:
#get dataset
dataset = fetch_california_housing()
X, y = pd.DataFrame(dataset.data), pd.DataFrame(dataset.target)
X.columns = dataset.feature_names

#Simulating NaN in dataset
#Randomly replace 40% of the first column (MedInc) with NaN values
column = X['MedInc'] #is this a reference? like a pointer?
#print(column.size)
missing_pct = int(column.size * 0.4)
i = [random.choice(range(column.shape[0])) for _ in range(missing_pct)]
column[i] = np.NaN
#print(column.shape[0])

# Break off validation set from training data
X = X.select_dtypes(exclude=['object'])

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [292]:
#doesn't imports
#from impyute.imputation.cs import mice

# start the MICE training
#imputed_training=mice(train.values)

<h3>6- Imputation Using Deep Learning (Datawig):</h3>
<h4>works very well with categorical and non-numerical features.</h4>

In [293]:
#doesn't imports
import datawig

df_train, df_test = datawig.utils.random_split(train)

#Initialize a SimpleImputer model
imputer = datawig.SimpleImputer(
    input_columns=['1','2','3','4','5','6','7', 'target'], # column(s) containing information about the column we want to impute
    output_column= '0', # the column we'd like to impute values for
    output_path = 'imputer_model' # stores model data and metrics
    )

#Fit an imputer model on the train data
imputer.fit(train_df=df_train, num_epochs=50)

#Impute missing values and return original dataframe with predictions
imputed = imputer.predict(df_test)

ModuleNotFoundError: No module named 'datawig'