## Import Necessary Libraries

In [25]:
import random 
import pandas as pd 
from sklearn.impute import KNNImputer

## Fetching the dataset

In [26]:
dataset = fetch_california_housing() 
train, target = pd.DataFrame(dataset.data), pd.DataFrame(dataset.target) 
train.columns = ['zero','one','two','three','four','five','six','seven']
train.insert(loc=len(train.columns), column='target', value=target) 

In [27]:
train

Unnamed: 0,zero,one,two,three,four,five,six,seven,target
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


## Randomly replace 40% of the first column with NaN values

In [28]:
column = train['zero'] 
missing_pct = int(column.size * 0.4) 
i = [random.choice(range(column.shape[0])) for _ in range(missing_pct)]
column[i] = np.NaN 
train 

Unnamed: 0,zero,one,two,three,four,five,six,seven,target
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [29]:
train.shape

(20640, 9)

## Imput the values using scikit-learn KNN Imputer Class
- KNNImputer will be used to impute all missing data in the dataframe

In [30]:
imputer = KNNImputer(n_neighbors=3, weights="uniform")
imputed_data = imputer.fit_transform(train)  # impute all the missing data
df_temp = pd.DataFrame(imputed_data)
df_temp.columns = train.columns
train['zero'] = df_temp['zero']  # update only the desired column
train

Unnamed: 0,zero,one,two,three,four,five,six,seven,target
0,8.325200,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.301400,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.257400,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,2.952933,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.846200,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,3.045467,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.556800,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.700000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.867200,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847
