In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/melbourne-housing-snapshot/melb_data.csv')

In [None]:
data.head()

In [None]:
data.isnull().any()

In [None]:
y = data.Price
X = data.select_dtypes(exclude=['object'])
X = X.drop(['Price'],axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,random_state=0)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

<p>See the missing values</p>

In [None]:
print(X_train.shape)

# Number of missing values in each column of training data
missing_val_count_by_column = (X_train.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

<p>here only numerical values are used to impute</p>
<p>similiar can be done for <em>object</em> data type</p>
<p>Keep this is mind -  imputer when one of the columns is 'object'. After imputation all the columns result 'object'</p>

<h2>First approach</h2>
<p>Drop the missing values</p>

In [None]:
cols_with_missing = [col for col in X_train.columns if X_train[col].isnull().any()]

# Drop columns in training and validation data
reduced_X_train = X_train.drop(cols_with_missing, axis=1)
reduced_X_valid = X_valid.drop(cols_with_missing, axis=1)

print("Score from First approach")
print(f'{score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid):.0f}')

<h2>Second approach</h2>
<p>Impute vales</p>

In [None]:
from sklearn.impute import SimpleImputer

# Imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))


In [None]:
imputed_X_train.head()

In [None]:
# Imputation removed column names; put them back
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

print("Score from Second approach")
print(f'{score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid):.0f}')

<h4>You can make new columns and then use Imputer</h4>
<p>Also check the parameters in SimpleImputer</p>

In [None]:
# Make copy to avoid changing original data (when imputing)
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()

# Make new columns indicating what will be imputed
for col in cols_with_missing:
    X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
    X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()

In [None]:
X_train_plus.head()

In [None]:
# Imputation
my_imputer = SimpleImputer(strategy='mean', fill_value=None)
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus))

# Imputation removed column names; put them back
imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_valid_plus.columns = X_valid_plus.columns

print("Score from New Columns added approach")
print(f'{score_dataset(imputed_X_train_plus, imputed_X_valid_plus, y_train, y_valid):.0f}')