In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [11]:
x_full = pd.read_csv("../input/home-data-for-ml-course/train.csv", index_col="Id")
x_test_full = pd.read_csv("../input/home-data-for-ml-course/test.csv", index_col="Id")

x_full.dropna(axis=0, subset=["SalePrice"], inplace=True)
y = x_full.SalePrice
x_full.drop(["SalePrice"], axis=1, inplace=True)

x = x_full.select_dtypes(exclude=["object"])
x_test = x_test_full.select_dtypes(exclude=["object"])

In [13]:
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(x, y, train_size=0.8, test_size=0.2, random_state=0);

In [17]:
print(x_train.shape)

missing_values = x_train.isnull().sum()
print(missing_values[missing_values > 0])

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

def score_dataset(x_t, x_v, y_t, y_v):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(x_t, y_t)
    preds = model.predict(x_v)
    return mean_absolute_error(y_v, preds)

In [19]:
# Method 1: Dropping columns with missing values 
cols_missing = [col for col in x_train.columns if x_train[col].isnull().any()]

reduced_x_train = x_train.drop(cols_missing, axis=1)
reduced_x_valid = x_valid.drop(cols_missing, axis=1)

print(score_dataset(reduced_x_train, reduced_x_valid, y_train, y_valid))

In [22]:
# Method 2: Imputting missing values with (mean/median/most_frequent) (not constant)
from sklearn.impute import SimpleImputer

imputation_strategies = ["mean", "median", "most_frequent"]

for strategy in imputation_strategies:
    imputer = SimpleImputer(strategy=strategy)
    imputer.fit(x_train)
    imputted_x_train = pd.DataFrame(imputer.transform(x_train))
    imputted_x_valid = pd.DataFrame(imputer.transform(x_valid))

    imputted_x_train.columns = x_train.columns
    imputted_x_valid.columns = x_valid.columns

    print(f"Using {strategy} MEA obtained is: {score_dataset(imputted_x_train, imputted_x_valid, y_train, y_valid)}")

**Exploring the two methods of handling missing values, the method of (Imputting Missing values with Median) resulted in lowest MEA for this dataset**

In [23]:
final_imputer = SimpleImputer(strategy="median")
final_imputer.fit(x_train)
final_x_train = pd.DataFrame(final_imputer.transform(x_train))
final_x_train.columns = x_train.columns
final_x_valid = pd.DataFrame(final_imputer.transform(x_valid))
final_x_valid.columns = x_valid.columns

In [24]:
model = RandomForestRegressor(n_estimators=100, random_state=0)
model.fit(final_x_train, y_train)
preds_valid = model.predict(final_x_valid)
print(f"MAE on validation set: {mean_absolute_error(y_valid, preds_valid)}")

In [25]:
final_x_test = pd.DataFrame(final_imputer.transform(x_test))
final_x_test.columns = x_test.columns

preds_test = model.predict(final_x_test)
output = pd.DataFrame({"Id": x_test.index, "SalePrice": preds_test})
output.to_csv("submission.csv", index=False)