In [19]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [20]:
x = pd.read_csv("../input/home-data-for-ml-course/train.csv", index_col="Id")
x_test = pd.read_csv("../input/home-data-for-ml-course/test.csv", index_col="Id")

x.dropna(axis=0, subset=["SalePrice"], inplace=True)
y = x.SalePrice
x.drop(["SalePrice"], axis=1, inplace=True)

In [21]:
# Train / Validation split
from sklearn.model_selection import train_test_split
x_train, x_valid, y_train, y_valid = train_test_split(x, y, train_size=0.8, test_size=0.2)

In [31]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer

# Scoring dataset based on MAE
def score_dataset(x_t, x_v, y_t, y_v, handle_missing_values=False):
    if (handle_missing_values):
        imputer = SimpleImputer(strategy="median")
        imputer.fit(x_t)
        z_t = pd.DataFrame(imputer.transform(x_t))
        z_t.columns = x_t.columns
        z_v = pd.DataFrame(imputer.transform(x_v))
        z_v.columns = x_v.columns
        x_t = z_t
        x_v = z_v
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(x_t, y_t)
    preds = model.predict(x_v)
    return mean_absolute_error(y_v, preds)

In [32]:
# Handling categorical data
# 1. Dropping columns with categorical variables
object_cols = [col for col in x_train.columns if x_train[col].dtype == "object"]
c_dropped_x_train = x_train.drop(object_cols, axis=1)
c_dropped_x_valid = x_valid.drop(object_cols, axis=1)

print(score_dataset(c_dropped_x_train, c_dropped_x_valid, y_train, y_valid, handle_missing_values=True))

In [38]:
# Handling categorical data
# 2. Ordinal encoding
copy_x_train = x_train.copy()
copy_x_valid = x_valid.copy()

good_categorical_variables = [col for col in object_cols if set(x_valid[col]).issubset(set(x_train[col]))]
bad_categorical_variables = list(set(object_cols) - set(good_categorical_variables))

copy_x_train.drop(bad_categorical_variables, axis=1, inplace=True)
copy_x_valid.drop(bad_categorical_variables, axis=1, inplace=True)

from sklearn.preprocessing import OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
ordinal_encoder.fit(x_train[good_categorical_variables])

copy_x_train[good_categorical_variables] = ordinal_encoder.transform(x_train[good_categorical_variables])
copy_x_valid[good_categorical_variables] = ordinal_encoder.transform(x_valid[good_categorical_variables])

print(score_dataset(copy_x_train, copy_x_valid, y_train, y_valid, handle_missing_values=True))

In [39]:
# Handling categorical data
# 3. One-hot encoding
low_cardinality = [col for col in object_cols if x_train[col].nunique() < 10]
high_cardinality = list(set(object_cols) - set(low_cardinality))

oh_x_train = x_train.copy()
oh_x_valid = x_valid.copy()

oh_x_train.drop(high_cardinality, axis=1, inplace=True)
oh_x_valid.drop(high_cardinality, axis=1, inplace=True)

from sklearn.preprocessing import OneHotEncoder
oh_encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)
oh_encoder.fit(oh_x_train[low_cardinality])

oh_cols_x_train = pd.DataFrame(oh_encoder.transform(x_train[low_cardinality]))
oh_cols_x_valid = pd.DataFrame(oh_encoder.transform(x_valid[low_cardinality]))

oh_cols_x_train.index = oh_x_train.index
oh_cols_x_valid.index = oh_x_valid.index

oh_dropped_x_train = oh_x_train.drop(low_cardinality, axis=1)
oh_dropped_x_valid = oh_x_valid.drop(low_cardinality, axis=1)

res_x_train = pd.concat([oh_dropped_x_train, oh_cols_x_train], axis=1)
res_x_valid = pd.concat([oh_dropped_x_valid, oh_cols_x_valid], axis=1)

print(score_dataset(res_x_train, res_x_valid, y_train, y_valid, handle_missing_values=True))

In [26]:
# Handling categorical data
# Final solution

In [27]:
# Handling missing data
# 1. Dropping columns

missing_cols = [col for col in x_train.columns if x_train[col].isnull().any()]
m_dropped_x_train = x_train.drop(missing_cols, axis=1)
m_dropped_x_valid = x_valid.drop(missing_cols, axis=1)

print(score_dataset(m_dropped_x_train, m_dropped_x_valid, y_train, y_valid))

In [None]:
# Handling missing data
# 2. Imputing data with mean/median/mode

In [None]:
# Handling missing data
# Final solution