In [79]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Read the data
pd.options.display.max_columns = None
pd.options.display.max_rows = None
# pd.set_options('display.max_columns',None)
X_full = pd.read_csv('./input/train.csv')
X_test_full = pd.read_csv('./input/test.csv')

# Remove rows with missing Target, separate targets from predictors
X_full.dropna(axis=0,subset=['SalePrice'],inplace=True)
y=X_full.SalePrice
X_full.drop(['SalePrice'],axis=1,inplace=True)

# Select only numerical predictors
X = X_full.select_dtypes(exclude=['object'])
X_test = X_test_full.select_dtypes(exclude=['object'])

# break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

In [33]:
X_train.head()

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
618,619,20,90.0,11694,9,5,2007,2007,452.0,48,0,1774,1822,1828,0,0,1828,0,0,2,0,3,1,9,1,2007.0,3,774,0,108,0,0,260,0,0,7,2007
870,871,20,60.0,6600,5,5,1962,1962,0.0,0,0,894,894,894,0,0,894,0,0,1,0,2,1,5,0,1962.0,1,308,0,0,0,0,0,0,0,8,2009
92,93,30,80.0,13360,5,7,1921,2006,0.0,713,0,163,876,964,0,0,964,1,0,1,0,2,1,5,0,1921.0,2,432,0,0,44,0,0,0,0,8,2009
817,818,20,,13265,8,5,2002,2002,148.0,1218,0,350,1568,1689,0,0,1689,1,0,2,0,3,1,7,2,2002.0,3,857,150,59,0,0,0,0,0,7,2008
302,303,20,118.0,13704,7,5,2001,2002,150.0,0,0,1541,1541,1541,0,0,1541,0,0,2,0,3,1,6,1,2001.0,3,843,468,81,0,0,0,0,0,1,2006


In [37]:
print(X_train.shape)
missing_val_count_by_col = X_train.isna().sum()
print(missing_val_count_by_col[missing_val_count_by_col>0])

(1168, 37)
LotFrontage    212
MasVnrArea       6
GarageYrBlt     58
dtype: int64


In [47]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100,random_state=0)
    model.fit(X_train,y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid,preds)

In [46]:
# get names of columns with missing values
columns_with_missing_values = [col for col in X_train.columns if X_train[col].isnull().any()]

# drop columns in training and validation data
reduced_X_train = X_train.drop(columns_with_missing_values, axis=1)
reduced_X_valid = X_valid.drop(columns_with_missing_values, axis=1)

In [52]:
print("MAE (Drop columns with missing values):")
print(score_dataset(reduced_X_train, reduced_X_valid, y_train, y_valid))

MAE (Drop columns with missing values):
17952.591404109586


In [53]:
from sklearn.impute import SimpleImputer

# imputation
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid))

# assign column names to imputed data
imputed_X_train.columns = X_train.columns
imputed_X_valid.columns = X_valid.columns

In [54]:
print("MAE (imputation):")
print(score_dataset(imputed_X_train, imputed_X_valid, y_train, y_valid))

MAE (imputation):
18250.608013698627


In [67]:
# alternative imputation approach

columns_with_missing_values2=['LotFrontage','MasVnrArea']

X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()

X_train_plus.GarageYrBlt.fillna(0,inplace=True)
X_valid_plus.GarageYrBlt.fillna(0,inplace=True)

# X_train_plus.LotFrontage.fillna(0,inplace=True)
# X_valid_plus.LotFrontage.fillna(0,inplace=True)

for col in columns_with_missing_values2:
    X_train_plus[col+'_is_missing'] = X_train[col].isnull()
    X_valid_plus[col+'_is_missing'] = X_valid[col].isnull()

my_imputer = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus))

# assign column names to imputed data
imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_valid_plus.columns = X_valid_plus.columns

In [68]:
print("MAE (imputation plus):")
print(score_dataset(imputed_X_train_plus, imputed_X_valid_plus, y_train, y_valid))

MAE (imputation plus):
18022.736917808223


In [60]:
X_train_plus.GarageYrBlt.fillna(0,inplace=True)

In [80]:
# imputation
final_imputer = SimpleImputer(strategy='median')
final_X_train = pd.DataFrame(final_imputer.fit_transform(X_train))
final_X_valid = pd.DataFrame(final_imputer.transform(X_valid))

# restore column headers
final_X_train.columns = X_train.columns
final_X_valid.columns = X_valid.columns

In [81]:
print("MAE (imputation final):")
print(score_dataset(final_X_train, final_X_valid, y_train, y_valid))

MAE (imputation final):
18103.602945205483
