In [158]:
import pandas as pd
from sklearn.model_selection import train_test_split

pd.options.display.max_columns=None
pd.options.display.max_rows=None

# load train and test data sets
X = pd.read_csv('./input/train.csv')
X_test = pd.read_csv('./input/test.csv')

# remove data with missing target values, separate targets from predictors
X.dropna(subset=['SalePrice'],inplace=True)
y = X.SalePrice
X.drop('SalePrice',axis=1,inplace=True)

exclude_cols=['MiscFeature','PoolQC','Alley']
X.drop(exclude_cols,axis=1,inplace=True)
X_test.drop(exclude_cols,axis=1,inplace=True)

# list numeric and categorical columns
numerical_cols = X.select_dtypes(exclude='object').columns.tolist()
categorical_cols = X.select_dtypes(include='object').columns.tolist()

X_train, X_valid, y_train, y_valid = train_test_split(X,y,train_size=0.8,test_size=0.2,random_state=0)

# categorical columns safe for encoding
good_cat_columns = [col for col in categorical_cols if set(X_train[col].unique())==set(X_valid[col].unique())]
bad_cat_columns = list(set(categorical_cols)-set(good_cat_columns))

X_train.drop(bad_cat_columns, axis=1, inplace=True)
X_valid.drop(bad_cat_columns, axis=1, inplace=True)
X_test.drop(bad_cat_columns, axis=1, inplace=True)

# low and high cardinality categorical columns
low_cardinality_cols = [col for col in good_cat_columns if X[col].nunique() < 10]
high_cardinality_cols = list(set(good_cat_columns)-set(low_cardinality_cols))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [206]:
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor

# preprocessing of numerical columns
numerical_transformer = SimpleImputer(strategy='median')

# preprocessing of low cardinality columns
OH_transformer = Pipeline(steps=[
    ('imputation',SimpleImputer(strategy='constant')),
    ('one_hot_encoding',OneHotEncoder(handle_unknown='ignore'))
])

# preprocessing of high_cardinality columns
Label_transformer = Pipeline(steps=[
    ('imputation',SimpleImputer(strategy='constant')),
    ('label_encoding',LabelEncoder())
])

# bundle preprocessing of numerical and categorical columns
preprocessor = ColumnTransformer(transformers=[
    ('num',numerical_transformer,numerical_cols),
    ('oh',OH_transformer,low_cardinality_cols),
    ('lbl',Label_transformer,high_cardinality_cols)
])

# define model
model = XGBRegressor(n_estimators=300, learning_rate=0.1, random_state=0)

# bundle preprocessing and modeling code
clf = Pipeline(steps=[
    ('preprocessing',preprocessor),
    ('model',model)
])

In [207]:
clf.fit(X_train,y_train)

  if getattr(data, 'base', None) is not None and \




Pipeline(memory=None,
         steps=[('preprocessing',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('num',
                                                  SimpleImputer(add_indicator=False,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='median',
                                                                verbose=0),
                                                  ['Id', 'MSSubClass',
                                                   'LotFrontage', 'LotArea',
                                                   'OverallQual', 'Over

In [208]:
preds = clf.predict(X_valid)

from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_valid,preds)

16587.254387842466

In [86]:
# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index+1461,
                       'SalePrice': preds})
output.to_csv('submission.csv', index=False)

In [145]:
[(col,X[col].isnull().sum()) for col in X.columns if X[col].isnull().any()]

[('LotFrontage', 259),
 ('Alley', 1369),
 ('MasVnrType', 8),
 ('MasVnrArea', 8),
 ('BsmtQual', 37),
 ('BsmtCond', 37),
 ('BsmtExposure', 38),
 ('BsmtFinType1', 37),
 ('BsmtFinType2', 38),
 ('Electrical', 1),
 ('FireplaceQu', 690),
 ('GarageType', 81),
 ('GarageYrBlt', 81),
 ('GarageFinish', 81),
 ('GarageQual', 81),
 ('GarageCond', 81),
 ('PoolQC', 1453),
 ('Fence', 1179),
 ('MiscFeature', 1406)]