In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import Imputer
from scipy.stats import skew
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
%matplotlib inline



# Data load

In [2]:
train_df = pd.read_csv('../dat/train.csv')
print(train_df.shape)
train_df.head(3)

(1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500


In [3]:
train_df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

## You must drop the target colomn

In [4]:
y = train_df.SalePrice
train_df = train_df.drop('SalePrice', axis=1)
train_idx = len(train_df)

In [5]:
test_df = pd.read_csv('../dat/test.csv')
print(test_df.shape)
len(test_df.columns) == len(train_df.columns)

(1459, 80)


True

In [6]:
df = pd.concat([train_df, test_df], axis=0)
df.shape

(2919, 80)

# Missing Vals

In [7]:
# Number of colomns with nulls
def check_nulls(df):
    print(df.isnull().sum())
    return len(df.isnull().sum().nonzero()[0])

In [8]:
print('Number of colomns with nulls: ', check_nulls(df))

Id                  0
MSSubClass          0
MSZoning            4
LotFrontage       486
LotArea             0
Street              0
Alley            2721
LotShape            0
LandContour         0
Utilities           2
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         1
Exterior2nd         1
MasVnrType         24
MasVnrArea         23
ExterQual           0
ExterCond           0
Foundation          0
                 ... 
HalfBath            0
BedroomAbvGr        0
KitchenAbvGr        0
KitchenQual         1
TotRmsAbvGrd        0
Functional          2
Fireplaces          0
FireplaceQu      1420
GarageType        157
GarageYrBlt       159
GarageFinish      159
GarageCars          1
GarageArea          1
GarageQual        159
GarageCond

## Object type fill in missing vals:

In [9]:
df_obj = df.select_dtypes(include=['object'])
print(df_obj.shape)


(2919, 43)


In [10]:
print('Number of colomns with nulls: ', check_nulls(df_obj))

MSZoning            4
Street              0
Alley            2721
LotShape            0
LandContour         0
Utilities           2
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
RoofStyle           0
RoofMatl            0
Exterior1st         1
Exterior2nd         1
MasVnrType         24
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           81
BsmtCond           82
BsmtExposure       82
BsmtFinType1       79
BsmtFinType2       80
Heating             0
HeatingQC           0
CentralAir          0
Electrical          1
KitchenQual         1
Functional          2
FireplaceQu      1420
GarageType        157
GarageFinish      159
GarageQual        159
GarageCond        159
PavedDrive          0
PoolQC           2909
Fence            2348
MiscFeature      2814
SaleType            1
SaleCondition       0
dtype: int64
Number of colomns with nulls:  23


## FILLING ALL with None is not always the good way. Sometimes there's a specific default value per colomns

In [11]:
df_obj = df_obj.fillna("None")

In [12]:
print('Number of colomns with nulls: ', check_nulls(df_obj))

MSZoning         0
Street           0
Alley            0
LotShape         0
LandContour      0
Utilities        0
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
RoofStyle        0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
MasVnrType       0
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinType2     0
Heating          0
HeatingQC        0
CentralAir       0
Electrical       0
KitchenQual      0
Functional       0
FireplaceQu      0
GarageType       0
GarageFinish     0
GarageQual       0
GarageCond       0
PavedDrive       0
PoolQC           0
Fence            0
MiscFeature      0
SaleType         0
SaleCondition    0
dtype: int64
Number of colomns with nulls:  0


## Numerical missing values handling:

In [13]:
imp = Imputer()
df_non_obj = df.select_dtypes(exclude=['object'])
print(df_non_obj.shape)

(2919, 37)


In [14]:
print('Number of colomns with nulls: ', check_nulls(df_non_obj))

Id                 0
MSSubClass         0
LotFrontage      486
LotArea            0
OverallQual        0
OverallCond        0
YearBuilt          0
YearRemodAdd       0
MasVnrArea        23
BsmtFinSF1         1
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       2
BsmtHalfBath       2
FullBath           0
HalfBath           0
BedroomAbvGr       0
KitchenAbvGr       0
TotRmsAbvGrd       0
Fireplaces         0
GarageYrBlt      159
GarageCars         1
GarageArea         1
WoodDeckSF         0
OpenPorchSF        0
EnclosedPorch      0
3SsnPorch          0
ScreenPorch        0
PoolArea           0
MiscVal            0
MoSold             0
YrSold             0
dtype: int64
Number of colomns with nulls:  11


In [15]:
#train_df_non_obj = imp.fit_transform(train_df_non_obj)
df_non_obj = pd.DataFrame(imp.fit_transform(df_non_obj), index=df_non_obj.index, columns=df_non_obj.columns)


In [16]:
print('Number of colomns with nulls: ', check_nulls(df_non_obj))

Id               0
MSSubClass       0
LotFrontage      0
LotArea          0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
MasVnrArea       0
BsmtFinSF1       0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
TotRmsAbvGrd     0
Fireplaces       0
GarageYrBlt      0
GarageCars       0
GarageArea       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
MiscVal          0
MoSold           0
YrSold           0
dtype: int64
Number of colomns with nulls:  0


## Encode the skewed cols into log transform

In [17]:

# Log transform of the skewed numerical features to lessen impact of outliers
# Inspired by Alexandru Papiu's script : https://www.kaggle.com/apapiu/house-prices-advanced-regression-techniques/regularized-linear-models
# As a general rule of thumb, a skewness with an absolute value > 0.5 is considered at least moderately skewed
skewness = df_non_obj.apply(lambda x: skew(x))
skewness = skewness[abs(skewness) > 0.5]
print(str(skewness.shape[0]) + " skewed numerical features to log transform")
skewed_features = skewness.index
df_non_obj[skewed_features] = np.log1p(df_non_obj[skewed_features])

27 skewed numerical features to log transform


## Now put the two transfroms together

In [18]:
df = pd.concat([df_non_obj, df_obj], axis=1)

In [19]:
df.shape

(2919, 80)

In [20]:
print('Number of colomns with nulls: ', check_nulls(df))

Id               0
MSSubClass       0
LotFrontage      0
LotArea          0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
MasVnrArea       0
BsmtFinSF1       0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
TotRmsAbvGrd     0
Fireplaces       0
GarageYrBlt      0
GarageCars       0
GarageArea       0
WoodDeckSF       0
OpenPorchSF      0
                ..
RoofStyle        0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
MasVnrType       0
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinType2     0
Heating          0
HeatingQC        0
CentralAir       0
Electrical       0
KitchenQual      0
Functional       0
FireplaceQu      0
GarageType       0
GarageFinish

# Handle categorial features via OHE

In [21]:
df = pd.get_dummies(df)

In [22]:
df.shape

(2919, 311)

# Why the number of colomns increased?

In [23]:
for col in train_df.columns:
    print(col)

Id
MSSubClass
MSZoning
LotFrontage
LotArea
Street
Alley
LotShape
LandContour
Utilities
LotConfig
LandSlope
Neighborhood
Condition1
Condition2
BldgType
HouseStyle
OverallQual
OverallCond
YearBuilt
YearRemodAdd
RoofStyle
RoofMatl
Exterior1st
Exterior2nd
MasVnrType
MasVnrArea
ExterQual
ExterCond
Foundation
BsmtQual
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinSF1
BsmtFinType2
BsmtFinSF2
BsmtUnfSF
TotalBsmtSF
Heating
HeatingQC
CentralAir
Electrical
1stFlrSF
2ndFlrSF
LowQualFinSF
GrLivArea
BsmtFullBath
BsmtHalfBath
FullBath
HalfBath
BedroomAbvGr
KitchenAbvGr
KitchenQual
TotRmsAbvGrd
Functional
Fireplaces
FireplaceQu
GarageType
GarageYrBlt
GarageFinish
GarageCars
GarageArea
GarageQual
GarageCond
PavedDrive
WoodDeckSF
OpenPorchSF
EnclosedPorch
3SsnPorch
ScreenPorch
PoolArea
PoolQC
Fence
MiscFeature
MiscVal
MoSold
YrSold
SaleType
SaleCondition


Because the categorial cols are now col per cat_value: PavedDrive --> PavedDrive_N, PavedDrive_P, PavedDrive_Y, each has only 1 or 0

Note that: if you do the same after filling NA with None, you get more colmns due to the _None cat.
    
Another way to do it is: LableEncoder. But it only works for ordinal not categorial vars, o.w. higher values means higher importance.

In [24]:
df.dtypes # No objects

Id                       float64
MSSubClass               float64
LotFrontage              float64
LotArea                  float64
OverallQual              float64
OverallCond              float64
YearBuilt                float64
YearRemodAdd             float64
MasVnrArea               float64
BsmtFinSF1               float64
BsmtFinSF2               float64
BsmtUnfSF                float64
TotalBsmtSF              float64
1stFlrSF                 float64
2ndFlrSF                 float64
LowQualFinSF             float64
GrLivArea                float64
BsmtFullBath             float64
BsmtHalfBath             float64
FullBath                 float64
HalfBath                 float64
BedroomAbvGr             float64
KitchenAbvGr             float64
TotRmsAbvGrd             float64
Fireplaces               float64
GarageYrBlt              float64
GarageCars               float64
GarageArea               float64
WoodDeckSF               float64
OpenPorchSF              float64
          

# Target variable

In [25]:
# Log transform the target for official scoring
y = np.log1p(y)


# Train test split

In [26]:
X = df[:train_idx]

### XGB is only working on nparray. So we use df.as_matrix()

In [27]:
X_train, X_val, Y_train, Y_val = train_test_split(X.as_matrix(), y.as_matrix(), test_size=0.25)

In [28]:
print(X.shape)
print(y.shape)
print(X_train.shape)
print(Y_train.shape)
print(X_val.shape)
print(Y_val.shape)


(1460, 311)
(1460,)
(1095, 311)
(1095,)
(365, 311)
(365,)


In [29]:


XGB = XGBRegressor(n_estimators=1000, learning_rate=0.05)
# Add silent=True to avoid printing out updates with each cycle
XGB.fit(X_train, Y_train, eval_set=[(X_val, Y_val)], verbose=True, early_stopping_rounds=5)

[0]	validation_0-rmse:10.9379
Will train until validation_0-rmse hasn't improved in 5 rounds.
[1]	validation_0-rmse:10.3912
[2]	validation_0-rmse:9.87175
[3]	validation_0-rmse:9.37833
[4]	validation_0-rmse:8.90961
[5]	validation_0-rmse:8.46435
[6]	validation_0-rmse:8.04132
[7]	validation_0-rmse:7.63942
[8]	validation_0-rmse:7.25762
[9]	validation_0-rmse:6.8949
[10]	validation_0-rmse:6.55162
[11]	validation_0-rmse:6.22426
[12]	validation_0-rmse:5.91432
[13]	validation_0-rmse:5.61913
[14]	validation_0-rmse:5.3396
[15]	validation_0-rmse:5.07296
[16]	validation_0-rmse:4.81985
[17]	validation_0-rmse:4.57947
[18]	validation_0-rmse:4.35045
[19]	validation_0-rmse:4.13417
[20]	validation_0-rmse:3.92778
[21]	validation_0-rmse:3.73255
[22]	validation_0-rmse:3.54628
[23]	validation_0-rmse:3.37025
[24]	validation_0-rmse:3.20242
[25]	validation_0-rmse:3.04308
[26]	validation_0-rmse:2.89131
[27]	validation_0-rmse:2.74748
[28]	validation_0-rmse:2.61134
[29]	validation_0-rmse:2.48141
[30]	validation_0-

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.05, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=1000, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [31]:
# make predictions
predictions = XGB.predict(X_val)


print("Mean Absolute Error : " + str(mean_absolute_error(predictions, Y_val)))
print("Mean Squared Error : " + str(mean_squared_error(predictions, Y_val)))
print("Root Mean Squared Error : " + str(np.sqrt(mean_squared_error(predictions, Y_val))))

Mean Absolute Error : 0.0883746980963
Mean Squared Error : 0.0194805586622
Root Mean Squared Error : 0.139572771923


In [34]:
np.expm1(Y_val)

array([ 153000.,  139000.,  233000.,  212000.,  102000.,  325000.,
        228000.,  119000.,  165000.,  169500.,  162000.,  174000.,
        122900.,  126500.,  153500.,  345000.,  150000.,  223500.,
        241000.,  135000.,  124000.,  205000.,  134500.,  252000.,
        150750.,  147000.,  121000.,  200141.,  105000.,  135000.,
        262500.,  115000.,  154900.,  127500.,  244400.,  136905.,
        262500.,  123000.,  162000.,  164000.,  195000.,  239000.,
        210000.,  105000.,   91500.,  115000.,  165000.,  164990.,
        205000.,  129900.,  145000.,  151000.,  157000.,  235000.,
        230000.,  129000.,  367294.,  174000.,  162900.,  107500.,
        340000.,   98000.,  187500.,   40000.,   62383.,   99500.,
        185000.,  144152.,  110000.,  114500.,  301500.,  178000.,
        207500.,  263435.,  120000.,  149000.,  190000.,  155000.,
         88000.,  280000.,  169900.,  200000.,  185000.,  196000.,
        189000.,  143250.,  134900.,  374000.,  122000.,  1000

In [35]:
np.expm1(predictions)

array([ 161406.859375  ,  124507.4765625 ,  220670.609375  ,
        218766.109375  ,  121774.8203125 ,  319656.0625    ,
        244917.53125   ,  122480.1640625 ,  170164.546875  ,
        146723.890625  ,  162745.5625    ,  182114.125     ,
        122485.7734375 ,  123490.1328125 ,  146401.15625   ,
        362012.78125   ,  155667.859375  ,  210801.203125  ,
        204291.15625   ,  169852.28125   ,  125285.875     ,
        214802.34375   ,  136618.625     ,  268526.75      ,
        145214.890625  ,  136876.984375  ,  102133.7265625 ,
        194374.5625    ,  104392.1796875 ,  148093.984375  ,
        256870.015625  ,  115379.3828125 ,  121299.828125  ,
        125828.78125   ,  199712.75      ,  151973.78125   ,
        264406.71875   ,  111539.3671875 ,  156095.703125  ,
        163756.453125  ,  193317.3125    ,  267039.6875    ,
        222028.59375   ,  102711.0078125 ,  105135.0234375 ,
        114917.6171875 ,  160734.65625   ,  192488.90625   ,
        236514.90625   ,

# Submit

In [36]:
X_test = df[train_idx:]

In [37]:
X_test.shape

(1459, 311)

In [42]:
predictions = XGB.predict(X_test.as_matrix())

In [46]:
predicted_prices = np.expm1(predictions)

In [48]:
my_submission = pd.DataFrame({'Id': test_df.Id, 'SalePrice': predicted_prices})
# you could use any filename. We choose submission here
my_submission.to_csv('submission.csv', index=False)


