In [39]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_log_error

train_df_full = pd.read_csv("new_train_1h.csv")
test_df = pd.read_csv("new_test_1h.csv")

In [40]:
train_df = train_df_full.sample(frac=0.6,random_state=200)
evaluate_df = train_df_full.drop(train_df.index)

X_train = train_df.drop(['Id','SalePrice'], axis=1)
Y_train = train_df["SalePrice"]
X_train_full = train_df_full.drop(['Id','SalePrice'], axis=1)
Y_train_full = train_df_full["SalePrice"]
X_eval  = evaluate_df.drop(['Id','SalePrice'], axis=1)
Y_eval  = evaluate_df["SalePrice"]
X_test = test_df.drop('Id', axis=1)

## Drop Columns (list from House Prices - Column Selection)

In [41]:
orig_drop_list = ['Neighborhood', 'QuarterMean', 'MonthMean']

X_train.drop(orig_drop_list, axis=1, inplace=True)
X_eval.drop(orig_drop_list, axis=1, inplace=True)
X_train_full.drop(orig_drop_list, axis=1, inplace=True)
X_test.drop(orig_drop_list, axis=1, inplace=True)

drop_list = ['BldgType_4', 'Neighborhood_3', 'Foundation_4', 'Neighborhood_2', \
              'Foundation_3', 'Foundation_2', 'Foundation_1', 'GarageType_4', \
              'GarageType_3', 'GarageType_2', 'GarageType_1', 'LotConfig_2', \
              'LotConfig_5', 'GarageFinish_3', 'GarageFinish_2', 'Neighborhood_23', \
              'BedroomAbvGr', 'LotConfig_3', 'KitchenAbvGr', 'Neighborhood_24', \
              'GarageFinish_1', 'Neighborhood_21', 'LotConfig_4', 'Neighborhood_6', \
              'Neighborhood_20', 'BsmtHalfBath', 'Neighborhood_9', 'Neighborhood_18', \
              'BldgType_3', 'Neighborhood_19', 'Neighborhood_22', 'HeatingQC', \
              'LowQualFinSF', 'Neighborhood_15', 'Neighborhood_16', 'Neighborhood_25', \
              'SaleCondition_2', 'PavedDrive', 'BsmtFinSF2', 'Neighborhood_14', \
              'BldgType_5', 'Neighborhood_12', 'Neighborhood_17', 'GarageQual', \
              'Neighborhood_13', 'BsmtFinType2', 'HouseStyle_2', 'BsmtCond', \
              'Neighborhood_10', 'Neighborhood_11', 'Neighborhood_7', 'Neighborhood_8', \
              'BldgType_2', 'Neighborhood_5', 'HouseStyle_8', 'Fireplaces', \
              'HouseStyle_7', 'HouseStyle_6', 'HouseStyle_5', 'HouseStyle_4', \
              'HouseStyle_3', 'ExterCond']
drop_list = ['BldgType_4', 'Neighborhood_3', 'Foundation_4', 'Neighborhood_2', \
             'Foundation_3', 'Foundation_2', 'Foundation_1', 'GarageType_4', \
             'GarageType_3', 'GarageType_2', 'GarageType_1', 'LotConfig_2']

X_train.drop(drop_list, axis=1,inplace=True)
X_eval.drop(drop_list, axis=1,inplace=True)
X_train_full.drop(drop_list, axis=1,inplace=True)
X_test.drop(drop_list, axis=1,inplace=True)


# new_column_list = ['QuarterMeanNorm', 'MonthMeanNorm', 'AdjRemodAdd', \
#                    'OverallQualNCond', 'TotalSquareFeet', 'GarageCars', \
#                    'TotRmsAbvGrd', 'LotArea', 'Neighborhood', \
#                    'FullBath', 'BedroomAbvGr', 'KitchenQual', \
#                    'BsmtQual', 'ExterQual']
# X_train = X_train[new_column_list]
# X_eval = X_eval[new_column_list]
# X_train_full = X_train_full[new_column_list]
# X_test = X_test[new_column_list]

## Ensemble NFOLDS cross-validation and XGBoost

In [42]:
X_ens_train = X_train
y_ens_train = Y_train
X_ens_test = X_eval

# X_ens_train = X_train_full
# y_ens_train = Y_train_full
# X_ens_test = X_test

In [43]:
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor

ntrain = X_ens_train.shape[0]
ntest = X_ens_test.shape[0]
num_folds = 5
seed = 300
num_est = 100

kf = KFold(n_splits=num_folds, random_state=seed, shuffle=True)

class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        return self.clf.fit(x,y).feature_importances_
#         print(self.clf.fit(x,y).feature_importances_)


def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((num_folds, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
#         x_tr = x_train[train_index]
#         y_tr = y_train[train_index]
#         x_te = x_train[test_index]
        x_tr = x_train.iloc[list(train_index)]
        y_tr = y_train.iloc[list(train_index)]
        x_te = x_train.iloc[list(test_index)]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

rf_params = {
    'n_jobs': 10,
    'n_estimators': 2000, #100 = .1449, 1000= .1443 , 2000= .1440
    #'warm_start': True, 
    'max_features': 0.3, #na = .1543, 0.3= .1464
    'max_depth': 15, #na = .1464, 10 = 0.1463, 20 = .1461, 15 = 0.1449
    #'min_samples_leaf': 1, #na = 0.1449
    #'max_features' : 'sqrt',
    #'verbose': 0
}

# Extra Trees Parameters
et_params = {
    #'n_jobs': -1,
    'n_estimators':60,
    #'max_features': 0.9,
    'max_depth': 10,
    #'min_samples_leaf': 4,
    #'verbose': 1
}

# AdaBoost parameters
ad_params = {
    'n_estimators': 100, #100 = 0.1799
    #learning_rate' : 3 #na = 0.1799, 0.75=0.182
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500, #100 = 0.1442, 200 = 0.1415, 500 = 0.1407, 1000 = 0.1409
    'max_features': 0.3, #0.2 = 0.1371, 0.3 = 0.1344, 0.4 = 0.1358
    #'max_depth': 3,
    #'min_samples_leaf': 5,
    #'verbose': 0
}


rf = SklearnHelper(clf=RandomForestRegressor, seed=seed, params=rf_params)
et = SklearnHelper(clf=ExtraTreesRegressor, seed=seed, params=et_params)
ad = SklearnHelper(clf=AdaBoostRegressor, seed=seed, params=ad_params)
gb = SklearnHelper(clf=GradientBoostingRegressor, seed=seed, params=gb_params)

In [44]:
# et_params = {
#     #'n_jobs': -1,
#     'n_estimators':60,
#     #'max_features': 0.9,
#     'max_depth': 10,
#     #'min_samples_leaf': 4,
#     #'verbose': 1
# }

# et = SklearnHelper(clf=ExtraTreesRegressor, seed=seed, params=et_params)
# et_oof_train, et_oof_test = get_oof(et, X_ens_train, y_ens_train, X_ens_test) # Extra Trees
# acc_train = np.sqrt(mean_squared_log_error(y_ens_train, et.predict(X_ens_train)))
# acc_eval = np.sqrt(mean_squared_log_error(Y_eval, et.predict(X_ens_test)))
# print (f'ET: Train = {acc_train}  Evaluate = {acc_eval}')

In [45]:
# rf_params = {
#     'n_jobs': 10,
#     'n_estimators': 2000, #100 = .1449, 1000= .1443 , 2000= .1440
#     #'warm_start': True, 
#     'max_features': 0.3, #na = .1543, 0.3= .1464
#     'max_depth': 15, #na = .1464, 10 = 0.1463, 20 = .1461, 15 = 0.1449
#     #'min_samples_leaf': 1, #na = 0.1449
#     #'max_features' : 'sqrt',
#     #'verbose': 0
# }

# rf = SklearnHelper(clf=RandomForestRegressor, seed=seed, params=rf_params)

# rf_oof_train, rf_oof_test = get_oof(rf, X_ens_train, y_ens_train, X_ens_test) # Random Forest
# acc_train = np.sqrt(mean_squared_log_error(y_ens_train, rf.predict(X_ens_train)))
# acc_eval = np.sqrt(mean_squared_log_error(Y_eval, rf.predict(X_ens_test)))
# print (f'RF: Train = {acc_train}  Evaluate = {acc_eval}')

In [46]:
# ad_params = {
#     'n_estimators': 100, #100 = 0.1799
#     #learning_rate' : 3 #na = 0.1799, 0.75=0.182
# }

# ad = SklearnHelper(clf=AdaBoostRegressor, seed=seed, params=ad_params)

# ad_oof_train, ad_oof_test = get_oof(ad, X_ens_train, y_ens_train, X_ens_test) # AdaBoost
# acc_train = np.sqrt(mean_squared_log_error(y_ens_train, ad.predict(X_ens_train)))
# acc_eval = np.sqrt(mean_squared_log_error(Y_eval, ad.predict(X_ens_test)))
# print (f'AD: Train = {acc_train}  Evaluate = {acc_eval}')

In [47]:
# gb_params = {
#     'n_estimators': 500, #100 = 0.1442, 200 = 0.1415, 500 = 0.1407, 1000 = 0.1409
#     'max_features': 0.3, #0.2 = 0.1371, 0.3 = 0.1344, 0.4 = 0.1358
#     #'max_depth': 3,
#     #'min_samples_leaf': 5,
#     #'verbose': 0
# }

# gb = SklearnHelper(clf=GradientBoostingRegressor, seed=seed, params=gb_params)
# gb_oof_train, gb_oof_test = get_oof(gb, X_ens_train, y_ens_train, X_ens_test) # Gradient Boost
# acc_train = np.sqrt(mean_squared_log_error(y_ens_train, gb.predict(X_ens_train)))
# acc_eval = np.sqrt(mean_squared_log_error(Y_eval, gb.predict(X_ens_test)))
# print (f'GB: Train = {acc_train}  Evaluate = {acc_eval}')

In [48]:
rf_oof_train, rf_oof_test = get_oof(rf, X_ens_train, y_ens_train, X_ens_test) # Random Forest
et_oof_train, et_oof_test = get_oof(et, X_ens_train, y_ens_train, X_ens_test) # Extra Trees
ad_oof_train, ad_oof_test = get_oof(ad, X_ens_train, y_ens_train, X_ens_test) # AdaBoost 
gb_oof_train, gb_oof_test = get_oof(gb, X_ens_train, y_ens_train, X_ens_test) # Gradient Boost

print("Training is complete")

Training is complete


### Evaluation of trainers

In [49]:
acc_train = np.sqrt(mean_squared_log_error(y_ens_train, et.predict(X_ens_train)))
acc_eval = np.sqrt(mean_squared_log_error(Y_eval, et.predict(X_ens_test)))
print (f'ET: Train = {acc_train}  Evaluate = {acc_eval}')

acc_train = np.sqrt(mean_squared_log_error(y_ens_train, rf.predict(X_ens_train)))
acc_eval = np.sqrt(mean_squared_log_error(Y_eval, rf.predict(X_ens_test)))
print (f'RF: Train = {acc_train}  Evaluate = {acc_eval}')

acc_train = np.sqrt(mean_squared_log_error(y_ens_train, ad.predict(X_ens_train)))
acc_eval = np.sqrt(mean_squared_log_error(Y_eval, ad.predict(X_ens_test)))
print (f'AD: Train = {acc_train}  Evaluate = {acc_eval}')

acc_train = np.sqrt(mean_squared_log_error(y_ens_train, gb.predict(X_ens_train)))
acc_eval = np.sqrt(mean_squared_log_error(Y_eval, gb.predict(X_ens_test)))
print (f'GB: Train = {acc_train}  Evaluate = {acc_eval}')

ET: Train = 0.06612530712862164  Evaluate = 0.13979548348912774
RF: Train = 0.0767037179452672  Evaluate = 0.1439309764319522
AD: Train = 0.15423272069653246  Evaluate = 0.18155088198510075
GB: Train = 0.06560919622504459  Evaluate = 0.1366891392196188


### Feature data

In [50]:
rf_features = rf.feature_importances(X_ens_train,y_ens_train)
et_features = et.feature_importances(X_ens_train,y_ens_train)
ad_features = ad.feature_importances(X_ens_train,y_ens_train)
gb_features = gb.feature_importances(X_ens_train,y_ens_train)

cols = X_ens_train.columns.values
# Create a dataframe with features
feature_dataframe = pd.DataFrame( {'features': cols,
     'Random Forest feature importances': rf_features,
     'Extra Trees  feature importances': et_features,
      'AdaBoost feature importances': ad_features,
    'Gradient Boost feature importances': gb_features
    })

In [51]:
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

# Scatter plot 
trace = go.Scatter(
    y = feature_dataframe['Random Forest feature importances'].values,
    x = feature_dataframe['features'].values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 25,
#       size= feature_dataframe['AdaBoost feature importances'].values,
        #color = np.random.randn(500), #set color equal to a variable
        color = feature_dataframe['Random Forest feature importances'].values,
        colorscale='Portland',
        showscale=True
    ),
    text = feature_dataframe['features'].values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'Random Forest Feature Importance',
    hovermode= 'closest',
#     xaxis= dict(
#         title= 'Pop',
#         ticklen= 5,
#         zeroline= False,
#         gridwidth= 2,
#     ),
    yaxis=dict(
        title= 'Feature Importance',
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')

# Scatter plot 
trace = go.Scatter(
    y = feature_dataframe['Extra Trees  feature importances'].values,
    x = feature_dataframe['features'].values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 25,
#       size= feature_dataframe['AdaBoost feature importances'].values,
        #color = np.random.randn(500), #set color equal to a variable
        color = feature_dataframe['Extra Trees  feature importances'].values,
        colorscale='Portland',
        showscale=True
    ),
    text = feature_dataframe['features'].values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'Extra Trees Feature Importance',
    hovermode= 'closest',
#     xaxis= dict(
#         title= 'Pop',
#         ticklen= 5,
#         zeroline= False,
#         gridwidth= 2,
#     ),
    yaxis=dict(
        title= 'Feature Importance',
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')

# Scatter plot 
trace = go.Scatter(
    y = feature_dataframe['AdaBoost feature importances'].values,
    x = feature_dataframe['features'].values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 25,
#       size= feature_dataframe['AdaBoost feature importances'].values,
        #color = np.random.randn(500), #set color equal to a variable
        color = feature_dataframe['AdaBoost feature importances'].values,
        colorscale='Portland',
        showscale=True
    ),
    text = feature_dataframe['features'].values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'AdaBoost Feature Importance',
    hovermode= 'closest',
#     xaxis= dict(
#         title= 'Pop',
#         ticklen= 5,
#         zeroline= False,
#         gridwidth= 2,
#     ),
    yaxis=dict(
        title= 'Feature Importance',
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')

# Scatter plot 
trace = go.Scatter(
    y = feature_dataframe['Gradient Boost feature importances'].values,
    x = feature_dataframe['features'].values,
    mode='markers',
    marker=dict(
        sizemode = 'diameter',
        sizeref = 1,
        size = 25,
#       size= feature_dataframe['AdaBoost feature importances'].values,
        #color = np.random.randn(500), #set color equal to a variable
        color = feature_dataframe['Gradient Boost feature importances'].values,
        colorscale='Portland',
        showscale=True
    ),
    text = feature_dataframe['features'].values
)
data = [trace]

layout= go.Layout(
    autosize= True,
    title= 'Gradient Boosting Feature Importance',
    hovermode= 'closest',
#     xaxis= dict(
#         title= 'Pop',
#         ticklen= 5,
#         zeroline= False,
#         gridwidth= 2,
#     ),
    yaxis=dict(
        title= 'Feature Importance',
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig,filename='scatter2010')

In [52]:
feature_dataframe['mean'] = feature_dataframe.mean(axis= 1) # axis = 1 computes the mean row-wise

y_mean = feature_dataframe['mean'].values
x_feat = feature_dataframe['features'].values
data = [go.Bar(
            x= x_feat,
             y= y_mean,
            width = 0.5,
            marker=dict(
               color = feature_dataframe['mean'].values,
            colorscale='Portland',
            showscale=True,
            reversescale = False
            ),
            opacity=0.6
        )]

layout= go.Layout(
    autosize= True,
    title= 'Barplots of Mean Feature Importance',
    hovermode= 'closest',
#     xaxis= dict(
#         title= 'Pop',
#         ticklen= 5,
#         zeroline= False,
#         gridwidth= 2,
#     ),
    yaxis=dict(
        title= 'Feature Importance',
        ticklen= 5,
        gridwidth= 2
    ),
    showlegend= False
)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='bar-direct-labels')

In [53]:
base_predictions_train = pd.DataFrame( {'RandomForest': rf_oof_train.ravel(),
     'ExtraTrees': et_oof_train.ravel(),
     'AdaBoost': ad_oof_train.ravel(),
      'GradientBoost': gb_oof_train.ravel()
    })
base_predictions_train.head()

data = [
    go.Heatmap(
        z= base_predictions_train.astype(float).corr().values ,
        x=base_predictions_train.columns.values,
        y= base_predictions_train.columns.values,
          colorscale='Viridis',
            showscale=True,
            reversescale = True
    )
]
py.iplot(data, filename='labelled-heatmap')

### Final Training with XGBoost

In [54]:
x_ensembled_train = np.concatenate(( et_oof_train, rf_oof_train, ad_oof_train, gb_oof_train), axis=1)
x_ensembled_test = np.concatenate(( et_oof_test, rf_oof_test, ad_oof_test, gb_oof_test), axis=1)

In [55]:
import xgboost as xgb

gbm = xgb.XGBRegressor(
 learning_rate = 0.2, #0.2=0.1364, 0.5=0.1476, 1=0.1699
 n_estimators= 20, #20=0.1377, 50=0.1413, 100=0.1462, 200=0.1503, 500=0.1544, 5000=0.1552, 
 max_depth= 4,
 min_child_weight= 4,
 #gamma=1,
 gamma=0.9, #0.9=0.1356                 
 subsample=0.8, #0.8
 colsample_bytree=0.8,
 objective= 'reg:squarederror',
 nthread= -1,
 scale_pos_weight=1 #1
).fit(x_ensembled_train, y_ens_train)
predictions = gbm.predict(x_ensembled_test)

### Final evaluation

In [56]:
import xgboost as xgb

gbm = xgb.XGBRegressor(
 learning_rate = 0.2, #0.2=0.1364, 0.5=0.1476, 1=0.1699
 n_estimators= 20, #20=0.1377, 50=0.1413, 100=0.1462, 200=0.1503, 500=0.1544, 5000=0.1552, 
 max_depth= 4,
 min_child_weight= 4,
 #gamma=1,
 gamma=0.9, #0.9=0.1356                 
 subsample=0.8, #0.8
 colsample_bytree=0.8,
 objective= 'reg:squarederror',
 nthread= -1,
 scale_pos_weight=1 #1
).fit(x_ensembled_train, y_ens_train)
predictions = gbm.predict(x_ensembled_test)

acc_eval = np.sqrt(mean_squared_log_error(Y_eval, predictions))
print (f'Evaluate = {acc_eval}')

Evaluate = 0.13669131407055166


### Output results

In [57]:
# output = pd.DataFrame({'Id': test_df.Id, 'SalePrice': predictions})
# output.to_csv('my_submission2.csv', index=False)