In [1]:
# General
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split


# Modeling Tools
from sklearn.grid_search import GridSearchCV

# Models
from sklearn.ensemble import (ExtraTreesRegressor, RandomForestRegressor, 
                              AdaBoostRegressor, GradientBoostingRegressor)
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.neighbors import (KNeighborsRegressor, RadiusNeighborsRegressor)
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor



## Functions

In [2]:
#Create Grid Search Tool
class EstimatorSelectionHelper:
    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=3, n_jobs=1, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs, 
                              verbose=verbose, scoring=scoring, refit=refit)
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = [row(k, gsc.cv_validation_scores, gsc.parameters) 
                     for k in self.keys
                     for gsc in self.grid_searches[k].grid_scores_]
        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]

## Modelling

In [8]:
train = pd.read_csv("train_model.csv", index_col=0)
validation = pd.read_csv("test_model.csv", index_col=0)

In [32]:
#Pre-set qualifiers for the grid search
models = {
     #'RandomForestRegressor': RandomForestRegressor(),
      'GradientBoostingRegressor': GradientBoostingRegressor(),
     #'ExtraTreesRegressor': ExtraTreesRegressor(),
     #'AdaBoostRegressor': AdaBoostRegressor(),
     'XGBRegressor' : XGBRegressor()
}

params = {
    #'RandomForestRegressor': {'n_estimators': [10, 20, 40, 60, 75, 100]},
     'GradientBoostingRegressor': {'n_estimators': [10, 20, 40, 60, 75, 100],
                                   'learning_rate': [.1, .25, .5, .75, .9],
                                   'loss' : ['ls', 'lad', 'huber', 'quantile'], 
                                   'max_depth' : [1, 3, 5, 7, 10],
                                   'subsample': [.5, .75, 1]},
    #'ExtraTreesRegressor': {'n_estimators': [10, 20, 40, 60, 75, 100] },    
    #'AdaBoostRegressor': {'n_estimators': [10, 20, 40, 60, 75, 100],
    #                       'learning_rate': [.1, .25, .5, .75, .9]},
    'XGBRegressor': {'max_depth':[4 ,5, 6],
                     #'gamma':[1, 2, 3, 5, 10],
                     'min_child_weight':[.5, .75],
                     'subsample': [.5],
                     'learning_rate': [.06, .05, .04]},

}

In [33]:
train, test = train_test_split(train, test_size = 0.2)

In [34]:
# Further split training and test into dependent (y), vs independent (x) variables
train_y=train['SalePrice']
train_x=train.drop(['SalePrice', "Id"], axis = 1)

test_y=test['SalePrice']
test_x=test.drop(['SalePrice', "Id"], axis = 1)

In [35]:
#Feed each product set a bunch of models and run the grid search
helper = EstimatorSelectionHelper(models, params)
helper.fit(train_x, train_y, scoring='neg_mean_absolute_error', n_jobs=-1)
scores = helper.score_summary()


Running GridSearchCV for GradientBoostingRegressor.
Fitting 3 folds for each of 1800 candidates, totalling 5400 fits


[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done 439 tasks      | elapsed:   24.7s
[Parallel(n_jobs=-1)]: Done 689 tasks      | elapsed:   42.4s
[Parallel(n_jobs=-1)]: Done 1039 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1489 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 2039 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 2824 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 3650 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 4500 tasks      | elapsed:  4.9min


Running GridSearchCV for XGBRegressor.
Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Done 5400 out of 5400 | elapsed:  5.9min finished
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:    3.5s finished


In [36]:
scores.head(20)

Unnamed: 0,estimator,min_score,mean_score,max_score,std_score,learning_rate,loss,max_depth,min_child_weight,n_estimators,subsample
124,GradientBoostingRegressor,-20523.7,-18405.8,-14922.2,2482.33,0.1,lad,3,,100.0,0.75
155,GradientBoostingRegressor,-19987.3,-18423.6,-15430.8,2116.93,0.1,lad,7,,60.0,1.0
160,GradientBoostingRegressor,-20177.5,-18477.3,-15675.7,1996.09,0.1,lad,7,,100.0,0.75
231,GradientBoostingRegressor,-20104.7,-18569.0,-16082.8,1774.16,0.1,huber,5,,100.0,0.5
249,GradientBoostingRegressor,-19542.6,-18576.5,-17305.2,938.597,0.1,huber,7,,100.0,0.5
48,GradientBoostingRegressor,-20493.9,-18589.9,-17142.0,1405.87,0.1,ls,5,,75.0,0.5
138,GradientBoostingRegressor,-20272.2,-18606.0,-16258.1,1708.2,0.1,lad,5,,75.0,0.5
142,GradientBoostingRegressor,-20276.4,-18615.5,-15794.4,2005.25,0.1,lad,5,,100.0,0.75
123,GradientBoostingRegressor,-19823.4,-18619.7,-16583.6,1447.73,0.1,lad,3,,100.0,0.5
1811,XGBRegressor,-20660.0,-18628.6,-16471.5,1712.28,0.05,,6,0.75,,0.5


In [41]:
#Fit the model
#clf = XGBRegressor(gamma = 5,  max_depth = 3, min_child_weight = .5) #best model so far
#clf = XGBRegressor(gamma = 1,  learning_rate = .06, max_depth = 5, min_child_weight = .5, subsample = .5)
clf = GradientBoostingRegressor(learning_rate = .1, loss = 'lad', max_depth = 3, n_estimators = 100,
                                subsample = 1)
model=clf.fit(train_x, train_y)

In [44]:
#Scoring
submission = pd.DataFrame(validation["Id"])
submission["SalePrice"] = clf.predict(validation.fillna(0).drop("Id", axis = 1))

In [45]:
submission.to_csv("submission_03042018_2.csv", index = False)