First things first: importing all relevant libraries and functions

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.cross_validation as cv
from sklearn.metrics import make_scorer
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
import scipy.stats
from sklearn.linear_model import Ridge
import my_repository as my
from sklearn.cross_validation import KFold

def rmsle(y_true, y_pred, y_type=None):
    n = len(y_true)
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    parcels = [(np.log(y_pred[i] + 1) - np.log(y_true[i] + 1))**2 for i in xrange(n)]
    return np.sqrt(np.sum(parcels) / n)

In [2]:
original_data = pd.read_csv('../data/train.csv')
data = original_data.copy()
original_test = pd.read_csv('../data/test.csv')
test = original_test.copy()

The following features have missing values. How much?

In [3]:
dataNaN = data.isnull().sum()
dataNaN = dataNaN[dataNaN > 0]
testNaN = test.isnull().sum()
testNaN = testNaN[testNaN > 0]

#print dataNaN
#print testNaN

In [4]:
#dataNaN

In [5]:
#testNaN

In [6]:
#data.boxplot('LotFrontage', by='Neighborhood', figsize=(16, 7), fontsize=8)
#test.boxplot('LotFrontage', by='Neighborhood', figsize=(16, 7), fontsize=8)
#data.boxplot('LotArea', by='Neighborhood', figsize=(16, 7), fontsize=8)
#test.boxplot('LotArea', by='Neighborhood', figsize=(16, 7), fontsize=8)

In [7]:
#data.plot(x='LotFrontage', y='SalePrice', kind='scatter', figsize=(3, 3), fontsize=5)
#data.plot(x='LotArea', y='SalePrice', kind='scatter', figsize=(3, 3), fontsize=5)#

In [8]:
#len(set(data['Neighborhood']))

In [9]:
#fig, axes = plt.subplots(5,5, figsize=(25,25))
#for (neigh, group), ax in zip(data.groupby('Neighborhood'), axes.flatten()):
#    group.plot(x='LotFrontage', y='SalePrice', kind='scatter', ax=ax, title=neigh, fontsize=6)

In [10]:
#neighs = list(data['Neighborhood'].unique())
#neighs.sort()

In [11]:
#data.groupby('Neighborhood')[['LotFrontage','SalePrice']].corr().ix[0::2,'SalePrice']

In [12]:
#data.groupby('Neighborhood')[['LotArea','SalePrice']].corr().ix[0::2,'SalePrice']

Now, I'm going to do two things:  

First, treatment of missing values. For numeric variables, a missing value will receive the mean value of its variable over its Neighborhood. For categorical variables, it will simply receive "N/A".  

Second, use one-hot encoding to transform categorical variables into several binary variables. Then, drop the original categorical variables.

In [13]:
dropfeatures = ['Id', 'GarageYrBlt']#, 'YrSold', 'YearBuilt', 'YearRemodAdd'

for neigh in set(data['Neighborhood']):
    value = data[data['Neighborhood'] == neigh]['LotFrontage'].median()
    data.loc[(data['Neighborhood'] == neigh) & (data['LotFrontage'].isnull()), 'LotFrontage'] = value
    value = data[data['Neighborhood'] == neigh]['MasVnrArea'].median()
    data.loc[(data['Neighborhood'] == neigh) & (data['MasVnrArea'].isnull()), 'MasVnrArea'] = value
data['MSSubClass'] = data['MSSubClass'].astype(str)
data = data.fillna('N/A')
dataId = data['Id']
prices = data['SalePrice']
features = data.drop(dropfeatures, axis=1)
features = features.drop('SalePrice', axis=1)
features_before_dummies = features
features = pd.get_dummies(features) #, drop_first=True)

tlist = ['LotFrontage', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'GarageArea',
         'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'LotFrontage']
for neigh in set(test['Neighborhood']):
    for feature in tlist:
        value = test[test['Neighborhood'] == neigh][feature].median()
        test.loc[(test['Neighborhood'] == neigh) & (test[feature].isnull()), feature] = value
test[tlist] = test[tlist].fillna(test[tlist].mean())
test = test.fillna('N/A')
test_id = test['Id']
test_features = test.drop(dropfeatures, axis=1)
test_features['MSSubClass'] = test_features['MSSubClass'].astype(str)
test_features_before_dummies = test_features
test_features = pd.get_dummies(test_features) #, drop_first=True)

In [14]:
dataNaN = data.isnull().sum()
dataNaN = dataNaN[dataNaN > 0]
testNaN = test.isnull().sum()
testNaN = testNaN[testNaN > 0]

print dataNaN
print testNaN
print features.shape
print test_features.shape

Series([], dtype: int64)
Series([], dtype: int64)
(1460, 317)
(1459, 306)


There are data features not present in test_features and vice-versa... This is because of categorical values present in one set and not on the other set. Let's drop the ones which are not present on both sets.

In [15]:
dropfeat = list(set(features).difference(set(test_features)))
features = features.drop(dropfeat, axis=1)
dropfeat = list(set(test_features).difference(set(features)))
test_features = test_features.drop(dropfeat, axis=1)
print features.shape
print test_features.shape

(1460, 298)
(1459, 298)


In [16]:
dataNaN = features.isnull().sum()
dataNaN = dataNaN[dataNaN > 0]
testNaN = test_features.isnull().sum()
testNaN = testNaN[testNaN > 0]
print dataNaN
print testNaN

Series([], dtype: int64)
Series([], dtype: int64)


Now, the fun begins. Let's do a log transform on our target variable, randomly partition the training set in 5 folds and prepare a data frame to contain the scores of every model which will be found.

In [17]:
logSalePrice = np.log1p(prices)

In [18]:
kf = KFold(n=len(features), n_folds=5, shuffle=True, random_state=666)

In [19]:
model_scores = pd.DataFrame(columns=['model', 'params', 'RMSLE'])
model_scores_log = pd.DataFrame(columns=['model', 'params', 'RMSLE'])

Creating/finding a Ridge Regression model, both for original SalesPrice and for the log-transformed variable...

In [20]:
ridge_def = Ridge(normalize=True, max_iter=50000, random_state=666)
params = {'alpha': scipy.stats.uniform(loc=0.3, scale=5-0.3)}
np.random.seed(666)
ridge_model = my.make_model(features, prices, ridge_def, params, cv=kf, grid_type='random', n_iter=100, n_jobs=-1)
np.random.seed(666)
ridge_log_model = my.make_model(features, logSalePrice, ridge_def, params, cv=kf, y_type='log1p', grid_type='random', n_iter=100, n_jobs=-1)

scores = pd.DataFrame([['Ridge', str(ridge_model.best_params_), -ridge_model.best_score_]], columns=['model', 'params', 'RMSLE'])
model_scores = model_scores.append(scores)
scores = pd.DataFrame([['Ridge', str(ridge_log_model.best_params_), -ridge_log_model.best_score_]], columns=['model', 'params', 'RMSLE'])
model_scores_log = model_scores_log.append(scores)

my.fit_submit(test_features, test_id, ridge_model, '../data/submissions/ridge.csv')
my.fit_submit(test_features, test_id, ridge_log_model, '../data/submissions/ridge_log.csv', price_type='log1p')

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   19.3s


mean: -0.16207, std: 0.01020, params: {'alpha': 3.5920544727318231}
mean: -0.16782, std: 0.01114, params: {'alpha': 4.2676772213970429}
mean: -0.16112, std: 0.01003, params: {'alpha': 3.4796173789068394}
mean: -0.16317, std: 0.01039, params: {'alpha': 3.7209328690659516}
mean: -0.17208, std: 0.01176, params: {'alpha': 4.7718523999977949}
mean: -0.16013, std: 0.02430, params: {'alpha': 0.35970502606340787}
mean: -0.15111, std: 0.00794, params: {'alpha': 2.2438621842966602}
mean: -0.15053, std: 0.01144, params: {'alpha': 0.52942013086000017}
mean: -0.14642, std: 0.00724, params: {'alpha': 0.76966423820969365}
mean: -0.15455, std: 0.00874, params: {'alpha': 2.6879116371049303}
mean: -0.14552, std: 0.00633, params: {'alpha': 1.2411634348720442}
mean: -0.16382, std: 0.01050, params: {'alpha': 3.7975245934262558}
mean: -0.14544, std: 0.00630, params: {'alpha': 1.2065924142907436}
mean: -0.16209, std: 0.01020, params: {'alpha': 3.5939703351801122}
mean: -0.14736, std: 0.00693, params: {'alpha

[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   21.7s finished
[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:    8.8s finished


mean: -0.16634, std: 0.00969, params: {'alpha': 3.5920544727318231}
mean: -0.17176, std: 0.01011, params: {'alpha': 4.2676772213970429}
mean: -0.16542, std: 0.00964, params: {'alpha': 3.4796173789068394}
mean: -0.16738, std: 0.00976, params: {'alpha': 3.7209328690659516}
mean: -0.17571, std: 0.01050, params: {'alpha': 4.7718523999977949}
mean: -0.14469, std: 0.01621, params: {'alpha': 0.35970502606340787}
mean: -0.15521, std: 0.00986, params: {'alpha': 2.2438621842966602}
mean: -0.14420, std: 0.01500, params: {'alpha': 0.52942013086000017}
mean: -0.14472, std: 0.01368, params: {'alpha': 0.76966423820969365}
mean: -0.15889, std: 0.00958, params: {'alpha': 2.6879116371049303}
mean: -0.14740, std: 0.01184, params: {'alpha': 1.2411634348720442}
mean: -0.16800, std: 0.00980, params: {'alpha': 3.7975245934262558}
mean: -0.14716, std: 0.01195, params: {'alpha': 1.2065924142907436}
mean: -0.16635, std: 0.00969, params: {'alpha': 3.5939703351801122}
mean: -0.15064, std: 0.01071, params: {'alpha

Creating/finding a Gradient Boosting model, both for original SalesPrice and for the log-transformed variable...

In [21]:
gbr_def = GradientBoostingRegressor(random_state=666)
params = {'min_samples_split': scipy.stats.randint(low=2, high=20), 'n_estimators': scipy.stats.randint(low=50, high=5000), 'max_depth': [3, 4, 5], 'loss': ['ls', 'lad']}
np.random.seed(666)
gbr_model = my.make_model(features, prices, gbr_def, params, cv=kf, grid_type='random', n_iter=100, n_jobs=-1)
np.random.seed(666)
gbr_log_model = my.make_model(features, logSalePrice, gbr_def, params, cv=kf, y_type='log1p', grid_type='random', n_iter=100, n_jobs=-1)

scores = pd.DataFrame([['GBR', str(gbr_model.best_params_), -gbr_model.best_score_]], columns=['model', 'params', 'RMSLE'])
model_scores = model_scores.append(scores)
scores = pd.DataFrame([['GBR', str(gbr_log_model.best_params_), -gbr_log_model.best_score_]], columns=['model', 'params', 'RMSLE'])
model_scores_log = model_scores_log.append(scores)

my.fit_submit(test_features, test_id, gbr_model, '../data/submissions/gbr.csv')
my.fit_submit(test_features, test_id, gbr_log_model, '../data/submissions/gbr_log.csv', price_type='log1p')

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 11.1min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 36.4min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 83.8min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 93.7min finished


mean: -0.13463, std: 0.01367, params: {'min_samples_split': 15, 'loss': 'ls', 'n_estimators': 2000, 'max_depth': 5}
mean: -0.13556, std: 0.01230, params: {'min_samples_split': 11, 'loss': 'ls', 'n_estimators': 2512, 'max_depth': 5}
mean: -0.13074, std: 0.01111, params: {'min_samples_split': 16, 'loss': 'ls', 'n_estimators': 2835, 'max_depth': 4}
mean: -0.12977, std: 0.00898, params: {'min_samples_split': 16, 'loss': 'lad', 'n_estimators': 4634, 'max_depth': 3}
mean: -0.13142, std: 0.00898, params: {'min_samples_split': 14, 'loss': 'lad', 'n_estimators': 4097, 'max_depth': 4}
mean: -0.12806, std: 0.01086, params: {'min_samples_split': 13, 'loss': 'lad', 'n_estimators': 4170, 'max_depth': 4}
mean: -0.13099, std: 0.01245, params: {'min_samples_split': 18, 'loss': 'ls', 'n_estimators': 726, 'max_depth': 3}
mean: -0.13266, std: 0.01332, params: {'min_samples_split': 17, 'loss': 'ls', 'n_estimators': 2006, 'max_depth': 5}
mean: -0.13138, std: 0.01249, params: {'min_samples_split': 12, 'loss'

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  8.7min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 31.4min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 70.6min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 80.3min finished


mean: -0.13323, std: 0.00944, params: {'min_samples_split': 15, 'loss': 'ls', 'n_estimators': 2000, 'max_depth': 5}
mean: -0.13412, std: 0.01001, params: {'min_samples_split': 11, 'loss': 'ls', 'n_estimators': 2512, 'max_depth': 5}
mean: -0.12837, std: 0.01303, params: {'min_samples_split': 16, 'loss': 'ls', 'n_estimators': 2835, 'max_depth': 4}
mean: -0.12345, std: 0.01050, params: {'min_samples_split': 16, 'loss': 'lad', 'n_estimators': 4634, 'max_depth': 3}
mean: -0.12725, std: 0.01099, params: {'min_samples_split': 14, 'loss': 'lad', 'n_estimators': 4097, 'max_depth': 4}
mean: -0.12396, std: 0.01200, params: {'min_samples_split': 13, 'loss': 'lad', 'n_estimators': 4170, 'max_depth': 4}
mean: -0.12708, std: 0.01301, params: {'min_samples_split': 18, 'loss': 'ls', 'n_estimators': 726, 'max_depth': 3}
mean: -0.13183, std: 0.01246, params: {'min_samples_split': 17, 'loss': 'ls', 'n_estimators': 2006, 'max_depth': 5}
mean: -0.12758, std: 0.01298, params: {'min_samples_split': 12, 'loss'

Creating/finding a Random Forest model, both for original SalesPrice and for the log-transformed variable...

In [22]:
rf_def = RandomForestRegressor(random_state=666, n_jobs=-1)
params = {'min_samples_split': scipy.stats.randint(low=2, high=20), 'n_estimators': scipy.stats.randint(low=50, high=5000), 'bootstrap': [True, False]}
np.random.seed(666)
rf_model = my.make_model(features, prices, rf_def, params, cv=kf, grid_type='random', n_iter=100, n_jobs=-1)
np.random.seed(666)
rf_log_model = my.make_model(features, logSalePrice, rf_def, params, cv=kf, y_type='log1p', grid_type='random', n_iter=100, n_jobs=-1)

scores = pd.DataFrame([['RandomForest', str(rf_model.best_params_), -rf_model.best_score_]], columns=['model', 'params', 'RMSLE'])
model_scores = model_scores.append(scores)
scores = pd.DataFrame([['RandomForest', str(rf_log_model.best_params_), -rf_log_model.best_score_]], columns=['model', 'params', 'RMSLE'])
model_scores_log = model_scores_log.append(scores)

my.fit_submit(test_features, test_id, rf_model, '../data/submissions/rf.csv')
my.fit_submit(test_features, test_id, rf_log_model, '../data/submissions/rf_log.csv', price_type='log1p')

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 20.1min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 90.5min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 223.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 255.5min finished


mean: -0.14398, std: 0.00907, params: {'min_samples_split': 4, 'n_estimators': 2000, 'bootstrap': True}
mean: -0.14454, std: 0.00876, params: {'min_samples_split': 8, 'n_estimators': 2043, 'bootstrap': True}
mean: -0.14433, std: 0.00899, params: {'min_samples_split': 6, 'n_estimators': 1519, 'bootstrap': True}
mean: -0.19933, std: 0.00789, params: {'min_samples_split': 16, 'n_estimators': 2835, 'bootstrap': False}
mean: -0.19755, std: 0.00778, params: {'min_samples_split': 18, 'n_estimators': 3168, 'bootstrap': False}
mean: -0.14480, std: 0.00890, params: {'min_samples_split': 9, 'n_estimators': 3191, 'bootstrap': True}
mean: -0.14671, std: 0.00832, params: {'min_samples_split': 14, 'n_estimators': 4097, 'bootstrap': True}
mean: -0.20091, std: 0.00892, params: {'min_samples_split': 7, 'n_estimators': 935, 'bootstrap': False}
mean: -0.14630, std: 0.00846, params: {'min_samples_split': 13, 'n_estimators': 4170, 'bootstrap': True}
mean: -0.14414, std: 0.00886, params: {'min_samples_split'

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 19.5min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 86.4min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 215.3min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 247.1min finished


mean: -0.14220, std: 0.01006, params: {'min_samples_split': 4, 'n_estimators': 2000, 'bootstrap': True}
mean: -0.14281, std: 0.00990, params: {'min_samples_split': 8, 'n_estimators': 2043, 'bootstrap': True}
mean: -0.14230, std: 0.01001, params: {'min_samples_split': 6, 'n_estimators': 1519, 'bootstrap': True}
mean: -0.18883, std: 0.00994, params: {'min_samples_split': 16, 'n_estimators': 2835, 'bootstrap': False}
mean: -0.18803, std: 0.01003, params: {'min_samples_split': 18, 'n_estimators': 3168, 'bootstrap': False}
mean: -0.14282, std: 0.01004, params: {'min_samples_split': 9, 'n_estimators': 3191, 'bootstrap': True}
mean: -0.14408, std: 0.00965, params: {'min_samples_split': 14, 'n_estimators': 4097, 'bootstrap': True}
mean: -0.19177, std: 0.01176, params: {'min_samples_split': 7, 'n_estimators': 935, 'bootstrap': False}
mean: -0.14381, std: 0.00970, params: {'min_samples_split': 13, 'n_estimators': 4170, 'bootstrap': True}
mean: -0.14218, std: 0.00981, params: {'min_samples_split'

Creating/finding an Extra Trees model, both for original SalesPrice and for the log-transformed variable...

In [23]:
extra_def = ExtraTreesRegressor(random_state=666, n_jobs=-1)
params = {'min_samples_split': scipy.stats.randint(low=2, high=20), 'n_estimators': scipy.stats.randint(low=50, high=5000), 'bootstrap': [True, False]}
np.random.seed(666)
extra_model = my.make_model(features, prices, extra_def, params, cv=kf, grid_type='random', n_iter=100, n_jobs=-1)
np.random.seed(666)
extra_log_model = my.make_model(features, logSalePrice, extra_def, params, cv=kf, y_type='log1p', grid_type='random', n_iter=100, n_jobs=-1)

scores = pd.DataFrame([['ExtraTrees', str(extra_model.best_params_), -extra_model.best_score_]], columns=['model', 'params', 'RMSLE'])
model_scores = model_scores.append(scores)
scores = pd.DataFrame([['ExtraTrees', str(extra_log_model.best_params_), -extra_log_model.best_score_]], columns=['model', 'params', 'RMSLE'])
model_scores_log = model_scores_log.append(scores)

my.fit_submit(test_features, test_id, extra_model, '../data/submissions/extra.csv')
my.fit_submit(test_features, test_id, extra_log_model, '../data/submissions/extra_log.csv', price_type='log1p')

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 14.1min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 61.8min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 153.6min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 175.9min finished


mean: -0.14780, std: 0.00971, params: {'min_samples_split': 4, 'n_estimators': 2000, 'bootstrap': True}
mean: -0.14876, std: 0.00969, params: {'min_samples_split': 8, 'n_estimators': 2043, 'bootstrap': True}
mean: -0.14811, std: 0.01002, params: {'min_samples_split': 6, 'n_estimators': 1519, 'bootstrap': True}
mean: -0.14930, std: 0.00982, params: {'min_samples_split': 16, 'n_estimators': 2835, 'bootstrap': False}
mean: -0.15026, std: 0.00980, params: {'min_samples_split': 18, 'n_estimators': 3168, 'bootstrap': False}
mean: -0.14929, std: 0.00966, params: {'min_samples_split': 9, 'n_estimators': 3191, 'bootstrap': True}
mean: -0.15165, std: 0.00942, params: {'min_samples_split': 14, 'n_estimators': 4097, 'bootstrap': True}
mean: -0.14723, std: 0.00981, params: {'min_samples_split': 7, 'n_estimators': 935, 'bootstrap': False}
mean: -0.15113, std: 0.00948, params: {'min_samples_split': 13, 'n_estimators': 4170, 'bootstrap': True}
mean: -0.14807, std: 0.01003, params: {'min_samples_split'

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 13.9min
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed: 61.1min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed: 152.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 174.2min finished


mean: -0.14367, std: 0.00842, params: {'min_samples_split': 4, 'n_estimators': 2000, 'bootstrap': True}
mean: -0.14462, std: 0.00878, params: {'min_samples_split': 8, 'n_estimators': 2043, 'bootstrap': True}
mean: -0.14392, std: 0.00853, params: {'min_samples_split': 6, 'n_estimators': 1519, 'bootstrap': True}
mean: -0.14614, std: 0.00977, params: {'min_samples_split': 16, 'n_estimators': 2835, 'bootstrap': False}
mean: -0.14685, std: 0.00982, params: {'min_samples_split': 18, 'n_estimators': 3168, 'bootstrap': False}
mean: -0.14494, std: 0.00848, params: {'min_samples_split': 9, 'n_estimators': 3191, 'bootstrap': True}
mean: -0.14703, std: 0.00838, params: {'min_samples_split': 14, 'n_estimators': 4097, 'bootstrap': True}
mean: -0.14428, std: 0.00987, params: {'min_samples_split': 7, 'n_estimators': 935, 'bootstrap': False}
mean: -0.14657, std: 0.00848, params: {'min_samples_split': 13, 'n_estimators': 4170, 'bootstrap': True}
mean: -0.14396, std: 0.00814, params: {'min_samples_split'

Now, it's time to combine the best models, using one averaging approach and testing against the original target variable.

In [24]:
gbr_def_ = gbr_model.best_estimator_
ridge_def_ = ridge_model.best_estimator_
rf_def_ = rf_model.best_estimator_
extra_def_ = extra_model.best_estimator_
models, weights, score = my.test_model_array([gbr_def_, ridge_def_, rf_def_, extra_def_], features, prices, kf)

scores = pd.DataFrame([['Combinations', 'Weighted average', score]], columns=['model', 'params', 'RMSLE'])
model_scores = model_scores.append(scores)

my.fit_submit_array(test_features, test_id, models, weights, '../data/submissions/combinations_normalweights.csv')

Fold no. 1
-- Model 0 --
Training table score (RMSLE): 0.04358
Validation table score (RMSLE): 0.12472
-- Model 1 --
Training table score (RMSLE): 0.12461
Validation table score (RMSLE): 0.13767
-- Model 2 --
Training table score (RMSLE): 0.06363
Validation table score (RMSLE): 0.14135
-- Model 3 --
Training table score (RMSLE): 0.01298
Validation table score (RMSLE): 0.13474
--
Weighted training table score (RMSLE): 0.05354
Weighted validation table score (RMSLE): 0.12128
Current weights: [ 0.26926097  0.24392732  0.2375762   0.2492355 ]
 
Fold no. 2
-- Model 0 --
Training table score (RMSLE): 0.04975
Validation table score (RMSLE): 0.13540
-- Model 1 --
Training table score (RMSLE): 0.12417
Validation table score (RMSLE): 0.14362
-- Model 2 --
Training table score (RMSLE): 0.06410
Validation table score (RMSLE): 0.15593
-- Model 3 --
Training table score (RMSLE): 0.01173
Validation table score (RMSLE): 0.16223
--
Weighted training table score (RMSLE): 0.05731
Weighted validation tabl

Now, it's time to combine the best models, using the second averaging approach and testing against the original target variable.

In [25]:
gbr_def_ = gbr_model.best_estimator_
ridge_def_ = ridge_model.best_estimator_
rf_def_ = rf_model.best_estimator_
extra_def_ = extra_model.best_estimator_
models, weights, score = my.test_model_array([gbr_def_, ridge_def_, rf_def_, extra_def_], features, prices, kf, w_function='inv_sq_sum')

scores = pd.DataFrame([['Combinations', 'Inv Sq Sum', score]], columns=['model', 'params', 'RMSLE'])
model_scores = model_scores.append(scores)

my.fit_submit_array(test_features, test_id, models, weights, '../data/submissions/combinations_invsqsumweights.csv')

Fold no. 1
-- Model 0 --
Training table score (RMSLE): 0.04358
Validation table score (RMSLE): 0.12472
-- Model 1 --
Training table score (RMSLE): 0.12461
Validation table score (RMSLE): 0.13767
-- Model 2 --
Training table score (RMSLE): 0.06363
Validation table score (RMSLE): 0.14135
-- Model 3 --
Training table score (RMSLE): 0.01298
Validation table score (RMSLE): 0.13474
--
Weighted training table score (RMSLE): 0.05271
Weighted validation table score (RMSLE): 0.12099
Current weights: [ 0.28935448  0.23746757  0.2252627   0.24791524]
 
Fold no. 2
-- Model 0 --
Training table score (RMSLE): 0.04975
Validation table score (RMSLE): 0.13540
-- Model 1 --
Training table score (RMSLE): 0.12417
Validation table score (RMSLE): 0.14362
-- Model 2 --
Training table score (RMSLE): 0.06410
Validation table score (RMSLE): 0.15593
-- Model 3 --
Training table score (RMSLE): 0.01173
Validation table score (RMSLE): 0.16223
--
Weighted training table score (RMSLE): 0.05824
Weighted validation tabl

Now, it's time to combine the best models - except for Gradient Boosting -, using the second averaging approach and testing against the original target variable.

In [26]:
#gbr_def_ = gbr_model.best_estimator_
ridge_def_ = ridge_model.best_estimator_
rf_def_ = rf_model.best_estimator_
extra_def_ = extra_model.best_estimator_
models, weights, score = my.test_model_array([ridge_def_, rf_def_, extra_def_], features, prices, kf, w_function='inv_sq_sum')

scores = pd.DataFrame([['Combinations minus GBR', 'Inv Sq Sum', score]], columns=['model', 'params', 'RMSLE'])
model_scores = model_scores.append(scores)

my.fit_submit_array(test_features, test_id, models, weights, '../data/submissions/combinations_minusgbr_invsqsumweights.csv')

Fold no. 1
-- Model 0 --
Training table score (RMSLE): 0.12461
Validation table score (RMSLE): 0.13767
-- Model 1 --
Training table score (RMSLE): 0.06363
Validation table score (RMSLE): 0.14135
-- Model 2 --
Training table score (RMSLE): 0.01298
Validation table score (RMSLE): 0.13474
--
Weighted training table score (RMSLE): 0.06206
Weighted validation table score (RMSLE): 0.12602
Current weights: [ 0.33415756  0.31698322  0.34885922]
 
Fold no. 2
-- Model 0 --
Training table score (RMSLE): 0.12417
Validation table score (RMSLE): 0.14362
-- Model 1 --
Training table score (RMSLE): 0.06410
Validation table score (RMSLE): 0.15593
-- Model 2 --
Training table score (RMSLE): 0.01173
Validation table score (RMSLE): 0.16223
--
Weighted training table score (RMSLE): 0.06769
Weighted validation table score (RMSLE): 0.14441
Current weights: [ 0.37991976  0.32229925  0.297781  ]
 
Fold no. 3
-- Model 0 --
Training table score (RMSLE): 0.12251
Validation table score (RMSLE): 0.15574
-- Model 1 

Now, it's time to combine the best models, using one averaging approach and testing against the log-transformed target variable.

In [27]:
gbr_log_def_ = gbr_log_model.best_estimator_
ridge_log_def_ = ridge_log_model.best_estimator_
rf_log_def_ = rf_log_model.best_estimator_
extra_log_def_ = extra_log_model.best_estimator_
models_log, weights_log, score_log = my.test_model_array([gbr_log_def_, ridge_log_def_, rf_log_def_, extra_log_def_], features, logSalePrice, kf, y_type='log1p')#, w_function='inv_sq_sum')

scores = pd.DataFrame([['Combinations', 'Weighted average', score_log]], columns=['model', 'params', 'RMSLE'])
model_scores_log = model_scores_log.append(scores)

my.fit_submit_array(test_features, test_id, models_log, weights_log, '../data/submissions/combinations_log_normalweights.csv', price_type='log1p')

Fold no. 1
-- Model 0 --
Training table score (RMSLE): 0.00394
Validation table score (RMSLE): 0.00952
Training table score (RMSLE) for correct (exp) data: 0.05023
Validation table score (RMSLE) for correct (exp) data: 0.12422
-- Model 1 --
Training table score (RMSLE): 0.00869
Validation table score (RMSLE): 0.00989
Training table score (RMSLE) for correct (exp) data: 0.11254
Validation table score (RMSLE) for correct (exp) data: 0.12857
-- Model 2 --
Training table score (RMSLE): 0.00413
Validation table score (RMSLE): 0.01073
Training table score (RMSLE) for correct (exp) data: 0.05292
Validation table score (RMSLE) for correct (exp) data: 0.14013
-- Model 3 --
Training table score (RMSLE): 0.00457
Validation table score (RMSLE): 0.01049
Training table score (RMSLE) for correct (exp) data: 0.05852
Validation table score (RMSLE) for correct (exp) data: 0.13663
--
Weighted training table score (RMSLE): 0.00475
Weighted validation table score (RMSLE): 0.00928
Weighted training table sc

Now, it's time to combine the best models, using the second averaging approach and testing against the log-transformed target variable.

In [28]:
gbr_log_def_ = gbr_log_model.best_estimator_
ridge_log_def_ = ridge_log_model.best_estimator_
rf_log_def_ = rf_log_model.best_estimator_
extra_log_def_ = extra_log_model.best_estimator_
models_log, weights_log, score_log = my.test_model_array([gbr_log_def_, ridge_log_def_, rf_log_def_, extra_log_def_], features, logSalePrice, kf, y_type='log1p', w_function='inv_sq_sum')

scores = pd.DataFrame([['Combinations', 'Inv Sq Sum', score_log]], columns=['model', 'params', 'RMSLE'])
model_scores_log = model_scores_log.append(scores)

my.fit_submit_array(test_features, test_id, models_log, weights_log, '../data/submissions/combinations_log_invsqsumweights.csv', price_type='log1p')

Fold no. 1
-- Model 0 --
Training table score (RMSLE): 0.00394
Validation table score (RMSLE): 0.00952
Training table score (RMSLE) for correct (exp) data: 0.05023
Validation table score (RMSLE) for correct (exp) data: 0.12422
-- Model 1 --
Training table score (RMSLE): 0.00869
Validation table score (RMSLE): 0.00989
Training table score (RMSLE) for correct (exp) data: 0.11254
Validation table score (RMSLE) for correct (exp) data: 0.12857
-- Model 2 --
Training table score (RMSLE): 0.00413
Validation table score (RMSLE): 0.01073
Training table score (RMSLE) for correct (exp) data: 0.05292
Validation table score (RMSLE) for correct (exp) data: 0.14013
-- Model 3 --
Training table score (RMSLE): 0.00457
Validation table score (RMSLE): 0.01049
Training table score (RMSLE) for correct (exp) data: 0.05852
Validation table score (RMSLE) for correct (exp) data: 0.13663
--
Weighted training table score (RMSLE): 0.00476
Weighted validation table score (RMSLE): 0.00925
Weighted training table sc

Now, it's time to combine the best models - except for the Gradient Boosting model -, using the second averaging approach and testing against the log-transformed target variable.

In [29]:
#gbr_log_def_ = gbr_log_model.best_estimator_
ridge_log_def_ = ridge_log_model.best_estimator_
rf_log_def_ = rf_log_model.best_estimator_
extra_log_def_ = extra_log_model.best_estimator_
models_log, weights_log, score_log = my.test_model_array([ridge_log_def_, rf_log_def_, extra_log_def_], features, logSalePrice, kf, y_type='log1p', w_function='inv_sq_sum')

scores = pd.DataFrame([['Combinations minus GBR', 'Inv Sq Sum', score_log]], columns=['model', 'params', 'RMSLE'])
model_scores_log = model_scores_log.append(scores)

my.fit_submit_array(test_features, test_id, models_log, weights_log, '../data/submissions/combinations_log_minusgbr_invsqsumweights.csv', price_type='log1p')

Fold no. 1
-- Model 0 --
Training table score (RMSLE): 0.00869
Validation table score (RMSLE): 0.00989
Training table score (RMSLE) for correct (exp) data: 0.11254
Validation table score (RMSLE) for correct (exp) data: 0.12857
-- Model 1 --
Training table score (RMSLE): 0.00413
Validation table score (RMSLE): 0.01073
Training table score (RMSLE) for correct (exp) data: 0.05292
Validation table score (RMSLE) for correct (exp) data: 0.14013
-- Model 2 --
Training table score (RMSLE): 0.00457
Validation table score (RMSLE): 0.01049
Training table score (RMSLE) for correct (exp) data: 0.05852
Validation table score (RMSLE) for correct (exp) data: 0.13663
--
Weighted training table score (RMSLE): 0.00556
Weighted validation table score (RMSLE): 0.00955
Weighted training table score (RMSLE) for correct (exp) data: 0.07151
Weighted validation table score (RMSLE) for correct (exp) data: 0.12464
Current weights: [ 0.36665622  0.30867241  0.32467136]
 
Fold no. 2
-- Model 0 --
Training table sco

What are the validation scores for the best models?

In [30]:
model_scores

Unnamed: 0,model,params,RMSLE
0,Ridge,{'alpha': 1.0885413712586016},0.145312
0,GBR,"{'min_samples_split': 19, 'loss': 'lad', 'n_es...",0.125915
0,RandomForest,"{'min_samples_split': 4, 'n_estimators': 718, ...",0.143981
0,ExtraTrees,"{'min_samples_split': 4, 'n_estimators': 3470,...",0.146739
0,Combinations,Weighted average,0.129392
0,Combinations,Inv Sq Sum,0.128717
0,Combinations minus GBR,Inv Sq Sum,0.135025


In [31]:
model_scores_log

Unnamed: 0,model,params,RMSLE
0,Ridge,{'alpha': 0.53638180552121706},0.144199
0,GBR,"{'min_samples_split': 7, 'loss': 'lad', 'n_est...",0.123418
0,RandomForest,"{'min_samples_split': 2, 'n_estimators': 2200,...",0.142067
0,ExtraTrees,"{'min_samples_split': 4, 'n_estimators': 718, ...",0.143506
0,Combinations,Weighted average,0.126775
0,Combinations,Inv Sq Sum,0.125776
0,Combinations minus GBR,Inv Sq Sum,0.133096


In [32]:
model_scores.to_csv('../data/submissions/model_scores.csv')
model_scores_log.to_csv('../data/submissions/model_scores_log.csv')