In [1]:
%pylab inline

from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from soln.dataset import AllCategoricalsFeaturizer
from soln.dataset import featurize_and_to_numpy
from soln.dataset import generate_xv_splits
from soln.dataset import get_augmented_train_and_test_set
from soln.utils import eval_regressor

Populating the interactive namespace from numpy and matplotlib


In [2]:
regressors = [
    # DummyRegressor(strategy='constant', constant=0.0),
    # DummyRegressor(strategy='mean'),
    # RandomForestRegressor(n_estimators=20),
    RandomForestRegressor(n_estimators=20, max_features=0.4),
    # RandomForestRegressor(n_estimators=100),
]

In [3]:
%time aug_train_set, aug_test_set = get_augmented_train_and_test_set()

CPU times: user 7.53 s, sys: 136 ms, total: 7.66 s
Wall time: 7.85 s


In [4]:
featurizer = AllCategoricalsFeaturizer()

train_rmsles = []
test_rmsles = []
for reg in regressors:
    train_rmsles.append([])
    test_rmsles.append([])

for i, split in enumerate(generate_xv_splits(aug_train_set)):
    print "---------------------- split {}".format(i)
    %time split_np = featurize_and_to_numpy(featurizer, *split)

    for reg_id, reg in enumerate(regressors):
        %time train_rmsle, test_rmsle = eval_regressor(reg, *split_np)
        print "reg_id {}: train_rmsle {}; test_rmsle {}".format(reg_id, train_rmsle, test_rmsle)
        train_rmsles[reg_id].append(train_rmsle)
        test_rmsles[reg_id].append(test_rmsle)

print
print "------------------------------ averages".format(i)

for reg_id, reg in enumerate(regressors):
    print "{}:".format(reg)
    print "    train RMSLE avg {} std {}".format(np.mean(train_rmsles[reg_id]), np.std(train_rmsles[reg_id]))
    print "    train RMSLEs: {}".format(train_rmsles)
    print "    test RMSLE avg {} std {}".format(np.mean(test_rmsles[reg_id]), np.std(test_rmsles[reg_id]))
    print "    test RMSLEs: {}".format(test_rmsles)
    print

---------------------- split 0
CPU times: user 2.5 s, sys: 576 ms, total: 3.08 s
Wall time: 3.34 s
CPU times: user 24.2 s, sys: 56 ms, total: 24.3 s
Wall time: 24.6 s
reg_id 0: train_rmsle 0.0871158415041; test_rmsle 0.259160235699
---------------------- split 1
CPU times: user 2.44 s, sys: 504 ms, total: 2.95 s
Wall time: 3 s
CPU times: user 23.5 s, sys: 128 ms, total: 23.7 s
Wall time: 23.8 s
reg_id 0: train_rmsle 0.0865913981979; test_rmsle 0.240741781859
---------------------- split 2
CPU times: user 2.42 s, sys: 468 ms, total: 2.89 s
Wall time: 2.93 s
CPU times: user 24.5 s, sys: 116 ms, total: 24.6 s
Wall time: 25.3 s
reg_id 0: train_rmsle 0.087197583272; test_rmsle 0.255161897852
---------------------- split 3
CPU times: user 2.38 s, sys: 476 ms, total: 2.86 s
Wall time: 2.88 s
CPU times: user 24.1 s, sys: 112 ms, total: 24.2 s
Wall time: 24.4 s
reg_id 0: train_rmsle 0.0878039033572; test_rmsle 0.226122173927
---------------------- split 4
CPU times: user 2.43 s, sys: 496 ms, to

In [26]:
# Same thing with n_estimators=100

---------------------- split 0
CPU times: user 1.45 s, sys: 220 ms, total: 1.67 s
Wall time: 1.7 s
CPU times: user 3min 22s, sys: 156 ms, total: 3min 22s
Wall time: 3min 23s
reg_id 0: train_rmsle 0.0832996589187; test_rmsle 0.29448884641
---------------------- split 1
CPU times: user 1.44 s, sys: 388 ms, total: 1.83 s
Wall time: 1.85 s
CPU times: user 4min 3s, sys: 180 ms, total: 4min 3s
Wall time: 4min 6s
reg_id 0: train_rmsle 0.083701706416; test_rmsle 0.257979085022
---------------------- split 2
CPU times: user 1.44 s, sys: 324 ms, total: 1.77 s
Wall time: 1.79 s
CPU times: user 3min 25s, sys: 204 ms, total: 3min 25s
Wall time: 3min 28s
reg_id 0: train_rmsle 0.0836523980288; test_rmsle 0.267516413845
---------------------- split 3
CPU times: user 1.46 s, sys: 412 ms, total: 1.87 s
Wall time: 1.9 s
CPU times: user 4min, sys: 96 ms, total: 4min
Wall time: 4min 3s
reg_id 0: train_rmsle 0.0842293078426; test_rmsle 0.275240917052
---------------------- split 4
CPU times: user 1.4 s, sys