In [14]:
%pylab inline

from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from soln.dataset import AllCategoricalsFeaturizer
from soln.dataset import generate_xv_splits
from soln.dataset import get_augmented_train_and_test_set
from soln.utils import eval_regressor

Populating the interactive namespace from numpy and matplotlib


In [21]:
regressors = [
    DummyRegressor(strategy='constant', constant=0.0),
    DummyRegressor(strategy='mean'),
    RandomForestRegressor(n_estimators=20),
    # RandomForestRegressor(n_estimators=100),
]

In [10]:
%time aug_train_set, aug_test_set = get_augmented_train_and_test_set()

CPU times: user 6.53 s, sys: 12 ms, total: 6.54 s
Wall time: 6.58 s


In [22]:
featurizer = AllCategoricalsFeaturizer()

train_rmsles = []
test_rmsles = []
for reg in regressors:
    train_rmsles.append([])
    test_rmsles.append([])

for i, split in enumerate(generate_xv_splits(aug_train_set)):
    print "---------------------- split {}".format(i)
    %time split_np = featurize_and_to_numpy(featurizer, *split)

    for reg_id, reg in enumerate(regressors):
        %time train_rmsle, test_rmsle = eval_regressor(reg, *split_np)
        print "reg_id {}: train_rmsle {}; test_rmsle {}".format(reg_id, train_rmsle, test_rmsle)
        train_rmsles[reg_id].append(train_rmsle)
        test_rmsles[reg_id].append(test_rmsle)

print
print "------------------------------ averages".format(i)

for reg_id, reg in enumerate(regressors):
    print "{}:".format(reg)
    print "    train RMSLE avg {} std {}".format(np.mean(train_rmsles[reg_id]), np.std(train_rmsles[reg_id]))
    print "    train RMSLEs: {}".format(train_rmsles)
    print "    test RMSLE avg {} std {}".format(np.mean(test_rmsles[reg_id]), np.std(test_rmsles[reg_id]))
    print "    test RMSLEs: {}".format(test_rmsles)
    print

---------------------- split 0
CPU times: user 1.45 s, sys: 360 ms, total: 1.81 s
Wall time: 1.83 s
CPU times: user 36 ms, sys: 0 ns, total: 36 ms
Wall time: 38.3 ms
reg_id 0: train_rmsle 2.35046007432; test_rmsle 2.34246970634
CPU times: user 36 ms, sys: 0 ns, total: 36 ms
Wall time: 36.2 ms
reg_id 1: train_rmsle 0.822919215283; test_rmsle 0.825379493313
CPU times: user 40.3 s, sys: 20 ms, total: 40.3 s
Wall time: 40.6 s
reg_id 2: train_rmsle 0.0903853458814; test_rmsle 0.298694446024
---------------------- split 1
CPU times: user 1.43 s, sys: 404 ms, total: 1.84 s
Wall time: 1.85 s
CPU times: user 40 ms, sys: 0 ns, total: 40 ms
Wall time: 39.8 ms
reg_id 0: train_rmsle 2.35408179187; test_rmsle 2.31702226148
CPU times: user 36 ms, sys: 0 ns, total: 36 ms
Wall time: 35.7 ms
reg_id 1: train_rmsle 0.833758711427; test_rmsle 0.746476334169
CPU times: user 42.7 s, sys: 76 ms, total: 42.8 s
Wall time: 43.2 s
reg_id 2: train_rmsle 0.0930124988764; test_rmsle 0.260040993738
------------------