In [15]:
%pylab inline

import pandas as pd
from sklearn.cross_validation import KFold

Populating the interactive namespace from numpy and matplotlib


In [6]:
# A dummy model that always predicts a price of zero.

class AllZerosLearner(object):
    def learn(self, train_set):
        pass
    
    def predict(self, test_set):
        return np.zeros(len(test_set))

In [7]:
# A dummy model that always predicts the average prices from the train set.

class AveragePriceLearner(object):
    def __init__(self):
        self.avg_price = None
        
    def learn(self, train_set):
        self.avg_price = train_set['cost'].mean()
    
    def predict(self, test_set):
        return np.ones(len(test_set)) * self.avg_price

In [10]:
# The loss function for this problem. Lower is better; zero is perfect.

def calc_loss(true_prices, predicted_prices):
    assert len(true_prices) == len(predicted_prices)
    square_log_errs = (np.log(predicted_prices + 1) - np.log(true_prices + 1)) ** 2
    return np.sqrt(np.mean(square_log_errs))

In [17]:
def cross_validation_eval(learner_cls, train_set, label_col, folds=10):
    """
    Evaluate the given model by cross validation.
    
    Returns the loss value for each fold.
    """
    perm = np.random.permutation(np.arange(len(train_set)))
    shuffled_train_set = train_set.iloc[perm]
    losses = []
    kf = KFold(len(train_set), n_folds=folds)
    for train_is, test_is in kf:
        train_subset = shuffled_train_set.iloc[train_is]
        test_subset = shuffled_train_set.iloc[test_is]
        true_vals = test_subset[label_col]
        test_subset = test_subset.drop(label_col, 1)
        model = learner_cls()
        model.learn(train_subset)
        pred_vals = model.predict(test_subset)
        loss = calc_loss(true_vals, pred_vals)
        losses.append(loss)
    return np.array(losses)

In [21]:
# Evaluate baselines.

learner_classes = (AllZerosLearner, AveragePriceLearner)
train_set = pd.read_csv('data/competition_data/train_set.csv')
for learner_cls in learner_classes:
    losses = cross_validation_eval(learner_cls, train_set, 'cost')
    print "{}: loss avg {} std {}".format(
        learner_cls.__name__, np.mean(losses), np.std(losses))
    # print "    losses:", losses

AllZerosLearner: loss avg 2.34938711252 std 0.014116188699
AveragePriceLearner: loss avg 0.947495320102 std 0.0073931384397
