In [28]:
%pylab inline

from common import calc_loss
from common import cross_validation_eval

import pandas as pd

Populating the interactive namespace from numpy and matplotlib


In [29]:
# A dummy model that always predicts a price of zero.

class AllZerosLearner(object):
    def reset(self):
        pass
    
    def learn(self, train_set):
        pass
    
    def predict(self, test_set):
        return np.zeros(len(test_set))

In [30]:
# A dummy model that always predicts the average prices from the train set.

class AveragePriceLearner(object):
    def __init__(self):
        self.reset()
        
    def reset(self):
        self.avg_price = None
        
    def learn(self, train_set):
        self.avg_price = train_set['cost'].mean()
    
    def predict(self, test_set):
        return np.ones(len(test_set)) * self.avg_price

In [33]:
# Like AveragePriceLearner, but using log transform on the cost column.

class LogAveragePriceLearner(object):
    def __init__(self):
        self.reset()
        
    def reset(self):
        self.predicted_price = None
        
    def learn(self, train_set):
        avg_log_price = np.log(train_set['cost'] + 1).mean()
        self.predicted_price = np.exp(avg_log_price) - 1
    
    def predict(self, test_set):
        return np.ones(len(test_set)) * self.predicted_price

In [34]:
# Evaluate baselines.

learners = (AllZerosLearner(), AveragePriceLearner(), LogAveragePriceLearner())
train_set = pd.read_csv('data/competition_data/train_set.csv')
for learner in learners:
    losses = cross_validation_eval(learner, train_set, 'cost')
    print "{}: loss avg {} std {}".format(
        learner, np.mean(losses), np.std(losses))
    # print "    losses:", losses

<__main__.AllZerosLearner object at 0x4ad9510>: loss avg 2.34932425859 std 0.0220523255703
<__main__.AveragePriceLearner object at 0x4ad9890>: loss avg 0.94750525723 std 0.00673683498461
<__main__.LogAveragePriceLearner object at 0x4ad98d0>: loss avg 0.823169110554 std 0.0131809062251
