In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import cross_val_score
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer
from sklearn import manifold, decomposition, linear_model, ensemble, neighbors, cross_validation
%matplotlib inline
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt
import xgboost
import re
import scipy
import hyperopt


from heamy.dataset import Dataset
from heamy.estimator import Regressor, Classifier
from heamy.pipeline import ModelsPipeline



In [2]:
train_df = pd.read_csv("train_extra.csv")
test_df = pd.read_csv("test_extra.csv")

y = train_df.loss
train_df.drop('loss', axis=1, inplace=True)

train_df.drop('id', axis=1, inplace=True)
test_df.drop('id', axis=1, inplace=True)

In [5]:
train_data, test_data, y_train, y_test = cross_validation.train_test_split(train_df, y, test_size = 0.4)

In [6]:
shift = np.log(y_train).mean()

In [7]:
dtrain = xgboost.DMatrix(train_data.values, np.log(y_train) - np.log(y_train).mean())
dtest = xgboost.DMatrix(test_data.values, label=np.log(y_test) - np.log(y_train).mean())

In [8]:
from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [20]:
def score(params):
    print "Training with params : "
    print params
    num_round = int(params['n_estimators'])
    del params['n_estimators']

    # watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
    model = xgboost.train(params, dtrain, num_round)
    predictions = model.predict(dtest)
    score = sklearn.metrics.mean_absolute_error(y_test, np.exp(predictions+shift))
    print "\tScore: ", score, "\n\n"
    return {'loss': score, 'status': STATUS_OK}

In [32]:
def optimize(trials):
    space = {
             'n_estimators' : hp.quniform('n_estimators', 100, 1000, 1),
             'eta' : hp.quniform('eta', 0.005, 0.5, 0.005),
             'max_depth' : hp.quniform('max_depth', 3, 12, 1),
             'subsample' : hp.quniform('subsample', 0.3, 1, 0.1),
             'gamma' : hp.quniform('gamma', 0.3, 1, 0.1),
             'eval_metric': 'rmse',
             'nthread' : -1,
             'silent' : 1
             }

    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=250)

    print best

In [None]:
trials = Trials()

optimize(trials)

Training with params : 
{'silent': 1, 'eval_metric': 'rmse', 'nthread': -1, 'n_estimators': 280.0, 'subsample': 0.8, 'eta': 0.39, 'max_depth': 9.0, 'gamma': 0.9}
	Score:  1263.7173908 


Training with params : 
{'silent': 1, 'eval_metric': 'rmse', 'nthread': -1, 'n_estimators': 280.0, 'subsample': 0.7000000000000001, 'eta': 0.48, 'max_depth': 10.0, 'gamma': 0.9}
	Score:  1364.47129642 


Training with params : 
{'silent': 1, 'eval_metric': 'rmse', 'nthread': -1, 'n_estimators': 589.0, 'subsample': 0.7000000000000001, 'eta': 0.3, 'max_depth': 10.0, 'gamma': 0.30000000000000004}
