In [30]:
import pandas as pd
import numpy as np 

#load train and test 
train  = pd.read_csv('../data/train.csv')
test  = pd.read_csv('../data/test.csv')

cols = list(set(train.columns).difference(set(['Id', 'Hazard'])))
data = pd.concat([train, test], axis=0)
train_set = np.hstack([np.ones(train.shape[0]), np.zeros(test.shape[0])])

toDrop = ['T2_V10', 'T2_V7', 'T1_V13', 'T1_V10']
cols = list(set(cols).difference(toDrop))


In [32]:
from sklearn import preprocessing
import xgboost as xgb
from sklearn.feature_extraction import DictVectorizer


def xgboost_pred(train, labels, test):
    params = {}
    params["objective"] = "reg:linear"
    params["eta"] = 0.005
    params["min_child_weight"] = 6
    params["subsample"] = 0.7
    params["colsample_bytree"] = 0.7
    params["scale_pos_weight"] = 1
    params["silent"] = 1
    params["max_depth"] = 9

    #Using 5000 rows for early stopping. 
    offset = 4000

    num_rounds = 10000
    xgtest = xgb.DMatrix(test)

    #create a train and validation dmatrices 
    xgtrain = xgb.DMatrix(train[offset:,:], label=labels[offset:])
    xgval = xgb.DMatrix(train[:offset,:], label=labels[:offset])

    #train using early stopping and predict
    watchlist = [(xgtrain, 'train'),(xgval, 'val')]
    model = xgb.train(params, xgtrain, num_rounds, watchlist, early_stopping_rounds=120)
    preds1 = model.predict(xgtest,ntree_limit=model.best_iteration)


    #reverse train and labels and use different 5k for early stopping. 
    # this adds very little to the score but it is an option if you are concerned about using all the data. 
    train = train[::-1,:]
    labels = np.log(labels[::-1])

    xgtrain = xgb.DMatrix(train[offset:,:], label=labels[offset:])
    xgval = xgb.DMatrix(train[:offset,:], label=labels[:offset])

    watchlist = [(xgtrain, 'train'),(xgval, 'val')]
    model = xgb.train(params, xgtrain, num_rounds, watchlist, early_stopping_rounds=120)
    preds2 = model.predict(xgtest,ntree_limit=model.best_iteration)


    #combine predictions
    #since the metric only cares about relative rank we don't need to average
    preds = (preds1)*1.4 + (preds2)*8.6
    return preds

In [33]:
# label encode the categorical variables
lbl = preprocessing.LabelEncoder()
for col in cols:    
    data[col] = lbl.fit_transform(data[col])

data = data.astype(float)
train_c = data.ix[train_set==1, cols].values
test_c = data.ix[train_set==0, cols].values

preds1 = xgboost_pred(train_c, train.Hazard, test_c)

Will train until val error hasn't decreased in 120 rounds.
[0]	train-rmse:5.337064	val-rmse:5.284439
[1]	train-rmse:5.323959	val-rmse:5.272064
[2]	train-rmse:5.310928	val-rmse:5.259427
[3]	train-rmse:5.297853	val-rmse:5.247214
[4]	train-rmse:5.284754	val-rmse:5.234892
[5]	train-rmse:5.271778	val-rmse:5.222735
[6]	train-rmse:5.258983	val-rmse:5.210885
[7]	train-rmse:5.246544	val-rmse:5.199096
[8]	train-rmse:5.233875	val-rmse:5.186996
[9]	train-rmse:5.221542	val-rmse:5.175383
[10]	train-rmse:5.209443	val-rmse:5.163763
[11]	train-rmse:5.196890	val-rmse:5.151928
[12]	train-rmse:5.184478	val-rmse:5.140182
[13]	train-rmse:5.172633	val-rmse:5.128804
[14]	train-rmse:5.160739	val-rmse:5.117346
[15]	train-rmse:5.148759	val-rmse:5.106275
[16]	train-rmse:5.136783	val-rmse:5.095056
[17]	train-rmse:5.124830	val-rmse:5.083836
[18]	train-rmse:5.113326	val-rmse:5.073097
[19]	train-rmse:5.101777	val-rmse:5.062097
[20]	train-rmse:5.090150	val-rmse:5.051164
[21]	train-rmse:5.078759	val-rmse:5.040114
[22]	

In [34]:
# #model_2 building

train_s = train[cols].T.to_dict().values()
test_s = test[cols].T.to_dict().values()

vec = DictVectorizer()
train_s = vec.fit_transform(train_s)
test_s = vec.transform(test_s)

preds2 = xgboost_pred(train_s, train.Hazard, test_s)


preds = 0.47 * (preds1**0.2) + 0.53 * (preds2**0.8)

# #generate solution
preds = pd.DataFrame({"Id": test.Id, "Hazard": preds})
preds = preds.set_index('Id')

Will train until val error hasn't decreased in 120 rounds.
[0]	train-rmse:5.336885	val-rmse:5.284271
[1]	train-rmse:5.323642	val-rmse:5.271492
[2]	train-rmse:5.310417	val-rmse:5.258550
[3]	train-rmse:5.297314	val-rmse:5.245942
[4]	train-rmse:5.284058	val-rmse:5.233475
[5]	train-rmse:5.271222	val-rmse:5.221348
[6]	train-rmse:5.258712	val-rmse:5.209463
[7]	train-rmse:5.245913	val-rmse:5.197263
[8]	train-rmse:5.233398	val-rmse:5.185091
[9]	train-rmse:5.221048	val-rmse:5.173157
[10]	train-rmse:5.208894	val-rmse:5.161625
[11]	train-rmse:5.196730	val-rmse:5.149955
[12]	train-rmse:5.184256	val-rmse:5.137962
[13]	train-rmse:5.172167	val-rmse:5.126470
[14]	train-rmse:5.160299	val-rmse:5.115043
[15]	train-rmse:5.148313	val-rmse:5.103481
[16]	train-rmse:5.136570	val-rmse:5.092273
[17]	train-rmse:5.124834	val-rmse:5.081026
[18]	train-rmse:5.113006	val-rmse:5.069832
[19]	train-rmse:5.101223	val-rmse:5.058745
[20]	train-rmse:5.089541	val-rmse:5.047812
[21]	train-rmse:5.078405	val-rmse:5.037177
[22]	