In [2]:
import pandas as pd
import numpy as np 
from sklearn import preprocessing
import xgboost as xgb
from sklearn.feature_extraction import DictVectorizer

In [None]:
def Gini(y_pred, dtrain):
    y_true = dtrain.get_label()
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]
    
    # sort rows on prediction column 
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:,0].argsort()][::-1,0]
    pred_order = arr[arr[:,1].argsort()][::-1,0]
    
    # get Lorenz curves
    L_true = np.cumsum(true_order) / np.sum(true_order)
    L_pred = np.cumsum(pred_order) / np.sum(pred_order)
    L_ones = np.linspace(0, 1, n_samples)
    
    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)
    
    # normalize to true Gini coefficient
    return 'Gini', -G_pred/G_true

In [3]:
def xgboost_pred(train,labels,test):
    params = {}
    params["objective"] = "reg:linear"
    params["eta"] = 0.001
    params["min_child_weight"] = 50
    params["subsample"] = 0.33
    params["colsample_bytree"] = 0.7
    params["scale_pos_weight"] = 1
    params["silent"] = 1
    params["max_depth"] = 9
    plst = list(params.items())
    
    #Using 5000 rows for early stopping. 
    offset = 4000
    num_rounds = 10000
    xgtest = xgb.DMatrix(test)
    
    #create a train and validation dmatrices 
    xgtrain = xgb.DMatrix(train[offset:,:], label=labels[offset:])
    xgval = xgb.DMatrix(train[:offset,:], label=labels[:offset])
    
    #train using early stopping and predict
    watchlist = [(xgtrain, 'train'),(xgval, 'val')]
    model = xgb.train(plst, xgtrain, num_rounds, watchlist, feval=Gini, early_stopping_rounds=120)
    preds1 = model.predict(xgtest,ntree_limit=model.best_iteration)
    
    #reverse train and labels and use different 5k for early stopping. 
    # this adds very little to the score but it is an option if you are concerned about using all the data. 
    train = train[::-1,:]
    labels = np.log(labels[::-1])
    
    xgtrain = xgb.DMatrix(train[offset:,:], label=labels[offset:])
    xgval = xgb.DMatrix(train[:offset,:], label=labels[:offset])
    
    watchlist = [(xgtrain, 'train'),(xgval, 'val')]
    model = xgb.train(plst, xgtrain, num_rounds, watchlist, feval=Gini, early_stopping_rounds=120)
    preds2 = model.predict(xgtest,ntree_limit=model.best_iteration)
    
    #combine predictions
    #since the metric only cares about relative rank we don't need to average
    preds = (preds1)*1.4 + (preds2)*8.6
    return preds

In [None]:
#load train and test 
train = pd.read_csv('./train.csv', index_col=0)
test  = pd.read_csv('./test.csv', index_col=0)


labels = train.Hazard
train.drop('Hazard', axis=1, inplace=True)

train_s = train
test_s = test


train_s.drop('T2_V10', axis=1, inplace=True)
train_s.drop('T2_V7',  axis=1, inplace=True)
train_s.drop('T1_V13', axis=1, inplace=True)
train_s.drop('T1_V10', axis=1, inplace=True)

test_s.drop('T2_V10', axis=1, inplace=True)
test_s.drop('T2_V7',  axis=1, inplace=True)
test_s.drop('T1_V13', axis=1, inplace=True)
test_s.drop('T1_V10', axis=1, inplace=True)

columns = train.columns
test_ind = test.index

train_s = np.array(train_s)
test_s = np.array(test_s)

# label encode the categorical variables
for i in range(train_s.shape[1]):
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train_s[:,i]) + list(test_s[:,i]))
    train_s[:,i] = lbl.transform(train_s[:,i])
    test_s[:,i] = lbl.transform(test_s[:,i])

train_s = train_s.astype(float)
test_s = test_s.astype(float)

In [4]:
preds1 = xgboost_pred(train_s,labels,test_s)

#model_2 building

train = train.T.to_dict().values()
test = test.T.to_dict().values()

vec = DictVectorizer()
train = vec.fit_transform(train)
test = vec.transform(test)

preds2 = xgboost_pred(train,labels,test)

preds = 0.47 * (preds1**0.2) + 0.53 * (preds2**0.8)

#generate solution
preds = pd.DataFrame({"Id": test_ind, "Hazard": preds})
preds = preds.set_index('Id')
preds.to_csv('xgboost_benchmark_3.csv')

Will train until val error hasn't decreased in 120 rounds.
[0]	train-rmse:5.337100	val-rmse:5.284432
[1]	train-rmse:5.323635	val-rmse:5.271835
[2]	train-rmse:5.310438	val-rmse:5.259444
[3]	train-rmse:5.297897	val-rmse:5.247201
[4]	train-rmse:5.284804	val-rmse:5.234735
[5]	train-rmse:5.272405	val-rmse:5.222902
[6]	train-rmse:5.259734	val-rmse:5.211071
[7]	train-rmse:5.247163	val-rmse:5.199198
[8]	train-rmse:5.234341	val-rmse:5.187296
[9]	train-rmse:5.222056	val-rmse:5.175719
[10]	train-rmse:5.209643	val-rmse:5.163861
[11]	train-rmse:5.197356	val-rmse:5.152372
[12]	train-rmse:5.185115	val-rmse:5.140528
[13]	train-rmse:5.172774	val-rmse:5.128857
[14]	train-rmse:5.160886	val-rmse:5.117499
[15]	train-rmse:5.148882	val-rmse:5.106190
[16]	train-rmse:5.136794	val-rmse:5.094933
[17]	train-rmse:5.124596	val-rmse:5.083839
[18]	train-rmse:5.113253	val-rmse:5.072952
[19]	train-rmse:5.101650	val-rmse:5.062056
[20]	train-rmse:5.090237	val-rmse:5.051293
[21]	train-rmse:5.078608	val-rmse:5.040294
[22]	