In [1]:
import pandas as pd
import numpy as np 
from sklearn import preprocessing
import xgboost as xgb
from sklearn.feature_extraction import DictVectorizer

In [2]:
def Gini(y_pred, dtrain):
    y_true = dtrain.get_label()
    # check and get number of samples
    assert y_true.shape == y_pred.shape
    n_samples = y_true.shape[0]
    
    # sort rows on prediction column 
    # (from largest to smallest)
    arr = np.array([y_true, y_pred]).transpose()
    true_order = arr[arr[:,0].argsort()][::-1,0]
    pred_order = arr[arr[:,1].argsort()][::-1,0]
    
    # get Lorenz curves
    L_true = np.cumsum(true_order) / np.sum(true_order)
    L_pred = np.cumsum(pred_order) / np.sum(pred_order)
    L_ones = np.linspace(0, 1, n_samples)
    
    # get Gini coefficients (area between curves)
    G_true = np.sum(L_ones - L_true)
    G_pred = np.sum(L_ones - L_pred)
    
    # normalize to true Gini coefficient
    return 'Gini', -G_pred/G_true

In [6]:
def xgboost_pred(train,labels,test):
    params = {}
    params["objective"] = "reg:linear"
    params["eta"] = 0.005
    params["min_child_weight"] = 5
    params["subsample"] = 0.6
    params["colsample_bytree"] = 0.7
    params["scale_pos_weight"] = 1
    params["silent"] = 1
    params["max_depth"] = 9
    plst = list(params.items())
    
    #Using 5000 rows for early stopping. 
    offset = 4000
    num_rounds = 10000
    xgtest = xgb.DMatrix(test)
    
    #create a train and validation dmatrices 
    xgtrain = xgb.DMatrix(train[offset:,:], label=labels[offset:])
    xgval = xgb.DMatrix(train[:offset,:], label=labels[:offset])
    
    #train using early stopping and predict
    watchlist = [(xgtrain, 'train'),(xgval, 'val')]
    model = xgb.train(plst, xgtrain, num_rounds, watchlist, feval=Gini, early_stopping_rounds=120)
    preds1 = model.predict(xgtest,ntree_limit=model.best_iteration)
    
    #reverse train and labels and use different 5k for early stopping. 
    # this adds very little to the score but it is an option if you are concerned about using all the data. 
    train = train[::-1,:]
    labels = np.log(labels[::-1])
    
    xgtrain = xgb.DMatrix(train[offset:,:], label=labels[offset:])
    xgval = xgb.DMatrix(train[:offset,:], label=labels[:offset])
    
    watchlist = [(xgtrain, 'train'),(xgval, 'val')]
    model = xgb.train(plst, xgtrain, num_rounds, watchlist, feval=Gini, early_stopping_rounds=120)
    preds2 = model.predict(xgtest,ntree_limit=model.best_iteration)
    
    #combine predictions
    #since the metric only cares about relative rank we don't need to average
    preds = (preds1)*1.4 + (preds2)*8.6
    return preds

In [4]:
#load train and test 
train = pd.read_csv('./train.csv', index_col=0)
test  = pd.read_csv('./test.csv', index_col=0)


labels = train.Hazard
train.drop('Hazard', axis=1, inplace=True)

train_s = train
test_s = test


train_s.drop('T2_V10', axis=1, inplace=True)
train_s.drop('T2_V7',  axis=1, inplace=True)
train_s.drop('T1_V13', axis=1, inplace=True)
train_s.drop('T1_V10', axis=1, inplace=True)

test_s.drop('T2_V10', axis=1, inplace=True)
test_s.drop('T2_V7',  axis=1, inplace=True)
test_s.drop('T1_V13', axis=1, inplace=True)
test_s.drop('T1_V10', axis=1, inplace=True)

columns = train.columns
test_ind = test.index

train_s = np.array(train_s)
test_s = np.array(test_s)

# label encode the categorical variables
for i in range(train_s.shape[1]):
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train_s[:,i]) + list(test_s[:,i]))
    train_s[:,i] = lbl.transform(train_s[:,i])
    test_s[:,i] = lbl.transform(test_s[:,i])

train_s = train_s.astype(float)
test_s = test_s.astype(float)

In [None]:
preds1 = xgboost_pred(train_s,labels,test_s)

#model_2 building

train = train.T.to_dict().values()
test = test.T.to_dict().values()

vec = DictVectorizer()
train = vec.fit_transform(train)
test = vec.transform(test)

preds2 = xgboost_pred(train,labels,test)

Will train until val error hasn't decreased in 120 rounds.
[0]	train-Gini:-0.327303	val-Gini:-0.232757
[1]	train-Gini:-0.379422	val-Gini:-0.285498
[2]	train-Gini:-0.399341	val-Gini:-0.303596
[3]	train-Gini:-0.408209	val-Gini:-0.313233
[4]	train-Gini:-0.412908	val-Gini:-0.310595
[5]	train-Gini:-0.416414	val-Gini:-0.315564
[6]	train-Gini:-0.419452	val-Gini:-0.314876
[7]	train-Gini:-0.420400	val-Gini:-0.320195
[8]	train-Gini:-0.422144	val-Gini:-0.320789
[9]	train-Gini:-0.425029	val-Gini:-0.320061
[10]	train-Gini:-0.424724	val-Gini:-0.321595
[11]	train-Gini:-0.426171	val-Gini:-0.323486
[12]	train-Gini:-0.425086	val-Gini:-0.322965
[13]	train-Gini:-0.425662	val-Gini:-0.323727
[14]	train-Gini:-0.425648	val-Gini:-0.325262
[15]	train-Gini:-0.425756	val-Gini:-0.325435
[16]	train-Gini:-0.425988	val-Gini:-0.326691
[17]	train-Gini:-0.426454	val-Gini:-0.326346
[18]	train-Gini:-0.426829	val-Gini:-0.326804
[19]	train-Gini:-0.427205	val-Gini:-0.327128
[20]	train-Gini:-0.430091	val-Gini:-0.329668
[21]	t

In [None]:
for p in np.linspace(.2,.8,4):
    preds = p * (preds1**0.2) + (1-p) * (preds2**0.8)
    preds = pd.DataFrame({"Id": test_ind, "Hazard": preds})
    preds = preds.set_index('Id')
    preds.to_csv('sub'+str(p)+'.csv')