In [1]:
import numpy as np
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from xgboost import XGBClassifier
import cPickle as pickle
np.random.seed = 0
seed = 0

In [2]:
train = np.load('../../nn/data/train_scaled.npy')
test = np.load('../../nn/data/test_scaled.npy')

In [3]:
labels = train[:,0]
train = train[:,1:]

In [None]:
def gini(actual, pred):
    assert (len(actual) == len(pred))
    all = np.asarray(np.c_[actual, pred, np.arange(len(actual))], dtype=np.float)
    all = all[np.lexsort((all[:, 2], -1 * all[:, 1]))]
    totalLosses = all[:, 0].sum()
    giniSum = all[:, 0].cumsum().sum() / totalLosses

    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)


def gini_normalized(actual, pred):
    return gini(actual, pred) / gini(actual, actual)

#gini_scorer = make_scorer(gini_normalized, greater_is_better = True)

In [None]:
#import sys
#sys.path.append('../py')
#import xgboost_optimizer

In [4]:
# from CPMP's kernel https://www.kaggle.com/cpmpml/extremely-fast-gini-computation

def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini



# Funcitons from olivier's kernel
# https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = -eval_gini(labels, preds)
    return [('gini', gini_score)]


In [5]:
MAX_ROUNDS = 400
OPTIMIZE_ROUNDS = False
LEARNING_RATE = 0.07
EARLY_STOPPING_ROUNDS = 50  

params = {
    'n_estimators' : MAX_ROUNDS,
    'objective' : "binary:logistic",
    'learning_rate' : LEARNING_RATE, 
    'scale_pos_weight' : 1.6,
    'subsample' : .8,
    'colsample_bytree' : .8,
    'min_child_weight' : 6,
    'max_depth' : 4,
    'gamma' : 10,
    'reg_alpha' : 8,
    'reg_lambda' : 1.3,
    'n_jobs' : 4,
    'eval_metric' : 'auc'
}
model = XGBClassifier(**params)

In [6]:
test_pred = np.zeros(test.shape[0])
K = 5
kfold = StratifiedKFold(n_splits=K, shuffle=True, random_state=seed)
for i, (train_indices, valid_indices) in enumerate(kfold.split(train,labels.reshape(-1))) :
    
    x_train, y_train = train[train_indices], labels[train_indices]
    x_valid, y_valid = train[valid_indices], labels[valid_indices]
    
    eval_set = [(x_valid, y_valid)]
    st = np.datetime64('now')
    print "\nFold ", i 
    _ = model.fit(x_train, y_train, eval_set=eval_set, eval_metric=gini_xgb, 
              early_stopping_rounds=EARLY_STOPPING_ROUNDS, verbose=False)
    #_ = model.fit(x_train, y_train)
    
    print( "  Best N trees = ", model.best_ntree_limit )
    print( "  Best gini = ", model.best_score )
    
    pred = model.predict_proba(x_valid)[:,1]
    print( "  Gini = ", eval_gini(y_valid, pred) )
    print "It took : ", np.datetime64('now') - st
    
    test_pred += model.predict_proba(test)[:,1]
test_pred /= K


Fold  0
('  Best N trees = ', 246)
('  Best gini = ', -0.284295)
('  Gini = ', 0.28385665619858058)
It took :  469 seconds

Fold  1
('  Best N trees = ', 272)
('  Best gini = ', -0.283461)
('  Gini = ', 0.28293167449632617)
It took :  488 seconds

Fold  2
('  Best N trees = ', 400)
('  Best gini = ', -0.284466)
('  Gini = ', 0.28446646049829594)
It took :  601 seconds

Fold  3
('  Best N trees = ', 326)
('  Best gini = ', -0.280621)
('  Gini = ', 0.28047873418360059)
It took :  565 seconds

Fold  4
('  Best N trees = ', 344)
('  Best gini = ', -0.282837)
('  Gini = ', 0.2822144268377248)
It took :  599 seconds


In [None]:
temp = test_pred

In [7]:
ids = np.load('../../nn/data/test_ids.npy')

In [8]:
with open('../data/submission.csv', 'wb') as f :
	f.write('id,target')
	for i,p in zip(ids,test_pred) :
		f.write('\n%d,%.4f'%(i,p))

In [9]:
with open('../data/model.pkl','wb') as f :
    pickle.dump(model, f)

In [None]:
temp[:10]

In [None]:
train_pred = model.predict_proba(train)[:,1]

In [None]:
eval_gini(labels, train_pred)

In [None]:
temp2 = np.zeros(temp.shape[0])
for i in range(temp.shape[0]) :
    if temp[i] >= 0.5 : temp2[i] = 1


In [None]:
sum([1 for i in test_pred if i >= 0.5])

In [None]:
test = np.load('../../nn/data/test_scaled_del.npy')

In [None]:
p = model.predict_proba(test)[:,1]

In [None]:
eval_gini(labels[:10000], p)

In [None]:
with open('../data/model.pkl', 'rb') as f:
    model = pickle.load(f)

In [None]:
test[0]

In [None]:
train[:10000][0]