## Machine Learning

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
import gc




In [2]:
print('loading files...')
train = pd.read_csv('../../data/train_prepared.csv', na_values=-1) 
test = pd.read_csv('../../data/test.csv', na_values=-1)

print('files loaded...')


loading files...
files loaded...


In [3]:
test.drop(['ps_car_03_cat', 'ps_car_05_cat'], inplace=True, axis=1)

col_to_drop = train.columns[train.columns.str.startswith('ps_calc_')]
train = train.drop(col_to_drop, axis=1)  
test = test.drop(col_to_drop, axis=1)  

for c in train.select_dtypes(include=['float64']).columns:
    train[c]=train[c].astype(np.float32)
    test[c]=test[c].astype(np.float32)
for c in train.select_dtypes(include=['int64']).columns[2:]:
    train[c]=train[c].astype(np.int8)
    test[c]=test[c].astype(np.int8)    

print(train.shape, test.shape)

(595212, 37) (892816, 36)


In [4]:
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses

    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return [('gini', gini_score)]

In [5]:
# Set xgboost parameters
params = {}
params['eta'] = 0.02
params['max_depth'] = 4
params['subsample'] = 0.9
params['colsample_bytree'] = 0.9
params['silent'] = True
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'auc'
params['silent'] = True
#'maximize':True

In [6]:
X = train.drop(['id', 'target'], axis=1)
features = X.columns
X = X.values
y = train['target'].values
sub=test['id'].to_frame()
sub['target']=0

nrounds=2000  # need to change to 2000
kfold = 5  # need to change to 5
skf = StratifiedKFold(n_splits=kfold, random_state=0)
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(' xgb kfold: {}  of  {} : '.format(i+1, kfold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    d_train = xgb.DMatrix(X_train, y_train) 
    d_valid = xgb.DMatrix(X_valid, y_valid) 
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]
    
    xgb_model = xgb.train(params, d_train, nrounds, watchlist, early_stopping_rounds=100, 
                          feval=gini_xgb, maximize=True, verbose_eval=100)
    
    sub['target'] += xgb_model.predict(xgb.DMatrix(test[features].values), 
                        ntree_limit=xgb_model.best_ntree_limit+50) / (2*kfold)
    
sub.to_csv('submission_6.csv', index=False, float_format='%.5f')
gc.collect()
sub.head(2)


 xgb kfold: 1  of  5 : 
[0]	train-gini:0.188938	valid-gini:0.186657
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-gini:0.245822	valid-gini:0.240075
[200]	train-gini:0.27332	valid-gini:0.259674
[300]	train-gini:0.295718	valid-gini:0.270722
[400]	train-gini:0.310408	valid-gini:0.276855
[500]	train-gini:0.320485	valid-gini:0.279458
[600]	train-gini:0.328776	valid-gini:0.281145
[700]	train-gini:0.335922	valid-gini:0.281638
[800]	train-gini:0.342056	valid-gini:0.281886
[900]	train-gini:0.348515	valid-gini:0.282356
[1000]	train-gini:0.354208	valid-gini:0.282383
[1100]	train-gini:0.35993	valid-gini:0.282117
Stopping. Best iteration:
[1020]	train-gini:0.35561	valid-gini:0.282691

 xgb kfold: 2  of  5 : 
[0]	train-gini:0.188435	valid-gini:0.175613
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 r

Unnamed: 0,id,target
0,0,0.01401
1,1,0.013569


In [7]:
xgb_model.best_score #before 0.276938

0.275938