In [2]:
import pandas as pd
import xgboost as xgb
import numpy as np

In [3]:
df = pd.read_csv('../../data/train.csv',engine='python')
df = df.drop(columns=['ID_code'])

In [4]:
X = df.values[:,1:201]
y = df.values[:,0]
# rescale weight to make it same as test set
weight = X[:,199] * float(X.shape[0]) / len(y)

In [5]:
sum_wpos = sum( weight[i] for i in range(len(y)) if y[i] == 1.0  )
sum_wneg = sum( weight[i] for i in range(len(y)) if y[i] == 0.0 )

In [6]:
# print weight statistics
print ('weight statistics: wpos=%g, wneg=%g, ratio=%g' % ( sum_wpos, sum_wneg, sum_wneg/sum_wpos ))

weight statistics: wpos=-50893, wneg=-614414, ratio=12.0727


Specify sufficient boosting iterations to reach a minimum

In [7]:
sum_wpos = sum( weight[i] for i in range(len(y)) if y[i] == 1.0  )
sum_wneg = sum( weight[i] for i in range(len(y)) if y[i] == 0.0 )

In [8]:
num_round = 5000

Leave most parameters as default

In [9]:
param = {'tree_method': 'gpu_hist', # Use GPU accelerated algorithm
         'objective': 'binary:logitraw', # Specify multiclass classification       
         'eta':0.01,
         'gamma':0.1,
         'max_depth':10,
         'min_child_weight':100,
         'subsample':0.8,
         'colsample_bytree':0.8,
         'scale_pos_weight':sum_wneg/sum_wpos,#Control the balance of positive and negative weights, useful for unbalanced classes.
         #'lambda':0,
         #'alpha':0
         'eval_metric':'auc',
         'verbosity':3
         }

Convert input data from numpy to XGBoost format

In [10]:
dtrain = xgb.DMatrix(X, label=y)

In [11]:
print('running cross validation, with preprocessing function')
# define the preprocessing function
# used to return the preprocessed training, test data, and parameter
# we can use this to do weight rescale, etc.
# as a example, we try to set scale_pos_weight
def fpreproc(dtrain, dtest, param):
    label = dtrain.get_label()
    ratio = float(np.sum(label == 0)) / np.sum(label == 1)
    param['scale_pos_weight'] = ratio
    return (dtrain, dtest, param)

# do cross validation, for each fold
# the dtrain, dtest, param will be passed into fpreproc
# then the return value of fpreproc will be used to generate
# results of that fold
cv = xgb.cv(param, dtrain, num_round, nfold=10,
       metrics={'auc'}, seed=0, fpreproc=fpreproc)



running cross validation, with preprocessing function


KeyboardInterrupt: 

In [None]:
cv.save_model('../../models/modelXGB5000_3.model')
#joblib.dump(modelXGB, '../../models/modelXGB5000_1.pkl', compress=9)

In [None]:
df = pd.read_csv('../../data/test2.csv')
#df = df.drop(columns=['ID_code'])
data   = df.values[:,1:202]
idx = df.values[:,0]

In [None]:
dtest = xgb.DMatrix(data)

In [None]:
ypred = cv.predict( dtest )

In [None]:
np.savetxt("../../data/submit10.csv", ypred, delimiter=",")