In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np

In [2]:
df = pd.read_csv('../../data/train.csv',engine='python')
df = df.drop(columns=['ID_code'])

In [3]:
X = df.values[:,1:201]
y = df.values[:,0]
# rescale weight to make it same as test set
weight = X[:,199] * float(X.shape[0]) / len(y)

In [4]:
sum_wpos = sum( weight[i] for i in range(len(y)) if y[i] == 1.0  )
sum_wneg = sum( weight[i] for i in range(len(y)) if y[i] == 0.0 )

In [5]:
# print weight statistics
print ('weight statistics: wpos=%g, wneg=%g, ratio=%g' % ( sum_wpos, sum_wneg, sum_wneg/sum_wpos ))

weight statistics: wpos=-50893, wneg=-614414, ratio=12.0727


Specify sufficient boosting iterations to reach a minimum

In [6]:
sum_wpos = sum( weight[i] for i in range(len(y)) if y[i] == 1.0  )
sum_wneg = sum( weight[i] for i in range(len(y)) if y[i] == 0.0 )

In [7]:
num_round = 5000

Leave most parameters as default

In [12]:
param = {'tree_method': 'gpu_hist', # Use GPU accelerated algorithm
         'objective': 'binary:logitraw', # Specify multiclass classification       
         'eta':0.01,
         'gamma':0.1,
         'max_depth':10,
         'min_child_weight':100,
         'subsample':0.8,
         'colsample_bytree':0.8,
         'scale_pos_weight':sum_wneg/sum_wpos,#Control the balance of positive and negative weights, useful for unbalanced classes.
         #'lambda':0,
         #'alpha':0
         'eval_metric':'auc',
         'verbosity':3
         }

Convert input data from numpy to XGBoost format

In [13]:
dtrain = xgb.DMatrix(X, label=y)

In [None]:
modelXGB = xgb.train(param, dtrain, num_round)

In [None]:
modelXGB.save_model('../../models/modelXGB5000_3.model')
#joblib.dump(modelXGB, '../../models/modelXGB5000_1.pkl', compress=9)

In [None]:
#modelXGB = joblib.load('../../models/modelXGB5000_1.pkl')

In [None]:
df = pd.read_csv('../../data/test2.csv')
#df = df.drop(columns=['ID_code'])
data   = df.values[:,1:202]
idx = df.values[:,0]

In [None]:
dtest = xgb.DMatrix(data)

In [None]:
ypred = modelXGB.predict( dtest )

In [None]:
np.savetxt("../../data/submit9.csv", ypred, delimiter=",")

In [95]:
threshold_ratio = 0.15
outfile = '../../data/submit8.csv'

In [96]:
res  = [ ( i, ypred[i] ) for i in range(len(ypred)) ]

rorder = {}
for k, v in sorted( res, key = lambda x:-x[1] ):
    rorder[ k ] = len(rorder) + 1

# write out predictions
ntop = int( threshold_ratio * len(rorder ) )
fo = open(outfile, 'w')
nhit = 0
ntot = 0
fo.write('EventId,RankOrder,target\n')
for k, v in res:
    if rorder[k] <= ntop:
        lb = 1
        nhit += 1
    else:
        lb = 0
    # change output rank order to follow Kaggle convention
    fo.write('%s,%d,%s\n' % ( k,  len(rorder)+1-rorder[k], lb ) )
    ntot += 1
fo.close()

print ('finished writing into prediction file')

finished writing into prediction file
