In [1]:
import numpy as np
import pandas as pd

In [2]:
train = pd.read_csv('../input/sample-data/train_preprocessed.csv')
train_x = train.drop(['target'], axis=1)
train_y = train.target
test_x = pd.read_csv('../input/sample-data/test_preprocessed.csv')

In [16]:
train_x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 28 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   age                 10000 non-null  int64  
 1   sex                 10000 non-null  int64  
 2   height              10000 non-null  float64
 3   weight              10000 non-null  float64
 4   product             10000 non-null  int64  
 5   amount              10000 non-null  int64  
 6   medical_info_a1     10000 non-null  int64  
 7   medical_info_a2     10000 non-null  int64  
 8   medical_info_a3     10000 non-null  int64  
 9   medical_info_b1     10000 non-null  int64  
 10  medical_info_b2     10000 non-null  int64  
 11  medical_info_b3     10000 non-null  int64  
 12  medical_info_c1     7030 non-null   float64
 13  medical_info_c2     1998 non-null   float64
 14  medical_keyword_1   10000 non-null  int64  
 15  medical_keyword_2   10000 non-null  int64  
 16  medic

In [5]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=4, shuffle=True, random_state=71)
tr_idx, ca_idx = list(kf.split(train_x))[0]

tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[tr_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[tr_idx]

In [6]:
import xgboost as xgb
from sklearn.metrics import log_loss

In [19]:
dtrain = xgb.DMatrix(tr_x, label=tr_y)
dvalid = xgb.DMatrix(va_x, label=va_y)
dtest = xgb.DMatrix(test_x)

In [10]:
params = {'objective': 'binary:logistic', 'silent': 1, 'random_state': 71}
num_round = 50

In [11]:
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=num_round,
    evals=watchlist
)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-error:0.12853	eval-error:0.12853
[1]	train-error:0.11533	eval-error:0.11533
[2]	train-error:0.10933	eval-error:0.10933
[3]	train-error:0.10533	eval-error:0.10533
[4]	train-error:0.09693	eval-error:0.09693
[5]	train-error:0.09467	eval-error:0.09467
[6]	train-error:0.08733	eval-error:0.08733
[7]	train-error:0.08493	eval-error:0.08493
[8]	train-error:0.07813	eval-error:0.07813
[9]	train-error:0.07373	eval-error:0.07373
[10]	train-error:0.06867	eval-error:0.06867
[11]	train-error:0.06493	eval-error:0.06493
[12]	train-error:0.06227	eval-error:0.06227
[13]	train-error:0.06053	eval-error:0.06053
[14]	train-error:0.05680	eval-error:0.05680
[15]	train-error:0.05040	eval-error:0.05040
[16]	train-error:0.04920

In [17]:
va_pred = model.predict(dvalid)
score = log_loss(va_y, va_pred)
print(f'logloss:{score:.4f}')

logloss:0.0695


In [21]:
pred = model.predict(dtest)
pred

array([0.2064009 , 0.02400707, 0.00388634, ..., 0.83468455, 0.00239013,
       0.22417206], dtype=float32)

In [22]:
params = {'objective': 'binary:logistic', 'silent':1, 'random_state':71, 'eval_metric' :'logloss'}
num_round = 500
watchlist = [(dtrain, 'train'), (dvalid, 'eval')]
model = xgb.train(params=params, dtrain=dtrain, num_boost_round=num_round, evals=watchlist, early_stopping_rounds=20)

Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-logloss:0.54088	eval-logloss:0.54088
Multiple eval metrics have been passed: 'eval-logloss' will be used for early stopping.

Will train until eval-logloss hasn't improved in 20 rounds.
[1]	train-logloss:0.45269	eval-logloss:0.45269
[2]	train-logloss:0.39482	eval-logloss:0.39482
[3]	train-logloss:0.35198	eval-logloss:0.35198
[4]	train-logloss:0.32021	eval-logloss:0.32021
[5]	train-logloss:0.29673	eval-logloss:0.29673
[6]	train-logloss:0.27610	eval-logloss:0.27610
[7]	train-logloss:0.25886	eval-logloss:0.25886
[8]	train-logloss:0.24363	eval-logloss:0.24363
[9]	train-logloss:0.23153	eval-logloss:0.23153
[10]	train-logloss:0.22016	eval-logloss:0.22016
[11]	train-logloss:0.20963	eval-logloss:0.20963
[12

In [25]:
pred = model.predict(dtest, ntree_limit=model.best_ntree_limit)
print(pred)
pred.shape

[1.19892955e-01 1.90523744e-04 2.04496814e-06 ... 9.75438714e-01
 6.37821174e-08 5.99980317e-02]


(10000,)