In [1]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import ensemble
from sklearn.metrics import roc_auc_score
import optuna
import warnings

In [2]:
df_train = pd.read_csv('../input/tabularsep21-kfolddataset/train_10folds.csv')
test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')
submission_data = pd.read_csv('../input/tabular-playground-series-sep-2021/sample_solution.csv')

df_train.head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f112,f113,f114,f115,f116,f117,f118,claim,nan_count,kfold
0,0.10859,0.004314,-37.566,0.017364,0.28915,-10.251,135.12,168900.0,399240000000000.0,86.489,...,1.9096,-7.1157,4378.8,1.2096,861340000000000.0,140.1,1.0177,1,1,3
1,0.1009,0.29961,11822.0,0.2765,0.4597,-0.83733,1721.9,119810.0,3874100000000000.0,9953.6,...,0.34808,4.142,913.23,1.2464,7575100000000000.0,1861.0,0.28359,0,0,8
2,0.17803,-0.00698,907.27,0.27214,0.45948,0.17327,2298.0,360650.0,12245000000000.0,15827.0,...,0.2629,8.1312,45119.0,1.1764,321810000000000.0,3838.2,0.4069,1,5,6
3,0.15236,0.007259,780.1,0.025179,0.51947,7.4914,112.51,259490.0,77814000000000.0,-36.837,...,0.79631,-16.336,4952.4,1.1784,4533000000000.0,4889.1,0.51486,1,2,6
4,0.11623,0.5029,-109.15,0.29791,0.3449,-0.40932,2538.9,65332.0,1907200000000000.0,144.12,...,1.1464,-0.43124,3856.5,1.483,-8991300000000.0,10002.0,0.23049,1,8,9


In [3]:
features = [col for col in df_train.columns if col not in ('claim', 'kfold')]
features

['f1',
 'f2',
 'f3',
 'f4',
 'f5',
 'f6',
 'f7',
 'f8',
 'f9',
 'f10',
 'f11',
 'f12',
 'f13',
 'f14',
 'f15',
 'f16',
 'f17',
 'f18',
 'f19',
 'f20',
 'f21',
 'f22',
 'f23',
 'f24',
 'f25',
 'f26',
 'f27',
 'f28',
 'f29',
 'f30',
 'f31',
 'f32',
 'f33',
 'f34',
 'f35',
 'f36',
 'f37',
 'f38',
 'f39',
 'f40',
 'f41',
 'f42',
 'f43',
 'f44',
 'f45',
 'f46',
 'f47',
 'f48',
 'f49',
 'f50',
 'f51',
 'f52',
 'f53',
 'f54',
 'f55',
 'f56',
 'f57',
 'f58',
 'f59',
 'f60',
 'f61',
 'f62',
 'f63',
 'f64',
 'f65',
 'f66',
 'f67',
 'f68',
 'f69',
 'f70',
 'f71',
 'f72',
 'f73',
 'f74',
 'f75',
 'f76',
 'f77',
 'f78',
 'f79',
 'f80',
 'f81',
 'f82',
 'f83',
 'f84',
 'f85',
 'f86',
 'f87',
 'f88',
 'f89',
 'f90',
 'f91',
 'f92',
 'f93',
 'f94',
 'f95',
 'f96',
 'f97',
 'f98',
 'f99',
 'f100',
 'f101',
 'f102',
 'f103',
 'f104',
 'f105',
 'f106',
 'f107',
 'f108',
 'f109',
 'f110',
 'f111',
 'f112',
 'f113',
 'f114',
 'f115',
 'f116',
 'f117',
 'f118',
 'nan_count']

In [4]:
test['nan_count'] = test.isnull().sum(axis=1)
xtest = test[features]
mode = df_train[features].mode().iloc[0]
xtest = xtest.fillna(mode)

In [5]:
xtrain = df_train[features]
scaler = preprocessing.StandardScaler()

test[features] = scaler.fit_transform(xtest[features])
test.head()

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f110,f111,f112,f113,f114,f115,f116,f117,f118,nan_count
0,957919,1.747278,0.948442,-0.427147,-0.840645,0.095294,0.468778,-0.832226,-1.072016,-0.77977,...,-0.131863,-0.001763,-0.533905,-1.254592,0.194415,-0.124117,-0.629749,-0.63456,-0.250647,-0.44261
1,957920,0.909508,0.170491,-0.353351,2.503614,0.188642,-0.556304,-0.59313,0.528871,-0.52414,...,0.986689,-0.553983,-0.566362,0.986925,0.287187,-0.502827,0.022404,0.798596,-0.695993,-0.935935
2,957921,0.690577,0.661839,-0.515251,-0.731419,-0.063751,0.191682,0.818023,-0.16085,-0.609422,...,-0.708205,0.077831,-0.584945,-0.513748,-0.283915,-0.466968,-0.63014,0.552108,-1.045977,-0.44261
3,957922,-0.841062,0.324478,-0.474272,3.122008,0.426413,-0.039023,-0.979934,-0.13602,-0.795736,...,0.944134,-0.614268,-0.543467,-0.05057,-0.40266,-0.517696,2.560428,0.65422,-0.579111,-0.935935
4,957923,-0.24076,-2.435226,1.035997,0.701076,1.003687,-0.830404,1.108725,0.1619,-0.780536,...,0.061811,1.876614,-0.526856,-0.678597,-0.146556,0.355174,2.57909,-0.56231,-0.557469,-0.935935


In [6]:
df_train[features].head()

Unnamed: 0,f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,...,f110,f111,f112,f113,f114,f115,f116,f117,f118,nan_count
0,0.10859,0.004314,-37.566,0.017364,0.28915,-10.251,135.12,168900.0,399240000000000.0,86.489,...,-12.228,1.7482,1.9096,-7.1157,4378.8,1.2096,861340000000000.0,140.1,1.0177,1
1,0.1009,0.29961,11822.0,0.2765,0.4597,-0.83733,1721.9,119810.0,3874100000000000.0,9953.6,...,-56.758,4.1684,0.34808,4.142,913.23,1.2464,7575100000000000.0,1861.0,0.28359,0
2,0.17803,-0.00698,907.27,0.27214,0.45948,0.17327,2298.0,360650.0,12245000000000.0,15827.0,...,-5.7688,1.2042,0.2629,8.1312,45119.0,1.1764,321810000000000.0,3838.2,0.4069,5
3,0.15236,0.007259,780.1,0.025179,0.51947,7.4914,112.51,259490.0,77814000000000.0,-36.837,...,-34.858,2.0694,0.79631,-16.336,4952.4,1.1784,4533000000000.0,4889.1,0.51486,2
4,0.11623,0.5029,-109.15,0.29791,0.3449,-0.40932,2538.9,65332.0,1907200000000000.0,144.12,...,-13.641,1.5298,1.1464,-0.43124,3856.5,1.483,-8991300000000.0,10002.0,0.23049,8


In [13]:
xtrain.shape, df_train.claim.shape

((957919, 119), (957919,))

In [35]:
def run(trial):
    #optimize in one fold
    fold = 0
    xtrain = df_train[df_train.kfold != fold].reset_index(drop=True)
    xvalid = df_train[df_train.kfold == fold].reset_index(drop=True)
    
    ytrain = xtrain.claim
    yvalid = xvalid.claim
    
    xtrain = scaler.transform(xtrain[features])
    xvalid = scaler.transform(xvalid[features])
    
    learning_rate = trial.suggest_float('learning_rate', 1e-2, 0.8, log=True)  
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.1, 0.9)
    max_depth = trial.suggest_int('max_depth', 1, 9)
    reg_lambda = trial.suggest_float('reg_lembda', 1e-5, 100.0)
    reg_alpha = trial.suggest_float('reg_alpha', 1e-5, 100.0)
    subsample = trial.suggest_float('sumsample', 0.1, 0.9)
    alpha = trial.suggest_int('alpha', 0, 100)
    #n_estimator = trial.suggest_int('n_estimator', 500, 4000)
    
    model = XGBClassifier(random_state = 0,
                          #n_estimator = n_estimator,
                          learning_rate = learning_rate,
                          colsample_bytree = colsample_bytree,
                          max_depth = max_depth,
                          reg_lambda = reg_lambda,
                          reg_alpha = reg_alpha,
                          subsample = subsample,
                          alpha = alpha, 
                          tree_method = 'gpu_hist',
                          gpu_id = 0,
                          predictor = 'gpu_predictor',
                          #boosting='gbdt',
                          #metric='multiclass',
                          eval_metric='mlogloss'
                         )
    model.fit(xtrain, ytrain)
    valid_preds = model.predict(xvalid)
    
    roc_auc = roc_auc_score(yvalid, valid_preds)
    return roc_auc

study = optuna.create_study(direction='maximize')   
study.optimize(run, n_trials=5)

[32m[I 2021-09-14 18:34:21,340][0m A new study created in memory with name: no-name-51775d13-1a48-40fa-9dbd-41afc0622da2[0m
[32m[I 2021-09-14 18:34:28,086][0m Trial 0 finished with value: 0.7730736313698521 and parameters: {'learning_rate': 0.08674869015674984, 'colsample_bytree': 0.894194483453287, 'max_depth': 7, 'reg_lembda': 96.69872679770177, 'reg_alpha': 89.60570782924681, 'sumsample': 0.2833840188066744, 'alpha': 35}. Best is trial 0 with value: 0.7730736313698521.[0m
[32m[I 2021-09-14 18:34:36,935][0m Trial 1 finished with value: 0.7725653097363466 and parameters: {'learning_rate': 0.10276011680592632, 'colsample_bytree': 0.752270480898139, 'max_depth': 9, 'reg_lembda': 92.34692521110107, 'reg_alpha': 34.23477034695513, 'sumsample': 0.30034294429550706, 'alpha': 24}. Best is trial 0 with value: 0.7730736313698521.[0m
[32m[I 2021-09-14 18:34:43,749][0m Trial 2 finished with value: 0.7699159648269965 and parameters: {'learning_rate': 0.48146552270090415, 'colsample_byt

In [36]:
best_params = study.best_params
best_params

{'learning_rate': 0.08674869015674984,
 'colsample_bytree': 0.894194483453287,
 'max_depth': 7,
 'reg_lembda': 96.69872679770177,
 'reg_alpha': 89.60570782924681,
 'sumsample': 0.2833840188066744,
 'alpha': 35}

In [37]:
final_preds = []
for fold in range(10):
    xtrain = df_train[df_train.kfold != fold].reset_index(drop=True)
    xvalid = df_train[df_train.kfold == fold].reset_index(drop=True)
    
    ytrain = xtrain.claim
    yvalid = xvalid.claim
    
    xtrain = scaler.transform(xtrain[features])
    xvalid = scaler.transform(xvalid[features])
    
    best_params = {'learning_rate': 0.08674869015674984,
                     'colsample_bytree': 0.894194483453287,
                     'max_depth': 7,
                     'reg_lembda': 96.69872679770177,
                     'reg_alpha': 89.60570782924681,
                     'sumsample': 0.2833840188066744,
                     'alpha': 35}
    
    model = XGBClassifier(random_state = 0,
                          #n_estimator = 500,
                          **best_params, 
                          tree_method = 'gpu_hist',
                          gpu_id = 0,
                          predictor = 'gpu_predictor',
                          eval_metric='mlogloss'
                         )
    model.fit(xtrain, ytrain)
    valid_preds = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_preds.append(test_preds)
    print(fold, roc_auc_score(yvalid, valid_preds))

preds = np.mean(np.column_stack(final_preds), axis=1)
    



Parameters: { "reg_lembda", "sumsample" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


0 0.7729688808794686




Parameters: { "reg_lembda", "sumsample" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


1 0.7721575348665891




Parameters: { "reg_lembda", "sumsample" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


2 0.7726490523973744




Parameters: { "reg_lembda", "sumsample" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


3 0.771271484627671




Parameters: { "reg_lembda", "sumsample" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


4 0.7742007667775599




Parameters: { "reg_lembda", "sumsample" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


5 0.771435915721172




Parameters: { "reg_lembda", "sumsample" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


6 0.7753392224440004




Parameters: { "reg_lembda", "sumsample" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


7 0.7728919237247462




Parameters: { "reg_lembda", "sumsample" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


8 0.773525432758865




Parameters: { "reg_lembda", "sumsample" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


9 0.7733823517762851


In [38]:
preds = np.mean(np.column_stack(final_preds), axis=1)
preds

array([1., 1., 1., ..., 1., 1., 1.])

In [39]:
submission_data.claim = preds
submission_data.to_csv('submission.csv', index=False)

In [40]:
submission_data.claim.value_counts().to_frame().T

Unnamed: 0,1.0,0.9,0.8,0.7,0.6,0.5,0.4,0.3,0.2,0.1,0.0
claim,491010,1380,453,257,139,93,58,36,29,13,6
