In [1]:
import os
import gc
from tqdm import tqdm

import pandas as pd
import numpy as np
import random
import math

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as rmse_score

import xgboost as xgb

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [2]:
%%time

# from https://www.kaggle.com/valleyzw/ubiquant-lgbm-baseline
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in tqdm(df.columns):
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


train = reduce_mem_usage(pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet'))

for col in ['time_id', 'investment_id']:
    train[col] = train[col].astype(int)

Memory usage of dataframe is 3642.99 MB


100%|██████████| 304/304 [03:04<00:00,  1.65it/s]

Memory usage after optimization is: 1915.96 MB
Decreased by 47.4%
CPU times: user 1min 31s, sys: 1min 52s, total: 3min 24s
Wall time: 3min 43s





In [3]:
train.drop(['row_id', 'time_id'], axis = 1, inplace = True)
X = train.drop(['target'], axis = 1)
y = train['target']
del train

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.3, random_state = 42, shuffle = False)

del X
del y

In [4]:
gc.collect()

21

# 목적
1. xgboost tuning
2. data reduction

In [5]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'subsample': 0.9,
        'seed': 42
    }

In [6]:
def objective(space):
    clf = xgb.XGBRegressor(n_estimators = space['n_estimators'],
                           subsample = space['subsample'],
                           max_depth = int(space['max_depth']),
                           gamma = space['gamma'],
                           reg_alpha = int(space['reg_alpha']),
                           reg_lambda = int(space['reg_lambda']),
                           colsample_bytree = int(space['colsample_bytree']),
                           min_child_weight = int(space['min_child_weight']),
                           random_state = space['seed'],
                           tree_method = 'gpu_hist'
                          )
    evaluation = [(X_train, y_train), (X_valid, y_valid)]
    
    clf.fit(X_train, y_train, eval_set = evaluation,
            eval_metric = 'rmse', early_stopping_rounds = 10,
            verbose = 0)
    
    pred = clf.predict(X_valid)
    rmse = rmse_score(y_valid, pred)
    print(f"Score: {rmse}")
    
    return {'loss': rmse, 'status': STATUS_OK}

In [7]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

Score: 0.8280236124992371
Score: 0.8278078436851501
Score: 0.8279695510864258
Score: 0.8271894454956055
Score: 0.8278506398200989
Score: 0.8279497623443604
Score: 0.8277723789215088
Score: 0.8279606699943542
Score: 0.8280506134033203
Score: 0.827303409576416
Score: 0.8279312252998352
Score: 0.8278653621673584
Score: 0.8278455138206482
Score: 0.8278038501739502
Score: 0.8273518085479736
Score: 0.8278404474258423
Score: 0.8279592394828796
Score: 0.8273292779922485
Score: 0.8278650045394897
Score: 0.8279038071632385
Score: 0.8271316885948181
Score: 0.8277272582054138
Score: 0.827908992767334
Score: 0.8272759318351746
Score: 0.8278388977050781
Score: 0.8272711038589478
Score: 0.8278904557228088
Score: 0.8277806639671326
Score: 0.827834963798523
Score: 0.8279440402984619
Score: 0.8277619481086731
Score: 0.8278400301933289
Score: 0.8278200626373291
Score: 0.8272761702537537
Score: 0.8278729319572449
Score: 0.8278120756149292
Score: 0.8273395299911499
Score: 0.8277384638786316
Score: 0.827914

In [8]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

The best hyperparameters are :  

{'colsample_bytree': 0.9865590388194095, 'gamma': 2.963042414478359, 'max_depth': 3.0, 'min_child_weight': 10.0, 'reg_alpha': 42.0, 'reg_lambda': 0.8851654205895558}


In [9]:
model = xgb.XGBRegressor(best_hyperparams)

In [10]:
model.fit(X_train, y_train, early_stopping_rounds = 10,
          eval_set = [(X_valid, y_valid)], verbose = 1)

XGBoostError: [15:49:05] ../src/objective/objective.cc:26: Unknown objective function: `{'colsample_bytree': 0.9865590388194095, 'gamma': 2.963042414478359, 'max_depth': 3.0, 'min_child_weight': 10.0, 'reg_alpha': 42.0, 'reg_lambda': 0.8851654205895558}`
Objective candidate: survival:aft
Objective candidate: binary:hinge
Objective candidate: multi:softmax
Objective candidate: multi:softprob
Objective candidate: rank:pairwise
Objective candidate: rank:ndcg
Objective candidate: rank:map
Objective candidate: reg:squarederror
Objective candidate: reg:squaredlogerror
Objective candidate: reg:logistic
Objective candidate: reg:pseudohubererror
Objective candidate: binary:logistic
Objective candidate: binary:logitraw
Objective candidate: reg:linear
Objective candidate: count:poisson
Objective candidate: survival:cox
Objective candidate: reg:gamma
Objective candidate: reg:tweedie

Stack trace:
  [bt] (0) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x21da6d) [0x7fd6dceaba6d]
  [bt] (1) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x21e0c9) [0x7fd6dceac0c9]
  [bt] (2) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x1b2522) [0x7fd6dce40522]
  [bt] (3) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(+0x1b9e5d) [0x7fd6dce47e5d]
  [bt] (4) /opt/conda/lib/python3.7/site-packages/xgboost/lib/libxgboost.so(XGBoosterBoostedRounds+0x31) [0x7fd6dcd266e1]
  [bt] (5) /opt/conda/lib/python3.7/lib-dynload/../../libffi.so.8(+0x6a4a) [0x7fd750483a4a]
  [bt] (6) /opt/conda/lib/python3.7/lib-dynload/../../libffi.so.8(+0x5fea) [0x7fd750482fea]
  [bt] (7) /opt/conda/lib/python3.7/lib-dynload/_ctypes.cpython-37m-x86_64-linux-gnu.so(_ctypes_callproc+0x2f4) [0x7fd750499784]
  [bt] (8) /opt/conda/lib/python3.7/lib-dynload/_ctypes.cpython-37m-x86_64-linux-gnu.so(+0x10ff8) [0x7fd750499ff8]



In [None]:
import ubiquant
env = ubiquant.make_env()  
iter_test = env.iter_test()
for (test_df, sample_prediction_df) in iter_test:
    test_df.drop(['row_id'], axis=1, inplace=True)
    pred = model.predict(test_df)
    sample_prediction_df['target'] = pred
    env.predict(sample_prediction_df) 

# Reference

- [A Guide on XGBoost hyperparameters tuning](https://www.kaggle.com/prashant111/a-guide-on-xgboost-hyperparameters-tuning)
- [xgboost regressor parameter](https://xgboost.readthedocs.io/en/stable/parameter.html)