In [2]:
import os
import gc
from tqdm import tqdm

import pandas as pd
import numpy as np
import random
import math

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as rmse_score

import xgboost as xgb

from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [3]:
%%time

# from https://www.kaggle.com/valleyzw/ubiquant-lgbm-baseline
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in tqdm(df.columns):
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


train = reduce_mem_usage(pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet'))

for col in ['time_id', 'investment_id']:
    train[col] = train[col].astype(int)

In [4]:
train.drop(['row_id', 'time_id'], axis = 1, inplace = True)
X = train.drop(['target'], axis = 1)
y = train['target']
del train

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.3, random_state = 42, shuffle = False)

del X
del y

In [5]:
gc.collect()

# 목적
1. xgboost tuning
2. data reduction

In [6]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'subsample': 0.9,
        'seed': 42
    }

In [7]:
def objective(space):
    clf = xgb.XGBRegressor(n_estimators = space['n_estimators'],
                           subsample = space['subsample'],
                           max_depth = int(space['max_depth']),
                           gamma = space['gamma'],
                           reg_alpha = int(space['reg_alpha']),
                           reg_lambda = int(space['reg_lambda']),
                           colsample_bytree = int(space['colsample_bytree']),
                           min_child_weight = int(space['min_child_weight']),
                           random_state = space['seed'],
                           tree_method = 'gpu_hist'
                          )
    evaluation = [(X_train, y_train), (X_valid, y_valid)]
    
    clf.fit(X_train, y_train, eval_set = evaluation,
            eval_metric = 'rmse', early_stopping_rounds = 10,
            verbose = 0)
    
    pred = clf.predict(X_valid)
    rmse = rmse_score(y_valid, pred)
    print(f"Score: {rmse}")
    
    return {'loss': rmse, 'status': STATUS_OK}

In [None]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

In [None]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)

In [None]:
model = xgb.XGBRegressor(best_hyperparams)

In [None]:
model.fit(X_train, y_train, early_stopping_rounds = 10,
          eval_set = [(X_valid, y_valid)], verbose = 1)

In [None]:
import ubiquant
env = ubiquant.make_env()  
iter_test = env.iter_test()
for (test_df, sample_prediction_df) in iter_test:
    test_df.drop(['row_id'], axis=1, inplace=True)
    pred = model.predict(test_df)
    sample_prediction_df['target'] = pred
    env.predict(sample_prediction_df) 