- reference: https://www.kaggle.com/mostafaibrahim17/xgboost-3

In [1]:
import os
import gc
from tqdm import tqdm

import pandas as pd
import numpy as np
import random
import math

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
import xgboost as xgb


In [2]:
%%time

# from https://www.kaggle.com/valleyzw/ubiquant-lgbm-baseline
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in tqdm(df.columns):
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df


train = reduce_mem_usage(pd.read_pickle('../input/ump-train-picklefile/train.pkl'))

for col in ['time_id', 'investment_id']:
    train[col] = train[col].astype(int)

Memory usage of dataframe is 3642.99 MB


100%|██████████| 304/304 [03:02<00:00,  1.66it/s]


Memory usage after optimization is: 1915.96 MB
Decreased by 47.4%
CPU times: user 1min 21s, sys: 1min 47s, total: 3min 8s
Wall time: 3min 40s


In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3141410 entries, 0 to 3141409
Columns: 304 entries, row_id to f_299
dtypes: category(1), float16(301), int64(2)
memory usage: 1.9 GB


# Train, Valid Split

In [4]:
train.drop(['row_id', 'time_id'], axis = 1, inplace = True)
X = train.drop(['target'], axis = 1)
y = train['target']
del train

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.3, random_state = 42, shuffle = False)

del X
del y

In [5]:
gc.collect()

42

# Xgboost

In [6]:
# model = xgb.XGBRegressor(n_estimators = 1000,
#                          learning_rate = 0.03,
#                          max_depth = 12,
#                          subsample = 0.9,
#                          colsample_bytree = 0.7,
#                          missing = -999,
#                          random_state = 1111,
#                          tree_method = 'gpu_hist')

In [7]:
# xgboost-tuning notebook parameters

model = xgb.XGBRegressor(
    colsample_bytree = 0.9865590388194095,
    gamma = 2.963042414478359,
    max_depth = 3,
    min_child_weight = 10,
    reg_alpha = 42,
    reg_lambda = 0.8851654205895558,
    n_estimators = 1000,
    learning_rate = 0.03,
    subsample = 0.9,
    random_state = 42,
    tree_method = 'gpu_hist'
)

In [8]:
model.fit(X_train, y_train, early_stopping_rounds = 10,
          eval_set = [(X_valid, y_valid)], eval_metric = 'rmse', verbose = 1)

[0]	validation_0-rmse:1.03886
[1]	validation_0-rmse:1.03150
[2]	validation_0-rmse:1.02455
[3]	validation_0-rmse:1.01796
[4]	validation_0-rmse:1.01174
[5]	validation_0-rmse:1.00585
[6]	validation_0-rmse:1.00030
[7]	validation_0-rmse:0.99504
[8]	validation_0-rmse:0.99010
[9]	validation_0-rmse:0.98541
[10]	validation_0-rmse:0.98098
[11]	validation_0-rmse:0.97686
[12]	validation_0-rmse:0.97293
[13]	validation_0-rmse:0.96922
[14]	validation_0-rmse:0.96574
[15]	validation_0-rmse:0.96245
[16]	validation_0-rmse:0.95930
[17]	validation_0-rmse:0.95637
[18]	validation_0-rmse:0.95361
[19]	validation_0-rmse:0.95100
[20]	validation_0-rmse:0.94854
[21]	validation_0-rmse:0.94624
[22]	validation_0-rmse:0.94409
[23]	validation_0-rmse:0.94205
[24]	validation_0-rmse:0.94009
[25]	validation_0-rmse:0.93827
[26]	validation_0-rmse:0.93655
[27]	validation_0-rmse:0.93495
[28]	validation_0-rmse:0.93343
[29]	validation_0-rmse:0.93201
[30]	validation_0-rmse:0.93067
[31]	validation_0-rmse:0.92940
[32]	validation_0-

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.9865590388194095,
             enable_categorical=False, gamma=2.963042414478359, gpu_id=0,
             importance_type=None, interaction_constraints='',
             learning_rate=0.03, max_delta_step=0, max_depth=3,
             min_child_weight=10, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=2, num_parallel_tree=1, predictor='auto',
             random_state=42, reg_alpha=42, reg_lambda=0.8851654205895558,
             scale_pos_weight=1, subsample=0.9, tree_method='gpu_hist',
             validate_parameters=1, verbosity=None)

# Submission

In [9]:
import ubiquant
env = ubiquant.make_env()  
iter_test = env.iter_test()
for (test_df, sample_prediction_df) in iter_test:
    test_df.drop(['row_id'], axis=1, inplace=True)
    pred = model.predict(test_df)
    sample_prediction_df['target'] = pred
    env.predict(sample_prediction_df) 

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
