In [None]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

from utils import *

import lightgbm as lgb

seed=42
PATH = Path('data')
list(PATH.iterdir())

In [None]:
cols_to_drop=['is_attributed','ip','day','device']
train_filename = 'train_day8_3to16_nextclick_FE.feather'

# Sample run

In [None]:
val_names = [str(i) for i in list((PATH/'validation').iterdir())]
val_names

In [None]:
train_filename = 'train_day8_3to16_nextclick_FE.feather'
train_df,y_train = get_train(cols_to_drop,train_filename,100000)

In [None]:
train_df.dtypes


In [None]:
%%time

params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric':'auc',
        'learning_rate': 0.2,
        'scale_pos_weight':120,
        'verbose': 50,
        "device" : "gpu",
        "max_bin":63,
        "gpu_use_dp":False,
        
    }

val_names=['data/validation/val2.feather']
for i in val_names:           
    val_df,y_val = get_val_by_name(cols_to_drop,i)    

    lgb_model = lgb.train(params, lgb.Dataset(train_df, label=y_train,categorical_feature=[0,1,2,3]), 2000,
                          lgb.Dataset(val_df, label=y_val),
                          verbose_eval=50, 
                          early_stopping_rounds=50)
    train_pred = lgb_model.predict(train_df,lgb_model.best_iteration)
    val_pred = lgb_model.predict(val_df,lgb_model.best_iteration)

    train_loss = roc_auc_score(y_train,train_pred)
    val_loss = roc_auc_score(y_val,val_pred)
    print(f'Train AUC: {train_loss}. Val AUC: {val_loss}. Best ite: {lgb_model.best_iteration}')

    del val_df,y_val
    gc.collect()
    

In [None]:
%%time

params = {
        'boosting_type': 'gbdt',
        'objective': 'binary',
        'metric':'auc',
        'learning_rate': 0.2,
        'scale_pos_weight':120,
        'verbose': 50,
        "device" : "cpu"
    }

val_names=['data/validation/val2.feather']
for i in val_names:           
    val_df,y_val = get_val_by_name(cols_to_drop,i)    

    lgb_model = lgb.train(params, lgb.Dataset(train_df, label=y_train), 2000,
                          lgb.Dataset(val_df, label=y_val),
                          verbose_eval=50, 
                          early_stopping_rounds=50)
    train_pred = lgb_model.predict(train_df,lgb_model.best_iteration)
    val_pred = lgb_model.predict(val_df,lgb_model.best_iteration)

    train_loss = roc_auc_score(y_train,train_pred)
    val_loss = roc_auc_score(y_val,val_pred)
    print(f'Train AUC: {train_loss}. Val AUC: {val_loss}. Best ite: {lgb_model.best_iteration}')

    del val_df,y_val
    gc.collect()
    

In [None]:
fig, ax = plt.subplots(figsize=(10,12))
lgb.plot_importance(lgb_model,ax=ax,height=.5)

# Hypertuning

In [None]:
train_df,y_train = get_train(cols_to_drop,train_filename,10000000)

In [None]:
print(train_df.shape)
print(y_train.shape)

In [None]:
val_name = 'data/validation/val2.feather'
val_df,y_val = get_val_by_name(cols_to_drop,val_name)
print(val_df.shape)


In [None]:
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

val_losses=[]
ites=[]



def score(params):
    print("Training with params: ")
    print(params)


    lgb_model = lgb.train(params, lgb.Dataset(train_df, label=y_train), 2000,
                          lgb.Dataset(val_df, label=y_val),
                          verbose_eval=False, 
                          early_stopping_rounds=50)

    

    train_pred = lgb_model.predict(train_df,lgb_model.best_iteration)
    val_pred = lgb_model.predict(val_df,lgb_model.best_iteration)

    train_loss = roc_auc_score(y_train,train_pred)
    val_loss = roc_auc_score(y_val,val_pred)
    val_losses.append(val_loss)
    ites.append(lgb_model.best_iteration)
    print(f'Train AUC: {train_loss}. Val AUC: {val_loss}. Best ite: {lgb_model.best_iteration}')

    del lgb_model
    gc.collect()
    
    
    return {'loss': val_loss, 'status': STATUS_OK}

def optimize(space,max_evals=5):
    
    best = fmin(score, space, algo=tpe.suggest, 
        # trials=trials, 
        max_evals=max_evals)
    return best

In [None]:

space = {
    #'n_estimators': hp.quniform('n_estimators', 50, 500, 5),
#     'max_depth': hp.choice('max_depth', np.arange(5, 10, dtype=int)),
    'subsample': hp.quniform('subsample', 0.65, .9, 0.05),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.6, .75, 0.05),
    'gamma': hp.quniform('gamma', 0, 0.7, 0.05),
    'max_leaf_nodes': hp.choice('max_leaf_nodes', np.arange(115,139, dtype=int)),
    'min_child_weight': hp.choice('min_child_weight', np.arange(150,250, dtype=int)),
    'scale_pos_weight': hp.choice('scale_pos_weight', np.arange(140,175, dtype=int)),
    'learning_rate': 0.2,
    'eval_metric': 'auc', 
    'objective': 'binary:logistic', 
    'seed': seed,'tree_method':'gpu_hist'
}
best_hyperparams = optimize(space,max_evals=100)
print("The best hyperparameters are: ")
print(best_hyperparams)

# Test prediction

In [None]:
val_names = [str(i) for i in list((PATH/'validation').iterdir())]
val_names

In [None]:
train_df,y_train = get_train(cols_to_drop,train_filename,10000000)

In [None]:
val_name = ['data/validation/val2.feather']
val_df,y_val=[],[]
for name in val_name:
    temp = get_val_by_name(cols_to_drop,name) 
    val_df.append(temp[0])
    y_val.append(temp[1])
    
final_df = pd.concat([train_df] + val_df,ignore_index=True)
final_y = pd.concat([y_train] + y_val,ignore_index=True)

del train_df,y_train,val_df,y_val
gc.collect()

In [None]:
final_df.shape

In [None]:
# final_df.to_feather(PATH/'final_train.feather')
# final_y.Series.to_csv(PATH/'final_y.csv',index=False)

In [None]:
# final_df = pd.read_feather(PATH/'final_train.feather')
# final_y = pd.Series.read_csv(PATH/'final_y.csv')

In [None]:
ratio=2
# params = {'colsample_bytree': 0.7000000000000001, 
#           'eval_metric': 'auc', 
#           'gamma': 0.15000000000000002, 
#           'learning_rate': 0.2/ratio, 
#           'max_leaf_nodes': 137, 
#           'min_child_weight': 234, 
#           'objective': 'binary:logistic', 
#           'scale_pos_weight': 174, 
#           'seed': 42, 
#           'subsample': 0.65, 
#           'tree_method': 'gpu_hist'}
params = {'colsample_bytree': 0.65, 
          'eval_metric': 'auc', 
          'gamma': 0.6000000000000001, 
          'learning_rate': 0.2/ratio, 
          'max_leaf_nodes': 123, 
          'min_child_weight': 226, 
          'objective': 'binary:logistic', 
          'scale_pos_weight': 153, 
          'seed': 42, 
          'subsample': 0.9, 
#           'tree_method': 'gpu_hist'
          'tree_method': "hist"
         }
n_ite = (50+5)*ratio

dtrain = xgb.DMatrix(final_df,final_y)

In [None]:
del final_df,final_y
gc.collect()

In [None]:
xgb_model = xgb.train(params, dtrain, n_ite,[(dtrain, 'train'), (dtrain, 'valid')],
                  verbose_eval=10)

In [None]:
del dtrain
gc.collect()

In [None]:
fig, ax = plt.subplots(figsize=(10,12))
xgb.plot_importance(xgb_model,ax=ax,height=.5)

In [None]:
xgb_model.save_model(str(PATH/'xgb_FE_best_more.model'))
xgb_model.__del__()
gc.collect()

In [None]:

xgb_model = xgb.Booster({'nthread': 4})  # init model
xgb_model.load_model(str(PATH/'xgb_FE_best_more.model'))  # load data


In [None]:
test = get_feather('test_nextclick_FE.feather')

test.drop(cols_to_drop[1:],axis=1,inplace=True)

test.shape

In [None]:
dtest = xgb.DMatrix(test)

del test
gc.collect()

In [None]:
pred = xgb_model.predict(dtest)

In [None]:
sub = pd.read_csv(PATH/'sample_submission.csv')
sub.is_attributed = pred
sub.tail()

In [None]:
sub.to_csv(PATH/'submission'/'XGB_FE_day89_ratio2_10mil_moreite.csv',index=False)