# LGBMの導入から使用まで

## Install LGBM

In [None]:
! df

In [None]:
%bash
mkdir /dev/sdb/lgbm
pip install lightgbm -t /dev/sdb/lgbm

In [None]:
!ls /dev/sdb/lgbm/lightgbm

In [4]:
import sys
sys.path.append('/dev/sdb/lgbm/')

## Load data

In [5]:
#import google.datalab.bigquery as bq
import pandas as pd

In [None]:
! gsutil cp gs://path/to/sample_submission.csv ./

In [None]:
! gsutil cp gs://path/to/train_test*.csv ./

In [None]:
import os

path_list = os.listdir("./")
train_csvs = [path for path in path_list if "train_test" in path]

train_test_df = pd.DataFrame()
for csv_part in train_csvs:
  tmp_df = pd.read_csv(csv_part)
  train_test_df = train_test_df.append(tmp_df)

In [None]:
train_test_df.head()

## Log Transforms

Functions

In [10]:
def log2_transform(df, columns):
    for fea in columns:  
        df[fea]= np.log2(1 + df[fea].values).astype(int)
    return df

In [11]:
def log10_transform(df, columns):
    for fea in columns:  
        df[fea]= np.log10(1 + df[fea].values).astype(int)
    return df

## Label encoding

In [10]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [11]:
train_test_df[["col_1", "col_2", "col_3"]] = train_test_df[["col_1", "col_2", "col_3"]].fillna("no_description")

In [12]:
train_test_df[["col_1", "col_2", "col_3"]] \
= train_test_df[["col_1", "col_2", "col_3"]].apply(le.fit_transform)

In [None]:
train_test_df.head()

## train-test split

In [14]:
train_df = train_test_df[~train_test_df["target_col"].isnull()]
test_df = train_test_df[train_test_df["target_col"].isnull()]

## LGBM settings

In [None]:
import pandas as pd
import time
import numpy as np
from sklearn.cross_validation import train_test_split
import lightgbm as lgb
import gc
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import KFold

In [19]:
def lgb_modelfit(params, dtrain, dvalid, predictors, ytrain, yvalid, objective='regression', metrics='rmse',
                 feval=None, early_stopping_rounds=20, num_boost_round=3000, verbose_eval=10, categorical_features=None):
    lgb_params = {
        'boosting_type': 'gbdt',
        'objective': objective,
        'metric': metrics,
        'learning_rate': 0.2,
        #'is_unbalance': 'true',  #because training data is unbalance (replaced with scale_pos_weight)
        'num_leaves': 31,  # we should let it be smaller than 2^(max_depth)
        'max_depth': -1,  # -1 means no limit
        'min_child_samples': 20,  # Minimum number of data need in a child(min_data_in_leaf)
        'max_bin': 255,  # Number of bucketed bin for feature values
        'subsample': 0.6,  # Subsample ratio of the training instance.
        'subsample_freq': 0,  # frequence of subsample, <=0 means no enable
        'colsample_bytree': 0.3,  # Subsample ratio of columns when constructing each tree.
        'min_child_weight': 5,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
        'subsample_for_bin': 200000,  # Number of samples for constructing bin
        'min_split_gain': 0,  # lambda_l1, lambda_l2 and min_gain_to_split to regularization
        'reg_alpha': 10 ** (-1),  # L1 regularization term on weights; 0
        'reg_lambda': 10 ** (-1),  # L2 regularization term on weights: 0
        'nthread': 4,
        'verbose': 0,
        'metric':metrics
    }

    lgb_params.update(params)

    print("preparing validation datasets")

    xgtrain = lgb.Dataset(dtrain[predictors].values, label=ytrain,
                          feature_name=predictors,
                          categorical_feature=categorical_features
                          )
    xgvalid = lgb.Dataset(dvalid[predictors].values, label=yvalid,
                          feature_name=predictors,
                          categorical_feature=categorical_features
                          )

    evals_results = {}

    bst1 = lgb.train(lgb_params, 
                     xgtrain, 
                     valid_sets=[xgtrain, xgvalid], 
                     valid_names=['train','valid'], 
                     evals_result=evals_results, 
                     num_boost_round=num_boost_round,
                     early_stopping_rounds=early_stopping_rounds,
                     verbose_eval=10, 
                     feval=feval)

    print("\nModel Report")
    print("bst1.best_iteration: ", bst1.best_iteration)
    #print(metrics+":", evals_results['valid'][metrics][bst1.best_iteration-1])
    
    # original code start
    # generate oof_X_train
    oof_X_train = bst1.predict(dvalid[predictors], num_iteration=bst1.best_iteration)

    return (bst1, bst1.best_iteration, oof_X_train)

def DO(userows, train_df, test_df, sub_df, predictors, categoricals, targets, debug=0, seed=7, fold_num=4, outs_path="./lgbm_outs/"):
    if not os.path.exists(file_path):
        os.makedirs(file_path)
    print('loading train data...')
    if debug:
        print('*** debug parameter set: this is a test run for debugging purposes ***')
        train_df = train_df[:userows]
    len_train = len(train_df)
     
    print('predictors', predictors)
    print('categoricals', categoricals)
    
    
    # define K-fold cross validation
    kfold = KFold(n_splits=fold_num, shuffle=True, random_state=seed)
    # Initialize oof_xtrain
    oof_X_train = []
    # initialize output
    sum_output =[]
    
    
    print("Training {:d}fold seed{:d}".format(fold_num, seed))
    start_time = time.time()

    for fold_idx, (train_idx, eval_idx) in enumerate(kfold.split(train_df)):
        y_train = train_df[targets].values
        # Get train-valid splited dataset
        val_df = train_df.iloc[eval_idx]
        train_df_kf =train_df.iloc[train_idx]
        y_train_kf = y_train[train_idx]
        y_eval_kf = y_train[eval_idx]

        print("train size: ", len(train_df_kf))
        print("valid size: ", len(val_df))
        print("test size : ", len(test_df))

        gc.collect()

        params = {
            'learning_rate': 0.20,
            #'is_unbalance': 'true', # replaced with scale_pos_weight argument
            'num_leaves': 7,  # 2^max_depth - 1
            'max_depth': 3,  # -1 means no limit
            'min_child_samples': 64,  # Minimum number of data need in a child(min_data_in_leaf)
            'max_bin': 63,  # Number of bucketed bin for feature values
            'subsample': 0.7,  # Subsample ratio of the training instance.
            'subsample_freq': 1,  # frequence of subsample, <=0 means no enable
            'colsample_bytree': 0.9,  # Subsample ratio of columns when constructing each tree.
            'min_child_weight': 0,  # Minimum sum of instance weight(hessian) needed in a child(leaf)
            'scale_pos_weight': 200, # because training data is extremely unbalanced 
            'lambda_l1': 10 ** (-1),  # L1 regularization term on weights; 0  lambda_l1, reg_alpha
            'lambda_l2': 10 ** (-1)  # L2 regularization term on weights: 0 #-6  reg_lambda
        }
        (bst, best_iteration, oof_X_train_kf) = lgb_modelfit(params, 
                                                             train_df_kf, 
                                                             val_df, 
                                                             predictors, 
                                                             y_train_kf,
                                                             y_eval_kf,
                                                             objective='regression', 
                                                             metrics=['rmse'],
                                                             early_stopping_rounds=30, 
                                                             verbose_eval=True, 
                                                             num_boost_round=1000, 
                                                             categorical_features=categoricals)
        # concat oof_train
        if fold_idx == 0:
            oof_X_train = oof_X_train_kf
            oof_y_train = y_eval_kf
        else:
            oof_X_train = np.concatenate([oof_X_train, oof_X_train_kf], axis=0)
            oof_y_train = np.concatenate([y_eval_kf, y_eval_kf], axis=0)
        #print('Save delta of oof_X_train_kf and label data')
            
        print('[{}]: model training time'.format(time.time() - start_time))
        print('Plot feature importances...')
        f, ax = plt.subplots(figsize=[7,20])
        lgb.plot_importance(bst, max_num_features=1000, ax=ax)
        plt.title(file_path + "LGBM Feature Importance {:d}f_seed{:d}".format(fold_idx, seed))
        plt.savefig(file_path + 'ftr_imp_{:d}f_seed{:d}.png'.format(fold_idx, seed))
        # 画像を表示しすぎるとnotebookがsaveできないので，初回のみ表示するように設定する．
        if fold_idx == 0:
            plt.show()

        print("Predicting...")
        if fold_idx == 0:
            sum_output = bst.predict(test_df[predictors],num_iteration=best_iteration)
        else:
            sum_output += bst.predict(test_df[predictors],num_iteration=best_iteration)
    
    
    print('Save out-of-fold X, y train array')
    np.save(file_path + 'lgbm_example_{:d}f_seed{:d}.npy'.format(fold_num, seed), oof_X_train)
    np.save(file_path + 'lgbm_example_{:d}f_seed{:d}.npy'.format(fold_num, seed), oof_y_train)
    # fold eval y_labelは他の弱学習器と同じなので保存しない
    
    del train_df, train_df_kf
    del val_df
    gc.collect()
    
    if not debug:
        print("writing...")
        outputs = sum_output / fold_num
        test_df[targets] = outputs
        sub_df = pd.merge(sub_df[["item_id"]], test_df[["item_id", "deal_probability"]], how='left')
        # [0, 1]からはみ出した値を丸める
        sub_df[targets] = sub_df[targets].apply(lambda x: 0. if x < 0 else x)
        sub_df[targets] = sub_df[targets].apply(lambda x: 1. if x > 1 else x)
        sub_df[["item_id", "deal_probability"]].to_csv(file_path + 'lgbm_example_{:d}f_seed{:d}.csv'.format(fold_num, seed),index=False)
    print("done...")

In [None]:
# 学習と推論に使用する変数の指定
predictors = ["col_1", "col_2", "col_3"]
# カテゴリ変数の指定
categoricals = ["col_1", "col_2", "col_3"]
# 目的変数の指定
targets = ["target_col"]

sub_df = pd.read_csv("./sample_submission.csv")

userows = 1000000
# userows分のサンプル数で試したいときは debug=1に指定
# debug=1の場合はoutputが出力されない．
debug = 0 
seed = 7
# K-foldの分割数を指定する．
fold_num = 4 

DO(userows, train_df, test_df, sub_df, predictors, categoricals, targets, debug=debug, seed=7, fold_num=4, outs_path="./lgbm_outs/")

## Check submission data

In [None]:
lgbm_sub = pd.read_csv("./lgbm_outs/lgbm_example_4f_seed7.csv")
lgbm_sub 

## Copy the directory which contains output files to GCS

In [None]:
! gsutil cp -r ./lgbm_outs gs://path/to/outputs/