In [25]:
import numpy  as np
import pandas as pd
import datetime
import gc
import matplotlib.pyplot as plt
import seaborn  as sns
import lightgbm as lgb
from   sklearn.linear_model    import BayesianRidge
from   sklearn.model_selection import StratifiedKFold,RepeatedKFold
from   sklearn.metrics         import mean_squared_error,precision_recall_curve,f1_score,roc_curve,auc
from   sklearn.ensemble        import RandomTreesEmbedding
from   scipy.stats             import ks_2samp
import utils
import tqdm
import warnings
warnings.filterwarnings('ignore')
#np.random.seed(0)

In [26]:
# Modeling Idea
# Two models for stacking using Stratified  + Repeated (LGB)
# Create prediction overall
# Two models for stacking using Stratified  + Repeated (LGB) without outlier
# Create outlier model
# For non-outlier use step 3 model
# For outlier use step 1 model


# This is different compared to others.
# Log Transformation 
# (np.log1p) and an exponential function (np.expm1)

In [27]:
df_train                 = pd.read_pickle("../data/input/train_test/train_final.pkl")
df_test                  = pd.read_pickle("../data/input/train_test/test_final.pkl")

significant_features     = utils.load_obj("significant_features")

In [28]:
train_columns = [c for c in df_train.columns if c not in ['card_id', 'first_active_month','target','outliers']]
train_columns = [c for c in train_columns    if c in significant_features]

X_train       = df_train[train_columns]
target        = df_train['target']

In [29]:
def discard_different_features(df_train,df_test,train_columns):
    
    list_p_value =[]
    new_train_columns = []
    for i in train_columns:
        if df_test[i].dtypes != 'object':
            new_train_columns.append(i)
            list_p_value.append(ks_2samp(df_test[i] , df_train[i])[1])

    Se = pd.Series(list_p_value, index = new_train_columns).sort_values() 
    list_discarded = list(Se[Se < .1].index)
    
    return list_discarded

In [30]:
column_discarded = discard_different_features(df_train,df_test,train_columns)
train_columns    = [c for c in train_columns if c not in column_discarded] 

In [36]:
def model_training(df_train,df_test,target,param,train_columns,type_fold = "SKFold",n_splits=5,df_train_svd=None,df_test_svd=None,SVD=True):
    
    if type_fold=="SKFold":
        folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

    if type_fold=="RepeatFold":
        folds = RepeatedKFold(n_splits=n_splits, n_repeats=2, random_state=0)
             
    oof                   = np.zeros(len(df_train))
    predictions           = np.zeros(len(df_test))
    feature_importance_df = pd.DataFrame()
    cv_loss               = 0    
    num_round             = 10000    
    
    if SVD == False:
        for fold_, (train_idx, valid_idx) in enumerate(folds.split(df_train,df_train['outliers'].values)):

            #param = {**param,**extra}

            print("fold {}".format(fold_))

            train_data     = lgb.Dataset(df_train.iloc[train_idx][train_columns], label=target.iloc[train_idx])#, categorical_feature=categorical_feats)
            valid_data     = lgb.Dataset(df_train.iloc[valid_idx][train_columns], label=target.iloc[valid_idx])#, categorical_feature=categorical_feats)

            clf            = lgb.train(param,train_data,num_round,valid_sets = [train_data, valid_data],verbose_eval=-1,early_stopping_rounds = 100)

            oof[valid_idx] = clf.predict(df_train.iloc[valid_idx][train_columns], num_iteration=clf.best_iteration)

            fold_importance_df                 = pd.DataFrame()
            fold_importance_df["Feature"]      = train_columns
            fold_importance_df["importance"]   = clf.feature_importance()
            fold_importance_df["fold"]         = fold_ + 1
            feature_importance_df              = pd.concat([feature_importance_df, fold_importance_df], axis=0)

            if type_fold=="SKFold":
                predictions += clf.predict(df_test[train_columns], num_iteration=clf.best_iteration) / folds.n_splits
                cv_loss     += np.sqrt(mean_squared_error(oof, target))

            if type_fold=="RepeatFold":
                predictions += clf.predict(df_test[train_columns], num_iteration=clf.best_iteration)/10
                cv_loss     += np.sqrt(mean_squared_error(oof, target))    

            #print("Cumulative CV Loss = ",cv_loss/(fold_+1))
    else:
        for fold_, (train_idx, valid_idx) in enumerate(folds.split(df_train,df_train['outliers'].values)):

            #param = {**param,**extra}

            print("fold {}".format(fold_))

            train_data     = lgb.Dataset(df_train.iloc[train_idx][train_columns], label=target.iloc[train_idx])#, categorical_feature=categorical_feats)
            valid_data     = lgb.Dataset(df_train.iloc[valid_idx][train_columns], label=target.iloc[valid_idx])#, categorical_feature=categorical_feats)

            clf            = lgb.train(param,train_data,num_round,valid_sets = [train_data, valid_data],verbose_eval=-1,early_stopping_rounds = 100)

            oof[valid_idx] = clf.predict(df_train.iloc[valid_idx][train_columns], num_iteration=clf.best_iteration)

            fold_importance_df                 = pd.DataFrame()
            fold_importance_df["Feature"]      = train_columns
            fold_importance_df["importance"]   = clf.feature_importance()
            fold_importance_df["fold"]         = fold_ + 1
            feature_importance_df              = pd.concat([feature_importance_df, fold_importance_df], axis=0)

            if type_fold=="SKFold":
                predictions += clf.predict(df_test[train_columns], num_iteration=clf.best_iteration) / folds.n_splits
                cv_loss     += np.sqrt(mean_squared_error(oof, target))

            if type_fold=="RepeatFold":
                predictions += clf.predict(df_test[train_columns], num_iteration=clf.best_iteration)/10
                cv_loss     += np.sqrt(mean_squared_error(oof, target))    

            #print("Cumulative CV Loss = ",cv_loss/(fold_+1))

    feature_importance_df = feature_importance_df.groupby(['Feature'])['importance'].mean().reset_index()
    CV_LOSS = np.sqrt(mean_squared_error(oof, target))
    print("CV Loss = ",CV_LOSS)    
    
    return predictions,oof,feature_importance_df,CV_LOSS

In [10]:
def stacking(train_stack1,train_stack2,prediction_stack1,prediction_stack2,target):

    train_stack = np.vstack([train_stack1,train_stack2]).transpose()
    test_stack  = np.vstack([prediction_stack1,prediction_stack2]).transpose()

    #Concatenate Side ways

    folds       = RepeatedKFold(n_splits=5,n_repeats=1,random_state=0)
    oof_stack   = np.zeros(train_stack.shape[0])
    predictions_stack = np.zeros(test_stack.shape[0])

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_stack, target)):
        print("fold n°{}".format(fold_))
        trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values
        val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values

        print("-" * 10 + "Stacking " + str(fold_) + "-" * 10)
        clf = BayesianRidge()
        clf.fit(trn_data, trn_y)
    
        oof_stack[val_idx] = clf.predict(val_data)
        predictions_stack += clf.predict(test_stack)/5

    print(np.sqrt(mean_squared_error(target.values, oof_stack)))
    
    return predictions_stack

In [22]:
def outlier_model_training(df_train,df_test,train_columns,type_fold="SKFold",n_splits=5):
    
    target = df_train['outliers']
    
    #param  = {'num_leaves': 40,
    #         'min_data_in_leaf': 30, 
    #         'objective':'binary',
    #         'max_depth': 6,
    #         'learning_rate': 0.015,
    #         "min_child_samples": 20,
    #         "boosting_type": "gbdt",
    #         "feature_fraction": 0.6,
    #         "bagging_freq": 1,
    #         "bagging_fraction": 0.8 ,
    #         "bagging_seed": 11,
    #         "metric": 'binary_logloss',
    #         "lambda_l1": 0.1,
    #         "verbosity": -1,
    #         "nthread": 4,
    #         "n_estimators":400,
    #         "random_state": 0}
    
    #     #param = {
    #             'objective':'binary',
    #             "boosting" : "gbdt",
    #             'num_leaves': 22,
    #             'max_depth': 8,
    #             "min_child_weight" : 59.03,
    #             'min_data_in_leaf': 48, 
    #             "subsample" : 0.8392 ,
    #             "subsample_freq" : 19, 
    #             "colsample_bytree" : 0.8727,
    #             "reg_alpha":  8.395,
    #             "reg_lambda" : 37.98,
    #             "n_estimators" : 452,
    #             'learning_rate': 0.05846,
    #             "metric": 'binary_logloss',            
    #             "verbosity": -1,
    #             'random_state' : 0
    #         }

    #     param = {
    #             'objective':'binary',
    #             "boosting" : "gbdt",
    #             "metric": 'binary_logloss',  
    #             'num_leaves': 22,
    #             'max_depth': 8,
    #             'min_child_weight': 4.031922049091266,
    #             'subsample':        0.4949576665555243,
    #             'subsample_freq':   3 ,
    #             'colsample_bytree': 0.4966523480028763,
    #             'reg_alpha':        0.0010578009043560321,
    #             'reg_lambda':       3.8551470327971784,
    #             'n_estimators':     995.082438933931,
    #             'learning_rate':    0.30615518606160763,
    #             'min_data_in_leaf': 12,
    #             "verbosity":        -1,
    #             'random_state' :    0
    #             }
    
    #     param = {
    #                 'objective':         'binary',
    #                 "boosting" :         "gbdt",
    #                 "metric"   :         'binary_logloss',  
    #                 'num_leaves':        int(70.47972983975363),
    #                 'max_depth':         int(8.392477757408827),
    #                 'min_child_weight':  59.60535124223574,
    #                 'subsample':         0.7951795353622787,
    #                 'subsample_freq':    int(round(8.596699743076439,0)),
    #                 'colsample_bytree':  0.945243324411453,
    #                 'reg_alpha':         0.7675451834724589,
    #                 'reg_lambda':        9.14907461407702,
    #                 'n_estimators':      int(907.6248184257204),
    #                 'learning_rate':     0.369583706630833,
    #                 'min_data_in_leaf':  int(6.4831756298597085)}
    
    param = {
            'objective':         'binary',
            "boosting" :         "gbdt",
            "metric"   :         'binary_logloss',  
            'num_leaves':        int(round(23.139822370824557,0)),
            'max_depth':         int(round(8.58975375654246,0)),
            'min_child_weight':  2.7078843623515434,
            'subsample':         0.45759536703648757,
            'subsample_freq':    int(round(15.291177856629368,0)),
            'colsample_bytree':  0.48711283395859273,
            'reg_alpha':         0.4370945570821483,
            'reg_lambda':        1.8421506357456277,
            'n_estimators':      int(round(769.7003729230836,0)),
            'learning_rate':     0.3824725368523456,
            'min_data_in_leaf':  int(round(9.218152218008113,0)),
            'random_state':       0
    }
    
    predictions,oof,feature_importance_df = model_training(df_train,df_test,target,param,train_columns,type_fold=type_fold,n_splits=5)
    
    return predictions,oof 

In [9]:
def non_outlier_training(df_train,df_test,train_columns,type_fold="SKFold"):

    df_train      = df_train[df_train['outliers'] == 0]
    target        = df_train['target']
    
     # params optimized by optuna
#     param ={
#        'task':       'train',
#        'boosting':   'goss',
#        'objective':  'regression',
#        'metric':     'rmse',
#        'learning_rate': 0.01,
#        'subsample': 0.9855232997390695,
#        'max_depth': 7,
#        'top_rate': 0.9064148448434349,
#        'num_leaves': 63,
#        'min_child_weight': 41.9612869171337,
#        'other_rate': 0.0721768246018207,
#        'reg_alpha': 9.677537745007898,
#        'colsample_bytree': 0.5665320670155495,
#        'min_split_gain': 9.820197773625843,
#        'reg_lambda': 8.2532317400459,
#        'min_data_in_leaf': 21,
#        'verbose': -1}
    
#Version 1    
#     param = {
#               'objective':         'regression',
#               "boosting":          "gbdt",
#               'colsample_bytree':  0.7742624484824272,
#               'learning_rate':     0.025686010236442722,
#               'max_depth':         int(8.501720245641943),
#               'min_child_weight':  59.6919305864503,
#               'min_data_in_leaf':  int(13.190424020982828),
#               'n_estimators':      int(362.55733945351284),
#               'num_leaves':        int(78.15469597634271),
#               'reg_alpha':         9.785115624171436,
#               'reg_lambda':        2.7562911363915816,
#               'subsample':         0.7835388457721937,
#               'subsample_freq':    int(4.022277822542142),
#               "metric"           : 'rmse',            
#               "verbosity"        : -1,
#               'random_state'     : 0
#         }

#Version 2 
#     param = {
#         'objective':         'regression',
#         "boosting":          'gbdt',
#         'num_leaves':        int(12.193051675829718),
#          'max_depth':        int(round(7.716985651169187,0)),
#          'min_child_weight': 29.72335065298584,
#          'subsample':        0.805638808022076,
#          'subsample_freq':   int(1.1682326885196326),
#          'colsample_bytree': 0.43199192892339977,
#          'reg_alpha':        2.4746685205728824,
#          'reg_lambda':       10.964003773164277,
#          'n_estimators':     int(round(999.9239652223674,0)),
#          'learning_rate':    0.03808021689360447,
#          'min_data_in_leaf': int(44.326079777911545),
#          "metric"           : 'rmse',            
#          "verbosity"        : -1,
#          'random_state'     : 0}

#Version 3
    param = {
            'objective':          'regression',
            "boosting":           'gbdt',
            'num_leaves':         int(round(69.73996492517938,0)),
            'max_depth':          int(5.032821749635573),
            'min_child_weight':   58.72169981385353,
            'subsample':          0.7743352484347494,
            'subsample_freq':     int(round(10.531453984618025,0)),
            'colsample_bytree':   0.6079607673654336,
            'reg_alpha':          1.2516468359917554,
            'reg_lambda':         39.91422683261549,
            'n_estimators':       int(round(984.4508643679893,0)),
            'learning_rate':      0.04440594033746144,
            'min_data_in_leaf':   int(round(11.700552265707564,0)),
            "metric"           :  'rmse',            
            "verbosity"        :  -1,
            'random_state'     :  0
            }


    predictions,oof,feature_importance_df = model_training(df_train,df_test,target,param,train_columns,type_fold=type_fold,n_splits=5)
    
    return predictions,oof,feature_importance_df

In [10]:
def outlier_regression_training(df_train,df_test,train_columns,type_fold="SKFold"):

    target = df_train['target']
    
     # params optimized by optuna
#     param ={
#        'task':       'train',
#        'boosting':   'goss',
#        'objective':  'regression',
#        'metric':     'rmse',
#        'learning_rate': 0.01,
#        'subsample': 0.9855232997390695,
#        'max_depth': 7,
#        'top_rate': 0.9064148448434349,
#        'num_leaves': 63,
#        'min_child_weight': 41.9612869171337,
#        'other_rate': 0.0721768246018207,
#        'reg_alpha': 9.677537745007898,
#        'colsample_bytree': 0.5665320670155495,
#        'min_split_gain': 9.820197773625843,
#        'reg_lambda': 8.2532317400459,
#        'min_data_in_leaf': 21,
#        'verbose': -1}
    
    param = {
              'objective':         'regression',
              "boosting":          "gbdt",
              'colsample_bytree':  0.7742624484824272,
              'learning_rate':     0.025686010236442722,
              'max_depth':         int(8.501720245641943),
              'min_child_weight':  59.6919305864503,
              'min_data_in_leaf':  int(13.190424020982828),
              'n_estimators':      int(362.55733945351284),
              'num_leaves':        int(78.15469597634271),
              'reg_alpha':         9.785115624171436,
              'reg_lambda':        2.7562911363915816,
              'subsample':         0.7835388457721937,
              'subsample_freq':    int(4.022277822542142),
              "metric"           : 'rmse',            
              "verbosity"        : -1,
              'random_state'     : 0
        }

    predictions,oof,feature_importance_df = model_training(df_train,df_test,target,param,train_columns,type_fold=type_fold,n_splits=5)
    
    return predictions,oof,feature_importance_df

In [10]:
def combine_outlier_non_outlier(best_submission,outlier_id,model_without_outliers,final_submission):
    
    path                       = "../result/"
    file_name                  = best_submission + ".csv"
    best_submission            = pd.read_csv(path + file_name)
    outlier_id['outlier_flag'] = 1

    most_likely_liers          = best_submission.merge(outlier_id,how='right')
    model_without_outliers.rename(columns = {'target':'target_non_outlier'},inplace=True)   
    model_without_outliers = model_without_outliers.merge(most_likely_liers,how='left') 

    model_without_outliers['new_target'] = model_without_outliers.apply(lambda x: x['target'] if x['outlier_flag'] == 1.0 else x['target_non_outlier'],axis=1)
    model_without_outliers = model_without_outliers[['card_id','new_target']]
    model_without_outliers = model_without_outliers.rename(columns={'new_target':'target'})
        
    file_name = final_submission + ".csv"
    
    model_without_outliers.to_csv(path+file_name, index=False)

In [39]:
# Dataframe to hold cv results

#Version 1

#param = {'num_leaves': 40,
#         'min_data_in_leaf': 30, 
#         'objective':'regression',
#         'max_depth': 6,
#         'learning_rate': 0.015,
#         "min_child_samples": 20,
#         "boosting_type": "gbdt",
#         "feature_fraction": 0.6,
#         "bagging_freq": 1,
#         "bagging_fraction": 0.8 ,
#         "bagging_seed": 11,
#         "metric": 'rmse',
#         "lambda_l1": 0.1,
#         "verbosity": -1,
#         "nthread": 4,
#         "n_estimators": 400,
#         "random_state": 0}

# Version 2
# param = {
#             'objective':'regression',
#             "boosting": "gbdt",
#             'num_leaves'       : 22,
#             'max_depth'        : 8,
#             "min_child_weight" : 59.03,
#             'min_data_in_leaf' : 48, 
#             "subsample"        : 0.8392,
#             "subsample_freq"   : 19,
#             "colsample_bytree" : 0.8727,
#             "reg_alpha"        : 8.395,
#             "reg_lambda"       : 37.98,
#             "n_estimators"     : 452,
#             'learning_rate'    : 0.01,
#             "metric"           : 'rmse',            
#             "verbosity"        : -1,
#             'random_state'     : 0
#         }

# Version 3
# param = {
#          'objective':         'regression',
#          "boosting":          "gbdt",
#          'colsample_bytree':  0.7742624484824272,
#          'learning_rate':     0.025686010236442722,
#          'max_depth':         int(8.501720245641943),
#          'min_child_weight':  59.6919305864503,
#          'min_data_in_leaf':  int(13.190424020982828),
#          'n_estimators':      int(362.55733945351284),
#          'num_leaves':        int(78.15469597634271),
#          'reg_alpha':         9.785115624171436,
#          'reg_lambda':        2.7562911363915816,
#          'subsample':         0.7835388457721937,
#          'subsample_freq':    int(4.022277822542142),
#          "metric"           : 'rmse',            
#          "verbosity"        : -1,
#          'random_state'     : 1
#         }


# Version 4
# param = {
#          'objective':         'regression',
#          "boosting":          "gbdt",
#          'colsample_bytree':  0.7742624484824272,
#          'learning_rate':     0.025686010236442722,
#          'max_depth':         int(8.501720245641943),
#          'min_child_weight':  59.6919305864503,
#          'min_data_in_leaf':  int(13.190424020982828),
#          'n_estimators':      int(362.55733945351284),
#          'num_leaves':        int(78.15469597634271),
#          'reg_alpha':         9.785115624171436,
#          'reg_lambda':        2.7562911363915816,
#          'subsample':         0.7835388457721937,
#          'subsample_freq':    int(4.022277822542142),
#          "metric"           : 'rmse',            
#          "verbosity"        : -1,
#          'random_state'     : 1
#         }

#Version 5
# param = {
#          'objective':'regression',
#          "boosting": "gbdt",            
#          "metric": 'rmse',            
#          "verbosity": -1,
#          'num_leaves': int(round(78.8550248827644,0)),
#          'max_depth': int(round(4.364837032978728,0)),
#          'min_child_weight': 2.510726551117959,
#          'subsample': 0.6195251212654296,
#          'subsample_freq': int(round(1.3405825234524915,0)),
#          'colsample_bytree': 0.5400026592586313,
#          'reg_alpha': 6.834814893383772,
#          'reg_lambda': 35.35979905337201,
#          'n_estimators': int(round(985.0790468368917,0)),
#          'learning_rate': 0.19687878426596275,
#          'min_data_in_leaf': int(round(5.372504907365297,0)),
#          'random_state' : 0}


#Version 6
# param = {
#         'objective':         'regression',
#         "boosting":          'gbdt',
#         'num_leaves':        int(12.193051675829718),
#          'max_depth':        int(round(7.716985651169187,0)),
#          'min_child_weight': 29.72335065298584,
#          'subsample':        0.805638808022076,
#          'subsample_freq':   int(1.1682326885196326),
#          'colsample_bytree': 0.43199192892339977,
#          'reg_alpha':        2.4746685205728824,
#          'reg_lambda':       10.964003773164277,
#          'n_estimators':     int(round(999.9239652223674,0)),
#          'learning_rate':    0.03808021689360447,
#          'min_data_in_leaf': int(44.326079777911545),
#          "metric"           : 'rmse',            
#          "verbosity"        : -1,
#          'random_state'     : 0}

#Version 7
# param = {
#             'objective':          'regression',
#             "boosting":           'gbdt',
#             'num_leaves':         int(round(69.73996492517938,0)),
#             'max_depth':          int(5.032821749635573),
#             'min_child_weight':   58.72169981385353,
#             'subsample':          0.7743352484347494,
#             'subsample_freq':     int(round(10.531453984618025,0)),
#             'colsample_bytree':   0.6079607673654336,
#             'reg_alpha':          1.2516468359917554,
#             'reg_lambda':         39.91422683261549,
#             'n_estimators':       int(round(984.4508643679893,0)),
#             'learning_rate':      0.04440594033746144,
#             'min_data_in_leaf':   int(round(11.700552265707564,0)),
#             "metric":             'rmse',            
#             "verbosity":          -1,
#             'random_state':       0
#     }

# params optimized by optuna - 3.691
# param ={'task':        'train',
#        'boosting':     'goss',
#        'objective':    'regression',
#        'metric':       'rmse',
#        'learning_rate': 0.01,
#        'subsample':     0.9855232997390695,
#        'max_depth':     7,
#        'top_rate':      0.9064148448434349,
#        'num_leaves':    63,
#        'min_child_weight': 41.9612869171337,
#        'other_rate':    0.0721768246018207,
#        'reg_alpha':     9.677537745007898,
#        'colsample_bytree': 0.5665320670155495,
#        'min_split_gain': 9.820197773625843,
#        'reg_lambda': 8.2532317400459,
#        'min_data_in_leaf': 21,
#        'verbose': -1}

#Params optimized - 3.695
#param = {'num_leaves': 51,
#          'min_data_in_leaf': 35, 
#          'objective':'regression',
#          'max_depth': -1,
#          'learning_rate': 0.008,
#          "boosting": "gbdt",
#          "feature_fraction": 0.85,
#          "bagging_freq": 1,
#          "bagging_fraction": 0.82,
#          "bagging_seed": 42,
#          "metric": 'rmse',
#          "lambda_l1": 0.11,
#          "verbosity": -1,
#          "nthread": 4,
#          "random_state": 2019}

#Version 8##
param = {
        'objective':        'regression',
        'boosting':         'gbdt',
        'num_leaves':       351,
        'max_depth':        12,
        'min_child_weight': 4.081862709896796,
        'subsample':        0.8099691929098813,
        'subsample_freq':   31,
        'colsample_bytree': 0.09377558631763243,
        'reg_alpha':        0.0002916032212299976,
        'reg_lambda':       3.0959773794206487,
        'n_estimators':     1393,
        'learning_rate':    0.005958169006526415,
        'min_data_in_leaf': 22,
        'metric':           'rmse',
        'verbose':          -1,
        'random_state' :    0
        }

In [40]:
predictions_skfold,oof_skfold,feature_importance_skfold,CV_LOSS = model_training(df_train,df_test,target,param,train_columns,type_fold = "SKFold",n_splits=5)

fold 0
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1246]	training's rmse: 2.84648	valid_1's rmse: 3.65827
fold 1
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1093]	training's rmse: 2.88617	valid_1's rmse: 3.67574
fold 2
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1061]	training's rmse: 2.90978	valid_1's rmse: 3.64267
fold 3
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1180]	training's rmse: 2.87392	valid_1's rmse: 3.64952
fold 4
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[901]	training's rmse: 2.97504	valid_1's rmse: 3.63933
CV Loss =  3.653128692204508


In [46]:
##feature_importance_skfold_summ =feature_importance_skfold.groupby(['Feature'])['importance'].sum().reset_index()
#feature_importance_skfold_summ.to_csv("feature_importance_skfold.csv")

In [68]:
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import sparse_random_matrix

svd = TruncatedSVD(n_components=2, n_iter=7, random_state=42)
X   = df_train[train_columns]


In [69]:
svd.fit(X)

TruncatedSVD(algorithm='randomized', n_components=2, n_iter=7,
       random_state=42, tol=0.0)

In [71]:
df_test_svd  = svd.transform(df_test[train_columns])

In [74]:
df_train_svd =  svd.transform(df_train[train_columns])

In [77]:
for fold_, (train_idx, valid_idx) in enumerate(folds.split(df_train,df_train['outliers'].values)):
    
    print(train_idx)
    
    train_data     = lgb.Dataset(df_train_svd[train_idx], label=target.iloc[train_idx])#, categorical_feature=categorical_feats)
    valid_data     = lgb.Dataset(df_train_svd[valid_idx], label=target.iloc[valid_idx])#, categorical_feature=categorical_feats)
    num_round      = 10000
    clf            = lgb.train(param,train_data,num_round,valid_sets = [train_data, valid_data],verbose_eval=-1,early_stopping_rounds = 100)
        
    #oof[valid_idx] = clf.predict(df_train_svd[valid_idx], num_iteration=clf.best_iteration)
    
    

[     0      1      2 ... 201913 201914 201916]
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[526]	training's rmse: 3.83302	valid_1's rmse: 3.85504
[     1      2      3 ... 201913 201914 201915]
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[526]	training's rmse: 3.83532	valid_1's rmse: 3.84689
[     0      1      2 ... 201913 201915 201916]
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[217]	training's rmse: 3.84157	valid_1's rmse: 3.83716
[     0      2      3 ... 201914 201915 201916]
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[526]	training's rmse: 3.83685	valid_1's rmse: 3.8426
[     0      1      3 ... 201914 201915 201916]
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[620]	training's rmse: 3.8377	valid_1's rmse: 3.836

In [73]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

In [41]:
def result_append(Name,CV_LOSS,param,model_type):
    
    path = '../data/output/'
    
    try:
        file_name         = path + "result.json"
        result            = pd.read_json(file_name)
        row,col           = result.shape
        result.loc[row,:] = [CV_LOSS,Name,model_type,param]
        result.to_json("../data/output/result.json")
        
    except:    
        result            = pd.DataFrame(columns=['CV_LOSS','Name','model_type','params'])
        result.loc[0,:]   = [CV_LOSS,Name,model_type,param]
        result.to_json("../data/output/result.json")

In [43]:
path = '../data/output/'
file_name         = path + "result.json"
result            = pd.read_json(file_name)

In [42]:
result_append("Version_3",CV_LOSS,param,"non-process")

In [56]:
def output_feature_importance(data):
    data.to_csv("../data/output/feature_importance.csv")

In [57]:
output_feature_importance(feature_importance_skfold)

In [58]:
def select_important_features(data,K=10):
    
    percentile          = np.percentile(feature_importance_skfold['importance'], np.arange(0, 100, 10)) # deciles
    selected_percentile = percentile[int(K/10)]
    selected_feature    = feature_importance_skfold[feature_importance_skfold['importance'] >= selected_percentile]
    
    return selected_feature['Feature']

In [59]:
train_columns       = list(select_important_features(feature_importance_skfold['importance'],10))
#multi_y    = list(map(lambda x : 0 if x >= percentile[0] and x< percentile[1] else 1 if x >= percentile[1] and x< percentile[2] else 2 if x >= percentile[2] and x< percentile[3] else 3,y))

In [62]:
predictions_skfold,oof_skfold,feature_importance_skfold,CV_LOSS = model_training(df_train,df_test,target,param,train_columns,type_fold = "SKFold",n_splits=5)

fold 0
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1198]	training's rmse: 2.85754	valid_1's rmse: 3.65776
fold 1
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[986]	training's rmse: 2.93173	valid_1's rmse: 3.67602
fold 2
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[969]	training's rmse: 2.95054	valid_1's rmse: 3.64536
fold 3
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1231]	training's rmse: 2.85453	valid_1's rmse: 3.65361
fold 4
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[1026]	training's rmse: 2.9243	valid_1's rmse: 3.64505
CV Loss =  3.6555766051066465


In [63]:
result_append("Version_4",CV_LOSS,param,"non-process")

In [13]:
prediction_outlier_skfold,oof_outlier_skfold = outlier_model_training(df_train,df_test,train_columns,type_fold="SKFold",n_splits=5)
# In case missing some predictable outlier, we choose top 25000 with highest outliers likelyhood.

fold 0
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[16]	training's binary_logloss: 0.0413834	valid_1's binary_logloss: 0.048096
fold 1
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[7]	training's binary_logloss: 0.0431967	valid_1's binary_logloss: 0.0489456
fold 2
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[10]	training's binary_logloss: 0.0430617	valid_1's binary_logloss: 0.0471646
fold 3
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[7]	training's binary_logloss: 0.0444169	valid_1's binary_logloss: 0.0476139
fold 4
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[7]	training's binary_logloss: 0.0445825	valid_1's binary_logloss: 0.0480626
CV Loss =  0.10231736255089778


In [14]:
def create_non_outlier_data(df_train,oof_outlier_skfold):
    
    df_train_outlier                      = pd.DataFrame({"card_id":df_train['card_id'].values})
    df_train_outlier['prob']              = oof_outlier_skfold
    df_train_outlier['outlier']           = df_train['outliers']
    fpr, tpr, thresholds_tpr              = roc_curve(df_train_outlier['outlier'],df_train_outlier['prob'])
    auc_result                            = auc(fpr, tpr)
    i                                     = np.arange(len(tpr)) 
    roc                                   = pd.DataFrame({'tf' : pd.Series(tpr-(1-fpr), index=i), 'threshold' : pd.Series(thresholds_tpr, index=i)})
    roc_t                                 = roc.ix[(roc.tf-0).abs().argsort()[:1]]
    df_train_outlier['predicted_outlier'] = df_train_outlier['prob'].map(lambda x: 1 if x>=roc_t['threshold'].values[0] else 0)
    df_train['predicted_outlier']         = df_train_outlier['predicted_outlier']
    df_train_non_outlier                  = df_train[df_train['predicted_outlier'] == 0]
    df_train_outlier                      = df_train[df_train['predicted_outlier'] == 1]
    return df_train_non_outlier,df_train_outlier,roc_t['threshold'].values[0]

In [15]:
#prediction_without_outliers_skfold,oof_without_outliers_skfold,feature_importance_without_outliers_skfold = non_outlier_training(df_train,df_test,train_columns,type_fold="SKFold")
prediction_without_outliers_skfold,oof_without_outliers_skfold,feature_importance_without_outliers_skfold = non_outlier_training(df_train,df_test,train_columns,type_fold="SKFold")

fold 0
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[637]	training's rmse: 1.48565	valid_1's rmse: 1.57717
fold 1
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[551]	training's rmse: 1.49858	valid_1's rmse: 1.5589
fold 2
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[776]	training's rmse: 1.48391	valid_1's rmse: 1.53959
fold 3
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[619]	training's rmse: 1.4925	valid_1's rmse: 1.55626
fold 4
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[590]	training's rmse: 1.49803	valid_1's rmse: 1.54398
CV Loss =  1.555236039910202


In [16]:
df_train_non_outlier,df_train_outlier,threshold     = create_non_outlier_data(df_train,oof_outlier_skfold)

In [16]:
#prediction_without_outliers_skfold,oof_without_outliers_skfold,feature_importance_without_outliers_skfold = non_outlier_training(df_train,df_test,train_columns,type_fold="SKFold")
#prediction_outliers_reg_skfold,oof_outliers_reg_skfold,feature_importance_outliers_reg_skfold = outlier_regression_training(df_train_outlier,df_test,train_columns,type_fold="SKFold")

In [19]:
#prediction_without_outliers_Rfold,oof_without_outliers_Rfold = non_outlier_training(df_train,df_test,train_columns,type_fold="RepeatFold")

In [17]:
df_test_outlier                      = pd.DataFrame({"card_id":df_test["card_id"].values})
df_test_outlier["target"]            = prediction_outlier_skfold
df_test_outlier['predicted_outlier'] = df_test_outlier['target'].map(lambda x: 1 if x>=threshold else 0)

In [18]:
outlier_id           = df_test_outlier.loc[df_test_outlier['predicted_outlier']==1,'card_id'].reset_index(drop=True)

In [19]:
outlier_id.columns   = ['card_id']
outlier_id           = pd.DataFrame(outlier_id)

In [59]:
#df_test_outlier               = pd.DataFrame({"card_id":df_test["card_id"].values})
#df_test_outlier["target"]     = prediction_outlier_skfold

#df_test_outlier['predicted_outlier'] = df_test_outlier["target"].map(lambda x: 1 if x > roc_t['threshold'].values[0] else 0 )
#df_test_outlier['target_outlier']    = prediction_outliers_reg_skfold
#df_test_outlier['target_non_outlier']    = prediction_without_outliers_skfold
#df_test_outlier['final_target'] = df_test_outlier.apply(lambda x : x['target_outlier'] if x['predicted_outlier']==1 else x['target_non_outlier'],axis=1)
#df_test_outlier.to_csv("submission_feature9.csv")

In [None]:
#outlier_id           = pd.DataFrame(df_outlier.sort_values(by='target',ascending = False).head(25000)['card_id'])

In [46]:
#target_non_outlier           = df_train.loc[df_train['outliers'] == 0,'target']
prediction_stack_complete    = stacking(oof_skfold,oof_RFold,predictions_skfold,predictions_RFold,target)
#prediction_stack_non_outlier = stacking(oof_without_outliers_skfold,oof_without_outliers_Rfold,prediction_without_outliers_skfold,prediction_without_outliers_Rfold,target_non_outlier)

fold n°0
----------Stacking 0----------
fold n°1
----------Stacking 1----------
fold n°2
----------Stacking 2----------
fold n°3
----------Stacking 3----------
fold n°4
----------Stacking 4----------
3.648059573748058


In [64]:
output                 = pd.DataFrame({"card_id":df_test["card_id"].values})
output["target"]       = predictions_skfold
path                   = "../result/"
final_name             = path+"submission_feature22.csv" 
output.to_csv(final_name, index=False)

In [102]:
#path                   = "../result/"
#final_name             = path+"submission_stack_newhist_hist.csv" 
#output.to_csv(final_name, index=False)

In [20]:
test_without_outlier_output           = pd.DataFrame({"card_id":df_test["card_id"].values})
test_without_outlier_output["target"] = prediction_without_outliers_skfold

In [21]:
combine_outlier_non_outlier("3.694",outlier_id,test_without_outlier_output,"submission_3.694_outlier_auto")

In [None]:
#Feature1 - Remove insignificant features from 92

In [80]:
path1                   = "../result/"
final_name1             = path+"3.691.csv" 
path2                   = "../result/"
final_name2             = path+"3.695.csv" 
df_base1 = pd.read_csv(final_name1)
df_base2 = pd.read_csv(final_name2)

In [81]:
df_base1 = pd.merge(df_base1,df_base2,on='card_id',how='left')

In [82]:
df_base1['target'] = df_base1['target_x']*0.65 + df_base1['target_y']*0.35

In [83]:
df_base1 = df_base1[['card_id','target']]

In [84]:
path                   = "../result/"
final_name             = path+"submission_blend_3.691_3.695.csv" 
df_base1.to_csv(final_name, index=False)