# Combining your model with a model without outlier

In [1]:
import numpy as np
import pandas as pd
import time
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
import gc

from sklearn.calibration import CalibratedClassifierCV

import plotly.plotly as py
import plotly.graph_objs as go

from sklearn.metrics import f1_score,precision_recall_curve,roc_curve, recall_score,precision_score
from sklearn.metrics import confusion_matrix, roc_auc_score

%matplotlib inline
import random
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_rows = 999
pd.options.display.max_columns  = 999

* # Part 1 Training Model Without Outliers

In [2]:
# import h5py
# Path = '../input/elo-rfm-and-business-feats/'
# f = h5py.File(Path+'train_preproc.hdf')
# # list(f)

In [3]:
print(np.__version__)

1.15.4


In [4]:
# print(list(f['data']))
# # print(list(f['data']['axis0']))
# print()
# print()
# print(list(f['data']['block0_values'][0:10]))

# # print(np.__version__)

In [5]:
%%time
# Path = '../input/elo-preproc-3/'
Path = '../input/elo-rfm-and-business-feats/'

df_train = pd.read_csv(Path+'train_preproc.csv',index_col=0)
df_test = pd.read_csv(Path +'test_preproc.csv',index_col=0)

CPU times: user 12.9 s, sys: 796 ms, total: 13.7 s
Wall time: 13.7 s


## filtering out outliers

In [6]:
mask_without_outlier = df_train['outliers'] == 0

In [7]:
df_train_outliers = df_train[~mask_without_outlier]
outlier_card_ids = df_train_outliers['card_id'].values
df_train_full = df_train.copy()
df_train = df_train[mask_without_outlier]
target = df_train['target']
del df_train['target']
# features = [c for c in df_train.columns if c not in ['card_id', 'first_active_month','outliers']]
# categorical_feats = [c for c in features if 'feature_' in c]

## parameters

In [8]:
# param = {'num_leaves': 31,
#          'min_data_in_leaf': 30, 
#          'objective':'regression',
#          'max_depth': -1,
#          'learning_rate': 0.01,
#          "min_child_samples": 20,
#          "boosting": "gbdt",
#          "feature_fraction": 0.9,
#          "bagging_freq": 1,
#          "bagging_fraction": 0.9 ,
#          "bagging_seed": 11,
#          "metric": 'rmse',
#          "lambda_l1": 0.1,
#          "verbosity": -1,
#          "nthread": 4,
#          "random_state": 4590}

## training model

In [9]:
enc_splits = 5

In [10]:
def getenc(mask_without_outlier=None):
    tr_encs = []
    val_encs = []
    test_encs = []
    Path='../input/elo-output/'
#     Path='../input/elo-target-encoding-100-splits/'

    for i in range(0,enc_splits):
        cur_tr_enc = pd.read_hdf(Path+'train_targetenc_feats'+str(i)+'.hdf')
        cur_val_enc = pd.read_hdf(Path+'val_targetenc_feats'+str(i)+'.hdf')

        if mask_without_outlier is not None:
            cur_tr_enc = cur_tr_enc[mask_without_outlier]
            cur_val_enc = cur_val_enc[mask_without_outlier]

        tr_encs += [cur_tr_enc]
        val_encs +=[ cur_val_enc]

        test_encs += [pd.read_hdf(Path+'test_targetenc_feats'+str(i)+'.hdf')]
        print('read complete for:',i)
        
    return tr_encs,val_encs,test_encs

In [11]:
def get_logloss_score(labels,preds):
     return log_loss(labels, preds)
def get_rmse_score(labels,preds):
     return np.sqrt(mean_squared_error(labels, preds))
def get_f1loss_score(labels,preds):
    optcutoff,f1score = get_opt_cutoff_prec(labels,preds)
    f1loss = 1 - f1score
    return f1loss
    

In [12]:
def permutation_feature_selection(model, X_val, y_val, score_function,subset_feats=None,pred_proba=False, rep=3, max_delta_score=0.0001):

    # to do: predict_proba from Booster

    """""
    Perform permutation feature importance calculation for trained LightGBM model. 
    Scorer - ROC AUC. 
    The lower score with permuted feature - the more important feature is. 
    
    Parameters
    ----------
    model : lightGBM.Booster
        Trained model to perform feature importance calculation 
        
    X_val : pandas.DataFrame
        Validation dataset
            
    y_val : pandas.Series 
        Targets for validation dataset
       
    rep : integer (default = 3)
        Number of permutations. More permutations lead to more robust results, but requires more calculation time.
        It is recommended to use values in the range [3; 10].
        
    min_delta_score : float (default = 0.0)
        Minimum delta ins score to keep feature 
        It is recommended to use values in the range [-0.0001; 0].
        
    Output
    ----------
    selected_features : list
        Important features 
        
    importance_df : pandas.DataFrame 
        Records of calculations 
    """""

    print('Permutation feature importance is calculating...')

    columns = []
    scores = []
    stds = []
    score_max = []
    score_min = []

    # calculate the score of model with no permuted features, this is our baseline
    if pred_proba:
        y_hat_nopert = model.predict_proba(X_val)[:,1]
    else:
        y_hat_nopert = model.predict(X_val,model.best_iteration_)
    score_init = score_function(y_val, y_hat_nopert)
#     score_init = np.sqrt(mean_squared_error(y_val, y_hat_nopert))
#     print('score init:',score_init)
    col_iter = 1
    
    if subset_feats!=None:
        cols = subset_feats
    else:
        cols = X_val.columns
    
    for cc in tqdm(cols):
#         if col_iter > 3:
#             break
        scores_cc = []
        for seed in range(rep):
            # shuffle single column of dataset
            data_temp = X_val.copy()
            data_temp[cc] = np.nan
#             data_temp[cc] = data_temp[cc].sample(n=X_val.shape[0], random_state=seed).reset_index(drop=True)

            # make prediction on 'shuffled' dataset and score it
            if pred_proba:
                # calibration classifier fit
#                 model.fit(data_temp, y_val)
                y_hat = model.predict_proba(data_temp)[:,1]
            else:
                y_hat = model.predict(data_temp,model.best_iteration_)
            score = score_function(y_val, y_hat)
#             print('col:',cc)
#             print('score new:',score)
#             score = np.sqrt(mean_squared_error(y_val, y_hat))
            scores_cc.append(score)

        columns.append(cc)
        scores.append(np.mean(scores_cc) - score_init)
        stds.append(np.std(scores_cc))
        score_max.append(np.max(scores_cc) - score_init)
        score_min.append(np.min(scores_cc) - score_init)
        
        col_iter+=1

    importance_df = pd.DataFrame({'delta_score_mean': scores,
                                  'delta_score_std': stds,
                                  'delta_score_max': score_max,
                                  'delta_score_min': score_min,
                                  'feature': columns
                                  })
    importance_df = importance_df.sort_values(by='delta_score_mean')
    selected_features = list(importance_df.loc[importance_df['delta_score_mean'] >= max_delta_score, 'feature'])

    print('Permutation feature importance calculation is done. Overall number of features: ', importance_df.shape[0],
          'Number of selected features:', len(selected_features))

    return selected_features, importance_df

In [13]:
def getenccolname(colname,cols_agg):
    if 'var' in cols_agg:
        colname ="targetvarenc_"+colname
    elif 'std' in cols_agg:
        colname ="targetstdenc_"+colname 
    elif 'sum' in cols_agg:   
        colname ="targetsumenc_"+colname
    elif 'min' in cols_agg:   
        colname ="targetminenc_"+colname
    elif 'max' in cols_agg:   
        colname ="targetmaxenc_"+colname
    elif 'median' in cols_agg:   
        colname ="targetmedianenc_"+colname
    elif 'count' in cols_agg:   
        colname ="targetcountenc_"+colname
    elif 'iqmean' in cols_agg:   
        colname ="targetiqmeanenc_"+colname
    else:
        colname ="targetenc_"+colname
    return colname
def computeexpcomponent(countSeries,min_samples_leaf,smoothing):
    return -((countSeries - min_samples_leaf) / smoothing)        
def getexpcomponent(countSeries,min_samples_leaf,smoothing):
    expcomponent = computeexpcomponent(countSeries,min_samples_leaf,smoothing)
    return expcomponent
def performsmoothing(averages,targetcolname,train,agg,countSeries,smoothing,min_samples_leaf,noise_level,global_agg_val=None):
        expcomponent = getexpcomponent(countSeries,min_samples_leaf,smoothing)
        smoothing_v = 1 / (1 + np.exp(expcomponent) )
        
        if global_agg_val is None:
            global_agg_val = np.nanmean(train[targetcolname].values)
            
        newcol ='newcol'
        averages[newcol] = global_agg_val * (1 - smoothing_v) + averages[agg] * smoothing_v

        np.random.seed(42)
        noise = np.random.randn(len(averages[newcol])) * noise_level
        print('noise mean:{0} std:{1} min:{2} max:{3}'.format(np.mean(noise),np.std(noise),np.min(noise),np.max(noise)))
        averages[newcol] = averages[newcol] + noise
        
        del smoothing_v,noise;gc.collect()
        return averages[newcol]
             
def targetenc(train,test,val,catcolnames,targetcolname,
             smoothing,min_samples_leaf,noise_level):
    start = time.time()
    
    for i,curcol in enumerate(catcolnames):
        enccol = getenccolname(curcol,'mean')
#         print('enccol:',enccol)
        averages = train[[curcol,targetcolname]].groupby(curcol).agg({targetcolname: "mean",
                                                                     curcol: 'size'})
        averages.columns =['mean','count']
        print('curcol:',curcol)
        print('averages count describe:',averages['count'].describe())

        #Multiplication factor for each enc col since target enc is having very low value and noise will change the actual value itself
        averages['mean'] = 1000 * averages['mean']
        #Perform smoothing
        q25 = averages['count'].quantile(0.05)
        min_samples_leaf = np.max(np.array([25,q25]))
#         if min_samples_leaf > 5000:
#             min_samples_leaf=1000
#         elif min_samples_leaf > 1000:
#             min_samples_leaf = 1000
        min_samples_leaf = np.min(np.array([1000,min_samples_leaf]))
        print('min_samples_leaf:',min_samples_leaf)
        #compute cumulative mean:
        averages.sort_values('count',inplace=True)
        agg='cum_mean'
        averages[agg] = averages['mean'].expanding().mean()
        
        averages[enccol] = performsmoothing(averages,targetcolname,train,agg,averages['count'],
                                          smoothing[i],min_samples_leaf,noise_level[i])
        averages.drop(['mean','count',agg],axis=1,inplace=True)

        print('averages enccol describe:',averages[enccol].describe())
#         print('train curcol head:',train[curcol].head())
        # Use only the null merchant id enccol records to update the curcol encoding
        train[enccol] = train[curcol].map(averages[enccol])
        val[enccol] = val[curcol].map(averages[enccol])
        test[enccol] = test[curcol].map(averages[enccol])
        
        #Fill NA
        globalmean  = train[enccol].mean()
        train[enccol].fillna(globalmean,inplace=True)
        val[enccol].fillna(globalmean,inplace=True)
        test[enccol].fillna(globalmean,inplace=True)
    
    end = time.time()
    print('Target Enc Generation exec time:',end- start)
    
    return train,test,val

In [14]:
def droptargetenccols(train, val,test):
     #remove target encoding fields if present
    enccols_train = [col for col in train.columns if ('targetenc' in col) & ('merchant_id' not in col)]
    train.drop(enccols_train,axis=1,inplace=True)
    val.drop(enccols_train,axis=1,inplace=True)
    enccols_test = [col for col in test.columns if ('targetenc' in col) & ('merchant_id' not in col)]
    test.drop(enccols_test,axis=1,inplace=True)
    
    return train, val,test
def targetencprocess(tr,val,test,catcolnames,fold_):
    #drop any existing target enc cols
    tr,val,test = droptargetenccols(tr,val,test)
    targetcolname='outliers'
    
    smoothing=[]
    min_samples_leaf=[]
    noise_level=[]
    for col in catcolnames:
        smoothing +=  [50] 
        min_samples_leaf +=  [100] 
        noise_level +=  [0.1] 
    #target encoding on transaction merchant id
    tr,val,test = targetenc(tr, val,test,catcolnames,targetcolname,
                           smoothing,min_samples_leaf,noise_level)

    enc_cols_for_sum = [col for col in tr.columns if ('targetenc' in col) and ('merchant_id' not in col)]
    #weighted sum
#     weights = [ 0.042229,0.058662,0.055087,0.030753,0.015725,0.047265,0.047882,0.063172,0.005299,0.052449,0.041647,0.05688,
#                 0.045107,0.040973,0.054825,0.026013,0.039557,0.018088,0.012036,0.0354,0.061585,0.035343,0.039157]
    
#     enc_cols_for_sum.sort()
    
#     for df in [tr,val,test]:
#         df['sum_targetenc'] = 0
#         for i,col in enumerate(enc_cols_for_sum):
#             print('col:',col)
#             print('dtype:',df[col].dtype)
# #             print('df[col].describe:',df[col].describe)
#             df['sum_targetenc'] += df[col] * weights[i]
#         df['sum_targetenc'] = 100 * df['sum_targetenc'] / np.sum(np.array(weights))
        
    for df in [tr,val,test]:
        df['sum_targetenc'] = df[enc_cols_for_sum].sum(axis=1)
#         df['mean_targetenc'] = df[enc_cols_for_sum].mean(axis=1)
#         df['std_targetenc'] = df[enc_cols_for_sum].std(axis=1)
#         df['min_targetenc'] = df[enc_cols_for_sum].min(axis=1)
#         df['max_targetenc'] = df[enc_cols_for_sum].max(axis=1)
    enc_cols = [col for col in tr.columns if ('targetenc' in col) and ('merchant_id' not in col)]

#     print('enc cols:',enc_cols)
    print('save encoding feats...')
    #save target encoding features in separate file
    
#     print(list(tr[['card_id']+enc_cols].columns))
#     print(len(list(tr[['card_id']+enc_cols].columns)))

    
    tr[['card_id']+enc_cols].to_hdf('train_targetenc_feats'+str(fold_)+'.hdf',key='data')
    val[['card_id']+enc_cols].to_hdf('val_targetenc_feats'+str(fold_)+'.hdf',key='data')
    test[['card_id']+enc_cols].to_hdf('test_targetenc_feats'+str(fold_)+'.hdf',key='data')

    return tr,val,test

In [15]:
def computef1scoreandconfmatrix(y_true,y_preds):    
    opt_cutoff, f1score = get_opt_cutoff_prec(y_true,y_preds)
    print('opt_cutoff:',opt_cutoff)
    print('f1 score:',f1score)
    pred_labels = convert_probtolabels(y_preds,cutoff=opt_cutoff)
    print('conf matrix:',confusion_matrix(y_true,pred_labels))
    return opt_cutoff,f1score,pred_labels

In [16]:
def lgb_fit(regression,param,tr,y_tr,val,y_val,cur_feval):
    if regression:
        model = lgb.LGBMRegressor(**param)
    else:
        model = lgb.LGBMClassifier(**param)
        
    model.fit(tr,y_tr,eval_set=[(val, y_val)],
                    early_stopping_rounds=200,
                    verbose=100,
                    eval_metric=cur_feval,
             )
    return model
def lgb_predict(model,n_estimators,test):
    preds= model.predict(test,num_iteration = getbestiteration(model,n_estimators))
    return preds
# def xgb_fit(regression,param,tr,y_tr,val,y_val,cur_feval):
#     num_round= param['n_estimators']
#     trn_data = xgb.DMatrix(data=tr, label=y_tr)
#     val_data = xgb.DMatrix(data=val, label=y_val)
#     watchlist = [(trn_data, 'train'), (val_data, 'valid')]

#     model = xgb.train(param, trn_data, num_round, watchlist, 
#                       early_stopping_rounds=200, verbose_eval=100)

#     return model

def xgb_fit(regression,param,tr,y_tr,val,y_val,cur_feval):
    if regression:
        model = xgb.XGBRegressor(**param)
 
    else:
        model = xgb.XGBClassifier(**param)

    model.fit(tr,y_tr,eval_set=[(val, y_val)],
                    early_stopping_rounds=200,
                    verbose=50,
                    eval_metric=cur_feval,
             )
    return model
def xgb_predict(model,n_estimators,test):
    preds= model.predict(test)
#     preds = model.predict(xgb.DMatrix(test), ntree_limit=model.best_ntree_limit)
    return preds
def getbestiteration(model,n_estimators):
    if hasattr(model, 'best_iteration'):
        if model.best_iteration is None:
            return n_estimators
        else:
            return model.best_iteration
    elif hasattr(model, 'best_iteration_'):
        if model.best_iteration_ is None:
            return n_estimators
        else:
            return model.best_iteration_
    
    return -1

def xgb_getbestscore(model,cur_feval):
    if isinstance(model,xgb.XGBRegressor) :
#         print('model best_score object:',model.get_booster().best_score)
        score= model.get_booster().best_score
    elif isinstance(model,xgb.XGBClassifier):
        print('model best_score object:',model.get_booster().best_score)
        score= model.get_booster().best_score
       
    return score

def lgb_getbestscore(model,cur_feval):
    if  isinstance(model,lgb.LGBMRegressor) :
        score= model.best_score_['valid_0'][param['metric']]
    else: #isinstance(model,lgb.LGBMClassifier) :
        if (cur_feval is None):
            score= model.best_score_['valid_0'][param['metric']]
        else:
            score= model.best_score_['valid_0']['f1_score']
    return score
# def xgb_getbestscore(model,cur_feval):
#     return model.best_score

In [17]:
import time
def runlgb(random_state,ispermutefeats,train,test,param,features,
           y_train, fit_function,predict_function,bestscore_function,
           score_function=None,regression=True,
           train_outlier=None,likely_card_ids=None,targetenc=False,to_enc_cols=None,
          subset_feats = None, fold_feats=False, feval=None,
           fold_to_start=None,
          fold_to_stop = None):

#     global oof_rank,oof_rank_test,oof_rank_prob,oof_rank_p_labels, oof_rank_p_labels_test
#     global oof_labels, oof_labels_test
    global oof_rank,oof_rank_prob,oof_rank_p_labels
    global oof_labels
    global oof_rank_p_prob,oof_rank_p_prob_prop_high, oof_rank_p_prob_prop_low
    
    n_estimators = param['n_estimators']
    overall_sel_feats =[]
    overall_imp_df = pd.DataFrame()
    
    if subset_feats == None:
        overall_imp_df['feature']= np.array(features)
    else:
        overall_imp_df['feature']= np.array(subset_feats)
        
    overall_imp_df['overall_score_mean'] =0 
    overall_imp_df['overall_score_max'] =-9999 
    overall_imp_df['overall_score_min'] =9999 
    
    oof = np.zeros(len(train))
    oof_rank = np.zeros(len(train))
    oof_rank_prob = np.zeros(len(train))
    oof_rank_p_labels = np.zeros(len(train))    
    oof_labels = np.zeros(len(train))
    oof_rank_p_prob = np.zeros(len(train))
    oof_rank_p_prob_prop_high = np.zeros(len(train))
    oof_rank_p_prob_prop_low = np.zeros(len(train))
    
    
    predictions_train= np.zeros(len(train))
    predictions_train_w= np.zeros(len(train))
    predictions_train_mul_prob= np.ones(len(train))
    predictions_train_mul_prob_inv= np.ones(len(train))
    goodfolds =0
    train_outlier_preds = None
    if train_outlier is not None:
        train_outlier_preds= np.zeros(len(train_outlier)) 
    predictions = np.zeros(len(test))
    start = time.time()
    valid_scores =[]
    valid_auc_scores=[]
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] =0    
    fold_importance_df["gain"] =0
    val_preds_cum = None
    y_val_cum = None    

    folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    if regression:
        indices =  folds.split(train.values,train['outliers'].values)   
    else:
        indices = folds.split(train.values, y_train.values)
        
#     combined_encs = pd.concat([tr_encs[0],val_encs[0]])
#     train_with_enc = pd.concat([train,combined_encs[sel_enc_cols[0]]],axis=1)
#     train_with_enc = train_with_enc[features]
#     del combined_encs;gc.collect()
    for fold_, (trn_idx, val_idx) in enumerate(indices):
        if (fold_to_stop is not None):
            if (fold_ >=fold_to_stop):
                break
                
        if (fold_to_start is not None):
            if (fold_ < fold_to_start):
                continue
        
        print()
        print("fold n°{}".format(fold_))
        
        if fold_feats:
            cur_features = features[fold_].copy()
        else:
            cur_features = features.copy()
            
        tr = train.iloc[trn_idx]
        val = train.iloc[val_idx]
        print('y_train shape:',y_train.shape)
        y_val = y_train.iloc[val_idx]
        print('val shape:',val.shape)
        print('y_val shape:',y_val.shape)
        y_tr = y_train.iloc[trn_idx]
        
        
        #Concat current encoding train, valid and test files
        if fold_ < enc_splits:
            cur_tr_encs = tr_encs[fold_]
            cur_val_encs= val_encs[fold_]
            cur_test_encs= test_encs[fold_]
        
        if train_outlier is not None:
            cur_tr_encs = cur_tr_encs[~cur_tr_encs['card_id'].isin(outlier_card_ids)]
            cur_val_encs= cur_val_encs[~cur_val_encs['card_id'].isin(outlier_card_ids)]
        elif likely_card_ids is not None:
               
            cur_tr_encs = pd.concat([tr_encs[0],val_encs[0]])
            cur_val_encs = val_encs[0]
            for i in range(1,5):
                cur_val_encs = pd.concat([cur_val_encs,val_encs[i]])
        
            tr_orig_card_ids = tr['card_id']
            val_orig_card_ids = val['card_id']
            test_orig_card_ids = test['card_id']
            
        tr=pd.concat([tr,cur_tr_encs[sel_enc_cols[0]]],axis=1)
        val=pd.concat([val,cur_val_encs[sel_enc_cols[0]]],axis=1)
        test_cur=pd.concat([test,cur_test_encs[sel_enc_cols[0]]],axis=1)
        
        #remove extra card ids
        if likely_card_ids is not None:
            tr_mask = tr['card_id'].isin(tr_orig_card_ids)
            tr = tr[tr_mask]
            val_mask = val['card_id'].isin(val_orig_card_ids)
            val = val[val_mask]
            test_mask = test_cur['card_id'].isin(test_orig_card_ids)
            test_cur = test_cur[test_mask]
            
        print('tr shape:',tr.shape)
        print('val shape:',val.shape)
        if regression==False:
            print('yval 1 shape:',y_val[y_val==1].shape)
            print('yval 0 shape:',y_val[y_val==0].shape)
            print('yval 1 ratio:',y_val[y_val==1].shape[0] / (y_val[y_val==1].shape[0]+y_val[y_val==0].shape[0]))
#         print('tr columns:',list(tr.columns))
        #Target encoding
        if targetenc:
            tr,val,test_cur= targetencprocess(tr,val,test_cur,to_enc_cols,fold_)
#             enccols = [col for col in tr.columns if ('targetenc' in col) & ('merchant_id' not in col)]
            enccols = ['sum_targetenc']
            print('enccols:',enccols)
            cur_features += enccols
            
#         print('cur_features',cur_features)
        
        trn_data = lgb.Dataset(tr[cur_features], label=y_tr)#,, categorical_feature=categorical_feats)
        val_data = lgb.Dataset(val[cur_features], label=y_val)#,, categorical_feature=categorical_feats)
        
        cur_feval = feval
#         if fold_ in [4]:
#             cur_feval=None
            
        base_model =fit_function(regression,param,tr[cur_features],y_tr,val[cur_features],y_val,cur_feval)
        if regression:
            clf = base_model
        else:
            clf = CalibratedClassifierCV(base_model,cv='prefit',method='sigmoid') #isotonic
            clf.fit(val[cur_features], y_val)
#             if ispermutefeats:
#                 clf.fit(tr[cur_features], y_tr)
#             else:
#                 clf.fit(val[cur_features], y_val)

        

        #Prediction based on current fold selected features
        if ispermutefeats:
            
            selected_features, importance_df = permutation_feature_selection(clf, val[cur_features], 
                                                                             y_val,score_function,
                                                                             subset_feats=subset_feats,
                                                                             rep=4,max_delta_score=max_delta_score,
                                                                            pred_proba = not regression)
            overall_sel_feats += [selected_features]
            print(selected_features)

#             print('overal imp shape:{0} importance_df shape:{1}'.format(overall_imp_df.shape,importance_df.shape))
            
            overall_imp_df['fold_'+str(fold_)+'score_mean'] = importance_df['delta_score_mean']
            overall_imp_df['fold_'+str(fold_)+'score_max'] = importance_df['delta_score_max']
            overall_imp_df['fold_'+str(fold_)+'score_min'] = importance_df['delta_score_min']
        else:
            
            if regression:
                val_preds =predict_function(clf,n_estimators,val[cur_features])
#                 val_preds = clf.predict(val[cur_features], num_iteration=getbestiteration(clf,n_estimators))
                oof[val_idx] = val_preds
            else:
                val_preds = clf.predict_proba(val[cur_features])[:,1]
                oof[val_idx] = val_preds
                opt_cutoff, f1score = get_opt_cutoff_prec(y_val,val_preds)
                print('opt_cutoff:',opt_cutoff)
                print('f1 score after calib:',f1score)
                auc_score = roc_auc_score(y_val,val_preds)
                print('auc score after calib:',auc_score)
                print("log loss score after calib: {:<8.5f}".format(log_loss(y_val,val_preds)))
    

                oof_labels[val_idx] = convert_probtolabels(val_preds,cutoff=opt_cutoff)
    
                rank_df = pd.DataFrame()
                rank_df['preds']= val_preds
                rank_df['rank']= rank_df['preds'].rank(pct=True)
                oof_rank[val_idx] = rank_df['rank'].values
                oof_rank_p_labels[val_idx] = rank_df['rank'].values + oof_labels[val_idx]
                oof_rank_prob[val_idx] = rank_df['rank'].values * val_preds
                oof_rank_p_prob[val_idx] = rank_df['rank'].values + val_preds
                oof_rank_p_prob_prop_high[val_idx] = rank_df['rank'].values + 10 * val_preds
                oof_rank_p_prob_prop_low[val_idx] = rank_df['rank'].values + 0.25 * val_preds
                
                if val_preds_cum is None:
                    val_preds_cum = val_preds
                    y_val_cum = y_val
                else:
                    val_preds_cum = np.concatenate((val_preds_cum, val_preds), axis=0)
                    y_val_cum = np.concatenate((y_val_cum, y_val), axis=0)
                
                print('CUMULATIVE SCORE')
                computef1scoreandconfmatrix(y_val_cum,val_preds_cum) 

    #             lgb_prediction = clf.predict_proba(val[cur_features], num_iteration=clf.best_iteration)
#             print('lgb pred head:',lgb_prediction[0:10])
#             lgb_prediction = lgb_prediction.reshape(-1, 2)
#             oof[val_idx]   =  lgb_prediction.argmax(axis = 1)

            if train_outlier is not None:
                outlier_tr_encs = tr_encs[fold_][tr_encs[fold_]['card_id'].isin(outlier_card_ids)]
                trcardids = outlier_tr_encs['card_id']
                train_outlier.loc[train_outlier['card_id'].isin(trcardids),sel_enc_cols[0]]\
                    = outlier_tr_encs[sel_enc_cols[0]]
                outlier_val_encs = val_encs[fold_][val_encs[fold_]['card_id'].isin(outlier_card_ids)]
                valcardids = outlier_val_encs['card_id']
                train_outlier.loc[train_outlier['card_id'].isin(valcardids),sel_enc_cols[0]]\
                    = outlier_val_encs[sel_enc_cols[0]]
            #             print('train_outlier[cur_features].shape',train_outlier[cur_features].shape)
                if regression:
                    train_outlier_preds += predict_function(clf,n_estimators,train_outlier[cur_features]) / folds.n_splits
                else:
                    train_outlier_preds += clf.predict_proba(train_outlier[cur_features])[:,1]
                

#             if fold_==0:
#                 fold_importance_df["feature"] = cur_features
#                 fold_importance_df["importance"] =0

#             fold_importance_df["importance"] += base_model.booster_.feature_importance(importance_type='split') / folds.n_splits
#             fold_importance_df["gain"] += base_model.booster_.feature_importance(importance_type='gain') / folds.n_splits
            if regression:
                valid_scores+=[bestscore_function(clf,cur_feval)]
#                 valid_scores+=[clf.best_score_['valid_0'][param['metric']]]
                predictions += predict_function(clf,n_estimators,test_cur[cur_features]) / folds.n_splits
            else:
            
                valid_score =bestscore_function(base_model,cur_feval)
#                 if cur_feval is not None:
#                     valid_score = base_model.best_score_['valid_0']['f1_score']
#                 else:
#                     valid_score=base_model.best_score_['valid_0'][param['metric']]
                    
                valid_scores+=[valid_score]
                valid_auc_scores+=[auc_score]
#                 predictions += clf.predict_proba(test_cur[cur_features])[:,1] / folds.n_splits
                cur_tr_encs = pd.concat([tr_encs[0],val_encs[0]])
                
#                 cur_train_preds = clf.predict_proba(train_with_enc)[:,1]
#                 predictions_train += cur_train_preds / folds.n_splits
                
#                 predictions_train_w += valid_score * cur_train_preds
#                 predictions_train_mul_prob  *= cur_train_preds             
#                 predictions_train_mul_prob_inv  *= (1-cur_train_preds)

                predictions += clf.predict_proba(test_cur[cur_features])[:,1] / folds.n_splits


    if ispermutefeats:
        fold_mean_cols = [col for col in overall_imp_df.columns if ('score_mean' in col) and ('fold_' in col) ]
        fold_max_cols = [col for col in overall_imp_df.columns if ('score_max' in col) and ('fold_' in col) ]
        fold_min_cols = [col for col in overall_imp_df.columns if ('score_min' in col) and ('fold_' in col) ]
        overall_imp_df['overall_score_mean'] = overall_imp_df[fold_mean_cols].mean(axis=1)
        overall_imp_df['overall_score_max'] = overall_imp_df[fold_max_cols].max(axis=1)
        overall_imp_df['overall_score_min'] = overall_imp_df[fold_min_cols].min(axis=1)
    else:
        print('valid scores:',valid_scores)
        if regression:
            print("CV score: {:<8.5f}".format(mean_squared_error(oof, y_train)**0.5))
        else:
            print('CUMULATIVE SCORE')
            computef1scoreandconfmatrix(y_val_cum,val_preds_cum) 
            print()
            print("CV score log loss: {:<8.5f}".format(log_loss(y_train, oof)))
            opt_cutoff, f1score = get_opt_cutoff_prec(y_train, oof)
            print("CV score f1 score: {0} cutoff:{1}".format(f1score,opt_cutoff))
            auc_score = roc_auc_score(y_train, oof)
            print("CV score AUC score: {0}".format(auc_score))
                
            opt_cutoff, f1score = get_opt_cutoff_prec(y_train, oof_rank)
            print("Rank CV score f1 score: {0} cutoff:{1}".format(f1score,opt_cutoff))
            opt_cutoff, f1score = get_opt_cutoff_prec(y_train, oof_rank_prob)
            print("Rank Prob CV score f1 score: {0} cutoff:{1}".format(f1score,opt_cutoff))
            opt_cutoff, f1score = get_opt_cutoff_prec(y_train, oof_rank_p_prob)
            print("Rank Plus Prob CV score f1 score: {0} cutoff:{1}".format(f1score,opt_cutoff))
            opt_cutoff, f1score = get_opt_cutoff_prec(y_train, oof_rank_p_prob_prop_high)
            print("Rank Plus Prob Prop High CV score f1 score: {0} cutoff:{1}".format(f1score,opt_cutoff))
            opt_cutoff, f1score = get_opt_cutoff_prec(y_train, oof_rank_p_prob_prop_low)
            print("Rank Plus Prob Prop Low CV score f1 score: {0} cutoff:{1}".format(f1score,opt_cutoff))
            
            opt_cutoff, f1score = get_opt_cutoff_prec(y_train, oof_labels)
            print("Labels CV score f1 score: {0} cutoff:{1}".format(f1score,opt_cutoff))
            opt_cutoff, f1score = get_opt_cutoff_prec(y_train, oof_rank_p_labels)
            print('Labels plus rank head:',oof_rank_p_labels[0:10])
            print("Labels Plus Rank f1 score: {0} cutoff:{1}".format(f1score,opt_cutoff))
            print("Labels Plus Rank AUC score: {0} ".format(roc_auc_score(y_train,  oof_rank_p_labels)))
    
            opt_cutoff, f1score = get_opt_cutoff_prec(y_train, (oof_rank + oof) / 2)
            print("Normal and Rank Average score f1 score: {0} cutoff:{1}".format(f1score,opt_cutoff))
            oof_labels_p_prob = (oof_labels + oof) / 2
            opt_cutoff, f1score = get_opt_cutoff_prec(y_train, oof_labels_p_prob)
            print("Normal and Labels Average score f1 score: {0} cutoff:{1}".format(f1score,opt_cutoff))
            print("Normal and Labels Average score AUC score: {0}".format( roc_auc_score(y_train,  oof_labels_p_prob)))
            oof_rank_p_labels_p_prob = (oof_labels + oof + oof_rank) / 2
            opt_cutoff, f1score = get_opt_cutoff_prec(y_train, oof_rank_p_labels_p_prob)
            print("Normal and Labels and Rank Average score f1 score: {0} cutoff:{1}".format(f1score,opt_cutoff))
            print("Normal and Labels and Rank Average score AUC score: {0} ".format(roc_auc_score(y_train,  oof_rank_p_labels_p_prob)))
            
#             opt_cutoff, f1score = get_opt_cutoff_prec(y_train, predictions_train )
#             print("Train CV Avg score f1 score: {0} cutoff:{1}".format(f1score,opt_cutoff))
            
#             opt_cutoff, f1score = get_opt_cutoff_prec(y_train, predictions_train_w)
#             print("Train CV Weighted Avg score f1 score: {0} cutoff:{1}".format(f1score,opt_cutoff))
#             predictions_train_bayesian = predictions_train_mul_prob / (predictions_train_mul_prob + predictions_train_mul_prob_inv)
#             opt_cutoff, f1score = get_opt_cutoff_prec(y_train, predictions_train_bayesian)
#             print("Train CV Bayesian score f1 score: {0} cutoff:{1}".format(f1score,opt_cutoff))
   
    return fold_importance_df,predictions,oof,\
        train_outlier_preds,overall_imp_df,overall_sel_feats

In [18]:
sel_feats = ['feature_1',
 'feature_2',
 'feature_3',
 'hist_authorized_flag_mean',
 'hist_authorized_flag_sum',
 'hist_card_id_size',
 'hist_category_1_mean',
 'hist_category_3_mean_mean',
 'hist_category_4_mean',
 'hist_dayofweek_nunique',
 'hist_hour_nunique',
 'hist_installments_mean',
 'hist_installments_min',
 'hist_installments_sum',
 'hist_installments_var',
 'hist_merchant_category_id_nunique',
 'hist_merchant_group_id_nunique',
 'hist_merchant_id_nunique',
 'hist_month_diff_mean',
 'hist_month_lag_max',
 'hist_month_lag_mean',
 'hist_month_lag_min',
 'hist_month_lag_var',
 'hist_month_nunique',
 'hist_most_recent_purchases_range_max',
 'hist_most_recent_purchases_range_mean',
 'hist_most_recent_sales_range_max',
 'hist_most_recent_sales_range_mean',
 'hist_most_recent_sales_range_std',
 'hist_purchase_amount_max',
 'hist_purchase_amount_mean',
 'hist_purchase_amount_min',
 'hist_purchase_amount_sum',
 'hist_purchase_amount_var',
 'hist_purchase_date_average',
 'hist_purchase_date_count_mean',
 'hist_purchase_date_count_std',
 'hist_purchase_date_diff',
 'hist_purchase_date_max',
 'hist_purchase_date_min',
 'hist_purchase_date_uptonow',
 'hist_purchase_duration_max_max',
 'hist_purchase_duration_max_mean',
 'hist_purchase_duration_max_std',
 'hist_repeat_purchase_amount_sum_max',
 'hist_repeat_purchase_amount_sum_mean',
 'hist_repeat_purchase_amount_sum_min',
 'hist_sum_purchases_lag_max',
 'hist_sum_purchases_lag_mean',
 'hist_sum_purchases_lag_min',
 'hist_sum_purchases_lag_std',
 'hist_sum_purchases_lag_sum',
 'hist_sum_sales_lag_max',
 'hist_sum_sales_lag_mean',
 'hist_sum_sales_lag_min',
 'hist_sum_sales_lag_std',
 'hist_sum_sales_lag_sum',
 'hist_sum_sales_p_purchases_lag_max',
 'hist_sum_sales_p_purchases_lag_mean',
 'hist_sum_sales_p_purchases_lag_min',
 'hist_sum_sales_p_purchases_lag_sum',
 'hist_weekend_mean',
 'hist_weekofyear_nunique',
 'new_hist_card_id_size',
 'new_hist_category_1_mean',
 'new_hist_category_3_mean_mean',
 'new_hist_first_buy',
 'new_hist_installments_max',
 'new_hist_installments_mean',
 'new_hist_installments_min',
 'new_hist_installments_sum',
 'new_hist_installments_var',
 'new_hist_merchant_category_id_nunique',
 'new_hist_merchant_id_nunique',
 'new_hist_month_diff_mean',
 'new_hist_month_lag_mean',
 'new_hist_month_lag_var',
 'new_hist_purchase_amount_max',
 'new_hist_purchase_amount_mean',
 'new_hist_purchase_amount_min',
 'new_hist_purchase_amount_sum',
 'new_hist_purchase_amount_var',
 'new_hist_purchase_date_average',
 'new_hist_purchase_date_diff',
 'new_hist_purchase_date_max',
 'new_hist_purchase_date_min',
 'new_hist_purchase_date_uptonow',
 'new_hist_weekofyear_nunique',
 'new_hist_year_nunique',
 'new_most_recent_purchases_range_std',
#  'new_sum_purchases_lag_std',
#  'new_sum_purchases_lag_sum',
#  'new_sum_sales_lag_sum',
#  'new_sum_sales_p_purchases_lag_sum',
 'purchase_amount_total',
 'weekofyear']

In [19]:
# tr_encs, val_encs,test_encs = getenc(mask_without_outlier=mask_without_outlier)
tr_encs, val_encs,test_encs = getenc()
print(tr_encs[0].shape)
print(val_encs[0].shape)
print(test_encs[0].shape)

read complete for: 0
read complete for: 1
read complete for: 2
read complete for: 3
read complete for: 4
(161533, 2)
(40384, 2)
(123623, 2)


In [20]:
sel_enc_cols =['trans_merged_targetenc_merchant_id_mean']

In [21]:
hist_monthlagmeancols=[]
hist_monthlagsumcols=[]
for i in range(-13,1):
    hist_monthlagmeancols+=['hist_month_lag_'+str(i) + '_mean']
    hist_monthlagsumcols+=['hist_month_lag_'+str(i) + '_sum']
new_hist_monthlagmeancols=[]
new_hist_monthlagsumcols=[]
for i in range(1,3):
    new_hist_monthlagmeancols+=['new_hist_month_lag_'+str(i) + '_mean']
    new_hist_monthlagsumcols+=['new_hist_month_lag_'+str(i) + '_sum']

new_feats_reg =[
    'hist_category_3_A_sum',
    'hist_category_3_A_mean',
    'new_hist_category_3_A_sum',
    'new_hist_category_3_A_mean',
    'hist_city_id_nunique', 'new_hist_city_id_nunique',
    'hist_category_2_raw_mean',  'new_hist_category_2_raw_mean',
    'hist_category_3_raw_mean',  'new_hist_category_3_raw_mean',
    'hist_purchase_amount_per_install_sum', 'hist_purchase_amount_per_install_mean',
    'hist_purchase_amount_per_install_var',
    'new_hist_purchase_amount_per_install_sum', 'new_hist_purchase_amount_per_install_mean',
    'new_hist_purchase_amount_per_install_var',
    'hist_purchase_amount_scaled_sum', 'hist_purchase_amount_scaled_mean',
    'hist_purchase_amount_scaled_var',
    'new_hist_purchase_amount_scaled_sum', 'new_hist_purchase_amount_scaled_mean',
    'new_hist_purchase_amount_scaled_var',    
    ]

new_feats_reg += hist_monthlagsumcols + hist_monthlagmeancols + new_hist_monthlagsumcols + new_hist_monthlagmeancols


In [22]:
num_round = 10000
n_splits =5
max_delta_score =0.0001

In [23]:
# %%time
# #Run permute features
# features = list(df_train.columns) + sel_enc_cols
# exclude_cols = ['card_id','first_active_month','target','outliers']
# features = [col for col in features if col not in exclude_cols]

# # new_feats_reg=new_feats_reg[0:5]
# features = sel_feats + sel_enc_cols + new_feats_reg

# # print(features)
# # features = sel_feats[0:5]
# fold_importance_df,predictions,oof,train_outlier_preds,overall_imp_df,overall_sel_feats\
#     = runlgb(True,df_train,df_test,param,features,get_rmse_score,train_outlier=df_train_outliers,
#             subset_feats=new_feats_reg)

In [24]:
# selected_features = list(overall_imp_df.loc[overall_imp_df['overall_score_mean'] >= max_delta_score, 'feature'])
# selected_features.sort()
# df = pd.DataFrame( np.array(selected_features))
# df.to_csv("overall_selected_features.csv")
# selected_features

In [25]:
# overall_imp_df.sort_values(by='overall_score_mean',ascending=False,inplace=True)
# overall_imp_df.to_csv('overall_feats_allfolds.csv')
# overall_imp_df

In [26]:
# overall_sel_feats[0].sort()
# df = pd.DataFrame( np.array(overall_sel_feats[0]))
# df.to_csv("sel_feats_0.csv")
# overall_sel_feats[0]

In [27]:
# overall_sel_feats[1].sort()
# df = pd.DataFrame( np.array(overall_sel_feats[1]))
# df.to_csv("sel_feats_1.csv")
# overall_sel_feats[1]

In [28]:
# overall_sel_feats[2].sort()
# df = pd.DataFrame( np.array(overall_sel_feats[2]))
# df.to_csv("sel_feats_2.csv")
# overall_sel_feats[2]

In [29]:
# overall_sel_feats[3].sort()
# df = pd.DataFrame( np.array(overall_sel_feats[3]))
# df.to_csv("sel_feats_3.csv")
# overall_sel_feats[3]

In [30]:
# overall_sel_feats[4].sort()
# df = pd.DataFrame( np.array(overall_sel_feats[4]))
# df.to_csv("sel_feats_4.csv")
# overall_sel_feats[4]

In [31]:
overall_sel_feats =[1,2,3,4,5]

In [32]:
overall_sel_feats[0] = ['hist_category_3_A_sum', 'hist_city_id_nunique',
 'hist_month_lag_-1_mean', 'hist_month_lag_-2_mean', 'hist_month_lag_-2_sum',
 'hist_month_lag_0_mean', 'hist_purchase_amount_per_install_mean', 'hist_purchase_amount_per_install_sum',
 'hist_purchase_amount_per_install_var', 
'hist_purchase_amount_scaled_mean', 'hist_purchase_amount_scaled_sum',
 'new_hist_category_3_A_sum', 'new_hist_month_lag_1_mean', 'new_hist_month_lag_1_sum',
 'new_hist_purchase_amount_per_install_mean', 'new_hist_purchase_amount_per_install_sum',
 'new_hist_purchase_amount_per_install_var', 
'new_hist_purchase_amount_scaled_mean',
 'new_hist_purchase_amount_scaled_sum', 
                        'new_hist_purchase_amount_scaled_var'
                       ]

In [33]:
overall_sel_feats[1] = ['hist_city_id_nunique', 'hist_month_lag_-13_mean', 'hist_month_lag_-1_mean',
 'hist_month_lag_-1_sum', 'hist_month_lag_-2_mean', 'hist_month_lag_-2_sum',
 'hist_month_lag_-3_mean', 'hist_month_lag_-4_mean', 'hist_month_lag_0_mean', 'hist_month_lag_0_sum',
 'hist_purchase_amount_per_install_mean', 'hist_purchase_amount_per_install_sum',
'hist_purchase_amount_scaled_mean', 'hist_purchase_amount_scaled_sum', 
#                         'hist_purchase_amount_scaled_var',
 'new_hist_category_3_A_sum', 'new_hist_month_lag_1_sum',
 'new_hist_purchase_amount_per_install_mean', 'new_hist_purchase_amount_per_install_sum',
 'new_hist_purchase_amount_per_install_var', 
'new_hist_purchase_amount_scaled_mean',
 'new_hist_purchase_amount_scaled_sum',
#                         'new_hist_purchase_amount_scaled_var'
                       ]

In [34]:
overall_sel_feats[2] =['hist_category_3_A_sum', 'hist_city_id_nunique',
 'hist_month_lag_-11_mean', 'hist_month_lag_-1_mean', 'hist_month_lag_-2_sum',
 'hist_month_lag_-3_mean', 'hist_month_lag_0_mean', 'hist_month_lag_0_sum',
 'hist_purchase_amount_per_install_mean', 'hist_purchase_amount_per_install_var',
 'hist_purchase_amount_scaled_mean', 'hist_purchase_amount_scaled_sum',
 'new_hist_category_2_raw_mean', 'new_hist_category_3_A_sum',
 'new_hist_month_lag_1_mean', 'new_hist_month_lag_1_sum',
 'new_hist_purchase_amount_per_install_mean', 'new_hist_purchase_amount_per_install_sum',
 'new_hist_purchase_amount_per_install_var', 
'new_hist_purchase_amount_scaled_mean',
 'new_hist_purchase_amount_scaled_sum'
                      ]

In [35]:
overall_sel_feats[3] =['hist_category_2_raw_mean', 'hist_category_3_A_sum', 'hist_category_3_raw_mean',
 'hist_city_id_nunique', 'hist_month_lag_-1_mean', 'hist_month_lag_-1_sum',
 'hist_month_lag_-2_mean', 'hist_month_lag_-2_sum', 'hist_month_lag_-3_mean',
 'hist_month_lag_0_mean', 'hist_month_lag_0_sum',
 'hist_purchase_amount_per_install_mean', 'hist_purchase_amount_per_install_sum',
 'hist_purchase_amount_per_install_var',
                       'hist_purchase_amount_scaled_mean',
 'hist_purchase_amount_scaled_sum', 'new_hist_category_2_raw_mean',
 'new_hist_category_3_A_mean', 'new_hist_category_3_A_sum', 'new_hist_month_lag_1_mean',
 'new_hist_month_lag_1_sum',
 'new_hist_purchase_amount_per_install_mean', 'new_hist_purchase_amount_per_install_sum',
 'new_hist_purchase_amount_per_install_var', 
                       'new_hist_purchase_amount_scaled_mean',
 'new_hist_purchase_amount_scaled_sum',
                       'new_hist_purchase_amount_scaled_var'
                      ]

In [36]:
overall_sel_feats[4] =['hist_category_2_raw_mean', 'hist_category_3_A_sum', 'hist_category_3_raw_mean',
 'hist_city_id_nunique', 'hist_month_lag_-1_mean', 'hist_month_lag_-1_sum', 'hist_month_lag_-2_sum',
 'hist_month_lag_-3_sum', 'hist_month_lag_0_mean', 'hist_month_lag_0_sum',
 'hist_purchase_amount_per_install_mean', 'hist_purchase_amount_per_install_sum',
 'hist_purchase_amount_per_install_var', 
'hist_purchase_amount_scaled_mean', 'hist_purchase_amount_scaled_sum', 
                       'new_hist_category_3_A_sum',
 'new_hist_month_lag_1_mean', 'new_hist_month_lag_1_sum', 'new_hist_month_lag_2_mean',
 'new_hist_purchase_amount_per_install_mean', 'new_hist_purchase_amount_per_install_sum',
 'new_hist_purchase_amount_per_install_var', 
                       'new_hist_purchase_amount_scaled_mean',
 'new_hist_purchase_amount_scaled_sum',
#                        'new_hist_purchase_amount_scaled_var'
                      ]

In [37]:
num_round=10000

In [38]:

#Run model with selected features
# features = selected_features
# features = sel_feats

for i,val in enumerate(overall_sel_feats):
    overall_sel_feats[i] += sel_feats + sel_enc_cols #sel_feats
print(overall_sel_feats[0])

['hist_category_3_A_sum', 'hist_city_id_nunique', 'hist_month_lag_-1_mean', 'hist_month_lag_-2_mean', 'hist_month_lag_-2_sum', 'hist_month_lag_0_mean', 'hist_purchase_amount_per_install_mean', 'hist_purchase_amount_per_install_sum', 'hist_purchase_amount_per_install_var', 'hist_purchase_amount_scaled_mean', 'hist_purchase_amount_scaled_sum', 'new_hist_category_3_A_sum', 'new_hist_month_lag_1_mean', 'new_hist_month_lag_1_sum', 'new_hist_purchase_amount_per_install_mean', 'new_hist_purchase_amount_per_install_sum', 'new_hist_purchase_amount_per_install_var', 'new_hist_purchase_amount_scaled_mean', 'new_hist_purchase_amount_scaled_sum', 'new_hist_purchase_amount_scaled_var', 'feature_1', 'feature_2', 'feature_3', 'hist_authorized_flag_mean', 'hist_authorized_flag_sum', 'hist_card_id_size', 'hist_category_1_mean', 'hist_category_3_mean_mean', 'hist_category_4_mean', 'hist_dayofweek_nunique', 'hist_hour_nunique', 'hist_installments_mean', 'hist_installments_min', 'hist_installments_sum', 'h

In [39]:
# param = {'objective':'regression',
#          'num_leaves': 31,
#          'min_data_in_leaf': 25,
#          'max_depth': 7,
#          'learning_rate': 0.01,
#          'lambda_l1':0.13,
#          "boosting": "gbdt",
#          "feature_fraction":0.85,
#          'bagging_freq':8,
#          "bagging_fraction": 0.9 ,
#          "metric": 'rmse',
#          "verbosity": -1,
#          "random_state": 2333,
#          'n_estimators': 10000,
#          'n_jobs' :-1
#         }

In [40]:
# param = {'colsample_bytree': 0.6090229042575085,
#    'min_child_samples': 215,
#    'num_leaves': 77,
#    'reg_alpha': 0.830466583246783,
#    'reg_lambda': 0.24303972756965078,
#    'subsample': 0.717525538993901,
#    'subsample_for_bin': 160000,
#    'learning_rate': 0.01,
#    'boosting': 'gbdt',
#    'bagging_seed': 2018,
#    'bagging_freq': 8,
#    'n_estimators': 3000,
#    'objective': 'regression',
#    'metric': 'rmse',
#    'random_state': 2333,
#    'max_depth': 7}

In [41]:
#Optimized after 200 iterations on bagging freq = 2
param =  {'colsample_bytree': 0.6461416260298948,
   'min_child_samples': 100,
   'num_leaves': 56,
   'reg_alpha': 0.924728048030499,
   'reg_lambda': 0.7379608279900612,
   'subsample': 0.542352168720072,
   'subsample_for_bin': 120000,
   'learning_rate': 0.01,
   'boosting': 'gbdt',
   'bagging_seed': 2018,
   'min_data_in_bin': 100,
   'bagging_freq': 2,
   'n_estimators': 10000,
   'objective': 'regression',
   'metric': 'rmse',
   'random_state': 2333,
   'max_depth': 7}

In [42]:
%%time
fold_importance_df,predictions,oof,train_outlier_preds,dummy_overall_imp_df,dummy_overall_sel_feats\
    = runlgb(4590,False,df_train,df_test,param,overall_sel_feats,target,
             lgb_fit, lgb_predict,lgb_getbestscore,train_outlier=df_train_outliers,
            fold_feats=True)


fold n°0
y_train shape: (199710,)
val shape: (39942, 205)
y_val shape: (39942,)
tr shape: (159768, 206)
val shape: (39942, 206)
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's rmse: 1.62086
[200]	valid_0's rmse: 1.59303
[300]	valid_0's rmse: 1.58324
[400]	valid_0's rmse: 1.57861
[500]	valid_0's rmse: 1.57633
[600]	valid_0's rmse: 1.57495
[700]	valid_0's rmse: 1.57408
[800]	valid_0's rmse: 1.57352
[900]	valid_0's rmse: 1.57306
[1000]	valid_0's rmse: 1.5727
[1100]	valid_0's rmse: 1.57264
[1200]	valid_0's rmse: 1.57251
[1300]	valid_0's rmse: 1.5724
[1400]	valid_0's rmse: 1.57227
[1500]	valid_0's rmse: 1.57234
[1600]	valid_0's rmse: 1.57257
Early stopping, best iteration is:
[1400]	valid_0's rmse: 1.57227

fold n°1
y_train shape: (199710,)
val shape: (39942, 205)
y_val shape: (39942,)
tr shape: (159768, 206)
val shape: (39942, 206)
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's rmse: 1.59451
[200]	valid_0's rmse: 1.56775
[300]

In [43]:
np.savetxt('oof_lgb.csv',oof,delimiter=',')
np.savetxt('predictions_lgb.csv',predictions,delimiter=',')
np.savetxt('train_outlier_preds_lgb.csv',train_outlier_preds,delimiter=',')

In [44]:
xgbparam = {'objective':'reg:linear',
         'max_leaf_nodes': 31,
         'min_child_weight': 25,
         'max_depth': 5,
         'learning_rate': 0.01,
         'reg_lambda':0.13,
         "booster": "gbtree",
         "colsample_bytree":0.85,
         'colsample_bylevel':0.9,
         "subsample": 0.9 ,
         "eval_metric": 'rmse',
#          "verbosity": 0,
#          'silent': True,
         "random_state": 2333,
         'n_estimators': 5000,
         'n_jobs' :4
        }

In [45]:
# %%time
# #XGBoost Predictions
# fold_importance_df,predictions_xgb,oof_xgb,train_outlier_preds_xgb,dummy_overall_imp_df,dummy_overall_sel_feats\
#     = runlgb(4590,False,df_train,df_test,xgbparam,overall_sel_feats,target,
#              xgb_fit,xgb_predict,xgb_getbestscore,train_outlier=df_train_outliers,
#             fold_feats=True)

In [46]:
Path = '../input/elo-combine-reg-and-best-subm-and-classification/'
oof_xgb=np.loadtxt(Path+'oof_xgb.csv',delimiter=',')
predictions_xgb=np.loadtxt(Path+'predictions_xgb.csv',delimiter=',')
train_outlier_preds_xgb=np.loadtxt(Path+'train_outlier_preds_xgb.csv',delimiter=',')

In [47]:
np.savetxt('oof_xgb.csv',oof_xgb,delimiter=',')
np.savetxt('predictions_xgb.csv',predictions_xgb,delimiter=',')
np.savetxt('train_outlier_preds_xgb.csv',train_outlier_preds_xgb,delimiter=',')

In [48]:
def getrmsescore(target,preds):
    score = mean_squared_error(target,preds)**0.5
    return score

In [49]:
getscore  = getrmsescore

In [50]:
train_ens_arr= np.vstack([oof, oof_xgb]).transpose()
test_ens_arr= np.vstack([predictions, predictions_xgb]).transpose()
train_outlier_ens_arr = np.vstack([train_outlier_preds, train_outlier_preds_xgb]).transpose()

In [51]:
#weighted average ensemble
#score weighted average
score_weight = [0.8,0.2]

train_mean_w = score_weight[0] * train_ens_arr[:,0] + score_weight[1] * train_ens_arr[:,1] 
test_mean_w = score_weight[0] * test_ens_arr[:,0] + score_weight[1] * test_ens_arr[:,1] 
train_outlier_mean_w = score_weight[0] * train_outlier_ens_arr[:,0] + score_weight[1] * train_outlier_ens_arr[:,1] 
score = getscore(target,train_mean_w)
print('Ensemble score weighted mean score:',score)

Ensemble score weighted mean score: 1.5521684071642086


In [52]:
from sklearn.linear_model import Ridge

In [53]:
# #STacking ensemble of xgb and lgb
# folds = KFold(n_splits=5, shuffle=True, random_state=15)
# oof_stack = np.zeros(train_ens_arr.shape[0])
# predictions_stack = np.zeros(test_ens_arr.shape[0])

# for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_ens_arr, target)):
#     print("fold n°{}".format(fold_))
#     trn_data, trn_y = train_ens_arr[trn_idx], target.iloc[trn_idx].values
#     val_data, val_y = train_ens_arr[val_idx], target.iloc[val_idx].values

#     print("-" * 10 + "Regression " + str(fold_) + "-" * 10)
    
#     clf = Ridge(alpha=0.01)#,solver='saga')
#     clf.fit(trn_data,trn_y)
    
#     oof_stack[val_idx] = clf.predict(val_data)
#     print("current score: {0} ".format(getscore(val_y, oof_stack[val_idx])))    
# #     train_outlier_preds_stack += clf.predict(test_ens_arr) / 5
#     predictions_stack += clf.predict(test_ens_arr) / 5
    
# print("Stacked score: {0} ".format(getscore(target, oof_stack)))

In [54]:
#Ensemble of xgb and lgbm
oof = train_mean_w
predictions = test_mean_w
train_outlier_preds = train_outlier_mean_w

In [55]:
# n_ensembles=100

In [56]:
# predictions_list=[]
# oof_list =[]
# for i in range(0,n_ensembles):
# #     randomno = 1234
#     randomno = 888 + i
#     param['random_state'] = randomno
#     fold_importance_df,predictions,oof,train_outlier_preds,dummy_overall_imp_df,dummy_overall_sel_feats\
#         = runlgb(4590,False,df_train,df_test,param,overall_sel_feats,target,train_outlier=df_train_outliers,
#                 fold_feats=True)

#     oof_list +=[oof]
#     predictions_list +=[predictions]
#     np.savetxt('test_reg_'+str(randomno)+'.csv',predictions,delimiter=',')
#     np.savetxt('oof_reg_'+str(randomno)+'.csv',oof,delimiter=',')

In [57]:
model_without_outliers = pd.DataFrame({"card_id":df_test["card_id"].values})
model_without_outliers["target"] = predictions

In [58]:
#OOF
model_without_outliers_oof = pd.DataFrame({"card_id":df_train["card_id"].values})
model_without_outliers_oof["target"] = oof
model_without_outliers_oof.index = df_train[mask_without_outlier].index


Boolean Series key will be reindexed to match DataFrame index.



In [59]:
train_outlier_preds_df = pd.DataFrame()
train_outlier_preds_df['card_id'] = df_train_outliers['card_id']
train_outlier_preds_df['target'] = train_outlier_preds
train_outlier_preds_df.index = df_train_outliers.index

In [60]:
model_full_oof = pd.concat([model_without_outliers_oof,train_outlier_preds_df])
print(model_full_oof.shape)

(201917, 2)


In [61]:
# Path='../input/elo-combine-permute-feats-likely-liers/'
# model_full_oof = pd.read_csv(Path+'model_full_oof.csv',index_col=0)
# model_without_outliers = pd.read_csv(Path+'model_without_outliers.csv',index_col=0)

In [62]:
model_full_oof.to_csv('model_full_oof.csv')
model_without_outliers_oof.to_csv('model_without_outliers_oof.csv')
model_without_outliers.to_csv('model_without_outliers.csv')
train_outlier_preds_df.to_csv('train_outlier_preds.csv')

# Part 2 Training Model For Outliers Classification

In [63]:
# %%time
# # Path = '../input/elo-preproc-3/'
# Path = '../input/elo-rfm-and-business-feats/'

# df_train = pd.read_csv(Path+'train_preproc.csv',index_col=0)
# df_test = pd.read_csv(Path +'test_preproc.csv',index_col=0)

In [64]:
df_train = df_train_full

## using outliers column as labels instead of target column

In [65]:
target_reg = df_train['target']
target = df_train['outliers']
# del df_train['outliers']
# del df_train['target']

In [66]:
# features = [c for c in df_train.columns if c not in ['card_id', 'first_active_month']]
# categorical_feats = [c for c in features if 'feature_' in c]

## parameters

In [67]:
tr_encs, val_encs,test_encs = getenc()
print(tr_encs[0].shape)
print(val_encs[0].shape)
print(test_encs[0].shape)

read complete for: 0
read complete for: 1
read complete for: 2
read complete for: 3
read complete for: 4
(161533, 2)
(40384, 2)
(123623, 2)


In [68]:
def rank_preproc(combined):
    nullzerocols = ['new_hist_purchase_amount_scaled_max','new_hist_purchase_amount_scaled_min',
                   'new_hist_purchase_amount_scaled_sum','new_hist_purchase_amount_scaled_mean',
                'new_hist_purchase_amount_per_install_max','new_hist_purchase_amount_per_install_min',
                'new_hist_purchase_amount_per_install_sum','new_hist_purchase_amount_per_install_mean']
    nullmaxcols = ['new_hist_purchase_date_uptonow','new_hist_first_buy']
    for df in [combined]:
        df['purchase_amount_scaled_total'] = df['new_hist_purchase_amount_scaled_sum']+df['hist_purchase_amount_scaled_sum']
        df['purchase_amount_per_install_total'] = df['new_hist_purchase_amount_per_install_sum']+df['hist_purchase_amount_per_install_sum']
        for col in nullmaxcols:
            maxval = df[col].max() + 30
            df[col].fillna(maxval,inplace=True)
        for col in nullzerocols:
            df[col].fillna(0,inplace=True)
            
    return combined

In [69]:
def gen_newfeats_outlier(combined):
    for df in [combined]:
        df['hist_size_per_merchant'] = df['hist_card_id_size'] / df['hist_merchant_id_nunique']
        df['hist_size_per_merchant_category'] = df['hist_card_id_size'] / df['hist_merchant_category_id_nunique']
        df['hist_size_per_subsector'] = df['hist_card_id_size'] / df['hist_subsector_id_nunique']
        df['hist_size_per_merchant_group'] = df['hist_card_id_size'] / df['hist_merchant_group_id_nunique']
        df['hist_size_per_city'] = df['hist_card_id_size'] / df['hist_city_id_nunique']
    
    return combined
        
def generate_rank_outlier(combined):


        rankdesccols = [
'elapsed_time',
'hist_size_per_merchant',
'new_hist_first_buy',
'new_hist_card_id_size',
'hist_first_buy',
'hist_category_1_mean',
'hist_installments_mean',
'hist_purchase_date_count_mean',
'new_hist_purchase_date_uptonow',
'hist_purchase_date_count_max',
'hist_purchase_date_diff',
'hist_month_lag_-4_sum',
'hist_month_lag_-5_sum',
'hist_month_lag_-6_sum',            
]
    
        rankasccols = [
        'hist_category_2_raw_mean',
        'hist_purchase_amount_per_install_sum',
        'hist_purchase_amount_per_install_mean',
        'hist_card_id_size',
        'new_hist_purchase_date_min',
        'hist_purchase_date_min',
        'hist_month_lag_0_sum',
        'hist_month_lag_-1_sum',
        'hist_month_lag_-2_sum',
        'hist_month_lag_-3_sum',
#         'hist_month_lag_-4_sum',
#         'hist_month_lag_-5_sum',
#         'hist_month_lag_-6_sum',
        'hist_purchase_date_diff',
       # 'hist_most_recent_sales_range_std',
                       
        'hist_authorized_flag_mean',
        'hist_authorized_flag_sum',
        'new_hist_card_id_size',
        'new_hist_purchase_amount_per_install_sum',
        'hist_category_3_A_mean',
        'hist_month_lag_-12_mean',
        'hist_month_lag_-13_sum',
        'new_hist_purchase_date_diff',
        'hist_purchase_amount_per_install_mean',

        ]
        
        rankcols =[col for col in combined.columns if ('rank_' in col)]
        if len(rankcols)!=0:
            combined.drop(rankcols,axis=1,inplace=True)
#         rankcols_train =[col for col in train.columns if ('rank_' in col)]
#         if len(rankcols_train)!=0:
#             train.drop(rankcols_train,axis=1,inplace=True)
#         rankcols_test =[col for col in test.columns if ('rank_' in col)]
#         if len(rankcols_test)!=0:
#             test.drop(rankcols_test,axis=1,inplace=True)
        for i,df in enumerate([combined]):
            print('df :',i)
            for col in rankasccols:
                print('col:',col)
                df['rank_'+col] =  df[col].rank()
    #             df['rank_'+col] = weightasccols[i] * df[col].rank()

            for col in rankdesccols:
                print('col:',col)
                df['rank_'+col] = df[col].rank(ascending=False)
    #             df['rank_'+col] = weightdesccols[i] * df[col].rank(ascending=False)

            rankcols =[col for col in df.columns if ('rank_' in col)
                       & ('hist_purchase_amount_per_install_sum' !=col)
#                        & ('hist_card_id_size' not in col)
                        
                      ]
            df['rank_sum'] = df[rankcols].sum(axis=1) / 1E+5
            df['rank_mean'] = df[rankcols].mean(axis=1) / 1E+5
    #         weight_sum = (np.sum(np.array(weightasccols)) + np.sum(np.array(weightdesccols)))
    #         df['rank_mean'] = df[rankcols].sum(axis=1) / (1E+5 * weight_sum)

    #         df['rank_sum'] = df[rankcols].product(axis=1) / pow(10,len(rankdesccols) + len (rankasccols))

        return combined

In [70]:
def rank_postproc(combined):
    for df in [combined]:
        df['rank_rank_sum']= df['rank_sum'].rank()
        col = 'rank_hist_purchase_amount_per_install_sum'
        df['rank_diff_' + col]= df[col] - df['rank_rank_sum']
    return combined

## training model

In [71]:
sel_enc_cols =['trans_merged_targetenc_merchant_id_mean']

In [72]:
sel_feats  = ['elapsed_time',
 'hist_authorized_flag_mean',
 'hist_authorized_flag_sum',
 'hist_category_1_mean',
 'hist_category_1_sum',
 'hist_installments_sum',
 'hist_month_diff_mean',
 'hist_month_lag_mean',
 'hist_month_lag_min',
 'hist_month_lag_var',
 'hist_month_nunique',
 'hist_most_recent_sales_range_std',
 'hist_purchase_date_diff',
 'hist_purchase_date_max',
 'hist_purchase_date_min',
 'hist_purchase_date_uptonow',
 'hist_weekofyear_nunique',
 'new_hist_category_1_mean',
 'new_hist_category_1_sum',
 'new_hist_month_lag_mean',
 'new_hist_purchase_date_diff',
 'new_hist_purchase_date_max',
 'new_hist_purchase_date_uptonow' ]

In [73]:
df_train['istrain']=1
df_test['istrain']=0
combined = pd.concat([df_train,df_test])

combined = rank_preproc(combined)
print()
print('************ Pre proc complete ********************************')
combined = gen_newfeats_outlier(combined)

combined = generate_rank_outlier(combined)
print()
print('************ Generate Rank complete ********************************')
combined = rank_postproc(combined)
print()
print('************ Post proc complete ********************************')

#separate back combined to train and test
df_train=combined[combined['istrain']==1]
df_test=combined[combined['istrain']==0]
del combined;gc.collect()


Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.






************ Pre proc complete ********************************
df : 0
col: hist_category_2_raw_mean
col: hist_purchase_amount_per_install_sum
col: hist_purchase_amount_per_install_mean
col: hist_card_id_size
col: new_hist_purchase_date_min
col: hist_purchase_date_min
col: hist_month_lag_0_sum
col: hist_month_lag_-1_sum
col: hist_month_lag_-2_sum
col: hist_month_lag_-3_sum
col: hist_purchase_date_diff
col: hist_authorized_flag_mean
col: hist_authorized_flag_sum
col: new_hist_card_id_size
col: new_hist_purchase_amount_per_install_sum
col: hist_category_3_A_mean
col: hist_month_lag_-12_mean
col: hist_month_lag_-13_sum
col: new_hist_purchase_date_diff
col: hist_purchase_amount_per_install_mean
col: elapsed_time
col: hist_size_per_merchant
col: new_hist_first_buy
col: new_hist_card_id_size
col: hist_first_buy
col: hist_category_1_mean
col: hist_installments_mean
col: hist_purchase_date_count_mean
col: new_hist_purchase_date_uptonow
col: hist_purchase_date_count_max
col: hist_purchase_date

512

In [74]:
new_feats_cls = new_feats_reg.copy()

rankcols = [col for col in df_train.columns if ('rank_' in col)
           & ('rank_sum' not in col )
           & ('rank_diff' not in col)
           & ('rank_mean' not in col)]

new_feats_cls += rankcols + ['rank_diff_rank_hist_purchase_amount_per_install_sum']


In [75]:
from sklearn.metrics import brier_score_loss
def lgb_sk_f1(labels,preds):
    optcutoff,f1score = get_opt_cutoff_prec(labels,preds)
    return 'f1_score', f1score, True

def lgb_f1(preds, train_data):
    labels = train_data.get_label()
#     preds = preds.reshape(-1, 2)
#     print('preds before :',preds[0:10])
#     preds = preds.argmax(axis = 1)
#     preds = preds.min(axis = 1)
#     f1score = f1_score(preds, labels, average = 'micro')
    optcutoff,f1score = get_opt_cutoff_prec(labels,preds)
    
#     bscore = brier_score_loss(labels,preds) 
#     bss = 1- (bscore/0.010944928684627575) 
    return 'f1_score', f1score, True

def get_opt_cutoff_prec(labels,preds):
    precision, recall, thresholds  = precision_recall_curve(labels, preds)
    f1_score= 2*((precision*recall)/(precision+recall))
    optimal_idx = np.nanargmax(f1_score)
    optimal_threshold = thresholds[optimal_idx]
    return optimal_threshold, f1_score[optimal_idx]

def get_opt_cutoff_recall(labels,preds):
    precision, recall, thresholds  = precision_recall_curve(labels, preds)
    optimal_idx = np.nanargmax(recall)
    optimal_threshold = thresholds[optimal_idx]
    return optimal_threshold, recall[optimal_idx]

def convert_probtolabels(preds,cutoff=0.5):
    y_bin= preds.copy()
    y_bin[preds>cutoff] = 1
    y_bin[preds<=cutoff] = 0
    y_bin=y_bin.astype(int)

    return y_bin

In [76]:
num_round = 10000
n_splits =5

In [77]:
# param = {'num_leaves': 31,
#          'min_data_in_leaf': 30, 
#          'objective':'binary',
# #          'num_class':2,
#          'max_depth': -1,
#          'learning_rate': 0.01,
#          "min_child_samples": 20,
#          "boosting": "gbdt",
#          "feature_fraction": 0.9,
#          "bagging_freq": 1,
#          "bagging_fraction": 0.9 ,
#          "bagging_seed": 11,
#          "metric": 'binary_logloss',
# #          "metric": 'auc',
# #          "metric": 'multi_logloss',
#          "lambda_l1": 0.1,
#          "verbosity": -1,
#          "nthread": 4,
#          'n_estimators' : 10000,
#          "random_state": 4590}

In [78]:

# features = list(df_train.columns) + sel_enc_cols
# exclude_cols = ['card_id','first_active_month','target','outliers']
# features = [col for col in features if col not in exclude_cols]

# new_feats_cls =new_feats_cls[0:5]
# param['n_estimators'] =10
features = sel_feats + sel_enc_cols + new_feats_cls

In [79]:
# %%time
# #Run permute features
# fold_importance_df,predictions,oof,train_outlier_preds,overall_imp_df,overall_sel_feats\
#     = runlgb(True,df_train,df_test,param,features,score_function=get_f1loss_score,regression=False,
#             subset_feats = new_feats_cls,
#             feval=lgb_sk_f1,
#                 )

In [80]:
# selected_features_cls = list(overall_imp_df.loc[overall_imp_df['overall_score_mean'] >= max_delta_score, 'feature'])
# selected_features_cls.sort()
# df = pd.DataFrame( np.array(selected_features_cls))
# df.to_csv("overall_selected_features_cls.csv")
# selected_features_cls

In [81]:
# overall_imp_df.sort_values(by='overall_score_mean',ascending=False,inplace=True)
# overall_imp_df.to_csv('overall_feats_allfolds_cls.csv')
# overall_imp_df

In [82]:
# overall_sel_feats[0].sort()
# df = pd.DataFrame( np.array(overall_sel_feats[0]))
# df.to_csv("sel_feats_0_cls.csv")
# overall_sel_feats[0]

In [83]:
# overall_sel_feats[1].sort()
# df = pd.DataFrame( np.array(overall_sel_feats[1]))
# df.to_csv("sel_feats_1_cls.csv")
# overall_sel_feats[1]

In [84]:
# overall_sel_feats[2].sort()
# df = pd.DataFrame( np.array(overall_sel_feats[2]))
# df.to_csv("sel_feats_2_cls.csv")
# overall_sel_feats[2]

In [85]:
# overall_sel_feats[3].sort()
# df = pd.DataFrame( np.array(overall_sel_feats[3]))
# df.to_csv("sel_feats_3_cls.csv")
# overall_sel_feats[3]

In [86]:
# overall_sel_feats[4].sort()
# df = pd.DataFrame( np.array(overall_sel_feats[4]))
# df.to_csv("sel_feats_4_cls.csv")
# overall_sel_feats[4]

In [87]:
selected_features_cls =['hist_category_2_raw_mean', 'hist_category_3_A_mean', 'hist_category_3_A_sum',
 'hist_city_id_nunique', 'hist_month_lag_-1_mean', 'hist_month_lag_-2_mean', 'hist_month_lag_-5_mean',
 'hist_month_lag_-6_mean', 'hist_month_lag_-7_mean', 'hist_month_lag_-8_mean', 'hist_month_lag_-8_sum',
 'hist_month_lag_-9_mean', 'hist_month_lag_-9_sum', 'hist_month_lag_0_mean', 'hist_month_lag_0_sum',
 'hist_purchase_amount_per_install_mean', 'hist_purchase_amount_per_install_sum',
 'hist_purchase_amount_per_install_var', 'hist_purchase_amount_scaled_sum', 'new_hist_category_2_raw_mean',
 'new_hist_category_3_A_sum', 'new_hist_category_3_raw_mean', 'new_hist_month_lag_1_mean',
 'new_hist_month_lag_1_sum', 'new_hist_purchase_amount_scaled_sum', 'new_hist_purchase_amount_scaled_var',
 'rank_diff_rank_hist_purchase_amount_per_install_sum', 'rank_elapsed_time',
 'rank_hist_authorized_flag_sum', 'rank_hist_category_2_raw_mean', 'rank_hist_first_buy',
 'rank_hist_installments_mean', 'rank_hist_month_lag_-1_sum', 'rank_hist_month_lag_-4_sum',
 'rank_hist_month_lag_-5_sum', 'rank_hist_month_lag_-6_sum', 'rank_hist_purchase_date_count_mean',
 'rank_hist_purchase_date_diff', 'rank_hist_size_per_merchant',
 'rank_new_hist_purchase_date_min', 'rank_new_hist_purchase_date_uptonow']

In [88]:
overall_sel_feats_cls =[0,1,2,3,4]

In [89]:
overall_sel_feats_cls[0]=['hist_category_2_raw_mean', 'hist_category_3_A_mean',
 'hist_city_id_nunique', 'hist_month_lag_-2_mean', 'hist_month_lag_-6_mean',
 'hist_month_lag_-8_sum', 'hist_month_lag_-9_mean', 'hist_month_lag_0_sum',
 'hist_purchase_amount_per_install_mean', 'hist_purchase_amount_per_install_sum',
 'hist_purchase_amount_per_install_var', 'hist_purchase_amount_scaled_mean',
 'new_hist_category_3_A_mean', 'new_hist_category_3_A_sum',
 'new_hist_city_id_nunique', 'new_hist_purchase_amount_per_install_sum',
 'new_hist_purchase_amount_per_install_var', 'new_hist_purchase_amount_scaled_var',
 'rank_diff_rank_hist_purchase_amount_per_install_sum', 'rank_elapsed_time',
 'rank_hist_authorized_flag_mean', 'rank_hist_category_1_mean',
 'rank_hist_category_2_raw_mean', 'rank_hist_first_buy',
 'rank_hist_installments_mean', 'rank_hist_month_lag_-1_sum',
 'rank_hist_month_lag_-6_sum', 'rank_hist_purchase_amount_per_install_mean',
 'rank_new_hist_first_buy', 'rank_new_hist_purchase_amount_per_install_sum',
 'rank_new_hist_purchase_date_diff', 'rank_new_hist_purchase_date_min',
 'rank_new_hist_purchase_date_uptonow']

In [90]:
overall_sel_feats_cls[1]=['hist_category_2_raw_mean', 'hist_category_3_A_sum', 'hist_category_3_raw_mean',
 'hist_city_id_nunique', 'hist_month_lag_-1_mean', 'hist_month_lag_-1_sum',
 'hist_month_lag_-2_mean', 'hist_month_lag_-2_sum', 'hist_month_lag_-4_mean',
 'hist_month_lag_-6_mean', 'hist_month_lag_-7_mean', 'hist_month_lag_-9_sum',
 'hist_month_lag_0_mean', 'hist_month_lag_0_sum', 'hist_purchase_amount_per_install_mean',
 'hist_purchase_amount_per_install_sum', 'hist_purchase_amount_per_install_var',
 'new_hist_category_3_A_sum', 'new_hist_category_3_raw_mean', 'new_hist_month_lag_1_mean',
 'new_hist_purchase_amount_scaled_mean', 'rank_elapsed_time', 'rank_hist_authorized_flag_sum',
 'rank_hist_category_2_raw_mean', 'rank_hist_installments_mean', 'rank_hist_month_lag_-1_sum',
 'rank_hist_month_lag_-3_sum', 'rank_hist_purchase_amount_per_install_sum', 'rank_hist_purchase_date_count_mean',
 'rank_hist_purchase_date_diff', 'rank_hist_size_per_merchant', 'rank_new_hist_first_buy',
 'rank_new_hist_purchase_amount_per_install_sum', 'rank_new_hist_purchase_date_diff',
 'rank_new_hist_purchase_date_min', 'rank_new_hist_purchase_date_uptonow']

In [91]:
overall_sel_feats_cls[2]=['hist_category_2_raw_mean', 'hist_category_3_raw_mean', 'hist_city_id_nunique',
 'hist_month_lag_-11_mean', 'hist_month_lag_-1_mean', 'hist_month_lag_-2_mean',
 'hist_month_lag_-3_mean', 'hist_month_lag_-4_mean', 'hist_month_lag_-5_mean',
 'hist_month_lag_-5_sum', 'hist_month_lag_-6_mean', 'hist_month_lag_-7_mean',
 'hist_month_lag_-8_mean', 'hist_month_lag_-8_sum', 'hist_month_lag_-9_mean',
 'hist_month_lag_0_mean', 'hist_month_lag_0_sum',
 'hist_purchase_amount_per_install_mean', 'hist_purchase_amount_per_install_sum',
 'hist_purchase_amount_per_install_var', 'hist_purchase_amount_scaled_mean',
 'hist_purchase_amount_scaled_sum', 'hist_purchase_amount_scaled_var', 'new_hist_category_2_raw_mean',
 'new_hist_category_3_A_sum', 'new_hist_category_3_raw_mean', 'new_hist_month_lag_1_mean',
 'new_hist_month_lag_1_sum', 'new_hist_month_lag_2_sum', 'new_hist_purchase_amount_per_install_mean',
 'new_hist_purchase_amount_per_install_sum', 'new_hist_purchase_amount_scaled_sum',
 'new_hist_purchase_amount_scaled_var', 'rank_diff_rank_hist_purchase_amount_per_install_sum',
 'rank_hist_authorized_flag_sum', 'rank_hist_first_buy', 'rank_hist_installments_mean',
 'rank_hist_month_lag_-4_sum', 'rank_hist_month_lag_-5_sum', 'rank_hist_month_lag_-6_sum',
 'rank_hist_purchase_date_count_max', 'rank_hist_purchase_date_count_mean', 'rank_hist_purchase_date_diff',
 'rank_hist_size_per_merchant', 'rank_new_hist_card_id_size', 'rank_new_hist_purchase_date_min',
 'rank_new_hist_purchase_date_uptonow']

In [92]:
overall_sel_feats_cls[3]=['hist_category_2_raw_mean', 'hist_category_3_A_sum', 'hist_city_id_nunique',
 'hist_month_lag_-10_mean', 'hist_month_lag_-3_mean', 'hist_month_lag_-3_sum', 'hist_month_lag_-4_sum',
 'hist_month_lag_-5_sum', 'hist_month_lag_-6_mean', 'hist_month_lag_-9_mean', 'hist_month_lag_-9_sum',
 'hist_month_lag_0_sum', 'hist_purchase_amount_per_install_mean', 'hist_purchase_amount_per_install_sum',
 'hist_purchase_amount_per_install_var', 'hist_purchase_amount_scaled_sum', 'new_hist_category_3_A_sum',
 'new_hist_city_id_nunique', 'new_hist_month_lag_1_mean', 'new_hist_month_lag_1_sum', 'new_hist_month_lag_2_sum',
 'new_hist_purchase_amount_per_install_mean', 'new_hist_purchase_amount_per_install_sum',
 'new_hist_purchase_amount_per_install_var', 'rank_diff_rank_hist_purchase_amount_per_install_sum',
 'rank_hist_authorized_flag_sum', 'rank_hist_category_2_raw_mean', 'rank_hist_month_lag_-5_sum',
 'rank_hist_month_lag_-6_sum', 'rank_hist_purchase_amount_per_install_mean', 'rank_hist_purchase_date_count_max',
 'rank_hist_purchase_date_min', 'rank_hist_size_per_merchant', 'rank_new_hist_first_buy',
 'rank_new_hist_purchase_amount_per_install_sum', 'rank_new_hist_purchase_date_min']

In [93]:
overall_sel_feats_cls[4]=['hist_category_2_raw_mean', 'hist_category_3_A_mean', 'hist_city_id_nunique',
 'hist_month_lag_-1_mean', 'hist_month_lag_-1_sum', 'hist_month_lag_-2_sum', 'hist_month_lag_-5_mean',
 'hist_month_lag_-7_mean', 'hist_month_lag_-9_mean', 'hist_month_lag_0_mean', 'hist_month_lag_0_sum',
 'hist_purchase_amount_per_install_var', 'hist_purchase_amount_scaled_mean', 'new_hist_category_3_raw_mean',
 'new_hist_month_lag_1_mean', 'new_hist_month_lag_1_sum', 'new_hist_purchase_amount_per_install_mean',
 'new_hist_purchase_amount_scaled_sum', 'new_hist_purchase_amount_scaled_var',
 'rank_diff_rank_hist_purchase_amount_per_install_sum', 'rank_elapsed_time', 'rank_hist_first_buy',
 'rank_hist_installments_mean', 'rank_hist_month_lag_-1_sum', 'rank_hist_month_lag_-4_sum',
 'rank_hist_month_lag_-5_sum', 'rank_hist_month_lag_-6_sum', 'rank_hist_month_lag_0_sum',
 'rank_hist_purchase_date_diff', 'rank_hist_purchase_date_min', 'rank_hist_size_per_merchant',
 'rank_new_hist_card_id_size', 'rank_new_hist_first_buy', 'rank_new_hist_purchase_date_diff']

In [94]:
# overall_sel_feats=[0,1,2,3,4]

In [95]:
# overall_sel_feats[0] =['hist_month_lag_-2_mean', 'hist_month_lag_-3_mean',
#  'hist_purchase_amount_scaled_var', 'new_hist_month_lag_1_mean',
#  'new_hist_purchase_amount_per_install_mean', 'new_hist_purchase_amount_per_install_var',
#  'new_hist_purchase_amount_scaled_mean', 'rank_hist_authorized_flag_mean',
#  'rank_hist_card_id_size', 'rank_hist_category_1_mean', 'rank_hist_first_buy',
#  'rank_hist_installments_mean', 'rank_hist_month_lag_-6_sum', 'rank_hist_purchase_date_count_mean',
#  'rank_hist_purchase_date_diff', 'rank_hist_purchase_date_min', 'rank_hist_size_per_merchant',
#                       'rank_diff_rank_hist_purchase_amount_per_install_sum'
#                       ]

In [96]:
# overall_sel_feats[1] =['hist_category_3_raw_mean', 'hist_month_lag_-1_mean', 'hist_month_lag_-2_sum',
#  'hist_month_lag_-3_mean', 'hist_month_lag_-4_mean', 'hist_month_lag_-4_sum', 'hist_month_lag_-5_sum',
#  'hist_month_lag_-6_sum', 'hist_month_lag_0_mean', 'hist_month_lag_0_sum',
#  'hist_purchase_amount_per_install_mean', 'hist_purchase_amount_per_install_sum',
#  'hist_purchase_amount_scaled_mean', 'new_hist_category_2_raw_mean',
#  'new_hist_purchase_amount_per_install_sum', 'new_hist_purchase_amount_scaled_mean',
#  'rank_diff_rank_hist_purchase_amount_per_install_sum', 'rank_elapsed_time',
#  'rank_hist_authorized_flag_mean', 'rank_hist_category_1_mean', 'rank_hist_category_2_raw_mean', 
#  'rank_hist_first_buy', 'rank_hist_installments_mean', 'rank_hist_month_lag_-4_sum',
#  'rank_hist_purchase_date_count_max', 'rank_hist_purchase_date_count_mean', 'rank_hist_purchase_date_min',
#  'rank_hist_size_per_merchant', 'rank_new_hist_purchase_amount_per_install_sum',
#  'rank_new_hist_purchase_date_diff', 'rank_new_hist_purchase_date_min',
#  'rank_new_hist_purchase_date_uptonow']

In [97]:
# overall_sel_feats[2] =['hist_month_lag_-1_mean', 'hist_month_lag_0_sum',
#  'hist_purchase_amount_per_install_mean', 'hist_purchase_amount_per_install_var',
#  'new_hist_month_lag_1_mean', 'new_hist_purchase_amount_per_install_mean',
#  'new_hist_purchase_amount_scaled_mean', 'rank_hist_category_2_raw_mean',
#  'rank_hist_purchase_amount_per_install_mean', 'rank_hist_purchase_amount_per_install_sum',
#  'rank_hist_purchase_date_count_max', 'rank_new_hist_purchase_date_uptonow',
#                       'rank_diff_rank_hist_purchase_amount_per_install_sum']

In [98]:
# overall_sel_feats[3] =['hist_purchase_amount_per_install_mean', 'hist_purchase_amount_per_install_sum',
#  'hist_purchase_amount_per_install_var', 'new_hist_purchase_amount_per_install_sum',
#  'new_hist_purchase_amount_scaled_mean', 'new_hist_purchase_amount_scaled_sum',
#  'rank_elapsed_time', 'rank_hist_category_1_mean', 'rank_hist_first_buy',
#  'rank_hist_month_lag_-6_sum', 'rank_hist_month_lag_0_sum',
#  'rank_hist_purchase_date_count_max', 'rank_hist_purchase_date_count_mean', 'rank_hist_purchase_date_min',
#  'rank_hist_size_per_merchant', 'rank_new_hist_purchase_date_min', 'rank_new_hist_purchase_date_uptonow',
#                       'rank_diff_rank_hist_purchase_amount_per_install_sum']

In [99]:
# overall_sel_feats[4] =['hist_month_lag_-1_mean',
#  'hist_purchase_amount_per_install_mean', 'hist_purchase_amount_per_install_sum',
#  'hist_purchase_amount_per_install_var', 'hist_purchase_amount_scaled_sum',
#  'new_hist_category_2_raw_mean', 'new_hist_month_lag_1_mean',
#  'new_hist_purchase_amount_scaled_mean', 'new_hist_purchase_amount_scaled_sum',
#  'rank_elapsed_time', 'rank_hist_authorized_flag_mean', 'rank_hist_category_1_mean',
#  'rank_hist_first_buy', 'rank_hist_installments_mean', 'rank_hist_purchase_date_count_max',
#  'rank_hist_purchase_date_count_mean', 'rank_hist_size_per_merchant', 'rank_new_hist_first_buy',
#  'rank_new_hist_purchase_date_min', 'rank_new_hist_purchase_date_uptonow',
#                       'rank_diff_rank_hist_purchase_amount_per_install_sum']

In [100]:
num_round = 10000

In [101]:
#Run for fold-wise selected feature model training and predictions
for i,val in enumerate(overall_sel_feats_cls):
    if i==4:
        overall_sel_feats_cls[i] = sel_feats + sel_enc_cols + new_feats_cls
    else:
        overall_sel_feats_cls[i] += sel_feats + sel_enc_cols

In [102]:
# features = sel_feats + sel_enc_cols + new_feats_cls

In [103]:
# print(df_train['hist_card_id_size'].describe())
# print(df_train[df_train['hist_card_id_size']<=10].shape)

# cardsizes = [20,35,50,75,100,150,10000]
# prevval =0
# for val in cardsizes:
#     mask = (df_train['hist_card_id_size']<=val) & (df_train['hist_card_id_size']>prevval)
#     mask_outlier  = df_train['outliers']==1
#     print()
#     totalsize = df_train[mask].shape[0]
#     print('card size:',val)
#     print('total:',totalsize)
#     outliersize = df_train[mask & mask_outlier].shape[0]
#     print('outliers:',outliersize)
#     print('non outliers:',df_train[(mask) & (~mask_outlier)].shape[0])
#     prevval = val
#     outlierratio = outliersize / totalsize
#     print('ratio:',outlierratio)
# # print(df_train[(df_train['hist_card_id_size']<=20) & (df_train['hist_card_id_size']>10)].shape)
# # print(df_train[(df_train['hist_card_id_size']<=50) & (df_train['hist_card_id_size']>20)].shape)
# # print(df_train[(df_train['hist_card_id_size']<=100) & (df_train['hist_card_id_size']>50)].shape)
# # print(df_train[(df_train['hist_card_id_size']<=500) & (df_train['hist_card_id_size']>100)].shape)
# # print(df_train[(df_train['hist_card_id_size']<=1000) & (df_train['hist_card_id_size']>500)].shape)
# # print(df_train[(df_train['hist_card_id_size']<=5000) & (df_train['hist_card_id_size']>1000)].shape)
# # print(df_train[(df_train['hist_card_id_size']<=10000) & (df_train['hist_card_id_size']>5000)].shape)


In [104]:
# # df_train_filtered = df_train.copy()
# # df_train_filtered.sort_values('hist_card_id_size',inplace=True)
# df_train_filtered = df_train.copy()
# df_train_filtered['target'] = target
# # df_train_filtered = df_train_filtered[df_train_filtered['hist_card_id_size'] <=10]
# mask  = (df_train_filtered['hist_card_id_size'] >10) & (df_train_filtered['hist_card_id_size'] <20)
# test_mask  = (df_test['hist_card_id_size'] >10) & (df_test['hist_card_id_size'] <20)
# df_train_filtered = df_train_filtered[mask]
# target_filtered = df_train_filtered['target']
# del df_train_filtered['target']
# df_test_filtered = df_test[test_mask]

In [105]:
# target_filtered.tail()

In [106]:
# df_train_filtered.tail()

In [107]:
oof_labels_p_prob = np.zeros(len(df_train))
oof_rank_p_labels_p_prob = np.zeros(len(df_train))

oof_rank = np.zeros(len(df_train))
oof_rank_prob = np.zeros(len(df_train))
oof_rank_p_labels = np.zeros(len(df_train))    
oof_labels = np.zeros(len(df_train))
oof_rank_p_prob = np.zeros(len(df_train))
oof_rank_p_prob_prop_high = np.zeros(len(df_train))
oof_rank_p_prob_prop_low = np.zeros(len(df_train))


In [108]:
n_splits = 5
fold_to_start = None
fold_to_stop= None

In [109]:
param ={}

In [110]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'binary',
#          'num_class':2,
         'max_depth': -1,
         'learning_rate': 0.01,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
#          "metric": 'binary_logloss',
         "metric": 'auc',
#          "metric": 'multi_logloss',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 4,
         'n_estimators' : 10000,
         "random_state": 4590}

In [111]:
randomno = 4590
fold_importance_df,predictions,oof,train_outlier_preds,dummy_overall_imp_df,dummy_overall_sel_feats\
    = runlgb(randomno,
            False,df_train,df_test,param,overall_sel_feats_cls,
             target,lgb_fit,lgb_predict,lgb_getbestscore,
             score_function=get_logloss_score,regression=False,
             fold_feats=True, 
#                  feval=lgb_sk_f1,
             fold_to_start=fold_to_start,
             fold_to_stop=fold_to_stop,
#              targetenc=True,
#              to_enc_cols= toenccols,
#              likely_card_ids=[]
            )



fold n°0
y_train shape: (201917,)
val shape: (40384, 249)
y_val shape: (40384,)
tr shape: (161533, 250)
val shape: (40384, 250)
yval 1 shape: (442,)
yval 0 shape: (39942,)
yval 1 ratio: 0.010944928684627575
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's auc: 0.900823
[200]	valid_0's auc: 0.905773
[300]	valid_0's auc: 0.907993
[400]	valid_0's auc: 0.908254
[500]	valid_0's auc: 0.908363
[600]	valid_0's auc: 0.908469
[700]	valid_0's auc: 0.908479
Early stopping, best iteration is:
[555]	valid_0's auc: 0.908604



invalid value encountered in true_divide


invalid value encountered in true_divide



opt_cutoff: 0.04480101898516457
f1 score after calib: 0.2655935613682092
auc score after calib: 0.9086044674279968
log loss score after calib: 0.04984 
CUMULATIVE SCORE
opt_cutoff: 0.04480101898516457
f1 score: 0.2655935613682092
conf matrix: [[39522   420]
 [  311   131]]

fold n°1
y_train shape: (201917,)
val shape: (40384, 249)
y_val shape: (40384,)
tr shape: (161533, 250)
val shape: (40384, 250)
yval 1 shape: (442,)
yval 0 shape: (39942,)
yval 1 ratio: 0.010944928684627575
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's auc: 0.901992
[200]	valid_0's auc: 0.906092
[300]	valid_0's auc: 0.906668
[400]	valid_0's auc: 0.906661
Early stopping, best iteration is:
[246]	valid_0's auc: 0.907103



invalid value encountered in true_divide


invalid value encountered in true_divide



opt_cutoff: 0.043167104040228284
f1 score after calib: 0.2606284658040665
auc score after calib: 0.9071032522043841
log loss score after calib: 0.05004 
CUMULATIVE SCORE
opt_cutoff: 0.04271501550498901
f1 score: 0.26248216833095583
conf matrix: [[78941   943]
 [  609   275]]

fold n°2
y_train shape: (201917,)
val shape: (40383, 249)
y_val shape: (40383,)
tr shape: (161534, 250)
val shape: (40383, 250)
yval 1 shape: (441,)
yval 0 shape: (39942,)
yval 1 ratio: 0.0109204368174727
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's auc: 0.904034
[200]	valid_0's auc: 0.907104
[300]	valid_0's auc: 0.908685
[400]	valid_0's auc: 0.909272
[500]	valid_0's auc: 0.909495
[600]	valid_0's auc: 0.909393
[700]	valid_0's auc: 0.909503
Early stopping, best iteration is:
[578]	valid_0's auc: 0.909581



invalid value encountered in true_divide


invalid value encountered in true_divide



opt_cutoff: 0.04454049142654412
f1 score after calib: 0.2474012474012474
auc score after calib: 0.9095807969174351
log loss score after calib: 0.05078 
CUMULATIVE SCORE
opt_cutoff: 0.04271501550498901
f1 score: 0.257207644962747
conf matrix: [[118461   1365]
 [   929    396]]

fold n°3
y_train shape: (201917,)
val shape: (40383, 249)
y_val shape: (40383,)
tr shape: (161534, 250)
val shape: (40383, 250)
yval 1 shape: (441,)
yval 0 shape: (39942,)
yval 1 ratio: 0.0109204368174727
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's auc: 0.898824
[200]	valid_0's auc: 0.901854
[300]	valid_0's auc: 0.903838
[400]	valid_0's auc: 0.904503
[500]	valid_0's auc: 0.904946
[600]	valid_0's auc: 0.905175
[700]	valid_0's auc: 0.905405
[800]	valid_0's auc: 0.905536
[900]	valid_0's auc: 0.90516
Early stopping, best iteration is:
[796]	valid_0's auc: 0.905598



invalid value encountered in true_divide


invalid value encountered in true_divide



opt_cutoff: 0.033756842106711066
f1 score after calib: 0.2522686025408349
auc score after calib: 0.9055975836164253
log loss score after calib: 0.05106 
CUMULATIVE SCORE
opt_cutoff: 0.04271501550498901
f1 score: 0.25227888642522794
conf matrix: [[157987   1781]
 [  1255    511]]

fold n°4
y_train shape: (201917,)
val shape: (40383, 249)
y_val shape: (40383,)
tr shape: (161534, 250)
val shape: (40383, 250)
yval 1 shape: (441,)
yval 0 shape: (39942,)
yval 1 ratio: 0.0109204368174727
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's auc: 0.877625
[200]	valid_0's auc: 0.883928
[300]	valid_0's auc: 0.886886
[400]	valid_0's auc: 0.887704
[500]	valid_0's auc: 0.888318
[600]	valid_0's auc: 0.888252
[700]	valid_0's auc: 0.888417
[800]	valid_0's auc: 0.88812
[900]	valid_0's auc: 0.887992
Early stopping, best iteration is:
[702]	valid_0's auc: 0.88844
opt_cutoff: 0.06715065423244596
f1 score after calib: 0.22439024390243903
auc score after calib: 0.8884397682762454
lo


invalid value encountered in true_divide



conf matrix: [[197555   2155]
 [  1598    609]]
valid scores: [0.9086044674279968, 0.9071032522043841, 0.9095807969174351, 0.9055975836164252, 0.8884397682762455]
CUMULATIVE SCORE
opt_cutoff: 0.04430684055553044
f1 score: 0.24537409493161705



invalid value encountered in true_divide



conf matrix: [[197555   2155]
 [  1598    609]]

CV score log loss: 0.05064 
CV score f1 score: 0.24537409493161705 cutoff:0.04430684055553044
CV score AUC score: 0.8954637532078967
Rank CV score f1 score: 0.24492017416545722 cutoff:0.9836564891167076



invalid value encountered in true_divide


invalid value encountered in true_divide



Rank Prob CV score f1 score: 0.24541607898448517 cutoff:0.04396695663364109
Rank Plus Prob CV score f1 score: 0.24578866768759572 cutoff:1.0251491616753337
Rank Plus Prob Prop High CV score f1 score: 0.24541607898448517 cutoff:1.4319536930308123
Rank Plus Prob Prop Low CV score f1 score: 0.24561403508771934 cutoff:0.9954264363315664
Labels CV score f1 score: 0.24944500504540867 cutoff:1.0
Labels plus rank head: [0.16905629 0.48134116 0.55049029 0.3320209  0.05507132 0.93410757
 0.27850828 0.4427358  0.80398177 0.44188292]
Labels Plus Rank f1 score: 0.24979822437449556 cutoff:0.9906396255850234
Labels Plus Rank AUC score: 0.903892347574123 
Normal and Rank Average score f1 score: 0.24578866768759572 cutoff:0.5125745808376668
Normal and Labels Average score f1 score: 0.24979822437449556 cutoff:0.03357532711622298
Normal and Labels Average score AUC score: 0.8954868700984802
Normal and Labels and Rank Average score f1 score: 0.24979822437449556 cutoff:0.5288951399087347



invalid value encountered in true_divide


invalid value encountered in true_divide



Normal and Labels and Rank Average score AUC score: 0.9038931416571246 


In [112]:
# oof_rank_p_labels_1 = oof_rank_p_labels.copy()

In [113]:
np.savetxt('oof_cls_fold_feats.csv',oof,delimiter=',')
np.savetxt('test_preds_cls_fold_feats.csv',predictions,delimiter=',')

In [114]:
features = selected_features_cls +  sel_feats + sel_enc_cols
# features.remove('rank_diff_rank_hist_purchase_amount_per_install_sum')
print(features)

['hist_category_2_raw_mean', 'hist_category_3_A_mean', 'hist_category_3_A_sum', 'hist_city_id_nunique', 'hist_month_lag_-1_mean', 'hist_month_lag_-2_mean', 'hist_month_lag_-5_mean', 'hist_month_lag_-6_mean', 'hist_month_lag_-7_mean', 'hist_month_lag_-8_mean', 'hist_month_lag_-8_sum', 'hist_month_lag_-9_mean', 'hist_month_lag_-9_sum', 'hist_month_lag_0_mean', 'hist_month_lag_0_sum', 'hist_purchase_amount_per_install_mean', 'hist_purchase_amount_per_install_sum', 'hist_purchase_amount_per_install_var', 'hist_purchase_amount_scaled_sum', 'new_hist_category_2_raw_mean', 'new_hist_category_3_A_sum', 'new_hist_category_3_raw_mean', 'new_hist_month_lag_1_mean', 'new_hist_month_lag_1_sum', 'new_hist_purchase_amount_scaled_sum', 'new_hist_purchase_amount_scaled_var', 'rank_diff_rank_hist_purchase_amount_per_install_sum', 'rank_elapsed_time', 'rank_hist_authorized_flag_sum', 'rank_hist_category_2_raw_mean', 'rank_hist_first_buy', 'rank_hist_installments_mean', 'rank_hist_month_lag_-1_sum', 'rank

In [115]:
param = {'colsample_bytree': 0.933637062627785,
   'min_child_samples': 465,
   'num_leaves': 75,
   'reg_alpha': 0.616207848670193,
   'reg_lambda': 0.829210506269319,
   'subsample': 0.936495757097884,
   'subsample_for_bin': 80000,
   'learning_rate': 0.01,
   'boosting': 'gbdt',
   'bagging_seed': 2018,
   'bagging_frequency': 2,
   'min_data_in_bin': 100,
   'n_estimators': 10000,
   'objective': 'binary',
   'metric': 'auc',
   'random_state': 2333,
   'max_depth': 7}

In [116]:
param['scale_pos_weight'] = 2

In [117]:
# param = {'colsample_bytree': 0.8897311029338786,
#    'min_child_samples': 55,
#    'num_leaves': 75,
#    'reg_alpha': 0.7190472014738334,
#    'reg_lambda': 0.4675245415536591,
#    'subsample': 0.9562661113791268,
#    'subsample_for_bin': 90000,
#    'learning_rate': 0.01,
#    'boosting': 'gbdt',
#    'bagging_seed': 2018,
#    'bagging_frequency': 2,
#    'min_data_in_bin': 100,
#    'n_estimators': 10000,
#    'objective': 'binary',
#    'metric': 'auc',
#    'random_state': 2333,
#    'max_depth': 8}

In [118]:
randomno = 4590
fold_importance_df,predictions_2,oof_2,train_outlier_preds,dummy_overall_imp_df,dummy_overall_sel_feats\
    = runlgb(randomno,
            False,df_train,df_test,param,features,
             target,lgb_fit,lgb_predict,lgb_getbestscore,
             score_function=get_logloss_score,regression=False,
#              fold_feats=True, 
#                  feval=lgb_sk_f1,
             fold_to_start=fold_to_start,
             fold_to_stop=fold_to_stop,
#              targetenc=True,
#              to_enc_cols= toenccols,
#              likely_card_ids=[]
            )



fold n°0
y_train shape: (201917,)
val shape: (40384, 249)
y_val shape: (40384,)
tr shape: (161533, 250)
val shape: (40384, 250)
yval 1 shape: (442,)
yval 0 shape: (39942,)
yval 1 ratio: 0.010944928684627575
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's auc: 0.908486
[200]	valid_0's auc: 0.909602
[300]	valid_0's auc: 0.909742
[400]	valid_0's auc: 0.909947
[500]	valid_0's auc: 0.909747
Early stopping, best iteration is:
[376]	valid_0's auc: 0.910018



invalid value encountered in true_divide


invalid value encountered in true_divide



opt_cutoff: 0.05603475033367883
f1 score after calib: 0.26916221033868093
auc score after calib: 0.9100184520949042
log loss score after calib: 0.04766 
CUMULATIVE SCORE
opt_cutoff: 0.05603475033367883
f1 score: 0.26916221033868093
conf matrix: [[39413   529]
 [  292   150]]

fold n°1
y_train shape: (201917,)
val shape: (40384, 249)
y_val shape: (40384,)
tr shape: (161533, 250)
val shape: (40384, 250)
yval 1 shape: (442,)
yval 0 shape: (39942,)
yval 1 ratio: 0.010944928684627575
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's auc: 0.904916
[200]	valid_0's auc: 0.905637
[300]	valid_0's auc: 0.905398
Early stopping, best iteration is:
[184]	valid_0's auc: 0.905946
opt_cutoff: 0.0902151800117532
f1 score after calib: 0.2508250825082508
auc score after calib: 0.9059459746043528
log loss score after calib: 0.04851 
CUMULATIVE SCORE
opt_cutoff: 0.07574326822562058
f1 score: 0.2584269662921348
conf matrix: [[79063   821]
 [  632   252]]



invalid value encountered in true_divide




fold n°2
y_train shape: (201917,)
val shape: (40383, 249)
y_val shape: (40383,)
tr shape: (161534, 250)
val shape: (40383, 250)
yval 1 shape: (441,)
yval 0 shape: (39942,)
yval 1 ratio: 0.0109204368174727
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's auc: 0.90776
[200]	valid_0's auc: 0.910849
[300]	valid_0's auc: 0.911331
[400]	valid_0's auc: 0.911544
[500]	valid_0's auc: 0.911962
[600]	valid_0's auc: 0.911841
[700]	valid_0's auc: 0.911679
Early stopping, best iteration is:
[527]	valid_0's auc: 0.912045
opt_cutoff: 0.08966568710701275
f1 score after calib: 0.25
auc score after calib: 0.9120445167034149
log loss score after calib: 0.04876 
CUMULATIVE SCORE
opt_cutoff: 0.07841993315700052
f1 score: 0.25450762829403606
conf matrix: [[118634   1192]
 [   959    366]]



invalid value encountered in true_divide




fold n°3
y_train shape: (201917,)
val shape: (40383, 249)
y_val shape: (40383,)
tr shape: (161534, 250)
val shape: (40383, 250)
yval 1 shape: (441,)
yval 0 shape: (39942,)
yval 1 ratio: 0.0109204368174727
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's auc: 0.905227
[200]	valid_0's auc: 0.905774
[300]	valid_0's auc: 0.906464
[400]	valid_0's auc: 0.906373
[500]	valid_0's auc: 0.906495
[600]	valid_0's auc: 0.90634
[700]	valid_0's auc: 0.906015
Early stopping, best iteration is:
[570]	valid_0's auc: 0.906548



invalid value encountered in true_divide


invalid value encountered in true_divide



opt_cutoff: 0.04929449156995747
f1 score after calib: 0.24137931034482757
auc score after calib: 0.9065476005968292
log loss score after calib: 0.04963 
CUMULATIVE SCORE
opt_cutoff: 0.058474909797922976
f1 score: 0.24924576467857973
conf matrix: [[157762   2006]
 [  1230    536]]

fold n°4
y_train shape: (201917,)
val shape: (40383, 249)
y_val shape: (40383,)
tr shape: (161534, 250)
val shape: (40383, 250)
yval 1 shape: (441,)
yval 0 shape: (39942,)
yval 1 ratio: 0.0109204368174727
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's auc: 0.888675
[200]	valid_0's auc: 0.889734
[300]	valid_0's auc: 0.890594
[400]	valid_0's auc: 0.890992
[500]	valid_0's auc: 0.890959
[600]	valid_0's auc: 0.890835
Early stopping, best iteration is:
[435]	valid_0's auc: 0.891068
opt_cutoff: 0.11699396040682118
f1 score after calib: 0.2425
auc score after calib: 0.8910733488728725
log loss score after calib: 0.04928 
CUMULATIVE SCORE
opt_cutoff: 0.058429508087668405
f1 score: 0.245


invalid value encountered in true_divide



conf matrix: [[197187   2523]
 [  1545    662]]
valid scores: [0.9100184520949042, 0.9059459746043528, 0.9120445167034149, 0.9065476005968292, 0.8910678987933864]
CUMULATIVE SCORE
opt_cutoff: 0.058429508087668405
f1 score: 0.2458742814759874



invalid value encountered in true_divide



conf matrix: [[197187   2523]
 [  1545    662]]

CV score log loss: 0.04877 
CV score f1 score: 0.2458742814759874 cutoff:0.058429508087668405
CV score AUC score: 0.8980308397788483
Rank CV score f1 score: 0.2466555183946488 cutoff:0.9872474247226625



invalid value encountered in true_divide


invalid value encountered in true_divide



Rank Prob CV score f1 score: 0.2458742814759874 cutoff:0.05750060869455043
Rank Plus Prob CV score f1 score: 0.24537379718726868 cutoff:1.0423798578132886
Rank Plus Prob Prop High CV score f1 score: 0.24573758339510748 cutoff:1.5683973021083903
Rank Plus Prob Prop Low CV score f1 score: 0.24591857821864022 cutoff:1.0055394224446779
Labels CV score f1 score: 0.2495860927152318 cutoff:1.0
Labels plus rank head: [0.16425228 0.37547978 0.49616185 0.39073372 0.04204635 0.93111133
 0.38231434 0.55382215 0.73841125 0.45186212]
Labels Plus Rank f1 score: 0.24994827229464098 cutoff:0.9911348834905778
Labels Plus Rank AUC score: 0.9051494660461112 
Normal and Rank Average score f1 score: 0.24537379718726868 cutoff:0.5211899289066443
Normal and Labels Average score f1 score: 0.24994827229464098 cutoff:0.05849698020341059
Normal and Labels Average score AUC score: 0.8980560099411932
Normal and Labels and Rank Average score f1 score: 0.24994827229464098 cutoff:0.5540644219486994



invalid value encountered in true_divide


invalid value encountered in true_divide



Normal and Labels and Rank Average score AUC score: 0.905147756498849 


In [119]:
# oof_rank_p_labels_2 = oof_rank_p_labels.copy()

In [120]:
# #rank + labels returned high cum score

# # oof_ens = (oof_rank_p_labels + oof_labels_p_prob ) / 2
# oof_ens = oof_rank_p_labels 
# opt_cutoff, f1score = get_opt_cutoff_prec(target, oof_ens )
# print("Mean  oof score f1 score: {0} cutoff:{1}".format(f1score,opt_cutoff))
# print("Mean  oof AUC score: {0} ".format(roc_auc_score(target, oof_ens )))

In [121]:
# oof = oof_rank_p_labels.copy()

In [122]:
# oof_sc_1 = oof_rank_p_labels.copy()
# oof_raw_sc_1 = oof_2.copy()
# oof_rank_sc_1 = oof_rank.copy()
# oof_label_sc_1 = oof_label.copy()
# # oof_sc_3, oof_sc_5

In [123]:
np.savetxt('oof_cls_overall_sel.csv',oof_2,delimiter=',')
np.savetxt('test_preds_cls_overall_sel.csv',predictions_2,delimiter=',')

In [124]:
# # oof_ens = (0.333*oof_sc_5 + 0.333*oof_sc_3 + 0.333*oof_sc_4 + ) 
# oof_ens = oof_sc_1 + oof_sc_2
# # oof_ens = (oof_sc_5 + oof_sc_3 + oof_sc_4 + oof_sc_10 ) / 4 
# # predictions_ens =(predictions + predictions_2) / 2
# opt_cutoff, f1score = get_opt_cutoff_prec(target, oof_ens )
# print("Mean  oof score f1 score: {0} cutoff:{1}".format(f1score,opt_cutoff))
# print("Mean  oof AUC score: {0} ".format(roc_auc_score(target, oof_ens )))

In [125]:
# oof_ens = (oof + oof_2) / 2
w=[0.6,0.4]
oof_ens = (w[0]*oof + w[1]*oof_2) 

# predictions_ens =(predictions + predictions_2) / 2
predictions_ens =(w[0]*predictions + w[1]*predictions_2) 
opt_cutoff, f1score = get_opt_cutoff_prec(target, oof_ens )
print("Mean  oof score f1 score: {0} cutoff:{1}".format(f1score,opt_cutoff))
print("Mean  oof AUC score: {0} ".format(roc_auc_score(target, oof_ens )))

Mean  oof score f1 score: 0.25049780963759455 cutoff:0.05477038535050867
Mean  oof AUC score: 0.8982393455558135 



invalid value encountered in true_divide



In [126]:
oof = oof_ens
predictions = predictions_ens

In [127]:
np.savetxt('oof_cls_ens.csv',oof,delimiter=',')
np.savetxt('test_preds_ens.csv',predictions,delimiter=',')

In [128]:
# n_ensembles = 100

In [129]:
# predictions_list=[]
# oof_list =[]
# for i in range(0,n_ensembles):
# #     randomno = 1234
#     randomno = 888 + i
#     param['random_state'] = randomno
#     fold_importance_df,predictions,oof,oof_labels,oof_rank,\
#     train_outlier_preds,dummy_overall_imp_df,dummy_overall_sel_feats\
#         = runlgb(4590,
#                 False,df_train,df_test,param,features,
#                  target,
#                  score_function=get_logloss_score,regression=False,
#     #              fold_feats=True, 
#                  feval=lgb_sk_f1,
#                  fold_to_start=fold_to_start,
#                  fold_to_stop=fold_to_stop,
#     #              targetenc=True,
#     #              to_enc_cols= toenccols,
# #                  likely_card_ids=[]
#                 )
#     oof_list +=[oof]
#     predictions_list +=[predictions]
#     np.savetxt('test_'+str(randomno)+'.csv',predictions,delimiter=',')
#     np.savetxt('oof_'+str(randomno)+'.csv',oof,delimiter=',')

In [130]:
# for i,oof_cur in enumerate(oof_list):
#     opt_cutoff, f1score = get_opt_cutoff_prec(target, oof_cur)
#     print()
#     print('Ensemble:',i)
#     opt_cutoff, f1score = get_opt_cutoff_prec(target, oof_cur)
#     print("Current Ensemble oof score f1 score: {0} cutoff:{1}".format(f1score,opt_cutoff))
#     print("Current Ensemble oof AUC score: {0} ".format(roc_auc_score(target, oof_cur)))

In [131]:
# predictions = np.mean(np.array(predictions_list),axis=0)
# oof = np.mean(np.array(oof_list),axis=0)

# opt_cutoff, f1score = get_opt_cutoff_prec(target, oof)
# print("Ensemble oof score f1 score: {0} cutoff:{1}".format(f1score,opt_cutoff))
# print("Ensemble oof AUC score: {0} ".format(roc_auc_score(target, oof)))

In [132]:
# fold_importance_df['importance_ratio'] = 100 * fold_importance_df['importance'] / fold_importance_df['importance'].sum()
# fold_importance_df['gain_ratio'] = 100 * fold_importance_df['gain'] / fold_importance_df['gain'].sum()
# fold_importance_df=fold_importance_df.sort_values(by="importance", ascending=False)
# fold_importance_df.to_csv('feature_importance.csv')
# fold_importance_df

In [133]:
# fold_importance_df=fold_importance_df.sort_values(by="gain", ascending=False)
# fold_importance_df

In [134]:
# import os
# os.listdir( '../input/elo-output/')

In [135]:
# Path = '../input/elo-output/'
# ooflist=[]
# testlist=[]
# # endpoints =[2,5,7]
# endpoints =[552,1104,1656,2206]

# oof = np.full(df_train.shape[0],0,dtype='float')
# predictions = np.full(df_test.shape[0],0,dtype='float')

# for val in endpoints:
#     print('cur end point:',val)
# #     cur_oof=np.loadtxt(Path + 'oof_'+ str(val) +'.csv',delimiter=',')
#     cur_oof=np.loadtxt(Path + 'oof_hs_'+ str(val) +'.csv',delimiter=',')
#     print(cur_oof[cur_oof!=0].shape)
# #     ooflist += [cur_oof]
#     oof += cur_oof
# #     cur_test_preds=np.loadtxt(Path + 'test_preds_'+ str(val) +'.csv',delimiter=',')
#     cur_test_preds=np.loadtxt(Path + 'test_preds_hs_'+ str(val) +'.csv',delimiter=',')
# #     testlist += [cur_test_preds]
#     print(cur_test_preds[cur_test_preds!=0].shape)
#     predictions +=cur_test_preds

In [136]:
# print(oof[oof>1].shape)
# print(predictions[predictions>1].shape)

In [137]:
# print(oof.mean())
# print(predictions.mean())

In [138]:
# print(target[target==0].shape[0] / (target[target==0].shape[0] + target[target==1].shape[0]))
# target[target==1].shape[0] / (target[target==0].shape[0] + target[target==1].shape[0])


In [139]:
# print(f1_score(oof,target))
# print('conf matrix:',confusion_matrix(target,oof))

In [140]:
computef1scoreandconfmatrix(target,oof)

# permute sel feats
# opt_cutoff: 0.11941022019862527
# f1 score: 0.24567541302235177
# conf matrix: [[197404   2306]
#  [  1576    631]]
# (0.11941022019862527, 0.24567541302235177, array([0, 0, 0, ..., 0, 0, 0]))


# with new feats sum and mean, cat raw
# cur opt_cutoff: 0.11131955523682135
# cur f1 score: 0.23903508771929824
# cur conf matrix: [[197099   2611]
#  [  1554    653]]

# cur opt_cutoff: 0.09891675917584493
# cur f1 score: 0.2400388726919339
# cur conf matrix: [[196484   3226]
#  [  1467    740]]

# with new feats combined rank_diff, rank indl cols, category_3_A, month_lag dummies sum
# opt_cutoff: 0.11965149342035582
# f1 score: 0.24696197569580555
# conf matrix: [[197445   2265]
#  [  1578    629]]

# with new feats rank_diff, rank indl cols, category_3_A, month_lag dummies sum
# opt_cutoff: 0.11363498063696807
# f1 score: 0.24554158062699455
# conf matrix: [[197244   2466]
#  [  1554    653]]

# with new feats rank_diff, category_3_A, month_lag dummies sum
# valid scores: [0.043260558903229475, 0.044006556599281085, 0.04365489765477539, 0.04395118265001193, 0.04531144654168368]
# CV score: 0.04404 
# opt_cutoff: 0.10800507734665102
# f1 score: 0.2432909604519774
# conf matrix: [[196942   2768]
#  [  1519    688]]

# with new feats category_3_A, month_lag dummies sum
# opt_cutoff: 0.10907925221938841
# f1 score: 0.2447227191413238
# conf matrix: [[197011   2699]
#  [  1524    683]]

# with new feats category_3_A, month_lag dummies mean
# opt_cutoff: 0.12271163161550341
# f1 score: 0.24064386317907444
# conf matrix: [[197545   2165]
#  [  1610    597]]

opt_cutoff: 0.05477038535050867
f1 score: 0.25049780963759455
conf matrix: [[197524   2186]
 [  1579    628]]



invalid value encountered in true_divide



(0.05477038535050867, 0.25049780963759455, array([0, 0, 0, ..., 0, 0, 0]))

In [141]:
# from sklearn.metrics import f1_score
# f1_score(target,pred_labels)

In [142]:
# cutoff=0.0175
# conf matrix: [[175815  23895]
#  [   553   1654]]

# cutoff=0.0162
# conf matrix: [[174197  25513]
#  [   528   1679]]

# cutoff=0.0437
# conf matrix: [[189851   9859]
#  [   972   1235]]

# cutoff=0.0003
# conf matrix: [[ 26966 172744]
#  [     0   2207]]

# cutoff=0.001
# conf matrix: [[ 72168 127542]
#  [    20   2187]]


In [143]:
pred_labels = convert_probtolabels(oof,cutoff=0.0165) #0.0462
print('conf matrix:',confusion_matrix(target,pred_labels))

conf matrix: [[191376   8334]
 [  1073   1134]]


In [144]:
# factored_cutoff = 0.0062
# pred_labels = convert_probtolabels(oof,cutoff=factored_cutoff) #0.01637
# print('conf matrix:',confusion_matrix(target,pred_labels))

In [145]:
# # In case missing some predictable outlier, we choose top 25000 with highest outliers likelyhood.
# df_outlier_prob_oof[df_outlier_prob_oof['target']>factored_cutoff].shape
# outlier_id_oof = pd.DataFrame(df_outlier_prob_oof[df_outlier_prob_oof['target']>=factored_cutoff]['card_id'])
# print(outlier_id_oof.shape)

# model_full_oof = pd.concat([model_without_outliers_oof,train_outlier_preds_df])
# print(model_full_oof.shape)

# model_full_oof.loc[model_full_oof["card_id"].isin(outlier_id_oof["card_id"].values), "target"]\
#     = best_oof[best_oof["card_id"].isin(outlier_id_oof["card_id"].values)]["target"]

# # mask = model_full_oof['card_id'].isin(df_train['card_id'])
# model_full_oof.sort_index(inplace=True)
# print("CV score: {:<8.5f}".format(mean_squared_error(model_full_oof['target'], target_reg)**0.5))

In [146]:
# cols = (fold_importance_df[["feature", "importance"]]
#         .groupby("feature")
#         .mean()
#         .sort_values(by="importance", ascending=False)[:1000].index)

# best_features = fold_importance_df.loc[fold_importance_df.feature.isin(cols)]
# best_features=best_features.sort_values(by="importance", ascending=False)
# best_features.to_csv('best_features.csv')
# print(best_features['importance'].sum())
# best_features['importance_ratio'] = best_features['importance'] / best_features['importance'].sum()
# print(best_features[:100])


In [147]:
### 'target' is the probability of whether an observation is an outlier
df_outlier_prob = pd.DataFrame({"card_id":df_test["card_id"].values})
df_outlier_prob["target"] = predictions
df_outlier_prob.head()

Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,0.018787
1,C_ID_130fd0cbdd,0.006526
2,C_ID_b709037bc5,0.00756
3,C_ID_d27d835a9f,0.006387
4,C_ID_2b5e3df5c2,0.006575


In [148]:
### OOF 'target' is the probability of whether an observation is an outlier
df_outlier_prob_oof = pd.DataFrame({"card_id":df_train["card_id"].values})
df_outlier_prob_oof["target"] = oof
df_outlier_prob_oof.head()

Unnamed: 0,card_id,target
0,C_ID_92a2005557,0.006702
1,C_ID_3d0044924f,0.006826
2,C_ID_d639edf6cd,0.006384
3,C_ID_186d6a6901,0.006778
4,C_ID_cdbd2c0db2,0.00611


# Part 3 Combining Submission:
So far so good !
We now have three dataset:

1. Best Submission
2. Prediction Using Model Without Outliers
3. Probability of Outliers In Test set


In [149]:
# if the test set has the same ratio of outliers as training set, 
# then the numbuer of outliers in test is about: (1.06% outliers in training set)
123623*0.0106

1310.4038

In [150]:
opt_cutoff, f1_score = get_opt_cutoff_prec(target,df_outlier_prob_oof['target'])
print('opt_cutoff:',opt_cutoff)
print('f1 score:',f1_score)

opt_cutoff: 0.05477038535050867
f1 score: 0.25049780963759455



invalid value encountered in true_divide



In [151]:
# factored_cutoff = opt_cutoff/8
# factored_cutoff = 0.01 # 0.000301 #0.001
# factored_cutoff = 0.0071

In [152]:
# factor = 8
factorrange= np.linspace(1,20, num=2000, endpoint=False)
for factor in factorrange:
    factored_cutoff = opt_cutoff/factor
    predicted_card_ids = df_outlier_prob_oof[df_outlier_prob_oof['target']>=factored_cutoff]['card_id'].values
    outlier_card_ids = df_train[~mask_without_outlier]['card_id'].values
    leftout_outliers = len(set(outlier_card_ids).difference(set(predicted_card_ids)))
    if leftout_outliers <=528:
        break
        
print('factored cutoff:',factored_cutoff)
print(df_outlier_prob_oof[df_outlier_prob_oof['target']>=factored_cutoff]['card_id'].shape)
print(leftout_outliers)

factored cutoff: 0.00871862230985493
(26584,)
526


In [153]:
# if leftout_outliers >400:
#     factorrange= np.linspace(20,30, num=1000, endpoint=False)
#     for factor in factorrange:
#         factored_cutoff = opt_cutoff/factor
#         predicted_card_ids = df_outlier_prob_oof[df_outlier_prob_oof['target']>=factored_cutoff]['card_id'].values
#         outlier_card_ids = df_train[~mask_without_outlier]['card_id'].values
#         leftout_outliers = len(set(outlier_card_ids).difference(set(predicted_card_ids)))
#         if leftout_outliers <=528:
#             break

#     print('factored cutoff:',factored_cutoff)
#     print(df_outlier_prob_oof[df_outlier_prob_oof['target']>=factored_cutoff]['card_id'].shape)
#     print(leftout_outliers)

OOF CV Check

In [154]:
# In case missing some predictable outlier, we choose top 25000 with highest outliers likelyhood.
# df_outlier_prob_oof[df_outlier_prob_oof['target']>factored_cutoff].shape
outlier_id_oof = pd.DataFrame(df_outlier_prob_oof[df_outlier_prob_oof['target']>=factored_cutoff]['card_id'])

# df_outlier_prob_oof.sort_values('target',ascending=False,inplace=True)
# outlier_id_oof=pd.DataFrame(df_outlier_prob_oof['card_id'][0:25000])
print(outlier_id_oof.shape)

(26584, 1)


Run Regression of Best Submission with full train set

In [155]:
# param = {'objective':'regression',
#          'num_leaves': 31,
#          'min_data_in_leaf': 25,
#          'max_depth': 7,
#          'learning_rate': 0.01,
#          'lambda_l1':0.13,
#          "boosting": "gbdt",
#          "feature_fraction":0.85,
#          'bagging_freq':8,
#          "bagging_fraction": 0.9 ,
#          "metric": 'rmse',
#          "verbosity": -1,
#          "random_state": 2333,
#          'n_estimators': 10000,
#          'n_jobs' :-1
#         }

In [156]:
param = {'colsample_bytree': 0.7370323842138731,
   'min_child_samples': 30,
   'num_leaves': 73,
   'reg_alpha': 0.3016168759062065,
   'reg_lambda': 0.339362990388993,
   'subsample': 0.9072420448274023,
   'subsample_for_bin': 80000,
   'learning_rate': 0.01,
   'boosting': 'gbdt',
   'bagging_seed': 2018,
   'min_data_in_bin': 100,
   'bagging_freq': 2,
   'n_estimators': 10000,
   'objective': 'regression',
   'metric': 'rmse',
   'random_state': 2333,
   'max_depth': 7}

In [157]:
%%time
fold_importance_df,best_predictions,best_oof_preds,train_outlier_preds,dummy_overall_imp_df,dummy_overall_sel_feats\
    = runlgb(4590,False,df_train,df_test,param,overall_sel_feats,target_reg,
             lgb_fit, lgb_predict,lgb_getbestscore,
            fold_feats=True)


fold n°0
y_train shape: (201917,)
val shape: (40384, 249)
y_val shape: (40384,)
tr shape: (161533, 250)
val shape: (40384, 250)
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's rmse: 3.70533
[200]	valid_0's rmse: 3.66476
[300]	valid_0's rmse: 3.65079
[400]	valid_0's rmse: 3.64598
[500]	valid_0's rmse: 3.64366
[600]	valid_0's rmse: 3.64189
[700]	valid_0's rmse: 3.64134
[800]	valid_0's rmse: 3.64062
[900]	valid_0's rmse: 3.64045
[1000]	valid_0's rmse: 3.64083
Early stopping, best iteration is:
[840]	valid_0's rmse: 3.64033

fold n°1
y_train shape: (201917,)
val shape: (40384, 249)
y_val shape: (40384,)
tr shape: (161533, 250)
val shape: (40384, 250)
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's rmse: 3.69924
[200]	valid_0's rmse: 3.66356
[300]	valid_0's rmse: 3.65342
[400]	valid_0's rmse: 3.64902
[500]	valid_0's rmse: 3.64704
[600]	valid_0's rmse: 3.64626
[700]	valid_0's rmse: 3.64536
[800]	valid_0's rmse: 3.64473
[900]	vali

In [158]:
np.savetxt('oof_fulltrain_reg.csv',best_oof_preds,delimiter=',')
np.savetxt('predictions_fulltrain_reg.csv',best_predictions,delimiter=',')

In [159]:
best_oof_preds_prev = np.loadtxt('../input/elo-output/LGB_targetenc_card_smoothing.npy')
best_submission_prev = pd.read_csv('../input/elo-output/submit_targetenc_nullreplacecategory.csv')

In [160]:
best_oof_ens = (best_oof_preds_prev + best_oof_preds) / 2
best_predictions_ens =(best_submission_prev['target'].values + best_predictions) / 2
score = mean_squared_error(target_reg,best_oof_ens)**0.5
print('ens mean score:',score)

ens mean score: 3.642772218408375


In [161]:
np.savetxt('oof_fulltrain_reg_ens.csv',best_oof_ens,delimiter=',')
np.savetxt('predictions_fulltrain_reg_ens.csv',best_predictions_ens,delimiter=',')

In [162]:
best_oof = pd.DataFrame()
best_oof['card_id'] = df_train['card_id']
best_oof['target'] = best_oof_ens

best_submission = pd.DataFrame()
best_submission['card_id'] = df_test['card_id']
best_submission['target'] = best_predictions_ens

In [163]:
# best_oof_preds = np.loadtxt('../input/elo-output/LGB_targetenc_card_smoothing.npy')
# best_oof = pd.DataFrame()
# best_oof['card_id'] = df_train['card_id']
# best_oof['target'] = best_oof_preds
# print(best_oof.shape)

In [164]:
# model_full_oof = pd.concat([model_without_outliers_oof,train_outlier_preds_df])
# print(model_full_oof.shape)

In [165]:
model_full_oof.loc[model_full_oof["card_id"].isin(outlier_id_oof["card_id"].values), "target"]\
    = best_oof[best_oof["card_id"].isin(outlier_id_oof["card_id"].values)]["target"]

In [166]:
model_full_oof['target'].head()

0   -0.202817
1    0.273385
2    0.571852
3    0.250635
4   -0.076238
Name: target, dtype: float64

In [167]:
# mask = model_full_oof['card_id'].isin(df_train['card_id'])
model_full_oof.sort_index(inplace=True)
print("CV score: {:<8.5f}".format(mean_squared_error(model_full_oof['target'], target_reg)**0.5))

CV score: 3.64096 


In [168]:
model_full_oof.to_csv('Meta_Ensembled_Tuned.csv')

In [169]:
print("CV score: {:<8.5f}".format(mean_squared_error(best_oof_preds, target_reg)**0.5))

CV score: 3.64479 


Test new submission

In [170]:
# In case missing some predictable outlier, we choose top 25000 with highest outliers likelyhood.
# df_outlier_prob[df_outlier_prob['target']>factored_cutoff].shape
# df_outlier_prob.sort_values('target',ascending=False,inplace=True)
# outlier_id=pd.DataFrame(df_outlier_prob['card_id'][0:25000])
outlier_id = pd.DataFrame(df_outlier_prob[df_outlier_prob['target']>=factored_cutoff]['card_id'])
print(outlier_id.shape)

(16600, 1)


In [171]:
# best_submission = pd.read_csv('../input/elo-output/submit_targetenc_nullreplacecategory.csv')

In [172]:
model_without_outliers.loc[model_without_outliers["card_id"].isin(outlier_id["card_id"].values), "target"]\
    = best_submission[best_submission["card_id"].isin(outlier_id["card_id"].values)]["target"]

In [173]:
model_without_outliers.to_csv("submission_ensembled_Tuned.csv", index=False)