**Magic Explained**

- Remove Fake Samples from test cases
- combined train and test cases and Create count features for train and test
- Create Interaction features between original and count.

- Run augment LightGBM 

In [4]:
import numpy  as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from   tqdm   import tqdm_notebook as tqdm
import random
import os,sys,math
from   sklearn.model_selection import KFold, StratifiedKFold
from   sklearn.metrics import roc_auc_score
import lightgbm as lgb
sys.path.append('../')
import utils

In [None]:
def set_seed(number):
    random.seed(number)
    np.random.seed(number)
    return

In [None]:
set_seed(2019)

In [None]:
def load_data():
    
    train_path        = '../../data/input/input_pkl/train/'
    train             = utils.read_pickles(train_path)
    test_path         = '../../data/input/input_pkl/test/'
    test              = utils.read_pickles(test_path)
    
    return train,test

In [None]:
train,test = load_data()

100%|██████████| 5/5 [00:00<00:00, 15.83it/s]
100%|██████████| 5/5 [00:00<00:00, 21.23it/s]


In [None]:
def getUniquesamples(test):
    
    test.drop(['ID_code'], axis=1, inplace=True)
    test           = test.values
    
    unique_samples = []
    unique_count   = np.zeros_like(test)
    
    for feature in tqdm(range(test.shape[1])):
        _, index_, count_ = np.unique(test[:, feature], return_counts=True, return_index=True)
        unique_count[index_[count_ == 1], feature] += 1
        
    # Samples which have unique values are real the others are fake
    real_samples_indexes      = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
    synthetic_samples_indexes = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]
    
    return real_samples_indexes,synthetic_samples_indexes

In [None]:
real_samples_indexes,synthetic_samples_indexes = getUniquesamples(test.copy())

HBox(children=(IntProgress(value=0, max=200), HTML(value='')))




In [None]:
def findmeancounts(bins,values,x):
    
    for i in range(len(bins)):
        
        if x < bins[i]:
            return values[i]
        
    return values[-1]

In [None]:
def FE1(train,test): 
    
    feats = ["var_"+str(i) for i in range(200)] 
    df    = pd.concat([train,test.ix[real_samples_indexes]])
    
    for feat in feats:
        
        temp                = df[feat].value_counts(dropna = True)
        
        #Clamping the unique counts 
        train[feat+"vc"] = train[feat].map(temp).map(lambda x:min(10,x)).astype(np.uint8)
        test[feat+"vc"]  = test[feat].map(temp).map(lambda x:min(10,x)).astype(np.uint8)
        
        #Create interaction between Feature and Count Feature based on Count 1
        train[feat+"sum"]   = ((train[feat] - df[feat].mean())*train[feat+"vc"].map(lambda x:int(x>1))).astype(np.float32)
        test[feat+"sum"]    = ((test[feat] - df[feat].mean())*test[feat+"vc"].map(lambda x:int(x>1))).astype(np.float32)
        
        #Create interaction between Feature and Count Feature based on Count 2
        train[feat+"sum2"]  = ((train[feat])*train[feat+"vc"].map(lambda x:int(x>2))).astype(np.float32)
        test[feat+"sum2"]   = ((test[feat])*test[feat+"vc"].map(lambda x:int(x>2))).astype(np.float32)
        
        #Create interaction between Feature and Count Feature based on Count 4
        train[feat+"sum3"] = ((train[feat])*train[feat+"vc"].map(lambda x:int(x>4))).astype(np.float32) 
        test[feat+"sum3"] = ((test[feat])*test[feat+"vc"].map(lambda x:int(x>4))).astype(np.float32) 
        
    return train,test

In [None]:
train,test = FE1(train,test)

In [None]:
def augment(x,y,t=2):
    
    xs,xn = [],[]
    
    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        featnum = x1.shape[1]//200 - 1
        for c in range(200):
            np.random.shuffle(ids)
            x1[:,[c] + [200 + featnum * c + idc for idc in range(featnum)]] = x1[ids][:,[c] + [200 + featnum * c + idc for idc in range(featnum)]]
        xn.append(x1)

    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        featnum = x1.shape[1]//200 - 1
        for c in range(200):
            np.random.shuffle(ids)
            x1[:,[c] + [200 + featnum * c + idc for idc in range(1)]] = x1[ids][:,[c] + [200 + featnum * c + idc for idc in range(1)]]
        xs.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x  = np.vstack([x,xs,xn])
    y  = np.concatenate([y,ys,yn])
    
    return x,y

In [None]:
features = [col for col in train.columns if col not in ['target', 'ID_code']]

In [None]:
lgb_params = {
                "objective" : "binary",
                "metric" : "auc",
                "boosting": 'gbdt',
                "max_depth" : -1,
                "num_leaves" : 15,
                "learning_rate" : 0.01,
                "bagging_freq": 5,
                "bagging_fraction" : 0.6,
                "feature_fraction" : 0.05,
                "min_data_in_leaf": 50,
                "min_sum_heassian_in_leaf": 10,
                "tree_learner": "serial",
                "boost_from_average": "false",
                "lambda_l1" : 1.,
                #     "lambda_l2" : 0.5,
                "bagging_seed" : 2007,
                "verbosity" : 1,
                "seed": 2007
}

In [None]:
skf                   = StratifiedKFold(n_splits=5, shuffle=True, random_state=2007)
oof                   = train[['ID_code', 'target']]
oof['predict']        = 0
predictions           = np.zeros((test.shape[0],5))
val_aucs              = []
feature_importance_df = pd.DataFrame()
features              = [col for col in train.columns if col not in ['target', 'ID_code']]
X_test                = test[features].values

In [None]:
for fold, (trn_idx, val_idx) in enumerate(skf.split(train, train['target'])):
    
    X_train, y_train = train.iloc[trn_idx][features], train.iloc[trn_idx]['target']
    X_valid, y_valid = train.iloc[val_idx][features], train.iloc[val_idx]['target']
    
    N                = 1
    p_valid,yp       = 0,0
    
    for i in range(N):
        
        X_t, y_t                   = augment(X_train.values, y_train.values)
        weights                    = np.array([0.8] * X_t.shape[0])
        weights[:X_train.shape[0]] = 1.0
        trn_data                   = lgb.Dataset(X_t, label=y_t, weight = weights)
        val_data                   = lgb.Dataset(X_valid, label=y_valid)
        evals_result               = {}
        lgb_clf                    = lgb.train(lgb_params,
                                               trn_data,
                                               100000,
                                               valid_sets            = [trn_data, val_data],
                                               early_stopping_rounds = 5000,
                                               verbose_eval          = 1000,
                                               evals_result          = evals_result)
        
        p_valid                    += lgb_clf.predict(X_valid)
        yp                         += lgb_clf.predict(X_test)
        
    fold_importance_df                  = pd.DataFrame()
    fold_importance_df["feature"]       = features
    fold_importance_df["importance"]    = lgb_clf.feature_importance()
    fold_importance_df["fold"]          = fold + 1
    feature_importance_df               = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    oof['predict'][val_idx]             = p_valid/N
    
    val_score = roc_auc_score(y_valid, p_valid)
    val_aucs.append(val_score)
    
    predictions[:,fold] = yp/N

Training until validation scores don't improve for 5000 rounds.
[1000]	training's auc: 0.952668	valid_1's auc: 0.856333
[2000]	training's auc: 0.963747	valid_1's auc: 0.878537
[3000]	training's auc: 0.96875	valid_1's auc: 0.887747
[4000]	training's auc: 0.971947	valid_1's auc: 0.892256
[5000]	training's auc: 0.974386	valid_1's auc: 0.894737
[6000]	training's auc: 0.97655	valid_1's auc: 0.89612
[7000]	training's auc: 0.978601	valid_1's auc: 0.896826
[8000]	training's auc: 0.980573	valid_1's auc: 0.897369
[9000]	training's auc: 0.982471	valid_1's auc: 0.897492
[10000]	training's auc: 0.984256	valid_1's auc: 0.897561
[11000]	training's auc: 0.985935	valid_1's auc: 0.897637
[12000]	training's auc: 0.987467	valid_1's auc: 0.897621
[13000]	training's auc: 0.988885	valid_1's auc: 0.897577
[14000]	training's auc: 0.990176	valid_1's auc: 0.897495
[15000]	training's auc: 0.991355	valid_1's auc: 0.897447
Early stopping, best iteration is:
[10925]	training's auc: 0.985813	valid_1's auc: 0.897675
T

In [None]:
mean_auc = np.mean(val_aucs)
std_auc = np.std(val_aucs)
all_auc = roc_auc_score(oof['target'], oof['predict'])

In [None]:
print("Mean auc: %.9f, std: %.9f. All auc: %.9f." % (mean_auc, std_auc, all_auc))

In [None]:
cols          = (feature_importance_df[["feature", "importance"]]
                 .groupby("feature")
                 .mean()
                 .sort_values(by="importance", ascending=False)[:1000].index)


best_features = feature_importance_df.loc[feature_importance_df.feature.isin(cols)]

In [None]:
##submission##
sub_df = pd.DataFrame({"ID_code":df_test["ID_code"].values})
sub_df["target"] = np.mean(predictions,axis = 1)
sub_df.to_csv("lgb_submission.csv", index=False)