Feature Engineering (FE)
===
Magic, secret behind the data, would help to classify the true.

1. [Classify which ones are true or fake data](#Magic-1,-divide-real-and-fake-test-samples)
- [Calculate Counts for each Unique Value](#Magic-frequency-counts)
- [Concatenate outputs](#Blend-Models)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm,tqdm_notebook

import seaborn as sns
%matplotlib inline

In [None]:
import random,gc
import lightgbm as lgb
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score,accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold,train_test_split,cross_validate
import os,datetime

import warnings,random
warnings.filterwarnings('ignore')

In [None]:
#Reference  the bottom

W  = '\033[0m'  # white (normal)
R  = '\033[31m' # red
G  = '\033[32m' # green
O  = '\033[1;33m' # orange
B  = '\033[34m' # blue
P  = '\033[35m' # purple

T =  '\033[1;33;47m' #Title

In [None]:
train_df = pd.read_csv('input/train.csv')
test_df = pd.read_csv('input/test.csv')

features=train_df.columns[2:]

In [None]:
import gc 
gc.collect()

Magics
===

Magic 1, divide real and fake test samples
---

```
features   f0, f1, f2, ...
  data
   d0       
   d1      ⬇︎
   d2
   :
``` 
1. Only data, owning unique value, could be real there, otherwise were duplicated for test with large probability, by ```np.unique()```; and save it to ```unique_count = [row × column]```
- U = `np.sum(unique_count, axis=1)` gives array of uniques for each data
```
for instance, U = [ 0, 1, 0, 1       [ 2
                    0, 0, 0, 0    ➠     0
                    0, 1, 1, 1           3 
                    0, 0, 0, 0           0
                    ...]               ...]
```
- `np.argwhere(U > 0)[:, 0]` gives all the indecies of data, owing at least one uniques value.
```
   [ 2,
     0,     ➠ [0,2,...]
     3,
     0,
     ...]    
```


In [None]:
np.argwhere(np.sum([[1,2,3],[3,4,5]],axis=1)>6)
np.sum([[1,2,3],[3,4,5]],axis=1)

In [None]:
def unique_val_found(data_val):
    """
    input numpy array of data: data_val
    output: real_ind, fake_ind 
    """
    unique_samples = []
    unique_count = np.zeros_like(data_val)
    for feature in tqdm_notebook(range(data_val.shape[1])):
        # Filter out all the pair of (index,count)
        _, index_, count_ = np.unique(data_val[:, feature], return_counts=True, return_index=True)
        unique_count[index_[count_ == 1], feature] += 1

    # Samples which have unique values are real, or are fake
    real_ind = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
    fake_ind = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]
    r_n=len(real_ind)
    f_n=len(fake_ind)
    r_ratio=r_n/data_val.shape[0]
    print("no. of real samples: %s\nno. of fake samples: %s\nratio of real samples: %s" %(r_n,f_n,r_ratio))
    return real_ind,fake_ind
    

In [None]:
trn_=train_df.drop(['ID_code','target'], axis=1).values

trn_real_ind,trn_fake_ind=unique_val_found(trn_)

In [None]:
te_=test_df.drop(['ID_code'], axis=1).values

te_real_ind,te_fake_ind=unique_val_found(te_)

Magic frequency counts
---
After filtering out true data in test set as above, delete the fake ones; and 

In [None]:
features = train_df.columns[2:]

X_train=train_df[features].copy()
y_train=train_df.target.values
X_test = test_df[features].copy()
X_real_test = X_test.iloc[te_real_ind]

In [None]:
X_train.shape, y_train.shape, X_test.shape, X_real_test.shape

1. `df.value_counts()` gives the value (tmp) with frequency as follows:
```
  feature     frequency
     a           n1
     b           n2
     ...
``` 
2. df.map(tmp) maps value to its frequency; here clip by 6, the biggest.

In [None]:
count_cols = []

for c in tqdm_notebook(features[:200]):
    count_col = c+'_count'
    tmp = pd.concat((X_train[c], X_real_test[c])).value_counts()
    # mean of target is small 
    X_train[count_col] = X_train[c].map(tmp).clip(0, 6)
    X_test[count_col] = X_test[c].map(tmp).clip(0, 6).fillna(1)
    count_cols.append(count_col)

In [None]:
def freq_count(train,test,real_test,cut=6):
    """
    input: train,test,real_test, cuts (opt)
    output: train,test (with new freq_count features), array of new features
    """
    count_cols = []
    features=train.columns
    for c in tqdm_notebook(features):
        count_col = c+'_count'
        tmp = pd.concat((train[c], real_test[c])).value_counts()
        # mean of target is small 
        train[count_col] = train[c].map(tmp).clip(0, cut)
        test[count_col] = test[c].map(tmp).clip(0, cut).fillna(1)
        count_cols.append(count_col)
    print("%s features before, %s features after..." %(len(features),len(train.columns)))   
    return train,test,count_cols  

In [None]:
XX_train,XX_test,cols_new=freq_count(X_train,X_test,X_real_test)

Test One
---
Revisit lightgbm traing:

In [None]:
param = {
    'bagging_freq': 5,          
    'bagging_fraction': 0.23+0.77,
    'bootstrap' :  True,
    'bagging_with_replacement' : True,
    'boost_from_average':'false',   
    'boost': 'gbdt',#'dart',
    # The var_count have to be considered together, turn the fraction BE 1!
    'feature_fraction': 0.04+0.06+0.9,   
    'learning_rate': 0.006,     
    'max_depth': -1,
    'num_leaves': 64,
    'metric':'auc',#'binary_logloss',
    'lambda_l1':0.01, 
    'lambda_l2':0.01*100,
    'min_data_in_leaf': 80,     
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13-10,           
    'num_threads': 8,
    'tree_learner': 'serial',
    #'max_bin': 40,
    'objective': 'binary', 
    'is_unbalance': 'true',
    'verbosity': 1
}


In [None]:
# with Magics
num_round = 1000000
features_m=X_train.columns.values
target=train_df['target']
train, test = train_test_split(X_train, test_size=0.2,random_state=random.randint(1,1e5))

trn_data=lgb.Dataset(train[features],label=target.iloc[train.index])
val_data=lgb.Dataset(test[features],label=target.iloc[test.index])

clf=lgb.train(param, trn_data,num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, 
              early_stopping_rounds = 500)

Blend Models
---

In [None]:
def fast_auc(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    nfalse = 0
    auc = 0
    n = len(y_true)
    for i in range(n):
        y_i = y_true[i]
        nfalse += (1 - y_i)
        auc += y_i * nfalse
    auc /= (nfalse * (n - nfalse))
    return auc

In [None]:
def individual_ml_model(train,target,test,param="",fname='model'):
    # check whether the saved directory exists , create it if not
    directory="submissions/grouped/"+fname
    nseed=1
    if not os.path.exists(directory):
       os.makedirs(directory) 
    
    # save result in name of scheme
    now = datetime.datetime.now()
    now = str(now.strftime('%Y-%m-%d-%H-%M-%S'))
    print('started at:', now)
    print("train set shape: ",train.shape, ", test set shape: ",test.shape)
    print("Features: ", train.columns.values)
    #fname = directory'_'+now
    if True:
       x_train = train.copy()#[[c, count]].copy()
       y_train = target
       x_test = test.copy()#[[c, count]].copy()
       result=model_lgb(x_train,y_train,x_test,param=param,directory=directory)
       # result=model_lgb(x_train,y_train,x_test,param=param,directory=directory)
    return result    

In [None]:
def model_lgb(train,target,test,param="",directory="submissions/grouped"):
    if not param:
       param = {
         'bagging_freq': 5,          
         'bagging_fraction': 1,
         'bootstrap' :  True,
         'bagging_with_replacement' : True,
         'boost_from_average':'false',   
         'boost': 'gbdt',
         # The var_count have to be considered together, turn the fraction BE 1!
         'feature_fraction': 1,   
         'learning_rate': 0.01,     
         'max_depth': 2,
         'num_leaves': 3,
         'metric':'binary_logloss',
         'lambda_l1':0.01, 
         'lambda_l2':2,
         'min_data_in_leaf': 80,     
         'min_sum_hessian_in_leaf': 10.0,          
         'num_threads': 8,
         'tree_learner': 'serial',
         #'max_bin': 40,
         'objective': 'binary', 
         'verbosity': 1,
       }
        
    num_round = 1000000
    features=train.columns.values
    target=target
                
    trn, te_ = train_test_split(train, test_size=0.2,random_state=random.randint(1,1e5))

    trn_data=lgb.Dataset(trn[features],label=target.iloc[trn.index])
    val_data=lgb.Dataset(te_[features],label=target.iloc[te_.index])

    clf=lgb.train(param, trn_data,num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, 
                 early_stopping_rounds = 500)
    train_pred=clf.predict(train[features]) 
    print('%s train auc: %0.5f' % (features,fast_auc(target, train_pred)), end=' ')
    test_pred=clf.predict(test[features])
    sub=pd.read_csv("input/sample_submission.csv")
    sub['target']=test_pred
    saved_fold=directory+"/"+c+".csv"
    sub.to_csv(saved_fold,index=False)

    print("\n\n   *** train/prediction end, output saved as ",directory+"/"+c+".csv",' ...' )
    return test_pred

In [None]:
final=np.array([])
size=len(features)
for c, count in zip(tqdm_notebook(features[:size]), count_cols[:size]):
    print(T+R+"(%s,%s) Train/Test\n" %(c,count))
    print(W+"===")
    x_train_1 = X_train[[c, count]].copy()
    y_train = target
    x_test_1 = X_test[[c, count]].copy()
    f=individual_ml_model(x_train_1,y_train,x_test_1)
    final=np.append(final,f)
    print("\n ===\n" )

In [None]:
# Prepare the submit data
final=final.reshape(size,200000)
final.mean(axis=0)
sub=pd.read_csv("input/sample_submission.csv")
sub['target']=final.mean(axis=0)

sub.head()

In [None]:
sub.to_csv("output/late.csv",index=False)

In [None]:
num_round = 1000000
features=train.columns[2:]
target=train['target']
train, test = train_test_split(train_df, test_size=0.33,random_state=random.randint(1,1e5))

trn_data=lgb.Dataset(train[features],label=train['target'])
val_data=lgb.Dataset(test[features],label=test['target'])

clf=lgb.train(param, trn_data,num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, 
              early_stopping_rounds = 1500)

In [None]:
param_m =  {
    'boosting_type': 'gbdt',
  "objective"                  : "binary",
  "learning_rate"              : 0.01,
  "num_leaves"                 : 3,
    'feature_fraction':1,
  "bagging_fraction"           : 0.8,
  "bagging_freq"               : 1,
               #'min_data_in_leaf' : 80,
               #'min_sum_hessian_in_leaf' : 10.0,
     'nthread'                  : 20,
    'bin_construct_sample_cnt' : 1000000,
    'max_depth':2,
    'lambda_l2':2,
    'metric':'auc',
}

In [None]:
# with Magics
num_round = 1000000
features_m=X_train.columns.values
target=train_df['target']
train, test = train_test_split(X_train, test_size=0.2,random_state=random.randint(1,1e5))

trn_data=lgb.Dataset(train[features_m],label=target.iloc[train.index])
val_data=lgb.Dataset(test[features_m],label=target.iloc[test.index])

clf=lgb.train(param, trn_data,num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, 
              early_stopping_rounds = 500)

In [None]:
# with Magics
num_round = 1000000
features_m=X_train.columns.values
target=train_df['target']
train, test = train_test_split(X_train, test_size=0.33,random_state=random.randint(1,1e5))

trn_data=lgb.Dataset(train[features_m],label=target.iloc[train.index])
val_data=lgb.Dataset(test[features_m],label=target.iloc[test.index])

clf=lgb.train(param_m, trn_data,num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, 
              early_stopping_rounds = 500)

In [None]:
train_pred=clf.predict(X_train[features_m])

In [None]:
lgb.plot_importance(clf, max_num_features=30, importance_type='split',figsize=(12,10))