# <a id='2'>Prepare for data analysis</a>  


## Load packages


In [1]:
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
warnings.filterwarnings('ignore')

## Load data   

Let's check what data files are available.

In [2]:
PATH="../../data/"

Let's load the train and test data files.

In [31]:

train_df = pd.read_csv(PATH+"train.csv")
test_df = pd.read_csv(PATH+"test.csv")

# <a id='3'>Data exploration</a>  

## <a id='31'>Check the data</a>  

Let's check the train and test set.

In [32]:
train_df.shape, test_df.shape

((200000, 202), (200000, 201))

Both train and test data have 200,000 entries and 202, respectivelly 201 columns. 

Let's glimpse train and test dataset.

In [43]:

features = [c for c in train_df.columns if c not in ['ID_code', 'target']]
for feature in features:
    train_df[feature] = np.power(train_df[feature],2)
    test_df[feature] = np.power(test_df[feature],2)
    #train_df['l_'+feature] = np.log(train_df[feature])
    #test_df['l_'+feature] = np.log(test_df[feature])
    #train_df['p2_'+feature] = np.power(train_df[feature], 2)
    #test_df['p2_'+feature] = np.power(test_df[feature], 2)
    #train_df['r2_'+feature] = np.round(train_df[feature], 2)
    #test_df['r2_'+feature] = np.round(test_df[feature], 2)
    #train_df['r1_'+feature] = np.round(train_df[feature], 1)
    #test_df['r1_'+feature] = np.round(test_df[feature], 1)


Let's check how many features we have now.

In [44]:
print('Train and test columns: {} {}'.format(len(train_df.columns), len(test_df.columns)))

Train and test columns: 202 201


# <a id='5'>Model</a>  

From the train columns list, we drop the ID and target to form the features list.

In [45]:
features = [c for c in train_df.columns if c not in ['ID_code', 'target']]
target = train_df['target']

We define the hyperparameters for the model.

In [46]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.4,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.05,
    'learning_rate': 0.01,
    'max_depth': -1,  
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 20.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': 1
}

We run the model.

In [None]:
folds = StratifiedKFold(n_splits=10, shuffle=False, random_state=44000)
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
    print("Fold {}".format(fold_))
    trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx])

    num_round = 1000000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
    oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(test_df[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))

Fold 0
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.883832	valid_1's auc: 0.865008
[2000]	training's auc: 0.896072	valid_1's auc: 0.872486
[3000]	training's auc: 0.904088	valid_1's auc: 0.876518
[4000]	training's auc: 0.910344	valid_1's auc: 0.879212
[5000]	training's auc: 0.915609	valid_1's auc: 0.880612
[6000]	training's auc: 0.920348	valid_1's auc: 0.881699
[7000]	training's auc: 0.924816	valid_1's auc: 0.882412
[8000]	training's auc: 0.928949	valid_1's auc: 0.882806
[9000]	training's auc: 0.932845	valid_1's auc: 0.882991
[10000]	training's auc: 0.936564	valid_1's auc: 0.882923
[11000]	training's auc: 0.94014	valid_1's auc: 0.883038
[12000]	training's auc: 0.94352	valid_1's auc: 0.882993
[13000]	training's auc: 0.946742	valid_1's auc: 0.882694
[14000]	training's auc: 0.949763	valid_1's auc: 0.882579
Early stopping, best iteration is:
[11101]	training's auc: 0.940498	valid_1's auc: 0.883096


Let's check the feature importance.

In [None]:
cols = (feature_importance_df[["Feature", "importance"]]
        .groupby("Feature")
        .mean()
        .sort_values(by="importance", ascending=False)[:150].index)
best_features = feature_importance_df.loc[feature_importance_df.Feature.isin(cols)]

plt.figure(figsize=(14,28))
sns.barplot(x="importance", y="Feature", data=best_features.sort_values(by="importance",ascending=False))
plt.title('Features importance (averaged/folds)')
plt.tight_layout()
plt.savefig('FI.png')

# <a id='6'>Submission</a>  

We submit the solution.

In [None]:
sub_df = pd.DataFrame({"ID_code":test_df["ID_code"].values})
sub_df["target"] = predictions
sub_df.to_csv("submission.csv", index=False)

# <a id='7'>References</a>    

[1] https://www.kaggle.com/gpreda/elo-world-high-score-without-blending  
[2] https://www.kaggle.com/chocozzz/santander-lightgbm-baseline-lb-0-897  
[3] https://www.kaggle.com/brandenkmurray/nothing-works

