In [4]:
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
 
    

In [8]:
def process_data(train_df, test_df):    
    idx = [c for c in train_df.columns if c not in ['ID_code', 'target']]
    for df in [test_df, train_df]:
        for feat in idx:
            mean = df.loc[:,feat].mean()
            df['r2mm_'+feat] = np.round(df[feat], 2) -  mean 
#             if mean > 0.0:
#                 df['r2mm_' +feat] = np.round(df[feat] - mode.iloc[0], 2)
#             else:
#                 df['r2mm_' +feat] = np.round(df[feat] - mean , 2)
    print('Train and test shape:',train_df.shape, test_df.shape)
    train_df.head(10)
    return train_df, test_df

In [15]:
def run_model(train_df, test_df):
    features = [c for c in train_df.columns if c not in ['ID_code', 'target']]
    target = train_df['target']
    param = {
            'num_leaves': 15, #was 10
            'max_bin': 119,
            'min_data_in_leaf': 11,
            'learning_rate': 0.02,
            'min_sum_hessian_in_leaf': 0.00245,
            'bagging_fraction': 1.0, 
            'bagging_freq': 5, 
            'feature_fraction': 0.05,
            'lambda_l1': 4.972,
            'lambda_l2': 10.0,    #2.276,
            'min_gain_to_split': 0.65,
            'max_depth': 15, #was 14
            'save_binary': True,
            'seed': 1337,
            'feature_fraction_seed': 1337,
            'bagging_seed': 1337,
            'drop_seed': 1337,
            'data_random_seed': 1337,
            'objective': 'binary',
            'boosting_type': 'gbdt',
            'verbose': 1,
            'metric': 'auc',
            'is_unbalance': True,
            'boost_from_average': True, #was false
        }
    num_round = 15000
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019)
    oof = np.zeros(len(train_df))
    predictions = np.zeros(len(test_df))
    train_preds = np.zeros(len(train_df))
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
        print("Fold {}".format(fold_))
        trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx])
        val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx])
        clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 200)
        oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
        predictions += clf.predict(test_df[features], num_iteration=clf.best_iteration) / folds.n_splits
        train_preds += clf.predict(train_df[features], num_iteration=clf.best_iteration) / folds.n_splits
    print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))
    return (predictions, train_preds)

In [16]:
print(os.listdir("../input"))
train=pd.read_csv("../input/train.csv")
print("Training dataset")
print("----------------")
train.info()

test=pd.read_csv("../input/test.csv")
print("Test dataset")
print("----------------")
test.info()

features = [c for c in train.columns if c not in ['ID_code', 'target']]
target = train['target']

train_df, test_df = process_data(train, test)
predictions, train_preds = run_model(train_df, test_df)
sub = pd.DataFrame({"ID_code": test_df.ID_code.values})
sub_lgb = pd.DataFrame({"ID_code": train_df.ID_code.values})
sub["target"] = predictions
sub_lgb["output"] = train_preds
sub.to_csv("submission.csv", index=False)
sub_lgb.to_csv("../input/submission_lgb.csv")

['catboost_output.csv', 'sample_submission.csv', 'test.csv', 'train.csv']
Training dataset
----------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Columns: 202 entries, ID_code to var_199
dtypes: float64(200), int64(1), object(1)
memory usage: 308.2+ MB
Test dataset
----------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Columns: 201 entries, ID_code to var_199
dtypes: float64(200), object(1)
memory usage: 306.7+ MB
Train and test shape: (200000, 402) (200000, 401)
Fold 0
Training until validation scores don't improve for 200 rounds.
[1000]	training's auc: 0.916987	valid_1's auc: 0.886389
[2000]	training's auc: 0.938302	valid_1's auc: 0.896428
[3000]	training's auc: 0.950128	valid_1's auc: 0.898575
Early stopping, best iteration is:
[3301]	training's auc: 0.953154	valid_1's auc: 0.898878
Fold 1
Training until validation scores don't improve for 200 rounds.
[1000]	training's auc: 0.917235	valid_1's auc: 0.8820