In [1]:
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
 
    

In [59]:
def process_data(train_df, test_df):    
    idx = [c for c in train_df.columns if c not in ['ID_code', 'target']]
    for df in [test_df, train_df]:
        for feat in idx:
            mean = df.loc[:,feat].mean()
            mode = df.loc[:,feat].astype(int).mode()
            print("Mean feat {0} = {1}".format(feat, mean))
            print("Mode feat {0} = {1}".format(feat, mode.iloc[0]))
            #df['r2mm_'+feat] = np.round(df[feat], 2) -  mean 
            if mean > 0.0:
                df['r2mm_' +feat] = np.round(df[feat] - mode.iloc[0], 2)
            else:
                df['r2mm_' +feat] = np.round(df[feat] - mean , 2)
        df['sum'] = df[idx].sum(axis=1)  
        df['min'] = df[idx].min(axis=1)
        df['max'] = df[idx].max(axis=1)
        df['mean'] = df[idx].mean(axis=1)
        df['std'] = df[idx].std(axis=1)
        df['skew'] = df[idx].skew(axis=1)
        df['kurt'] = df[idx].kurtosis(axis=1)
        df['med'] = df[idx].median(axis=1)
    print('Train and test shape:',train_df.shape, test_df.shape)
    train_df.head(10)
    return train_df, test_df

In [64]:
def run_model(train_df, test_df):
    features = [c for c in train_df.columns if c not in ['ID_code', 'target']]
    target = train_df['target']
    param = {
            'num_leaves': 15, #was 10
            'max_bin': 119,
            'min_data_in_leaf': 11,
            'learning_rate': 0.02,
            'min_sum_hessian_in_leaf': 0.00245,
            'bagging_fraction': 1.0, 
            'bagging_freq': 5, 
            'feature_fraction': 0.05,
            'lambda_l1': 4.972,
            'lambda_l2': 2.276,
            'min_gain_to_split': 0.65,
            'max_depth': 20, #was 14
            'save_binary': True,
            'seed': 1337,
            'feature_fraction_seed': 1337,
            'bagging_seed': 1337,
            'drop_seed': 1337,
            'data_random_seed': 1337,
            'objective': 'binary',
            'boosting_type': 'gbdt',
            'verbose': 1,
            'metric': 'auc',
            'is_unbalance': True,
            'boost_from_average': True, #was false
        }
    num_round = 15000
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019)
    oof = np.zeros(len(train_df))
    predictions = np.zeros(len(test_df))
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
        print("Fold {}".format(fold_))
        trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx])
        val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx])
        clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 100)
        oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
        predictions += clf.predict(test_df[features], num_iteration=clf.best_iteration) / folds.n_splits
    print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))
    return predictions

In [65]:
print(os.listdir("../input"))
train=pd.read_csv("../input/train.csv")
print("Training dataset")
print("----------------")
train.info()

test=pd.read_csv("../input/test.csv")
print("Test dataset")
print("----------------")
test.info()

features = [c for c in train.columns if c not in ['ID_code', 'target']]
target = train['target']

train_df, test_df = process_data(train, test)
predictions = run_model(train_df, test_df)
sub = pd.DataFrame({"ID_code": test_df.ID_code.values})
sub["target"] = predictions
sub.to_csv("submission.csv", index=False)

['sample_submission.csv', 'test.csv', 'train.csv']
Training dataset
----------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Columns: 202 entries, ID_code to var_199
dtypes: float64(200), int64(1), object(1)
memory usage: 308.2+ MB
Test dataset
----------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Columns: 201 entries, ID_code to var_199
dtypes: float64(200), object(1)
memory usage: 306.7+ MB
Mean feat var_0 = 10.658737106499999
Mode feat var_0 = 9
Mean feat var_1 = -1.62424378
Mode feat var_1 = 0
Mean feat var_2 = 10.707451979500002
Mode feat var_2 = 9
Mean feat var_3 = 6.7882139085
Mode feat var_3 = 7
Mean feat var_4 = 11.076399133
Mode feat var_4 = 11
Mean feat var_5 = -5.050557728999999
Mode feat var_5 = 0
Mean feat var_6 = 5.4151641124999985
Mode feat var_6 = 5
Mean feat var_7 = 16.5291427375
Mode feat var_7 = 14
Mean feat var_8 = 0.27713482600000006
Mode feat var_8 = 0
Mean feat var_9 = 7.5694074580000

Mean feat var_143 = 12.335212781
Mode feat var_143 = 11
Mean feat var_144 = 8.644855877999998
Mode feat var_144 = 8
Mean feat var_145 = 4.8603530180000005
Mode feat var_145 = 0
Mean feat var_146 = 10.3622142035
Mode feat var_146 = 10
Mean feat var_147 = -3.2382060199999994
Mode feat var_147 = 0
Mean feat var_148 = 3.991192569999999
Mode feat var_148 = 3
Mean feat var_149 = 5.2762102095
Mode feat var_149 = 0
Mean feat var_150 = 16.820051007
Mode feat var_150 = 15
Mean feat var_151 = 10.156002283
Mode feat var_151 = 9
Mean feat var_152 = 7.628823997500001
Mode feat var_152 = 7
Mean feat var_153 = 16.721077255
Mode feat var_153 = 15
Mean feat var_154 = 6.971187818500002
Mode feat var_154 = 7
Mean feat var_155 = -2.043746718
Mode feat var_155 = 0
Mean feat var_156 = 13.208797351500001
Mode feat var_156 = 13
Mean feat var_157 = -4.836848327000001
Mode feat var_157 = 0
Mean feat var_158 = 17.924734242
Mode feat var_158 = 15
Mean feat var_159 = 10.239228836999999
Mode feat var_159 = 10
Mean f

Mean feat var_85 = 18.362720644499994
Mode feat var_85 = 16
Mean feat var_86 = 5.6210584225
Mode feat var_86 = 0
Mean feat var_87 = 11.351483195500002
Mode feat var_87 = 11
Mean feat var_88 = 8.702923521499999
Mode feat var_88 = 7
Mean feat var_89 = 3.7252079274999996
Mode feat var_89 = 0
Mean feat var_90 = -16.548147024000002
Mode feat var_90 = 0
Mean feat var_91 = 6.9875414695
Mode feat var_91 = 6
Mean feat var_92 = 12.739578443
Mode feat var_92 = 12
Mean feat var_93 = 10.556739837000002
Mode feat var_93 = 10
Mean feat var_94 = 10.999161667
Mode feat var_94 = 10
Mean feat var_95 = -0.084344253
Mode feat var_95 = 0
Mean feat var_96 = 14.400433117
Mode feat var_96 = 14
Mean feat var_97 = 18.5396446215
Mode feat var_97 = 0
Mean feat var_98 = 1.7520119760000004
Mode feat var_98 = 1
Mean feat var_99 = -0.7462960345000001
Mode feat var_99 = 0
Mean feat var_100 = -6.600518324500001
Mode feat var_100 = 0
Mean feat var_101 = 13.4135261765
Mode feat var_101 = 12
Mean feat var_102 = 22.29490815

KeyboardInterrupt: 