In [1]:
import gc
import math
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
%matplotlib inline
 
    

In [2]:
def process_data(train_df, test_df):    
    idx = [c for c in train_df.columns if c not in ['ID_code', 'target']]
    var_info_df = pd.DataFrame(index = idx)
    means = []
    stds = []
    kurts = []
    skews = []
    for feat in idx:
            mean = train_df.loc[:,feat].mean()
            std = train_df.loc[:,feat].std()
            kurt = train_df.loc[:,feat].kurtosis()
            skew = train_df.loc[:,feat].skew()
            means.append(mean)
            stds.append(std)
            kurts.append(kurt)
            skews.append(skew)
    var_info_df["means"] = means
    var_info_df["stds"] = stds
    var_info_df["kurts"] = kurts
    var_info_df["skews"] = skews
    
#     var_info_df.plot.scatter(x = 'means', y = 'stds')
#     var_info_df.plot.scatter(x = 'means', y = 'means')
#     var_info_df.plot.scatter(x = 'means', y = 'kurts')
#     var_info_df.plot.scatter(x = 'means', y = 'skews')
#     var_info_df.to_csv("variable-means-etc.csv", index=False)
       
    df_list = [train_df, test_df]
    new_df_list = []
    new_train_df = pd.DataFrame()
    new_train_df['target'] = train_df['target']
    new_test_df = pd.DataFrame()
    new_df_list = [new_train_df, new_test_df]
    for i in range(0, len(df_list)):        
        for feat in idx:
            mean = df_list[i].loc[:,feat].mean()
            new_df_list[i][feat] = np.round(df_list[i][feat] , 2) 
            new_df_list[i]["mm_" + feat] = np.round(df_list[i][feat] - mean , 3) 
            new_df_list[i]['sq_log_'+ feat] = np.log(np.square(df_list[i][feat] - mean))            
#             if mean > 7 or mean < -7:
#                 df['r2_'+feat] = np.round(df[feat], 2)
#                 df['r2mm_'+feat] = np.round(df[feat] - mean, 2) 
#                 df['r2mmsq_'+feat] = np.round(df[feat]*df[feat], 2)
#                 print(mean)
#             else:
#                 df['r2_' +feat] = 0 #np.round(df[feat] - df[feat] , 2)    #zero
#                 df['r2mm_'+feat] = 0 #np.round(df[feat] - df[feat] , 2)    #zero
#                 df['r2mmsq_'+feat] = 0
#                print("0")
#         df['sum'] = df[idx].sum(axis=1)  
#         df['min'] = df[idx].min(axis=1)
#         df['max'] = df[idx].max(axis=1)
#         df['mean'] = df[idx].mean(axis=1)
#         df['std'] = df[idx].std(axis=1)
#         df['skew'] = df[idx].skew(axis=1)
#         df['kurt'] = df[idx].kurtosis(axis=1)
#         df['med'] = df[idx].median(axis=1)
        
#         #plot the data
#         df.plot.scatter(x='mean', y='kurt')
#         df.plot.scatter(x='mean', y='std')
    
    print('Train and test shape:',new_df_list[0].shape,new_df_list[1].shape)
    #train_df.head(10)
    return new_df_list[0], new_df_list[1]

In [3]:
def run_model(train_df, test_df):
    features = [c for c in train_df.columns if c not in ['ID_code', 'target']]
    target = train_df['target']
    param = {
            'num_leaves': 15, #was 10
            'max_bin': 119,
            'min_data_in_leaf': 11,
            'learning_rate': 0.02,
            'min_sum_hessian_in_leaf': 0.00245,
            'bagging_fraction': 1.0, 
            'bagging_freq': 5, 
            'feature_fraction': 0.05,
            'lambda_l1': 4.972,
            'lambda_l2': 2.276,
            'min_gain_to_split': 0.65,
            'max_depth': 20, #was 14
            'save_binary': True,
            'seed': 1337,
            'feature_fraction_seed': 1337,
            'bagging_seed': 1337,
            'drop_seed': 1337,
            'data_random_seed': 1337,
            'objective': 'binary',
            'boosting_type': 'gbdt',
            'verbose': 1,
            'metric': 'auc',
            'is_unbalance': True,
            'boost_from_average': True, #was false
        }
    num_round = 15000
    folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019)
    oof = np.zeros(len(train_df))
    predictions = np.zeros(len(test_df))
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
        print("Fold {}".format(fold_))
        trn_data = lgb.Dataset(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx])
        val_data = lgb.Dataset(train_df.iloc[val_idx][features], label=target.iloc[val_idx])
        clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 800)
        oof[val_idx] = clf.predict(train_df.iloc[val_idx][features], num_iteration=clf.best_iteration)
        predictions += clf.predict(test_df[features], num_iteration=clf.best_iteration) / folds.n_splits
    print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))
    return predictions

In [4]:
print(os.listdir("../input"))
train=pd.read_csv("../input/train.csv")
print("Training dataset")
print("----------------")
train.info()

test=pd.read_csv("../input/test.csv")
print("Test dataset")
print("----------------")
test.info()

features = [c for c in train.columns if c not in ['ID_code', 'target']]
target = train['target']

train_df, test_df = process_data(train, test)
predictions = run_model(train_df, test_df)
sub = pd.DataFrame({"ID_code": test_df.ID_code.values})
sub["target"] = predictions
sub.to_csv("submission.csv", index=False)

['sample_submission.csv', 'test.csv', 'train.csv']
Training dataset
----------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Columns: 202 entries, ID_code to var_199
dtypes: float64(200), int64(1), object(1)
memory usage: 308.2+ MB
Test dataset
----------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Columns: 201 entries, ID_code to var_199
dtypes: float64(200), object(1)
memory usage: 306.7+ MB
Train and test shape: (200000, 601) (200000, 600)
Fold 0
Training until validation scores don't improve for 800 rounds.
[1000]	training's auc: 0.917975	valid_1's auc: 0.885377
[2000]	training's auc: 0.940349	valid_1's auc: 0.896041
[3000]	training's auc: 0.953559	valid_1's auc: 0.898302
[4000]	training's auc: 0.963947	valid_1's auc: 0.898857
Early stopping, best iteration is:
[4032]	training's auc: 0.964256	valid_1's auc: 0.898864
Fold 1
Training until validation scores don't improve for 800 rounds.
[1000]	training's au

KeyboardInterrupt: 