In [12]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt 
plt.rc("font", size=14)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)      

In [13]:
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
warnings.filterwarnings('ignore')

## Load data   

Let's check what data files are available.

In [14]:
PATH="../../data/"

Let's load the train and test data files.

In [15]:

df = pd.read_csv(PATH+"train.csv")
#test_df = pd.read_csv(PATH+"test.csv")

In [17]:
train_df, test_df = train_test_split( df, test_size=0.33, random_state=0)

# <a id='5'>Model</a>  

From the train columns list, we drop the ID and target to form the features list.

In [5]:
# TESTING Do not use!
features = [c for c in train_df.columns if c not in ['ID_code', 'target']]
for feature in features:
    train_df['p2_'+feature] = np.power(train_df[feature], 2)
    test_df['p2_'+feature] = np.power(test_df[feature], 2)
    train_df['r2_'+feature] = np.round(train_df[feature], 2)
    test_df['r2_'+feature] = np.round(test_df[feature], 2)
    train_df['r1_'+feature] = np.round(train_df[feature], 1)
    test_df['r1_'+feature] = np.round(test_df[feature], 1)

In [6]:
features = [c for c in train_df.columns if c not in ['ID_code', 'target']]
target = train_df['target']

We define the hyperparameters for the model.

In [8]:
param = {
    #'tree_method': 'gpu_hist', # Use GPU accelerated algorithm
    'objective': 'binary:logitraw', # Specify multiclass classification       
    'eta':0.001,
    'gamma':0.01,
    'max_depth':10,
    'min_child_weight':100,
    'subsample':0.05,
    'max_leaves':20,
    'eval_metric':'auc',
    'verbosity':1
}

We run the model.

In [10]:
folds = StratifiedKFold(n_splits=10, shuffle=False, random_state=44000)
oof = np.zeros(len(train_df))
predictions = np.zeros(len(test_df))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_df.values, target.values)):
    print("Fold {}".format(fold_))
    trn_data = xgb.DMatrix(train_df.iloc[trn_idx][features], label=target.iloc[trn_idx])
    val_data = xgb.DMatrix(train_df.iloc[val_idx][features], label=target.iloc[val_idx])
    
    num_round = 1000000
    clf = xgb.train(param, trn_data, num_round, evals = [(trn_data, 'train'), (val_data, 'valid')], early_stopping_rounds=3000, verbose_eval=10)
    train_data =xgb.DMatrix(train_df.iloc[val_idx][features])
    oof[val_idx] = clf.predict(train_data, ntree_limit=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([fold_importance_df], axis=0)
    test_data = xgb.DMatrix(test_df[features])
    predictions += clf.predict(test_data, ntree_limit=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))


Fold 0
[0]	train-auc:0.634176	valid-auc:0.632511
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 3000 rounds.
[10]	train-auc:0.731818	valid-auc:0.726057


KeyboardInterrupt: 

Let's check the feature importance.

# <a id='6'>Submission</a>  

We submit the solution.

In [59]:
sub_df = pd.DataFrame({"ID_code":test_df["ID_code"].values})
sub_df["target"] = predictions
sub_df.to_csv("submission.csv", index=False)

# <a id='7'>Reference</a>    

[1] https://www.kaggle.com/gpreda/santander-eda-and-prediction

