In [1]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold
import numpy as np 
import pandas as pd
from scipy.stats import norm
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import gc
import time
from sklearn.preprocessing import MinMaxScaler
from scipy.stats import rankdata

In [2]:
num_folds = 5
windows_flag = False

print("Running on Windows!\n") if windows_flag else print("Running on Linux!\n")

gc.enable()

Running on Linux!



In [3]:
def prob_preds(df, stats_df):
    neg_z = (df.values - stats_df.neg_mean.values) / stats_df.neg_sd.values
    neg_p = (1 - norm.cdf(np.abs(neg_z))) * 2
    neg_prob = neg_p.prod(axis=1)

    pos_z = (df.values - stats_df.pos_mean.values) / stats_df.pos_sd.values
    pos_p = (1 - norm.cdf(np.abs(pos_z))) * 2
    pos_prob = pos_p.prod(axis=1)
    return pos_prob/neg_prob

In [4]:
print('Load Train Data.')
train_df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')
print('\nShape of Train Data: {}\t Shape of Test Data: {}'
    .format(train_df.shape, test_df.shape))

train_labels = train_df['target']
train_index = np.array(train_df.index)

train_df.drop(['ID_code', 'target'], axis=1, inplace=True)
test_df.drop(['ID_code'], axis=1, inplace=True)

oof_preds = np.zeros(train_df.shape[0])
test_preds = np.zeros(test_df.shape[0])

skf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=42)

print('\nTraining model')
for counter, ids in enumerate(skf.split(train_index, train_labels)):
    print('\nFold {}'.format(counter+1))
    X_train, y_train = train_df.loc[ids[0],:], train_labels.values[ids[0]]
    X_val, y_val = train_df.loc[ids[1], :], train_labels.values[ids[1]]

    pos_idx = (y_train == 1)
    neg_idx = (y_train == 0)
    stats = []
    for col in train_df.columns:
        stats.append([
            train_df.loc[ids[0][pos_idx], col].mean(),
            train_df.loc[ids[0][pos_idx], col].std(),
            train_df.loc[ids[0][neg_idx], col].mean(),
            train_df.loc[ids[0][neg_idx], col].std()
        ])

    stats_df = pd.DataFrame(stats, columns=['pos_mean', 'pos_sd', 'neg_mean', 'neg_sd'])

    fold_val_preds = prob_preds(X_val, stats_df)
    test_preds += prob_preds(test_df, stats_df)/num_folds

    print("AUC score: {}".format(roc_auc_score(y_val, fold_val_preds)))
    oof_preds[ids[1]] += fold_val_preds

    del X_train, X_val, y_train, y_val
    gc.collect()

auc  = roc_auc_score(train_labels, oof_preds)

Load Train Data.

Shape of Train Data: (200000, 202)	 Shape of Test Data: (200000, 201)

Training model

Fold 1
AUC score: 0.8720345840348852

Fold 2
AUC score: 0.8699814620236965

Fold 3
AUC score: 0.8768338684564946

Fold 4
AUC score: 0.8716125018170361

Fold 5
AUC score: 0.8720893156063451


In [5]:
print('\nValidation AUC: {}'.format(auc))


Validation AUC: 0.8724834430400332


In [20]:
temp = rankdata(oof_preds)

In [11]:
oof_csv = pd.DataFrame(data={'target':oof_preds},index=train_index).rank()

In [19]:
oof_csv.head()

Unnamed: 0,target
0,88465.0
1,196964.0
2,29367.0
3,185377.0
4,144993.0


In [21]:
temp

array([ 88465., 196964.,  29367., ..., 136326.,  95919.,   3841.])

In [22]:
temp = temp.reshape(-1, 1)

In [23]:
temp

array([[ 88465.],
       [196964.],
       [ 29367.],
       ...,
       [136326.],
       [ 95919.],
       [  3841.]])

In [33]:
ss = MinMaxScaler()

In [29]:
ss.fit_transform(temp).reshape(-1)

array([0.44232221, 0.98481992, 0.14683073, ..., 0.68162841, 0.4795924 ,
       0.0192001 ])

In [36]:
test_preds = rankdata(test_preds)
ss.fit_transform(test_preds.reshape(-1,1)).reshape(-1)

array([0.78156891, 0.87596438, 0.77301387, ..., 0.06167531, 0.64499322,
       0.70266851])