In [1]:
# -*- coding: utf-8 -*-
"""
@author: Faron
"""
import pandas as pd
import numpy as np
import xgboost as xgb

DATA_DIR = "input_orig"

ID_COLUMN = 'Id'
TARGET_COLUMN = 'Response'

SEED = 0

TRAIN_NUMERIC = "{0}/train_numeric.csv".format(DATA_DIR)
TRAIN_DATE = "{0}/train_date.csv".format(DATA_DIR)

TEST_NUMERIC = "{0}/test_numeric.csv".format(DATA_DIR)
TEST_DATE = "{0}/test_date.csv".format(DATA_DIR)

FILENAME = "etimelhoods"

train = pd.read_csv(TRAIN_NUMERIC, usecols=[ID_COLUMN, TARGET_COLUMN])
test = pd.read_csv(TEST_NUMERIC, usecols=[ID_COLUMN])

train["StartTime"] = -1
test["StartTime"] = -1

In [2]:
train.shape, test.shape

((1183747, 3), (1183748, 2))

In [None]:
'''
tr = pd.read_csv(TRAIN_DATE, dtype=np.float32) 
te = pd.read_csv(TEST_DATE, dtype=np.float32)

feats = np.setdiff1d(tr.columns, [ID_COLUMN])

stime_tr = tr[feats].min(axis=1).values
stime_te = te[feats].min(axis=1).values

train.loc[train.Id.isin(tr.Id), 'StartTime'] = stime_tr
test.loc[test.Id.isin(te.Id), 'StartTime'] = stime_te

In [3]:
CHUNKSIZE = 250000
NROWS_TR = train.shape[0]
NROWS_TE = test.shape[0]
nrows_tr = 0
nrows_te = 0

for tr in (pd.read_csv(TRAIN_DATE, chunksize=CHUNKSIZE, dtype=np.float32)):
    feats = np.setdiff1d(tr.columns, [ID_COLUMN])

    stime_tr = tr[feats].min(axis=1).values

    train.loc[train.Id.isin(tr.Id), 'StartTime'] = stime_tr

    nrows_tr += CHUNKSIZE
    if nrows_tr >= NROWS_TR:
        break

for te in (pd.read_csv(TEST_DATE, chunksize=CHUNKSIZE, dtype=np.float32)):
    feats = np.setdiff1d(te.columns, [ID_COLUMN])

    stime_te = te[feats].min(axis=1).values

    test.loc[test.Id.isin(te.Id), 'StartTime'] = stime_te

    nrows_te += CHUNKSIZE
    if nrows_te >= NROWS_TE:
        break

In [7]:
test.tail()

Unnamed: 0,Id,StartTime
1183743,2367483,653.849976
1183744,2367485,907.340027
1183745,2367486,185.919998
1183746,2367489,570.849976
1183747,2367494,1412.800049


In [8]:
ntrain = train.shape[0]
train_test = pd.concat((train, test), ignore_index=True).reset_index(drop=False)

In [11]:
train_test.head()

Unnamed: 0,index,Id,Response,StartTime
0,0,4,0.0,82.239998
1,1,6,0.0,1313.119995
2,2,7,0.0,1618.699951
3,3,9,0.0,1149.199951
4,4,11,0.0,602.640015


In [12]:
train_test['magic1'] = train_test[ID_COLUMN].diff().fillna(9999999).astype(int)
train_test['magic2'] = train_test[ID_COLUMN].iloc[::-1].diff().fillna(9999999).astype(int)
train_test = train_test.sort_values(by=['StartTime', 'Id'], ascending=True)

train_test['magic3'] = train_test[ID_COLUMN].diff().fillna(9999999).astype(int)
train_test['magic4'] = train_test[ID_COLUMN].iloc[::-1].diff().fillna(9999999).astype(int)
train_test = train_test.sort_values(by=['index']).drop(['index'], axis=1)

In [18]:
train_test.tail()

Unnamed: 0,Id,Response,StartTime,magic1,magic2,magic3,magic4
2367490,2367483,,653.849976,1,-2,7590,2042322
2367491,2367485,,907.340027,2,-1,3483,2318983
2367492,2367486,,185.919998,1,-3,4732,2316034
2367493,2367489,,570.849976,3,-5,10503,2351203
2367494,2367494,,1412.800049,5,9999999,35783,2362123


In [19]:
train = train_test.iloc[:ntrain, :]
test = train_test.iloc[ntrain:, :]
features = np.setdiff1d(list(train.columns), [TARGET_COLUMN, ID_COLUMN])

In [23]:
train.drop(['Response'], axis=1, inplace=True)
test.drop(['Response'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [28]:
train.to_csv('faron_train.csv', index=False)
test.to_csv('faron_test.csv', index=False)

In [None]:
#train.replace(9999999, np.nan, inplace=True)
#train.dropna(inplace=True)

In [None]:
y = train.Response.ravel()
train = np.array(train[features])
print('train: {0}'.format(train.shape))

In [None]:
%matplotlib inline

from sklearn.metrics import matthews_corrcoef
import matplotlib.pyplot as plt
from numba import jit

@jit
def mcc(tp, tn, fp, fn):
    sup = tp * tn - fp * fn
    inf = (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)
    if inf==0:
        return 0
    else:
        return sup / np.sqrt(inf)

@jit
def eval_mcc(y_true, y_prob, show=False):
    idx = np.argsort(y_prob)
    y_true_sort = y_true[idx]
    n = y_true.shape[0]
    nump = 1.0 * np.sum(y_true) # number of positive
    numn = n - nump # number of negative
    tp = nump
    tn = 0.0
    fp = numn
    fn = 0.0
    best_mcc = 0.0
    best_id = -1
    prev_proba = -1
    best_proba = -1
    mccs = np.zeros(n)
    for i in range(n):
        # all items with idx < i are predicted negative while others are predicted positive
        # only evaluate mcc when probability changes
        proba = y_prob[idx[i]]
        if proba != prev_proba:
            prev_proba = proba
            new_mcc = mcc(tp, tn, fp, fn)
            if new_mcc >= best_mcc:
                best_mcc = new_mcc
                best_id = i
                best_proba = proba
        mccs[i] = new_mcc
        if y_true_sort[i] == 1:
            tp -= 1.0
            fn += 1.0
        else:
            fp -= 1.0
            tn += 1.0
    if show:
        y_pred = (y_prob >= best_proba).astype(int)
        score = matthews_corrcoef(y_true, y_pred)
        print(score, best_mcc)
        plt.plot(mccs)
        return best_proba, best_mcc, y_pred
    else:
        return best_mcc

def mcc_eval(y_prob, dtrain):
    y_true = dtrain.get_label()
    best_mcc = eval_mcc(y_true, y_prob)
    return 'MCC', best_mcc

In [None]:
prior = np.sum(y) / (1.*len(y))

xgb_params = {
    'seed': 0,
    'colsample_bytree': 1,
    'silent': 0,
    'subsample': 1,
    'learning_rate': 0.02,
    'objective': 'binary:logistic',
    'max_depth': 5,
    'min_child_weight': 1,
    'eval_metric': 'auc',
    'base_score': prior
}

dtrain = xgb.DMatrix(train, label=y)
res = xgb.cv(xgb_params, dtrain, num_boost_round=500, nfold=5, seed=0, stratified=True,
             early_stopping_rounds=10, verbose_eval=1, show_stdv=True, feval=mcc_eval, maximize=True)

In [None]:
cv_mean = res.iloc[-1, 0]
cv_std = res.iloc[-1, 1]

print('CV-Mean: {0}+{1}'.format(cv_mean, cv_std))