#### Import packages

In [1]:
import pandas as pd
import numpy as np
import itertools
from random import sample
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import MinMaxScaler, StandardScaler

from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers.core import Dropout
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, TensorBoard
import keras

Using TensorFlow backend.


In [2]:
# Define the gini metric - from https://www.kaggle.com/c/ClaimPredictionChallenge/discussion/703#5897
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

# Create an XGBoost-compatible metric from Gini

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return 'gini', gini_score

def gini_lgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return 'gini', 1-gini_score, False

In [3]:
def trainAndVal (model, splits=5):
    
    ###Splitting data
    
    kf = KFold(n_splits=splits)
    kf.get_n_splits(train)
    
    ###List of results in terms of gini coeff
    
    results = []
    
    for train_index, test_index in kf.split(train):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # model.fit(X_train, y_train)
        # x_train and y_train are Numpy arrays
        model.fit(X_train, y_train, epochs=20, batch_size=100)
        y_predict = model.predict_proba(X_test)
        gini = gini_normalized(y_test, y_predict)
        results.append(gini)
        
        print(gini)
        
    return(results)

#### Read Data 

In [4]:
# Read in our input data
train = pd.read_csv('../Dataset/train/train.csv')
test = pd.read_csv('../Dataset/test/test.csv')


# This prints out (rows, columns) in each dataframe
print('Train shape:', train.shape)
print('Test shape:', test.shape)


id_train = train['id'].values
y = train.target.values
id_test = test['id'].values

Train shape: (595212, 59)
Test shape: (892816, 58)


In [5]:
# We drop these variables as we don't want to train on them
# The other 57 columns are all numerical and can be trained on without preprocessing

train = train.drop(['id','target'], axis=1)
test = test.drop(['id'], axis=1)

print('Train shape:', train.shape)
print('Test shape:', test.shape)


Train shape: (595212, 57)
Test shape: (892816, 57)


#### Feature Engineering

In [6]:
# Add count of NAs field
train['countNAs'] = train.isin(['-1']).sum(axis=1)
test['countNAs'] = test.isin(['-1']).sum(axis=1)

In [None]:
train.describe()

In [7]:
X = train.values

#### Creating generator functions for ranking data 

In [8]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
pos_indices = np.where(y_train==1)[0]
neg_indices = np.where(y_train==0)[0]
np.random.shuffle(pos_indices)
np.random.shuffle(neg_indices)

index_pairs_train = []
for i in pos_indices:
    neg_iter = neg_indices[np.random.choice(len(neg_indices), size=100, replace=False)]
    index_pairs_train += [(i, j) for j in neg_iter]

index_pairs_train = sample(index_pairs_train, len(index_pairs_train))

In [11]:
batch_size=1

In [12]:
def train_generator():
    while True:
        x_batch = []
        y_batch = []
        for start in range(0, len(index_pairs_train), batch_size):    
            end = min(start + batch_size, len(index_pairs_train))
            index_pairs_batch = index_pairs_train[start:end]
            for id_ in index_pairs_batch:
                first_claim = np.random.randint(2)
                if first_claim:
                    x_batch.append(np.concatenate((X_train[id_[0]], X_train[id_[1]]), axis=0))
                    y_batch.append(1.0)
                else:
                    x_batch.append(np.concatenate((X_train[id_[1]], X_train[id_[0]]), axis=0))
                    y_batch.append(0.0)
        x_batch = np.array(x_batch, np.float32)
        y_batch = np.array(y_batch, np.float32)
        return x_batch, y_batch

In [13]:
pos_indices = np.where(y_val==1)[0]
neg_indices = np.where(y_val==0)[0]

index_pairs_val = []
for i in pos_indices:
    neg_iter = neg_indices[np.random.choice(len(neg_indices), size=100, replace=False)]
    index_pairs_val += [(i, j) for j in neg_iter]

In [14]:
def valid_generator():
    while True:
        x_batch = []
        y_batch = []
        for start in range(0, len(index_pairs_val), batch_size):
            
            end = min(start + batch_size, len(index_pairs_val))
            index_pairs_batch = index_pairs_val[start:end]
            for id_ in index_pairs_batch:
                first_claim = np.random.randint(2)
                if first_claim:
                    x_batch.append(np.concatenate((X_val[id_[0]], X_val[id_[1]]), axis=0))
                    y_batch.append(1.0)
                else:
                    x_batch.append(np.concatenate((X_val[id_[1]], X_val[id_[0]]), axis=0))
                    y_batch.append(0.0)
        x_batch = np.array(x_batch, np.float32)
        y_batch = np.array(y_batch, np.float32)
        return x_batch, y_batch

#### Keras Model 

In [15]:
rf_X_train, rf_y_train = train_generator()
rf_X_val, rf_y_val = valid_generator()

In [16]:
mod_rf = RandomForestClassifier(n_estimators=300, n_jobs=-1)

In [17]:
fitted_mod = mod_rf.fit(rf_X_train, rf_y_train)

KeyboardInterrupt: 

In [40]:
fitted_mod.score(rf_X_val, rf_y_val)

0.6246522234891676

In [24]:
mod_svm = SVC()

In [None]:
fitted_mod = mod_svm.fit(rf_X_train, rf_y_train)

In [17]:
fitted_mod.score(rf_X_val, rf_y_val)

0.54928164196123153

##### XGBoost

In [18]:
import xgboost as xgb



In [19]:
# x_test = test.values
#x_test = get_features([test.values])[0]
xgbscores = []

# Set xgb parameters
params = {}
params['objective'] = 'binary:logistic'
params['eta'] = 0.02
params['silent'] = True
params['max_depth'] = 6
params['subsample'] = 0.9
params['colsample_bytree'] = 0.6
params['colsample_bylevel'] = 0.9
params['seed'] = 99
params['lambda'] = 10
params['alpha'] = 1
params['eval_metric'] = 'error'
# params['scale_pos_weight'] = 26.43671061122891
# params['tree-method'] = 'gpu_exact'
# params['grow-policy'] = 'lossguide'

In [20]:
# Create a submission file
sub = pd.DataFrame()
sub['id'] = id_test
sub['target'] = np.zeros_like(id_test)

kfold = 5
weights = []
# sss = StratifiedKFold(n_splits=kfold, random_state=0)
# for i, (train_index, test_index) in enumerate(sss.split(, y)):
#     print('[Fold %d/%d]' % (i + 1, kfold))
#     X_train, X_valid = X[train_index], X[test_index]
#     y_train, y_valid = y[train_index], y[test_index]
#     # Convert our data into LGBoost format
d_train = xgb.DMatrix(rf_X_train, rf_y_train, missing=-1)
d_valid = xgb.DMatrix(rf_X_val, rf_y_val, missing=-1)
# d_test = xgb.DMatrix(x_test, missing=-1)
watchlist = [(d_train, 'train'), (d_valid, 'valid')]

# Train the model! We pass in a max of 2,000 rounds (with early stopping after 100)
# and the custom metric (maximize=True tells xgb that higher metric is better)
mdl = xgb.train(params, d_train, 2000, watchlist, early_stopping_rounds=170, verbose_eval=100)

# # print('[Fold %d/%d Prediction:]' % (i + 1, kfold))
# # Predict on our test data
# p_val = mdl.(d_valid)
# print(mdl.best_score)
# weight = mdl.best_score
# weights.append(weight)
# sub['target'] += p_test*weight

# sub['target'] = sub['target']/np.sum(weights)

[0]	train-error:0.406791	valid-error:0.407977
Multiple eval metrics have been passed: 'valid-error' will be used for early stopping.

Will train until valid-error hasn't improved in 170 rounds.
[100]	train-error:0.360792	valid-error:0.373044
[200]	train-error:0.343708	valid-error:0.366452
[300]	train-error:0.330861	valid-error:0.363719
[400]	train-error:0.319947	valid-error:0.362306
[500]	train-error:0.309795	valid-error:0.361384
[600]	train-error:0.301556	valid-error:0.361902
Stopping. Best iteration:
[523]	train-error:0.307724	valid-error:0.361361



In [99]:
half_val = X_val[:int(len(X_val)//20)]

In [105]:
positions = []
for j, value in enumerate(half_val):
    
    val_Xs = []
    temp_val = half_val
    temp_val = np.delete(temp_val, (j), axis=0)
    for i in temp_val:
        val_Xs.append(np.concatenate((value, i), axis=0))
    positions.append(len(np.where(mdl.predict(xgb.DMatrix(np.array(val_Xs), missing=-1))>0.6)[0]))
    if len(positions)%1000==0:
        print(len(positions))

1000
2000
3000
4000
5000


In [106]:
pred_probs = [(i+1)/len(half_val) for i in positions]

In [93]:
temp_list = []
temp_list.append(np.concatenate((half_val[17204], half_val[2377]), axis=0))
temp_list.append(np.concatenate((half_val[17204], half_val[14945]), axis=0))
temp_list.append(np.concatenate((half_val[2377], half_val[14945]), axis=0))
temp_list.append(np.concatenate((half_val[14945], half_val[2377]), axis=0))
temp_pred = mdl.predict(xgb.DMatrix(np.array(temp_list), missing=-1))

In [94]:
temp_pred

array([ 0.44076705,  0.46323767,  0.43058982,  0.44090021], dtype=float32)

In [96]:
len(positions)

2400

In [103]:
half_val_y = y_val[:int(len(y_val)//20)]

In [107]:
gini_normalized(half_val_y, pred_probs)

0.26992912247513134

In [20]:
model = Sequential()

model.add(Dense(units=500, input_dim=116))
model.add(Activation('relu'))
model.add(Dropout(0.4))
# model.add(Dense(units=400, input_dim=116))
# model.add(Activation('relu'))
# model.add(Dropout(0.4))
# model.add(Dense(units=200, input_dim=116))
# model.add(Activation('relu'))
# model.add(Dropout(0.2))
model.add(Dense(units=100))
model.add(Activation('relu'))
model.add(Dense(units=1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [21]:
callbacks = [EarlyStopping(monitor='val_loss',
                           patience=8,
                           verbose=1,
                           min_delta=1e-4),
             ReduceLROnPlateau(monitor='val_loss',
                               factor=0.1,
                               patience=4,
                               verbose=1,
                               epsilon=1e-4),
             ModelCheckpoint(monitor='val_loss',
                             filepath='weights/best_weights.hdf5',
                             save_best_only=True,
                             save_weights_only=True),
             TensorBoard(log_dir='logs')]

# Parameters
epochs = 20
batch_size = 200

# Train
model.fit_generator(generator=train_generator(),
                    steps_per_epoch=np.ceil(float(1000*len(np.where(y_train==1)[0])) / float(batch_size)),
                    epochs=epochs,
                    verbose=2,
                    callbacks=callbacks,
                    max_queue_size=1000000,
                    validation_data=valid_generator(),
                    validation_steps=np.ceil(float(1000*len(np.where(y_val==1)[0])) / float(batch_size)))

Epoch 1/20
318s - loss: 0.6380 - acc: 0.6343 - val_loss: 0.6426 - val_acc: 0.6285
Epoch 2/20


KeyboardInterrupt: 

In [None]:
# Create a submission file
sub.to_csv('./submit/xgb_v7_missing_vals_specified.csv.gz', 
           index=False, compression='gzip')

In [None]:
val_scores = [0.279973, 0.283588, 0.282138, 0.291781, 0.277837]
np.mean(val_scores)

In [None]:
val_scores = [0.278554, 0.283918, 0.283532, 0.290948, 0.276037]
np.mean(val_scores)

* scale_pos_weight
* logitraw

#### Error Analysis

In [None]:
import matplotlib.pyplot as plt
from scipy.stats import ttest_rel
pd.set_option('display.max_columns', 500)
%matplotlib inline

In [None]:
kfold = 5
sss = KFold(n_splits=kfold, random_state=0)
for i, (train_index, test_index) in enumerate(sss.split(X, y)):
    print('[Fold %d/%d]' % (i + 1, kfold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]
    
    if (i + 1)==5:
        X_difficult, y_difficult = X_valid, y_valid
    if (i + 1)==4:
        X_easy, y_easy = X_valid, y_valid
    print(100*y_valid.sum()/len(y_valid) - (100*y_train.sum()/len(y_train)))

In [None]:
X_difficult = pd.DataFrame(X_difficult, columns=train.columns)
X_difficult.describe()

In [None]:
X_easy = pd.DataFrame(X_easy, columns=train.columns)
X_easy.describe()

In [None]:
for col_name in train.columns:
    #bins = np.linspace(0, 10, 100)
    plt.figure(figsize=(20,3))
    ax = plt.subplot(11)
    ax.bar(x-0.2, y,width=0.2,color='b',align='center')
    ax.bar(x, z,width=0.2,color='g',align='center')
    plt.hist(X_difficult.loc[:, col_name], bins, color='red', alpha=0.5, label='Difficult')
    plt.hist(X_easy.loc[:, col_name], bins, color='green', alpha=0.5, label='Easy')
    #plt.boxplot([X_difficult.loc[:, col_name], X_easy.loc[:, col_name]], vert=0)
    plt.title(col_name)
    plt.legend(loc='upper right')
    plt.show()

#### Let's go for a t-test (https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_rel.html#scipy.stats.ttest_rel)

In [None]:
for col in train.columns:
    if ttest_rel(X_easy.loc[:, col_name], X_difficult.loc[:, col_name])[1] <= 0.05:
        print(col)

Nothing :-(

#### Feature Extraction using Keras 

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers.core import Dropout
import keras

In [None]:
model = Sequential()

model.add(Dense(units=150, input_dim=51))
model.add(Activation('relu'))
model.add(Dropout(0.4))
model.add(Dense(units=1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X, y, epochs=20, batch_size=100)
y_predict = model.predict_proba(X)
gini_normalized(y, y_predict)

In [None]:
from keras import backend as K

In [None]:
# with a Sequential model
get_features = K.function([model.layers[0].input],
                                  [model.layers[1].output])
layer_output = get_features([X])[0]

In [None]:
layer_output.shape

In [None]:
x_test.shape