In [75]:
import re
import numpy as np
import pandas as pd
import feather
import xgboost as xgb
import feather

from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from numba import jit
import time
import gc
from scipy.stats import randint as sp_randint
from sklearn.metrics import make_scorer
from sklearn.grid_search import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold

from sklearn.utils import shuffle

import pickle

from datetime import datetime

In [2]:
# Compute gini

# from CPMP's kernel https://www.kaggle.com/cpmpml/extremely-fast-gini-computation
@jit
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

# Funcitons from olivier's kernel
# https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = -eval_gini(labels, preds)
    return [('gini', gini_score)]

def gini_normalized(a, p):
    return eval_gini(a, p) / eval_gini(a, a)

gini_scorer = make_scorer(gini_normalized, greater_is_better = True)

In [3]:
df = pd.read_csv('../cache/train_labels.csv')
target = df['y'].values

In [4]:
trn_df = feather.read_dataframe('../cache/trn_df.feather')
sub_df = feather.read_dataframe('../cache/sub_df.feather')

In [5]:
trn_df.shape

(595212, 227)

In [6]:
sub_df.shape

(892816, 432)

In [15]:
sub_df = sub_df[trn_df.columns]

In [16]:
sub_df.shape

(595212, 227)

In [10]:
np.bincount(target)

array([573518,  21694])

In [12]:
595212 / (2 * np.bincount(target))

array([  0.51891309,  13.71835531])

In [13]:
param_grid = { 
    'n_estimators': [10, 50, 200, 700, 1000],
    'max_features': ['auto', 'sqrt', 'log2']
}

model_ET = ExtraTreesClassifier(random_state=0, class_weight='balanced')
clf = RandomizedSearchCV(model_ET, param_grid, scoring=gini_scorer,
                         cv=StratifiedKFold(target, 5),n_jobs=4, verbose=3, refit=True)
clf.fit(trn_df, target)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] n_estimators=10, max_features=auto ..............................
[CV] n_estimators=10, max_features=auto ..............................
[CV] n_estimators=10, max_features=auto ..............................
[CV] n_estimators=10, max_features=auto ..............................
[CV] ..... n_estimators=10, max_features=auto, score=0.004143 - 1.1min
[CV] n_estimators=10, max_features=auto ..............................
[CV] ..... n_estimators=10, max_features=auto, score=0.024449 - 1.1min
[CV] ..... n_estimators=10, max_features=auto, score=0.011207 - 1.1min
[CV] n_estimators=700, max_features=log2 .............................
[CV] .... n_estimators=10, max_features=auto, score=-0.005964 - 1.1min
[CV] n_estimators=700, max_features=log2 .............................
[CV] n_estimators=700, max_features=log2 .............................
[CV] .... n_estimators=10, max_features=auto, score=-0.002667 - 1.1min
[CV] n_estimator

[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed: 70.1min


[CV] ... n_estimators=700, max_features=log2, score=-0.002486 -39.5min
[CV] n_estimators=1000, max_features=sqrt ............................
[CV] ... n_estimators=1000, max_features=sqrt, score=0.003815 -92.3min
[CV] n_estimators=1000, max_features=sqrt ............................
[CV] ... n_estimators=1000, max_features=sqrt, score=0.023911 -92.6min
[CV] n_estimators=50, max_features=sqrt ..............................
[CV] ... n_estimators=1000, max_features=sqrt, score=0.011677 -92.5min
[CV] n_estimators=50, max_features=sqrt ..............................
[CV] ..... n_estimators=50, max_features=sqrt, score=0.004001 - 4.7min
[CV] n_estimators=50, max_features=sqrt ..............................
[CV] ..... n_estimators=50, max_features=sqrt, score=0.023782 - 4.7min
[CV] n_estimators=50, max_features=sqrt ..............................
[CV] ..... n_estimators=50, max_features=sqrt, score=0.011671 - 4.5min
[CV] n_estimators=50, max_features=sqrt ..............................
[CV] .

[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed: 349.7min finished


RandomizedSearchCV(cv=sklearn.cross_validation.StratifiedKFold(labels=[0 0 ..., 0 0], n_folds=5, shuffle=False, random_state=None),
          error_score='raise',
          estimator=ExtraTreesClassifier(bootstrap=False, class_weight='balanced',
           criterion='gini', max_depth=None, max_features='auto',
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=0,
           verbose=0, warm_start=False),
          fit_params={}, iid=True, n_iter=10, n_jobs=4,
          param_distributions={'n_estimators': [10, 50, 200, 700, 1000], 'max_features': ['auto', 'sqrt', 'log2']},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring=make_scorer(gini_normalized), verbose=3)

In [14]:
print(clf.best_score_)
print(clf.best_params_)

0.006314096941983021
{'n_estimators': 50, 'max_features': 'log2'}


In [19]:
target

array([0, 0, 0, ..., 0, 0, 0])

In [18]:
param_grid = { 
    'n_estimators': [50, 200],
    'max_features': ['auto', 'sqrt', 'log2']
}

model_ET = ExtraTreesClassifier(random_state=0, class_weight={0:1, 1:26.43671})
clf = GridSearchCV(model_ET, param_grid, scoring=gini_scorer,
                         cv=StratifiedKFold(target, 5),n_jobs=4, verbose=3, refit=True)
clf.fit(trn_df, target)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] n_estimators=50, max_features=auto ..............................
[CV] n_estimators=50, max_features=auto ..............................
[CV] n_estimators=50, max_features=auto ..............................
[CV] n_estimators=50, max_features=auto ..............................
[CV] ..... n_estimators=50, max_features=auto, score=0.003815 - 4.7min
[CV] n_estimators=50, max_features=auto ..............................
[CV] .... n_estimators=50, max_features=auto, score=-0.005721 - 4.6min
[CV] n_estimators=200, max_features=auto .............................
[CV] ..... n_estimators=50, max_features=auto, score=0.023911 - 4.7min
[CV] n_estimators=200, max_features=auto .............................
[CV] ..... n_estimators=50, max_features=auto, score=0.011666 - 4.8min
[CV] n_estimators=200, max_features=auto .............................
[CV] .... n_estimators=50, max_features=auto, score=-0.002653 - 5.0min
[CV] n_estimators

[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed: 77.0min finished


GridSearchCV(cv=sklearn.cross_validation.StratifiedKFold(labels=[0 0 ..., 0 0], n_folds=5, shuffle=False, random_state=None),
       error_score='raise',
       estimator=ExtraTreesClassifier(bootstrap=False, class_weight={0: 1, 1: 26.43671},
           criterion='gini', max_depth=None, max_features='auto',
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=0,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'n_estimators': [50, 200], 'max_features': ['auto', 'sqrt', 'log2']},
       pre_dispatch='2*n_jobs', refit=True,
       scoring=make_scorer(gini_normalized), verbose=3)

In [20]:
param_grid = { 
    'n_estimators': [50, 200],
    'max_features': ['auto', 'sqrt', 'log2']
}

model_ET = ExtraTreesClassifier(random_state=0)
clf = GridSearchCV(model_ET, param_grid, scoring=gini_scorer,
                         cv=StratifiedKFold(target, 5),n_jobs=4, verbose=3, refit=True)
clf.fit(trn_df, target)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] n_estimators=50, max_features=auto ..............................
[CV] n_estimators=50, max_features=auto ..............................
[CV] n_estimators=50, max_features=auto ..............................
[CV] n_estimators=50, max_features=auto ..............................
[CV] ..... n_estimators=50, max_features=auto, score=0.004013 - 4.8min
[CV] n_estimators=50, max_features=auto ..............................
[CV] ..... n_estimators=50, max_features=auto, score=0.011637 - 4.8min
[CV] n_estimators=200, max_features=auto .............................
[CV] ..... n_estimators=50, max_features=auto, score=0.024151 - 4.9min
[CV] n_estimators=200, max_features=auto .............................
[CV] .... n_estimators=50, max_features=auto, score=-0.005482 - 5.0min
[CV] n_estimators=200, max_features=auto .............................
[CV] .... n_estimators=50, max_features=auto, score=-0.001698 - 4.7min
[CV] n_estimators

KeyboardInterrupt: 

In [21]:
param_grid = { 
    'n_estimators': [50, 200],
    'max_features': ['auto', 'sqrt', 'log2']
}

model_ET = ExtraTreesClassifier(random_state=0, class_weight={0: 0.03644752, 1:0.947635485})
clf = GridSearchCV(model_ET, param_grid, scoring=gini_scorer,
                         cv=StratifiedKFold(target, 5),n_jobs=4, verbose=3, refit=True)
clf.fit(trn_df, target)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] n_estimators=50, max_features=auto ..............................
[CV] n_estimators=50, max_features=auto ..............................
[CV] n_estimators=50, max_features=auto ..............................
[CV] n_estimators=50, max_features=auto ..............................
[CV] .... n_estimators=50, max_features=auto, score=-0.005687 - 4.5min
[CV] n_estimators=50, max_features=auto ..............................
[CV] ..... n_estimators=50, max_features=auto, score=0.011677 - 4.6min
[CV] n_estimators=200, max_features=auto .............................
[CV] ..... n_estimators=50, max_features=auto, score=0.003818 - 4.8min
[CV] n_estimators=200, max_features=auto .............................
[CV] ..... n_estimators=50, max_features=auto, score=0.023911 - 4.8min
[CV] n_estimators=200, max_features=auto .............................
[CV] .... n_estimators=50, max_features=auto, score=-0.002653 - 4.6min
[CV] n_estimators

KeyboardInterrupt: 

In [23]:
wts = [0.947635485 if x==1 else 0.03644752 for x in target]

In [35]:
param_grid = { 
    'n_estimators': [50],
    'class_weight': [{1: 0.03644752, 0:0.947635485}, {1:1, 0:26.43671}],
    'max_features': ['auto', 'sqrt', 'log2']
}

model_ET = ExtraTreesClassifier(random_state=0,  bootstrap=True, oob_score=True)
clf = GridSearchCV(model_ET, param_grid, scoring=gini_scorer,
                         cv=StratifiedKFold(target, 5, shuffle=True),n_jobs=4, verbose=3, refit=True)
clf.fit(trn_df, target)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] n_estimators=50, class_weight={0: 0.947635485, 1: 0.03644752}, max_features=auto 
[CV] n_estimators=50, class_weight={0: 0.947635485, 1: 0.03644752}, max_features=auto 
[CV] n_estimators=50, class_weight={0: 0.947635485, 1: 0.03644752}, max_features=auto 
[CV] n_estimators=50, class_weight={0: 0.947635485, 1: 0.03644752}, max_features=auto 
[CV]  n_estimators=50, class_weight={0: 0.947635485, 1: 0.03644752}, max_features=auto, score=0.010018 - 4.3min
[CV] n_estimators=50, class_weight={0: 0.947635485, 1: 0.03644752}, max_features=auto 
[CV]  n_estimators=50, class_weight={0: 0.947635485, 1: 0.03644752}, max_features=auto, score=-0.004817 - 4.5min
[CV] n_estimators=50, class_weight={0: 0.947635485, 1: 0.03644752}, max_features=sqrt 
[CV]  n_estimators=50, class_weight={0: 0.947635485, 1: 0.03644752}, max_features=auto, score=-0.009680 - 4.6min
[CV] n_estimators=50, class_weight={0: 0.947635485, 1: 0.03644752}, max_features

KeyboardInterrupt: 

In [40]:
with open('../cache/idx_info.pkl', 'rb') as f:
    idx_info= pickle.load(f)

In [41]:
idx_info

{0: [array([     0,      1,      2, ..., 595209, 595210, 595211]),
  array([     4,     10,     20, ..., 595198, 595203, 595205])],
 1: [array([     0,      2,      3, ..., 595209, 595210, 595211]),
  array([     1,     15,     28, ..., 595201, 595204, 595208])],
 2: [array([     0,      1,      2, ..., 595208, 595210, 595211]),
  array([     5,     11,     13, ..., 595206, 595207, 595209])],
 3: [array([     1,      2,      4, ..., 595209, 595210, 595211]),
  array([     0,      3,      6, ..., 595186, 595196, 595202])],
 4: [array([     0,      1,      3, ..., 595207, 595208, 595209]),
  array([     2,      9,     14, ..., 595199, 595210, 595211])]}

In [43]:
idx_info[0][0]

array([     0,      1,      2, ..., 595209, 595210, 595211])

In [86]:
sub_df = feather.read_dataframe('../cache/sub_df.feather')
sub_df = sub_df[trn_df.columns]

In [87]:
sub_df.shape

(892816, 227)

In [92]:
sub_df.isnull().sum()

ps_car_13                          0
ps_reg_03                          0
ps_ind_03                          0
ps_ind_15                          0
ps_reg_02                          0
ps_car_14                          0
ps_car_12                          0
ps_ind_17_bin                      0
ps_reg_01                          0
ps_car_15                          0
ps_ind_01                          0
ps_ind_16_bin                      0
ps_ind_07_bin                      0
ps_ind_06_bin                      0
ps_car_11                          0
ps_calc_09                         0
ps_calc_05                         0
ps_ind_08_bin                      0
ps_ind_09_bin                      0
ps_ind_18_bin                      0
ps_ind_12_bin                      0
ps_ind_14                          0
loo_ps_ind_05_cat                  0
loo_ps_car_01_cat                  0
loo_ps_car_07_cat                  0
loo_ps_car_03_cat                  0
loo_ps_car_06_cat                  0
l

In [93]:
sub_df.fillna(-1, inplace=True)

In [94]:
n_splits = 5
n_estimators = 200
#folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=15) 
imp_df = np.zeros((len(trn_df.columns), n_splits))
et_evals = np.zeros((n_estimators, n_splits))
oof_et = np.empty(len(trn_df))
sub_preds = np.zeros(len(sub_df))
increase = True
np.random.seed(0)

In [95]:
tgt = df['y']

In [96]:
wts = [0.947635485 if x==1 else 0.03644752 for x in target]

In [98]:
for fold_ in range(len(idx_info)):
    l = idx_info[fold_]
    trn_idx = l[0]
    val_idx = l[1]
    trn_dat, trn_tgt = trn_df.iloc[trn_idx], tgt.iloc[trn_idx]
    val_dat, val_tgt = trn_df.iloc[val_idx], tgt.iloc[val_idx]
    
    #wts = [0.947635485 if x==1 else 0.03644752 for x in trn_tgt]
    pos = sum(trn_tgt==0)/sum(trn_tgt==1)
    wts = [pos if x==1 else 1 for x in trn_tgt]
    print(pos)
    
    clf = ExtraTreesClassifier(random_state=1337, n_estimators=200,
                              max_features='auto', criterion='gini')
    clf.fit(trn_dat, trn_tgt, sample_weight=wts)
    
    # Predict OOF and submission probas with the best round
    oof_et[val_idx] = clf.predict_proba(val_dat)[:, 1]
    # Update submission
    
    sub_preds += clf.predict_proba(sub_df)[:, 1] / n_splits
    
    # Display results
    print("Fold %2d : %.6f @%4d "
          % (fold_ + 1,
             eval_gini(val_tgt, oof_et[val_idx]),
             n_estimators
             ))

26.436992221261885
Fold  1 : 0.251975 @ 200 
26.436992221261885
Fold  2 : 0.255520 @ 200 
26.436992221261885
Fold  3 : 0.259327 @ 200 
26.437049841544223
Fold  4 : 0.258872 @ 200 
26.435526619036644
Fold  5 : 0.252099 @ 200 


0.2555586

In [99]:
np.save('../cache/oof_et', oof_et)

In [100]:
now = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
fn = '../cache/model.et.{}'.format(now)
with open(fn, 'wb') as f:
    pickle.dump(clf, f)

In [101]:
sub_preds = np.clip(sub_preds, a_min=0.05, a_max=0.95)
sub_df["target"] = sub_preds
now = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
fn = '../submissions/sub.et.{}'.format(now)
sub_df[["target"]].to_csv(fn, index=True, float_format="%.9f")

In [103]:
# take averages of val scores here
(0.251975 + 
0.255520 +
0.259327 + 
0.258872 +
0.252099)/5.

0.2555586

In [104]:
n_splits = 5
n_estimators = 200
#folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=15) 
imp_df = np.zeros((len(trn_df.columns), n_splits))
et_evals = np.zeros((n_estimators, n_splits))
oof_et = np.empty(len(trn_df))
sub_preds = np.zeros(len(sub_df))
np.random.seed(0)
sub_df = feather.read_dataframe('../cache/sub_df.feather')
sub_df = sub_df[trn_df.columns]
sub_df.fillna(-1, inplace=True)
print(sub_df.shape)

(892816, 227)


In [105]:
for fold_ in range(len(idx_info)):
    l = idx_info[fold_]
    trn_idx = l[0]
    val_idx = l[1]
    trn_dat, trn_tgt = trn_df.iloc[trn_idx], tgt.iloc[trn_idx]
    val_dat, val_tgt = trn_df.iloc[val_idx], tgt.iloc[val_idx]
    
    pos = sum(trn_tgt==0)/sum(trn_tgt==1)
    wts = [pos if x==1 else 1 for x in trn_tgt]
    clf = ExtraTreesClassifier(random_state=1337, n_estimators=200, class_weight={0:1, 1:pos},
                              max_features='auto', criterion='gini')
    clf.fit(trn_dat, trn_tgt, sample_weight=wts)
    
    # Predict OOF and submission probas with the best round
    oof_et[val_idx] = clf.predict_proba(val_dat)[:, 1]
    # Update submission
    
    sub_preds += clf.predict_proba(sub_df)[:, 1] / n_splits
    
    # Display results
    print("Fold %2d : %.6f @%4d "
          % (fold_ + 1,
             eval_gini(val_tgt, oof_et[val_idx]),
             n_estimators
             ))

Fold  1 : 0.227593 @ 200 
Fold  2 : 0.240041 @ 200 
Fold  3 : 0.242561 @ 200 
Fold  4 : 0.242370 @ 200 
Fold  5 : 0.227618 @ 200 


In [106]:
n_splits = 5
n_estimators = 200
#folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=15) 
imp_df = np.zeros((len(trn_df.columns), n_splits))
et_evals = np.zeros((n_estimators, n_splits))
oof_et = np.empty(len(trn_df))
sub_preds = np.zeros(len(sub_df))
np.random.seed(0)
sub_df = feather.read_dataframe('../cache/sub_df.feather')
sub_df = sub_df[trn_df.columns]
sub_df.fillna(-1, inplace=True)
print(sub_df.shape)

(892816, 227)


In [107]:
for fold_ in range(len(idx_info)):
    l = idx_info[fold_]
    trn_idx = l[0]
    val_idx = l[1]
    trn_dat, trn_tgt = trn_df.iloc[trn_idx], tgt.iloc[trn_idx]
    val_dat, val_tgt = trn_df.iloc[val_idx], tgt.iloc[val_idx]
    
    pos = sum(trn_tgt==0)/sum(trn_tgt==1)
    wts = [pos if x==1 else 1 for x in trn_tgt]
    clf = ExtraTreesClassifier(random_state=1337, n_estimators=200, class_weight={0:1, 1:pos},
                              max_features='auto', criterion='gini')
    clf.fit(trn_dat, trn_tgt)
    
    # Predict OOF and submission probas with the best round
    oof_et[val_idx] = clf.predict_proba(val_dat)[:, 1]
    # Update submission
    
    sub_preds += clf.predict_proba(sub_df)[:, 1] / n_splits
    
    # Display results
    print("Fold %2d : %.6f @%4d "
          % (fold_ + 1,
             eval_gini(val_tgt, oof_et[val_idx]),
             n_estimators
             ))

Fold  1 : 0.251975 @ 200 
Fold  2 : 0.255520 @ 200 
Fold  3 : 0.259327 @ 200 
Fold  4 : 0.258872 @ 200 
Fold  5 : 0.252099 @ 200 


In [113]:
np.save('../cache/oof_et', oof_et)
now = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
fn = '../cache/model.et.{}'.format(now)
with open(fn, 'wb') as f:
    pickle.dump(clf, f)

In [109]:
df = pd.read_csv('../cache/test_id.csv')

In [110]:
df.head()

Unnamed: 0,id
0,0
1,1
2,2
3,3
4,4


In [112]:
sub_preds = np.clip(sub_preds, a_min=0.05, a_max=0.95)
sub_df["target"] = sub_preds
sub_df['id'] = df['id'].values
now = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
fn = '../submissions/sub.et.{}'.format(now)
sub_df[['id', 'target']].to_csv(fn, index=False, float_format="%.9f")