In [19]:
import re
import numpy as np
import pandas as pd
import feather
import xgboost as xgb
import feather

from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from numba import jit
import time
import gc
from scipy.stats import randint as sp_randint
from sklearn.metrics import make_scorer
from sklearn.grid_search import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold

from sklearn.utils import shuffle

import pickle



In [2]:
# Compute gini

# from CPMP's kernel https://www.kaggle.com/cpmpml/extremely-fast-gini-computation
@jit
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

# Funcitons from olivier's kernel
# https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = -eval_gini(labels, preds)
    return [('gini', gini_score)]

def gini_normalized(a, p):
    return eval_gini(a, p) / eval_gini(a, a)

gini_scorer = make_scorer(gini_normalized, greater_is_better = True)

In [3]:
df = pd.read_csv('../cache/train_labels.csv')
target = df['y'].values

In [4]:
trn_df = feather.read_dataframe('../cache/trn_df.feather')
sub_df = feather.read_dataframe('../cache/sub_df.feather')

In [5]:
trn_df.shape

(595212, 227)

In [6]:
sub_df.shape

(892816, 432)

In [7]:
sub_df = sub_df[trn_df.columns]

In [8]:
sub_df.shape

(892816, 227)

In [9]:
np.bincount(target)

array([573518,  21694])

In [10]:
595212 / (2 * np.bincount(target))

array([  0.51891309,  13.71835531])

In [29]:
param_grid = { 
    'n_estimators': [10, 50, 200, 700, 1000],
    'max_features': ['auto', 'sqrt', 'log2']
}

model_ET = ExtraTreesClassifier(random_state=0, class_weight='balanced')
clf = RandomizedSearchCV(model_ET, param_grid, scoring=gini_scorer,
                         cv=5,n_jobs=4, verbose=3, refit=True)
clf.fit(trn_df, target)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] n_estimators=10, max_features=auto ..............................
[CV] n_estimators=10, max_features=auto ..............................
[CV] n_estimators=10, max_features=auto ..............................
[CV] n_estimators=10, max_features=auto ..............................
[CV] ..... n_estimators=10, max_features=auto, score=0.004143 - 1.0min
[CV] n_estimators=10, max_features=auto ..............................
[CV] ..... n_estimators=10, max_features=auto, score=0.011207 - 1.0min
[CV] n_estimators=200, max_features=auto .............................
[CV] .... n_estimators=10, max_features=auto, score=-0.005964 -  59.6s
[CV] ..... n_estimators=10, max_features=auto, score=0.024449 - 1.1min
[CV] n_estimators=200, max_features=auto .............................
[CV] n_estimators=200, max_features=auto .............................


KeyboardInterrupt: 

In [14]:
print(clf.best_score_)
print(clf.best_params_)

0.006314096941983021
{'n_estimators': 50, 'max_features': 'log2'}


In [19]:
target

array([0, 0, 0, ..., 0, 0, 0])

In [28]:
param_grid = { 
    'n_estimators': [50, 200],
    'max_features': ['auto', 'sqrt', 'log2']
}

model_ET = ExtraTreesClassifier(random_state=0, class_weight='{0:1, 1:26.43671})'
clf = GridSearchCV(model_ET, param_grid, scoring=gini_scorer,
                         cv=5,n_jobs=4, verbose=3, refit=True)
clf.fit(trn_df, target)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] n_estimators=50, max_features=auto ..............................
[CV] n_estimators=50, max_features=auto ..............................
[CV] n_estimators=50, max_features=auto ..............................
[CV] n_estimators=50, max_features=auto ..............................
[CV] ..... n_estimators=50, max_features=auto, score=0.011666 - 4.6min
[CV] n_estimators=50, max_features=auto ..............................
[CV] ..... n_estimators=50, max_features=auto, score=0.003815 - 4.7min
[CV] ..... n_estimators=50, max_features=auto, score=0.023911 - 4.6min
[CV] n_estimators=200, max_features=auto .............................
[CV] n_estimators=200, max_features=auto .............................
[CV] .... n_estimators=50, max_features=auto, score=-0.005721 - 4.7min
[CV] n_estimators=200, max_features=auto .............................


KeyboardInterrupt: 

In [20]:
param_grid = { 
    'n_estimators': [50, 200],
    'max_features': ['auto', 'sqrt', 'log2']
}

model_ET = ExtraTreesClassifier(random_state=0)
clf = GridSearchCV(model_ET, param_grid, scoring=gini_scorer,
                         cv=StratifiedKFold(target, 5),n_jobs=4, verbose=3, refit=True)
clf.fit(trn_df, target)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] n_estimators=50, max_features=auto ..............................
[CV] n_estimators=50, max_features=auto ..............................
[CV] n_estimators=50, max_features=auto ..............................
[CV] n_estimators=50, max_features=auto ..............................
[CV] ..... n_estimators=50, max_features=auto, score=0.004013 - 4.8min
[CV] n_estimators=50, max_features=auto ..............................
[CV] ..... n_estimators=50, max_features=auto, score=0.011637 - 4.8min
[CV] n_estimators=200, max_features=auto .............................
[CV] ..... n_estimators=50, max_features=auto, score=0.024151 - 4.9min
[CV] n_estimators=200, max_features=auto .............................
[CV] .... n_estimators=50, max_features=auto, score=-0.005482 - 5.0min
[CV] n_estimators=200, max_features=auto .............................
[CV] .... n_estimators=50, max_features=auto, score=-0.001698 - 4.7min
[CV] n_estimators

KeyboardInterrupt: 

In [21]:
param_grid = { 
    'n_estimators': [50, 200],
    'max_features': ['auto', 'sqrt', 'log2']
}

model_ET = ExtraTreesClassifier(random_state=0, class_weight={0: 0.03644752, 1:0.947635485})
clf = GridSearchCV(model_ET, param_grid, scoring=gini_scorer,
                         cv=StratifiedKFold(target, 5),n_jobs=4, verbose=3, refit=True)
clf.fit(trn_df, target)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] n_estimators=50, max_features=auto ..............................
[CV] n_estimators=50, max_features=auto ..............................
[CV] n_estimators=50, max_features=auto ..............................
[CV] n_estimators=50, max_features=auto ..............................
[CV] .... n_estimators=50, max_features=auto, score=-0.005687 - 4.5min
[CV] n_estimators=50, max_features=auto ..............................
[CV] ..... n_estimators=50, max_features=auto, score=0.011677 - 4.6min
[CV] n_estimators=200, max_features=auto .............................
[CV] ..... n_estimators=50, max_features=auto, score=0.003818 - 4.8min
[CV] n_estimators=200, max_features=auto .............................
[CV] ..... n_estimators=50, max_features=auto, score=0.023911 - 4.8min
[CV] n_estimators=200, max_features=auto .............................
[CV] .... n_estimators=50, max_features=auto, score=-0.002653 - 4.6min
[CV] n_estimators

KeyboardInterrupt: 

In [23]:
wts = [0.947635485 if x==1 else 0.03644752 for x in target]

In [17]:
param_grid = { 
    'n_estimators': [50],
    'class_weight': [{1: 0.03644752, 0:0.947635485}, {1:1, 0:26.43671}],
    'max_features': ['auto', 'sqrt', 'log2']
}

model_ET = ExtraTreesClassifier(random_state=0,  bootstrap=True,
                                oob_score=True, criterion='entropy')
clf = GridSearchCV(model_ET, param_grid, scoring=gini_scorer,
                         cv=StratifiedKFold(target, 5, shuffle=True),n_jobs=4, verbose=3, refit=True)
clf.fit(trn_df, target)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] n_estimators=50, class_weight={0: 0.947635485, 1: 0.03644752}, max_features=auto 
[CV] n_estimators=50, class_weight={0: 0.947635485, 1: 0.03644752}, max_features=auto 
[CV] n_estimators=50, class_weight={0: 0.947635485, 1: 0.03644752}, max_features=auto 
[CV] n_estimators=50, class_weight={0: 0.947635485, 1: 0.03644752}, max_features=auto 
[CV]  n_estimators=50, class_weight={0: 0.947635485, 1: 0.03644752}, max_features=auto, score=0.003828 - 4.3min
[CV] n_estimators=50, class_weight={0: 0.947635485, 1: 0.03644752}, max_features=auto 
[CV]  n_estimators=50, class_weight={0: 0.947635485, 1: 0.03644752}, max_features=auto, score=0.003121 - 4.6min
[CV] n_estimators=50, class_weight={0: 0.947635485, 1: 0.03644752}, max_features=sqrt 
[CV]  n_estimators=50, class_weight={0: 0.947635485, 1: 0.03644752}, max_features=auto, score=-0.001988 - 4.6min
[CV] n_estimators=50, class_weight={0: 0.947635485, 1: 0.03644752}, max_features=

KeyboardInterrupt: 