In [15]:
import re
import numpy as np
import pandas as pd
import feather
import xgboost as xgb
import feather

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from numba import jit
import time
import gc
from scipy.stats import randint as sp_randint
from sklearn.metrics import make_scorer
from sklearn.grid_search import RandomizedSearchCV, GridSearchCV
from sklearn.cross_validation import StratifiedKFold

from sklearn.utils import shuffle

import pickle



In [16]:
# Compute gini

# from CPMP's kernel https://www.kaggle.com/cpmpml/extremely-fast-gini-computation
@jit
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    ntrue = 0
    gini = 0
    delta = 0
    n = len(y_true)
    for i in range(n-1, -1, -1):
        y_i = y_true[i]
        ntrue += y_i
        gini += y_i * delta
        delta += 1 - y_i
    gini = 1 - 2 * gini / (ntrue * (n - ntrue))
    return gini

# Funcitons from olivier's kernel
# https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = -eval_gini(labels, preds)
    return [('gini', gini_score)]

def gini_normalized(a, p):
    return eval_gini(a, p) / eval_gini(a, a)

gini_scorer = make_scorer(gini_normalized, greater_is_better = True)

In [4]:
df = pd.read_csv('../cache/train_labels.csv')
target = df['y'].values

In [6]:
trn_df = feather.read_dataframe('../cache/trn_df.feather')
sub_df = feather.read_dataframe('../cache/sub_df.feather')

In [8]:
trn_df.shape

(595212, 227)

In [9]:
sub_df.shape

(892816, 432)

In [11]:
sub_df = trn_df[trn_df.columns]

In [12]:
sub_df.shape

(595212, 227)

In [22]:
param_grid = { 
    'n_estimators': [50, 200],
    'max_features': ['auto', 'sqrt', 'log2']
}

model_RF = RandomForestClassifier(random_state=0, class_weight='balanced')
clf = GridSearchCV(model_RF, param_grid, scoring=gini_scorer,
                         cv=StratifiedKFold(target, 5),n_jobs=4, verbose=3, refit=True)
clf.fit(trn_df, target)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] n_estimators=50, max_features=auto ..............................
[CV] n_estimators=50, max_features=auto ..............................
[CV] n_estimators=50, max_features=auto ..............................
[CV] n_estimators=50, max_features=auto ..............................
[CV] ..... n_estimators=50, max_features=auto, score=0.024264 -19.9min
[CV] n_estimators=50, max_features=auto ..............................
[CV] ..... n_estimators=50, max_features=auto, score=0.013581 -20.1min
[CV] n_estimators=200, max_features=auto .............................
[CV] .... n_estimators=50, max_features=auto, score=-0.003667 -20.5min
[CV] n_estimators=200, max_features=auto .............................
[CV] ..... n_estimators=50, max_features=auto, score=0.006572 -20.6min
[CV] n_estimators=200, max_features=auto .............................


KeyboardInterrupt: 

In [23]:
model_RF = RandomForestClassifier(random_state=0, class_weight='balanced_subsample')
clf = GridSearchCV(model_RF, param_grid, scoring=gini_scorer,
                         cv=StratifiedKFold(target, 5),n_jobs=4, verbose=3, refit=True)
clf.fit(trn_df, target)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] n_estimators=50, max_features=auto ..............................
[CV] n_estimators=50, max_features=auto ..............................
[CV] n_estimators=50, max_features=auto ..............................
[CV] n_estimators=50, max_features=auto ..............................
[CV] ..... n_estimators=50, max_features=auto, score=0.006716 -20.2min
[CV] n_estimators=50, max_features=auto ..............................
[CV] ..... n_estimators=50, max_features=auto, score=0.014143 -20.2min
[CV] n_estimators=200, max_features=auto .............................
[CV] ..... n_estimators=50, max_features=auto, score=0.024217 -20.3min
[CV] n_estimators=200, max_features=auto .............................
[CV] .... n_estimators=50, max_features=auto, score=-0.002242 -20.5min
[CV] n_estimators=200, max_features=auto .............................


KeyboardInterrupt: 

In [25]:
1/27.43671 * 26

0.9476354854499682

In [26]:
param_grid = { 
    'n_estimators': [50, 200],
    'max_features': ['auto', 'sqrt', 'log2']
}

model_RF = RandomForestClassifier(random_state=0, class_weight={0: 0.03644752, 1:0.947635485})
clf = GridSearchCV(model_RF, param_grid, scoring=gini_scorer,
                         cv=StratifiedKFold(target, 5),n_jobs=4, verbose=3, refit=True)
clf.fit(trn_df, target)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] n_estimators=50, max_features=auto ..............................
[CV] n_estimators=50, max_features=auto ..............................
[CV] n_estimators=50, max_features=auto ..............................
[CV] n_estimators=50, max_features=auto ..............................
[CV] ..... n_estimators=50, max_features=auto, score=0.024303 -18.4min
[CV] n_estimators=50, max_features=auto ..............................
[CV] ..... n_estimators=50, max_features=auto, score=0.015010 -18.7min
[CV] n_estimators=200, max_features=auto .............................
[CV] ..... n_estimators=50, max_features=auto, score=0.005458 -18.9min
[CV] n_estimators=200, max_features=auto .............................
[CV] .... n_estimators=50, max_features=auto, score=-0.002791 -18.7min
[CV] n_estimators=200, max_features=auto .............................


KeyboardInterrupt: 

In [29]:
wts = [0.947635485 if x==1 else 0.03644752 for x in target]

In [39]:
param_grid = { 
    'n_estimators': [50, 200],
    'max_features': ['auto', 'sqrt', 'log2']
}

model_RF = RandomForestClassifier(random_state=0)
clf = GridSearchCV(model_RF, param_grid, scoring=gini_scorer,
                         cv=StratifiedKFold(target, 5, shuffle=True),n_jobs=4, verbose=3, refit=True)
clf.fit(trn_df, target)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] n_estimators=50, max_features=auto ..............................
[CV] n_estimators=50, max_features=auto ..............................
[CV] n_estimators=50, max_features=auto ..............................
[CV] n_estimators=50, max_features=auto ..............................
[CV] .... n_estimators=50, max_features=auto, score=0.295260 -161.6min
[CV] n_estimators=50, max_features=auto ..............................
[CV] .... n_estimators=50, max_features=auto, score=0.287638 -161.8min
[CV] n_estimators=200, max_features=auto .............................
[CV] .... n_estimators=50, max_features=auto, score=0.269623 -162.2min
[CV] n_estimators=200, max_features=auto .............................
[CV] .... n_estimators=50, max_features=auto, score=0.309283 -162.5min
[CV] n_estimators=200, max_features=auto .............................
[CV] .... n_estimators=50, max_features=auto, score=0.289041 -184.2min
[CV] n_estimators

[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed: 1388.4min finished


GridSearchCV(cv=sklearn.cross_validation.StratifiedKFold(labels=[0 0 ..., 0 0], n_folds=5, shuffle=True, random_state=None),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=4,
       param_grid={'n_estimators': [50, 200], 'max_features': ['auto', 'sqrt', 'log2']},
       pre_dispatch='2*n_jobs', refit=True,
       scoring=make_scorer(gini_normalized), verbose=3)

In [40]:
print(clf.best_score_)
print(clf.best_params_)

0.29140959291613056
{'n_estimators': 200, 'max_features': 'auto'}


In [45]:
with open('../cache/idx_info.pkl', 'rb') as f:
    idx_info= pickle.load(f)
n_splits = 5
n_estimators = 200
#folds = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=15) 
imp_df = np.zeros((len(trn_df.columns), n_splits))
rf_evals = np.zeros((n_estimators, n_splits))
oof_rf = np.empty(len(trn_df))
np.random.seed(0)
sub_df = feather.read_dataframe('../cache/sub_df.feather')
sub_df = sub_df[trn_df.columns]
sub_df.fillna(-1, inplace=True)
print(sub_df.shape)
sub_preds = np.zeros(len(sub_df))
tgt = df['y']

(892816, 227)


In [46]:
for fold_ in range(len(idx_info)):
    l = idx_info[fold_]
    trn_idx = l[0]
    val_idx = l[1]
    trn_dat, trn_tgt = trn_df.iloc[trn_idx], tgt.iloc[trn_idx]
    val_dat, val_tgt = trn_df.iloc[val_idx], tgt.iloc[val_idx]
    
    pos = sum(trn_tgt==0)/sum(trn_tgt==1)
    wts = [pos if x==1 else 1 for x in trn_tgt]
    clf = RandomForestClassifier(random_state=1337, n_estimators=200, class_weight={0:1, 1:pos},
                              max_features='auto')
    clf.fit(trn_dat, trn_tgt, sample_weight=wts)
    
    # Predict OOF and submission probas with the best round
    oof_rf[val_idx] = clf.predict_proba(val_dat)[:, 1]
    # Update submission
    
    sub_preds += clf.predict_proba(sub_df)[:, 1] / n_splits
    
    # Display results
    print("Fold %2d : %.6f @%4d "
          % (fold_ + 1,
             eval_gini(val_tgt, oof_rf[val_idx]),
             n_estimators
             ))

Fold  1 : 0.352176 @ 200 
Fold  2 : 0.338913 @ 200 
Fold  3 : 0.362738 @ 200 
Fold  4 : 0.344359 @ 200 
Fold  5 : 0.356827 @ 200 


In [47]:
sub_preds = np.clip(sub_preds, a_min=0.05, a_max=0.95)

In [50]:
from datetime import datetime
np.save('../cache/oof_rf', oof_rf)
now = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
fn = '../cache/model.rf.{}'.format(now)
with open(fn, 'wb') as f:
    pickle.dump(clf, f)

In [51]:
df = pd.read_csv('../cache/test_id.csv')

In [52]:
sub_df["target"] = sub_preds
sub_df['id'] = df['id'].values
now = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
fn = '../submissions/sub.rf.{}'.format(now)
sub_df[['id', "target"]].to_csv(fn, index=False, float_format="%.9f")