In [None]:
"""
01, raw xgb + raw RF, simple average, 0.378522
02, raw xgb + raw RF, rank average, 0.377719
03, raw xgb, 0.381522 (works fine!)
04, raw RF, 0.365994
05, raw xgb + RF, rank multiplication, 0.380349 (works better than I thought)
06, should i try rank min, 0.380458
07, stack RF, Extra, XGB, 0.382054
08, xgb (subsample 0.8), 0.382557
09, (RF, XGB, ET, KNN, L1), 0.3823,  
"""

In [None]:
"""
categorical encoding + knn
xgb features + knn
categorical only + various estimator
numerical only + various estimator
variable selection (xgb, RF) + various estimator
numerical as categorical
second order features
standardize kfold transformer
numerical as categorical + 2nd order + linear
"""

In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import xgboost as xgb
import vw_utils as vw
import gini
%matplotlib inline

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

train['train_set'] = 1
test['train_set'] = 0

data = pd.concat([train, test], axis=0).reset_index()

## --------------------- Feature Start -----------------------

In [3]:
# start
categorical = ['T1_V' + str(i) for i in list(range(4, 10)) + [11, 12, 15, 16, 17]] + \
                ['T2_V' + str(i) for i in [3, 5, 11, 12, 13]]

numerical = set(train.columns).difference(categorical + ['Hazard', 'Id'])
numerical = list(numerical)

In [4]:
# raw feature
from sklearn.preprocessing import StandardScaler
import re

SS = StandardScaler()
feat_raw = data[numerical].applymap(float)
feat_raw[numerical] = SS.fit_transform(feat_raw[numerical])


toDrop = re.sub('[ ]+', '_', data[categorical].ix[0, :].to_string()).split('\n')
feat_raw_cat = pd.get_dummies(data[categorical]).drop(toDrop, axis=1)

feat_raw = pd.concat([feat_raw, feat_raw_cat], axis=1)
feat_raw.shape

(101999, 96)

In [5]:
# assemble features

tmp = pd.concat([feat_raw], axis=1)
train_c = tmp[data.train_set==1]
test_c = tmp[data.train_set==0]

print(train_c.shape)
print(test_c.shape)

(50999, 96)
(51000, 96)


## --------------------- Modeling -----------------------

In [None]:
%%time
# stacking
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline, FeatureUnion
from wrappers import Xgbooster, KFoldTransformer, cross_validate
import gini
import copy

RF = KFoldTransformer('reg:RF', K=5, n_estimators=1000, max_features='sqrt', min_samples_leaf=5, n_jobs=-1)

ET = KFoldTransformer('reg:ET', K=5, n_estimators=1000, max_features='sqrt', min_samples_leaf=5, n_jobs=-1)

XGB = KFoldTransformer('reg:XGB', K=5, n_trees=786, objective='reg:linear', max_depth=7, eta=0.01, gamma=0,
                             min_child_weight=30, subsample=1, colsample_bytree=0.5)

KNN1 = KFoldTransformer('reg:KNN', K=5, n_neighbors=1)

KNN16 = KFoldTransformer('reg:KNN', K=5, n_neighbors=16)

KNN64 = KFoldTransformer('reg:KNN', K=5, n_neighbors=64)

L1 = KFoldTransformer('reg:L1', K=5, alpha=1E-5, normalize=True)

features = FeatureUnion([('RF', RF), 
                         ('ET', ET), 
                         ('XGB', XGB), 
                         ('KNN1', KNN1), 
                         ('KNN16', KNN16), 
                         ('KNN64', KNN64),
                         ('L1', L1)])

stacker = Pipeline([('features', features),
                    ('model', Ridge(alpha=0.001, normalize=True))])


# score = cross_validate(stacker, train_c.values, train.Hazard, nfold=5)
# print(score)
stacker.fit(train_c.values, train.Hazard)
yhat = stacker.predict(test_c.values)
pd.DataFrame({'Id': test.Id, 'Hazard': yhat}).reindex_axis(['Id', 'Hazard'], 1).to_csv('../output/ensemble_09.csv', index=0)

In [62]:
%%time
# rank min

import gini
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from scipy.stats import rankdata

param = {'max_depth':7, 'eta':0.005, 'objective':'reg:linear',
         'gamma':0, 'min_child_weight': 30, 'subsample': 0.8,
         'colsample_bytree': 0.5}

dTrain = xgb.DMatrix(train_c, label=train.Hazard)
dTest = xgb.DMatrix(test_c)
# model = xgb.train(param, dTrain, 1901)
yhat_xgb = model.predict(dTest)


RFR = RandomForestRegressor(n_estimators=5000, max_features='sqrt', min_samples_leaf=5, n_jobs=-1)
RFR.fit(train_c, train.Hazard)
yhat_RF = RFR.predict(test_c)

yhat = np.minimum(rankdata(yhat_xgb), rankdata(yhat_RF))
pd.DataFrame({'Id': test.Id, 'Hazard': yhat}).reindex_axis(['Id', 'Hazard'], 1).to_csv('../output/ensemble_06.csv', index=0)

CPU times: user 9min 22s, sys: 16.6 s, total: 9min 39s
Wall time: 2min 29s


In [14]:
%%time
# k fold with xgb

from wrappers import KFoldEstimator, Xgbooster

XGB = Xgbooster(n_trees=786, objective='reg:linear', max_depth=7, eta=0.01, gamma=0,
                             min_child_weight=30, subsample=1, colsample_bytree=0.5)

XGB.fit(train_c ,train.Hazard)
yhat = XGB.predict(test_c)
pd.DataFrame({'Id': test.Id, 'Hazard': yhat}).reindex_axis(['Id', 'Hazard'], 1).to_csv('../output/test_xgbooster.csv', index=0)

CPU times: user 46.3 s, sys: 148 ms, total: 46.5 s
Wall time: 12.9 s


In [44]:
pd.DataFrame({'Id': test.Id, 'Hazard': yhat}).reindex_axis(['Id', 'Hazard'], 1).to_csv('../output/ensemble_07.csv', index=0)

## --------------------- Test Ground -----------------------

In [20]:
%%time
# k fold with xgb

from wrappers import KFoldEstimator
from wrappers import cross_validate

XGB = KFoldEstimator('reg:XGB', K=5, n_trees=786, objective='reg:linear', max_depth=7, eta=0.01, gamma=0,
                             min_child_weight=30, subsample=1, colsample_bytree=0.5)

print(cross_validate(XGB, train_c, train.Hazard, nfold=10))

140480130779848
0.385765017507
CPU times: user 24min 5s, sys: 3.22 s, total: 24min 8s
Wall time: 6min 4s


In [22]:
%%time
# k fold with xgb

from wrappers import KFoldEstimator, Xgbooster
from sklearn.cross_validation import cross_val_score

XGB = Xgbooster(n_trees=786, objective='reg:linear', max_depth=7, eta=0.01, gamma=0,
                             min_child_weight=30, subsample=1, colsample_bytree=0.5)

scores = cross_val_score(XGB, train_c.values, train.Hazard, cv=5, scoring=gini.normalized_gini_score)
print(scores.mean())

KeyError: -1

In [150]:
%%time
# 0.36597
# with sofia: 0.36420
# base: 0.36906
# min sample leaf: 0.36333 (20), 0.36576(10), 0.36464(2)
# max features: sqrt
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import cross_val_score

RFR = RandomForestRegressor(n_estimators=5000, max_features='sqrt', min_samples_leaf=5, n_jobs=-1)
RFR.fit(train_c, train.Hazard)
yhat_RF = RFR.predict(test_c)

# scores = cross_val_score(RFR, train_c, train.Hazard.values, cv=5, scoring=gini.normalized_gini_score, n_jobs=-1)
# print(scores.mean())

CPU times: user 7min 48s, sys: 15.1 s, total: 8min 3s
Wall time: 2min 3s


In [55]:
%%time
# eta 0.01 -> 0.005, not much effect
# subsample 1 -> 0.8, seems to help
# subsample 0.8 -> 0.5, didn't help
# child weight 30 -> 10, not much effect 
# max_depth 7 -> 5, slow
import gini
import xgboost as xgb

param = {'max_depth':7, 'eta':0.005, 'objective':'reg:linear',
         'gamma':0, 'min_child_weight': 30, 'subsample': 0.5,
         'colsample_bytree': 0.5}

dTrain = xgb.DMatrix(train_c, label=train.Hazard)
result = xgb.cv(param, dTrain, 3000, nfold=5, feval=gini.xgb_gini)

[0]	cv-test-gini:0.330612+0.034737	cv-train-gini:0.349319+0.034235
[1]	cv-test-gini:0.298928+0.021097	cv-train-gini:0.323502+0.006383
[2]	cv-test-gini:0.303476+0.012308	cv-train-gini:0.329273+0.008916
[3]	cv-test-gini:0.309294+0.015573	cv-train-gini:0.338216+0.003783
[4]	cv-test-gini:0.313880+0.014661	cv-train-gini:0.343886+0.005282
[5]	cv-test-gini:0.315783+0.015421	cv-train-gini:0.348472+0.005335
[6]	cv-test-gini:0.320639+0.013855	cv-train-gini:0.352414+0.004333
[7]	cv-test-gini:0.319901+0.013254	cv-train-gini:0.353097+0.005938
[8]	cv-test-gini:0.322495+0.012536	cv-train-gini:0.354909+0.005623
[9]	cv-test-gini:0.322875+0.013084	cv-train-gini:0.355418+0.005999
[10]	cv-test-gini:0.326441+0.014230	cv-train-gini:0.358663+0.005244
[11]	cv-test-gini:0.328941+0.014704	cv-train-gini:0.362267+0.005359
[12]	cv-test-gini:0.329353+0.014841	cv-train-gini:0.363368+0.005330
[13]	cv-test-gini:0.330903+0.014886	cv-train-gini:0.365197+0.005738
[14]	cv-test-gini:0.330400+0.015170	cv-train-gini:0.364874

CPU times: user 18min 36s, sys: 16.5 s, total: 18min 52s
Wall time: 6min 8s


In [8]:
%%time
# 0.347507(5), 0.33994(20)
# 
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.cross_validation import cross_val_score

RFR = ExtraTreesRegressor(n_estimators=2000, max_features='sqrt', min_samples_leaf=5, n_jobs=-1)
scores = cross_val_score(RFR, train_c, train.Hazard.values, cv=5, scoring=gini.normalized_gini_score, n_jobs=-1)
print(scores.mean())

0.347638400841
CPU times: user 452 ms, sys: 300 ms, total: 752 ms
Wall time: 4min 56s


## --------------------- Factory -----------------------

In [151]:
from scipy.stats import rankdata

yhat = (rankdata(yhat_xgb)*rankdata(yhat_RF))
pd.DataFrame({'Id': test.Id, 'Hazard': yhat}).reindex_axis(['Id', 'Hazard'], 1).to_csv('../output/ensemble_05.csv', index=0)

In [18]:
%%time
# sofia feature
from sofia_utils import sofia_kmeans

n_clusters = 100

sofia = sofia_kmeans(n_clusters=n_clusters, iterations=1000, mapping_threshold=0.0001)

feat_sofia = sofia.fit_transform(data[numerical])

feature_names = ['sofia_'+str(i+1) for i in range(n_clusters)]
feat_sofia = pd.DataFrame(feat_sofia, columns=feature_names)
print(feat_sofia.shape)

from sklearn.preprocessing import MinMaxScaler
MMS = MinMaxScaler()
feat_sofia.ix[:, :] = MMS.fit_transform(feat_sofia.values)

extract percent:  853192.0
sparcity:  0.695033872881
(101999, 100)
CPU times: user 6.6 s, sys: 1.11 s, total: 7.71 s
Wall time: 14.5 s


In [6]:
%%time
from sklearn.preprocessing import PolynomialFeatures

PF = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
train_c = PF.fit_transform(train_c)
print(train_c.shape)

(50999, 6216)
CPU times: user 6.03 s, sys: 2.56 s, total: 8.59 s
Wall time: 8.59 s


In [75]:
%%time
# kmeans features

from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

SS = StandardScaler()

kmeans = KMeans(80, random_state=12345, n_init=1)
feat_kmeans_train = kmeans.fit_transform(SS.fit_transform(train[numerical]))
feat_kmeans_test = kmeans.transform(SS.transform(test[numerical]))

# print('variance lost:')
# print(kmeans.inertia_ / KMeans(1, random_state=12345, n_init=1).fit(train[numerical]).inertia_)

feat_kmeans_train = pd.DataFrame(feat_kmeans_train).add_prefix('kmeans_')
feat_kmeans_test = pd.DataFrame(feat_kmeans_test).add_prefix('kmeans_')

numerical += feat_kmeans_train.columns.values.tolist()
train_c = pd.concat([train_c, feat_kmeans_train], axis=1)
test_c = pd.concat([test_c, feat_kmeans_test], axis=1)

CPU times: user 14.5 s, sys: 4 ms, total: 14.5 s
Wall time: 14.5 s


  "got %s" % (estimator, X.dtype))


In [49]:
# count features

feat_count_train = train[categorical].copy()
feat_count_test = test[categorical].copy()

for cat in categorical:
    numerical.append('cnt_'+cat)
    hotDeck = train[cat].value_counts().reset_index().rename(columns={'index': cat, 0: 'cnt_'+cat})
    feat_count_train = pd.merge(feat_count_train, hotDeck, on=cat)
    del feat_count_train[cat]
    
    feat_count_test = pd.merge(feat_count_test, hotDeck, on=cat)
    del feat_count_test[cat]
    
train_c = pd.concat([train_c, feat_count_train], axis=1)
test_c = pd.concat([test_c, feat_count_test], axis=1)

In [53]:
# categorical encoding
from sklearn.preprocessing import StandardScaler

feat_encode_train = train[categorical].copy()
feat_encode_test = test[categorical].copy()

for cat in categorical:
    hot_deck = train.groupby(cat).Hazard.mean().reset_index().rename(columns={'Hazard': 'encode_'+cat})
    feat_encode_train = pd.merge(feat_encode_train, hot_deck, on=cat)
    del feat_encode_train[cat]
    
    feat_encode_test = pd.merge(feat_encode_test, hot_deck, on=cat)
    del feat_encode_test[cat]
    
names = feat_encode_train.columns
ss = StandardScaler()
feat_encode_train[names] = ss.fit_transform(feat_encode_train.values)
feat_encode_test[names] = ss.transform(feat_encode_test.values)

encode_T1_V4     1.011131
encode_T1_V5     1.003089
encode_T1_V6     0.999460
encode_T1_V7     1.025445
encode_T1_V8     0.985815
encode_T1_V9     1.000703
encode_T1_V11    1.001235
encode_T1_V12    0.987670
encode_T1_V15    1.007230
encode_T1_V16    1.005126
encode_T1_V17    1.003132
encode_T2_V3     0.998128
encode_T2_V5     1.001219
encode_T2_V11    1.002282
encode_T2_V12    1.004119
encode_T2_V13    0.994556
dtype: float64

In [81]:
%%time
# reduce
from xgboost import XGBRegressor
import xgboost as xgb

feat_reduce_train = pd.DataFrame()
feat_reduce_test = pd.DataFrame()

for num in numerical:
    dTmp = xgb.DMatrix(train[[num]], label=train.Hazard)
    reg = xgb.train({'max_depth':8, 'min_child_weight':30, 'objective':'reg:linear'}, dTmp, 1)
    
    feat_reduce_train['discrete_'+num] = reg.predict(dTmp, pred_leaf=True)
    feat_reduce_train['discrete_'+num] = feat_reduce_train['discrete_'+num].map(str)
    
    feat_reduce_test['discrete_'+num] = reg.predict(xgb.DMatrix(test[[num]]), pred_leaf=True)
    feat_reduce_test['discrete_'+num] = feat_reduce_test['discrete_'+num].map(str)

Wall time: 6.86 s


In [73]:
# discrete

feat_discrete_train = train[numerical].applymap(str)
feat_discrete_test = test[numerical].applymap(str)

In [88]:
# combine all features
cTrain = train[numerical+categorical]
cTest = test[numerical+categorical]

# cTrain = pd.concat([train[numerical+categorical], feat_reduce_train], axis=1)
# cTest = pd.concat([test[numerical+categorical], feat_reduce_test], axis=1)