In [None]:
"""
00, base, 0.33197
01, cap .99, 0.33311
02, cap .95, 0.33367
03, cap .95 + AB, 0.34576, 0.335084
04, cap .99 + AB + count, 0.34279
05, cap .95 + AB + count, 0.34279, 0.331620
"""

In [None]:
"""
categorical only + various estimator
numerical only + various estimator
variable selection (xgb, RF) + various estimator
numerical as categorical
second order features
standardize kfold transformer
numerical as categorical + 2nd order + linear
"""

In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import xgboost as xgb
import vw_utils as vw
import gini
%matplotlib inline

In [1200]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

cap = train.Hazard.quantile(0.95)
print('Cap:', cap)

train['Hazard_cap'] = np.minimum(train.Hazard, cap)

data = pd.concat([train, test], axis=0).reset_index()
train_set = np.hstack([np.ones(train.shape[0]), np.zeros(test.shape[0])])

Cap: 12.0


## --------------------- Feature Start -----------------------

In [43]:
# start
categorical = ['T1_V' + str(i) for i in list(range(4, 10)) + [11, 12, 15, 16, 17]] + \
                ['T2_V' + str(i) for i in [3, 5, 11, 12, 13]]

numerical = set(train.columns).difference(categorical + ['Hazard', 'Hazard_cap', 'Id'])
numerical = list(numerical)

In [1012]:
# raw feature
from sklearn.preprocessing import StandardScaler
import re

SS = StandardScaler()
feat_raw = data[numerical].applymap(float)
feat_raw[numerical] = SS.fit_transform(feat_raw[numerical])


toDrop = re.sub('[ ]+', '_', data[categorical].ix[0, :].to_string()).split('\n')
# feat_raw_cat = pd.get_dummies(data[categorical]).drop(toDrop, axis=1)

feat_raw = pd.concat([feat_raw, data[categorical]], axis=1)
feat_raw.shape

(101999, 32)

In [1180]:
%%time
# count features
from wrappers import generatePrimes, calPowerCount
from sklearn.preprocessing import StandardScaler

feat_count = pd.concat([calPowerCount(data[categorical], 1), 
                        calPowerCount(data[categorical], 2)], axis=1)

cols = feat_count.columns
feat_count = pd.DataFrame(StandardScaler().fit_transform(feat_count.values), columns=cols)

CPU times: user 1.9 s, sys: 140 ms, total: 2.04 s
Wall time: 2.04 s


  "got %s" % (estimator, X.dtype))


In [1207]:
# assemble features

tmp = pd.concat([feat_raw, feat_count], axis=1)
train_c = tmp[train_set==1]
test_c = tmp[train_set==0]

print(train_c.shape)
print(test_c.shape)

(50999, 184)
(51000, 184)


## --------------------- Modeling -----------------------

In [1206]:
%%time
from wrappers import VWRegressor
from sklearn.cross_validation import KFold

# namespaces = {'A': [x for x in train_c.columns if x.startswith('T1')],
#               'B': [x for x in train_c.columns if x.startswith('T2')]}

# namespaces = {'A': [col for col, dtype in zip(train_c.columns, train_c[train_c.columns].dtypes) 
#                         if col.startswith('T') and dtype=='object'],
#               'B': [col for col, dtype in zip(train_c.columns, train_c[train_c.columns].dtypes) 
#                         if col.startswith('T') and dtype!='object'],
#               'C': [col for col in train_c.columns if col.startswith('count')]}

namespaces = {'A': [col for col, dtype in zip(train_c.columns, train_c[train_c.columns].dtypes) 
                        if col.startswith('T') and dtype=='object'],
              'B': [col for col, dtype in zip(train_c.columns, train_c[train_c.columns].dtypes)
                        if col.startswith('T') and dtype!='object']}


scores = []
vw = VWRegressor(passes=10, l2=1E-6, l1=1E-6, bit=20, interaction='AB', namespaces=namespaces)
for i, (idx_train, idx_test) in enumerate(KFold(train.shape[0], 10)):
    print(i)
    vw.fit(train_c.ix[idx_train, :], train.Hazard[idx_train])
    pred = vw.predict(train_c.ix[idx_test, :])
    scores.append(gini.normalized_gini(train.Hazard[idx_test], pred))
    
print(np.mean(scores))

0
1
2
3
4
5
6
7
8
9
0.344305706189
CPU times: user 25.2 s, sys: 996 ms, total: 26.2 s
Wall time: 45.6 s


In [1208]:
from wrappers import VWRegressor

namespaces = {'A': [col for col, dtype in zip(train_c.columns, train_c[train_c.columns].dtypes) 
                        if col.startswith('T') and dtype=='object'],
              'B': [col for col, dtype in zip(train_c.columns, train_c[train_c.columns].dtypes) 
                        if col.startswith('T') and dtype!='object'],
              'C': [col for col in train_c.columns if col.startswith('count')]}

# namespaces = {'A': [col for col, dtype in zip(train_c.columns, train_c[train_c.columns].dtypes) 
#                         if col.startswith('T') and dtype=='object'],
#               'B': [col for col, dtype in zip(train_c.columns, train_c[train_c.columns].dtypes)
#                         if col.startswith('T') and dtype!='object']}

vw = VWRegressor(passes=20, l2=1E-6, l1=1E-6, bit=20, interaction='AB', namespaces=namespaces)
vw.fit(train_c, train.Hazard)
yhat = vw.predict(test_c)
pd.DataFrame({'Id': test.Id, 'Hazard': yhat}).reindex_axis(['Id', 'Hazard'], 1).to_csv('../output/linear_v2_05.csv', index=0)

In [1209]:
pd.DataFrame({'Id': test.Id, 'Hazard': yhat}).reindex_axis(['Id', 'Hazard'], 1).to_csv('../output/linear_v2_05.csv', index=0)

## factory

In [7]:
%%time
from sklearn.linear_model import ElasticNet
from sklearn.cross_validation import KFold

model = ElasticNet(alpha=1E-3, l1_ratio=0.5, max_iter=2000)
scores = []
time = 1
for idx_train, idx_test in KFold(train.shape[0], 5):
    print(time)
    time += 1    
    model.fit(train_c.ix[idx_train, :], train.Hazard[idx_train])
    pred = model.predict(train_c.ix[idx_test, :])
    scores.append(gini.normalized_gini(train.Hazard[idx_test], pred))

print(np.mean(scores))

1
2
3
4
5
0.330865678696
CPU times: user 15min 54s, sys: 636 ms, total: 15min 55s
Wall time: 15min 55s


