In [1]:
import pandas as pd
# import xgboost as xgb
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
import random
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, make_scorer
from xgboost import XGBClassifier



In [2]:
df = pd.read_csv('train.csv', sep=';')
bins = [0,90,100,110, 120, 130, 140, 150, 160, 170, 180, 250, 100000]
df['ap_hi'] = pd.cut(df['ap_hi'].values, bins, labels=False)
lo_bins = [0,40,50,60,70,80,90,100,110,120,100000]
df['ap_lo'] = pd.cut(df['ap_lo'].values, lo_bins, labels=False)
df['ap_hi'].fillna(3, inplace = True)
df['ap_lo'].fillna(4, inplace = True)

In [37]:
# age - as it is
# gender - one hot
# height - as it is
# weight - as is
# ap_hi - category by 10
# ap_lo - category by 10
df.columns

Index(['id', 'age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
       'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio'],
      dtype='object')

In [3]:
def preprocess(X, squares = False):
    xlen = X.shape[0]
    imb = X[:,3] / ((X[:,2]/100.0)**2)
    imb = imb.reshape((len(imb),1))
    X = np.hstack((X, imb))
    X[:,0] = (X[:,0].astype(int) / int(365)).astype(int) #number of years instead of days
    gender = X[:,1].astype(int) - 1
    cholesterol = X[:,6].astype(int) - 1
    gluc = X[:,7].astype(int) - 1
    ap_hi = X[:,4].astype(np.int64)
    ap_lo = X[:,5].astype(np.int64)
    gender_oh = np.zeros((xlen, 2)).astype(int)
    cholesterol_oh = np.zeros((xlen, 3)).astype(int)
    gluc_oh = np.zeros((xlen, 3)).astype(np.int64)
    ap_hi_oh = np.zeros((xlen,12)).astype(np.int64)
    ap_lo_oh = np.zeros((xlen,10)).astype(np.int64)
    gender_oh[np.arange(xlen), gender[np.arange(xlen)] - 1] = 1
    cholesterol_oh[np.arange(xlen), cholesterol[np.arange(xlen)] - 1] = 1
    gluc_oh[np.arange(xlen), gluc[np.arange(xlen)] - 1] = 1
    ap_hi_oh[np.arange(xlen), ap_hi[np.arange(xlen)]] = 1
    ap_lo_oh[np.arange(xlen), ap_lo[np.arange(xlen)]] = 1
    X = np.hstack((X, gender_oh))
    X = np.hstack((X, cholesterol_oh))
    X = np.hstack((X, gluc_oh))
    X = np.hstack((X, ap_hi_oh))
    X = np.hstack((X, ap_lo_oh))
    if (squares):
        counter = 0
        XX = np.zeros((X.shape[0], int((X.shape[1] * (X.shape[1] + 1)) / 2) + X.shape[1]))
        for i in range(X.shape[1]):
            for j in range(i + 1):
                XX[:,counter] = X[:,i] * X[:,j]
                counter = counter + 1
        XX[:,counter:] = X
        X = XX;
    return X

In [4]:
X_short = df.values[:,1:-4]
X_all = df.values[:,1:-1]
y = df.values[:,-1]
indexes = list(range(len(y)))
random.shuffle(indexes)
X_short = X_short[indexes]
X_all = X_all[indexes]
y = y[indexes]
X_short = preprocess(X_short, squares=False)
X_all = preprocess(X_all, squares=False)

In [5]:
X_short_train, X_short_test, y_short_train, y_short_test = train_test_split(X_short, y, test_size=0.3)
X_all_train, X_all_test, y_all_train, y_all_stest = train_test_split(X_all, y, test_size=0.3)

In [76]:
clf_short = XGBClassifier(max_depth = 8, nthread = 36,
                    n_estimators = 10000, learning_rate =0.001,
                    colsample_bytree = 0.8, colsample_bylevel = 0.8)
clf_short.fit(X_short_train, y_short_train,
        eval_set = [(X_short_train, y_short_train), (X_short_test, y_short_test)], eval_metric = 'auc',
        early_stopping_rounds= 10, verbose = True)

[0]	validation_0-auc:0.799293	validation_1-auc:0.793218
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.803258	validation_1-auc:0.795023
[2]	validation_0-auc:0.807494	validation_1-auc:0.797366
[3]	validation_0-auc:0.806351	validation_1-auc:0.795587
[4]	validation_0-auc:0.808311	validation_1-auc:0.796968
[5]	validation_0-auc:0.809023	validation_1-auc:0.797732
[6]	validation_0-auc:0.809421	validation_1-auc:0.798278
[7]	validation_0-auc:0.809342	validation_1-auc:0.797955
[8]	validation_0-auc:0.809713	validation_1-auc:0.798424
[9]	validation_0-auc:0.810152	validation_1-auc:0.798849
[10]	validation_0-auc:0.809919	validation_1-auc:0.798933
[11]	validation_0-auc:0.810092	validation_1-auc:0.798991
[12]	validation_0-auc:0.81036	validation_1-auc:0.799001
[13]	validation_0-auc:0.810327	validation_1-auc:0.799014
[14]	validation_0-auc:0.810392	validation_1-auc:0.799084

XGBClassifier(base_score=0.5, colsample_bylevel=0.8, colsample_bytree=0.8,
       gamma=0, learning_rate=0.001, max_delta_step=0, max_depth=8,
       min_child_weight=1, missing=None, n_estimators=10000, nthread=36,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [96]:
clf_all = XGBClassifier(max_depth = 4, nthread = 36,
                    n_estimators = 10000, learning_rate =0.05)
clf_all.fit(X_all_train, y_all_train,
        eval_set = [(X_all_train, y_all_train), (X_all_test, y_all_stest)], eval_metric = 'auc',
        early_stopping_rounds= 10, verbose = True)

[0]	validation_0-auc:0.787858	validation_1-auc:0.790062
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.787858	validation_1-auc:0.790062
[2]	validation_0-auc:0.787858	validation_1-auc:0.790062
[3]	validation_0-auc:0.788679	validation_1-auc:0.790961
[4]	validation_0-auc:0.791615	validation_1-auc:0.792703
[5]	validation_0-auc:0.792688	validation_1-auc:0.794167
[6]	validation_0-auc:0.793737	validation_1-auc:0.795847
[7]	validation_0-auc:0.793965	validation_1-auc:0.796014
[8]	validation_0-auc:0.793992	validation_1-auc:0.796062
[9]	validation_0-auc:0.794595	validation_1-auc:0.796745
[10]	validation_0-auc:0.79485	validation_1-auc:0.796594
[11]	validation_0-auc:0.795552	validation_1-auc:0.797472
[12]	validation_0-auc:0.795733	validation_1-auc:0.797598
[13]	validation_0-auc:0.795994	validation_1-auc:0.797883
[14]	validation_0-auc:0.795835	validation_1-auc:0.797859

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=4,
       min_child_weight=1, missing=None, n_estimators=10000, nthread=36,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [69]:
clf = RandomForestClassifier(n_estimators=1000, n_jobs=-1, max_depth=4, max_features='sqrt', criterion='entropy')
# scores = cross_val_score(clf, X, y, cv=5)
clf.fit(X_train, y_train)
y_pred = clf.predict_proba(X_test)[:,1]
y_tpred = clf.predict_proba(X_train)[:,1]
print(roc_auc_score(y_train, y_tpred))
print(roc_auc_score(y_test, y_pred))

0.800134615551
0.796223156965


In [36]:
scoring = make_scorer(roc_auc_score, average='weighted')
scoring(clf, X_test, y_test)

0.73696370004568934

In [28]:
scores = cross_val_score(clf, X, y, cv=5)
print(scores)

[ 0.73566174  0.73392857  0.73485714  0.7325      0.73369526]


In [59]:
clf = RandomForestClassifier(n_estimators=100, n_jobs=-1)
scores = cross_val_score(clf, XX, y, cv=5)
print(scores)

In [70]:
cv = KFold(3)
grid = GridSearchCV(RandomForestClassifier(), param_grid={'n_estimators': [100,200,500],
                                                          'max_depth': [4,6,8,12],
                                                          'max_features': ['auto', 'log2', 'sqrt', None],
                                                          'criterion': ['entropy', 'gini'],
                                                          'n_jobs': [4]}, 
                    n_jobs=9, 
                    scoring='roc_auc',
                    cv=cv,
                   verbose = 1)

In [71]:
grid.fit(X, y)

Fitting 3 folds for each of 96 candidates, totalling 288 fits


[Parallel(n_jobs=9)]: Done  32 tasks      | elapsed:   47.8s
[Parallel(n_jobs=9)]: Done 182 tasks      | elapsed: 10.4min
[Parallel(n_jobs=9)]: Done 288 out of 288 | elapsed: 19.3min finished


GridSearchCV(cv=KFold(n_splits=3, random_state=None, shuffle=False),
       error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=9,
       param_grid={'criterion': ['entropy', 'gini'], 'max_features': ['auto', 'log2', 'sqrt', None], 'n_jobs': [4], 'n_estimators': [100, 200, 500], 'max_depth': [4, 6, 8, 12]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='roc_auc', verbose=1)

In [72]:
print(grid.best_params_)
print(grid.best_score_)

{'criterion': 'entropy', 'max_features': 'sqrt', 'max_depth': 8, 'n_estimators': 200, 'n_jobs': 4}
0.801359773449


In [6]:
cv = KFold(3)
param_grid={'n_estimators':[10000], 'max_depth': [4,6,8,10,12], 
            'learning_rate': [0.01, 0.05, 0.001, 0.005], 'nthread': [6],
            'colsample_bytree': [0.2,0.4,0.6,0.8], 'colsample_bylevel': [0.2,0.4,0.6,0.8],
            'subsample': [0.2,0.4,0.6,0.8]}
fit_params = {'eval_set': [(X_short_test, y_short_test)], 'eval_metric': 'auc',
              'early_stopping_rounds': 10, 'verbose': False}
xgb_grid = GridSearchCV(XGBClassifier(), 
                        param_grid=param_grid,
                        fit_params = fit_params,
                        n_jobs=6,
                        scoring='roc_auc',
                        cv=cv,
                        verbose = 1)

In [7]:
xgb_grid.fit(X_short_train, y_short_train)

Fitting 3 folds for each of 1280 candidates, totalling 3840 fits


[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:    3.1s
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:   13.8s
[Parallel(n_jobs=6)]: Done 438 tasks      | elapsed:   39.8s
[Parallel(n_jobs=6)]: Done 788 tasks      | elapsed:  1.4min
[Parallel(n_jobs=6)]: Done 1238 tasks      | elapsed:  2.1min
[Parallel(n_jobs=6)]: Done 1788 tasks      | elapsed:  3.4min
[Parallel(n_jobs=6)]: Done 2438 tasks      | elapsed:  4.6min
[Parallel(n_jobs=6)]: Done 3188 tasks      | elapsed:  6.5min
[Parallel(n_jobs=6)]: Done 3840 out of 3840 | elapsed:  8.4min finished


GridSearchCV(cv=KFold(n_splits=3, random_state=None, shuffle=False),
       error_score='raise',
       estimator=XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={'eval_set': [(array([[ 55.,   1., ...,   0.,   0.],
       [ 53.,   2., ...,   0.,   0.],
       ...,
       [ 62.,   1., ...,   0.,   0.],
       [ 41.,   1., ...,   0.,   0.]]), array([ 0.,  1., ...,  1.,  0.]))], 'verbose': False, 'early_stopping_rounds': 10, 'eval_metric': 'auc'},
       iid=True, n_jobs=6,
       param_grid={'nthread': [6], 'colsample_bylevel': [0.2, 0.4, 0.6, 0.8], 'max_depth': [4, 6, 8, 10, 12], 'colsample_bytree': [0.2, 0.4, 0.6, 0.8], 'learning_rate': [0.01, 0.05, 0.001, 0.005], 'n_estimators': [10000],

In [8]:
print(xgb_grid.best_params_)
print(xgb_grid.best_score_)

{'nthread': 6, 'colsample_bylevel': 0.6, 'max_depth': 4, 'colsample_bytree': 0.8, 'learning_rate': 0.05, 'n_estimators': 10000, 'subsample': 0.6}
0.801628065561


In [42]:
# make age, holesterol, gluc one-hot
# use xgboost!!!
# add smoke, alco, active processing
df.columns

Index(['id', 'age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
       'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio'],
      dtype='object')

In [27]:
def make_submission(test, clf):
    Z = test[['age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc']].values
    Z = preprocess(Z, squares=True)
    return clf.predict_proba(Z)[:,1]

In [32]:
df_test = pd.read_csv('test.csv', sep=';')
# clf = RandomForestClassifier(**grid.best_params_)
# clf.fit(X, y)
subm = make_submission(df_test, clf)
np.savetxt("submission.csv", subm)

In [29]:
y_pred = clf.predict_proba(X_test)[:,1]
roc_auc_score(y_test, y_pred)

0.802495060758458

In [47]:
np_test = np.genfromtxt('test.csv', delimiter=';')

In [51]:
np_test[2,:]

array([  6.00000000e+00,   1.90420000e+04,   2.00000000e+00,
         1.70000000e+02,   6.90000000e+01,   1.30000000e+02,
         9.00000000e+01,   1.00000000e+00,   1.00000000e+00,
                    nan,   0.00000000e+00,   1.00000000e+00])

In [30]:
clf.fit(X, y)

XGBClassifier(base_score=0.5, colsample_bylevel=0.5, colsample_bytree=0.2,
       gamma=0, learning_rate=0.05, max_delta_step=0, max_depth=4,
       min_child_weight=1, missing=None, n_estimators=10000, nthread=36,
       objective='binary:logitraw', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.2)

In [36]:
test_vals = df_test.values

In [41]:
test_vals = test_vals.astype(int)

ValueError: invalid literal for int() with base 10: 'None'

In [60]:
test_vals_nn2[1:,:]
    

array([[  6.00000000e+00,   1.90420000e+04,   2.00000000e+00, ...,
                     nan,   0.00000000e+00,   1.00000000e+00],
       [  1.00000000e+01,   1.81330000e+04,   2.00000000e+00, ...,
                     nan,   0.00000000e+00,   1.00000000e+00],
       [  1.90000000e+01,   1.58730000e+04,   2.00000000e+00, ...,
          0.00000000e+00,              nan,              nan],
       ..., 
       [  9.99820000e+04,   1.89810000e+04,   1.00000000e+00, ...,
          0.00000000e+00,              nan,   0.00000000e+00],
       [  9.99840000e+04,   2.17220000e+04,   2.00000000e+00, ...,
          0.00000000e+00,   1.00000000e+00,              nan],
       [  9.99970000e+04,   1.60350000e+04,   2.00000000e+00, ...,
                     nan,   0.00000000e+00,   1.00000000e+00]])

In [98]:
test_all = np_test[~np.isnan(np_test).any(axis=1)]
test_short = (np_test[np.isnan(np_test).any(axis=1)])
ids_all = test_all[:,0]
ids_short = test_short[1:,0]
test_all = preprocess(test_all[:,1:])
test_short = preprocess(test_short[1:,1:])

In [99]:
y_pred_short = clf_short.predict_proba(test_short[:,:-3])[:,1]
y_pred_all = clf_all.predict_proba(test_all)[:,1]

In [100]:
y_pred = list(zip(ids_all, y_pred_all)) + list(zip(ids_short, y_pred_short))
y_pred = sorted(y_pred, key=lambda x: x[0])
y_pred = [x[1] for x in y_pred]
np.savetxt("submission.csv", y_pred)

In [106]:
clf.feature_importances_

array([ 0.01596517,  0.00798258,  0.00145138,  0.02830189,  0.00362845,
        0.00943396,  0.02177068,  0.01596517,  0.01088534,  0.01161103,
        0.04499274,  0.01596517,  0.02902758,  0.01161103,  0.01523948,
        0.02539913,  0.00435414,  0.01886792,  0.00870827,  0.01814223,
        0.00798258,  0.00943396,  0.00217707,  0.01015965,  0.01306241,
        0.01959361,  0.00870827,  0.00362845,  0.01959361,  0.00145138,
        0.01451379,  0.0065312 ,  0.01669086,  0.00507983,  0.00290276,
        0.00290276,  0.01306241,  0.01233672,  0.02104499,  0.01959361,
        0.02612482,  0.02177068,  0.02975327,  0.01161103,  0.0203193 ,
        0.00580552,  0.        ,  0.00507983,  0.00725689,  0.00145138,
        0.00362845,  0.00072569,  0.        ,  0.01161103,  0.        ,
        0.01233672,  0.        ,  0.01015965,  0.00580552,  0.01306241,
        0.00798258,  0.00072569,  0.        ,  0.00798258,  0.        ,
        0.        ,  0.00217707,  0.00145138,  0.00290276,  0.00

In [6]:
clf = RandomForestClassifier(n_estimators=1000, n_jobs=-1, max_depth=4, max_features='sqrt', criterion='entropy')
# scores = cross_val_score(clf, X, y, cv=5)
clf.fit(X_all_train, y_all_train)
y_pred = clf.predict_proba(X_all_test)[:,1]
y_tpred = clf.predict_proba(X_all_train)[:,1]
print(roc_auc_score(y_all_train, y_tpred))
print(roc_auc_score(y_all_stest, y_pred))

0.793125754287
0.789208830256


In [9]:
X_all_train.shape

(49000, 20)

In [130]:
df.columns

Index(['id', 'age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
       'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio'],
      dtype='object')

In [150]:
len(df[df['ap_hi'] < 20])

183

array([ 51.,  45.,  42., ...,  58.,  52.,  44.])