In [1]:
import pandas as pd
import numpy as np
from scipy.stats import skew
import xgboost as xgb
#from sklearn.cross_validation import KFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from math import sqrt
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
import gc

In [2]:
train = pd.read_csv('./train-new.csv')
test = pd.read_csv('./test-new.csv')

In [3]:
def dummies(train, test):
    columns = ['ethnicity', 'gender','icu_stay_type','hospital_admit_source', 'icu_admit_source', 'icu_type', 'apache_3j_bodysystem', 'apache_2_bodysystem', 'cancer', 'liver_disease', 'other']
    for column in columns:
        train[column] = train[column].apply(lambda x: str(x))
        test[column] = test[column].apply(lambda x: str(x))
        good_cols = [column+'_'+i for i in train[column].unique() if i in test[column].unique()]
        train = pd.concat((train, pd.get_dummies(train[column], prefix = column)[good_cols]), axis = 1)
        test = pd.concat((test, pd.get_dummies(test[column], prefix = column)[good_cols]), axis = 1)
        del train[column]
        del test[column]
    return train, test

In [4]:
train, test = dummies(train, test)

In [5]:
y_train = train['hospital_death'].ravel()
train1 = train.drop(['hospital_death'], axis=1)
test1 = test.drop(['hospital_death'], axis=1)
x_train = train1.values # Creates an array of the train data
x_test = test1.values # Creats an array of the test data
ntrain = x_train.shape[0]
ntest = x_test.shape[0]

In [6]:
NFOLDS = 5
SEED = 0
NROWS = None
kf = KFold(n_splits = NFOLDS, shuffle=True, random_state=SEED)

In [7]:
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]

class CatboostWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_seed'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]
        
class LightGBMWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['feature_fraction_seed'] = seed
        params['bagging_seed'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict_proba(x)[:,1]


class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))


In [8]:
def get_oof(clf):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)



In [9]:
et_params = {
    'n_jobs': 16,
    'n_estimators': 250,
    'max_features': 0.75,
    'max_depth': 6,
    'min_samples_leaf': 2,
}

rf_params = {
    'n_jobs': 16,
    'n_estimators': 250,
    'max_features': 0.75,
    'max_depth': 6,
    'min_samples_leaf': 2,
}

xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.3,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.2,
    'objective': 'binary:logistic',
    'max_depth': 6,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'nrounds': 200
}

catboost_params = {
    'iterations': 200,
    'learning_rate': 0.2,
    'depth': 6 ,
    'l2_leaf_reg': 40,
    'bootstrap_type': 'MVS',
    'subsample': 0.7,
    'scale_pos_weight': 5,
    'eval_metric': 'AUC',
    'od_type': 'Iter',
    'allow_writing_files': False
}

lightgbm_params = {
    'n_estimators':250,
    'learning_rate':0.2,
    'num_leaves':1400,
    'colsample_bytree':0.5,
    'subsample':0.9,
    'max_depth':6,
    'reg_alpha':0.1,
    'reg_lambda':0.1,
    'min_split_gain':0.01,
    'min_child_weight':1    
}


In [10]:
xg = XgbWrapper(seed=SEED, params=xgb_params)
et = SklearnWrapper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
rf = SklearnWrapper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
cb = CatboostWrapper(clf= CatBoostClassifier, seed = SEED, params=catboost_params)
lg = LightGBMWrapper(clf = LGBMClassifier, seed = SEED, params = lightgbm_params)

In [11]:
xg_oof_train, xg_oof_test = get_oof(xg)
print("testing2")
et_oof_train, et_oof_test = get_oof(et)
print("testing3")
rf_oof_train, rf_oof_test = get_oof(rf)
print("testing4")
cb_oof_train, cb_oof_test = get_oof(cb)
print("Training completed")


testing2
testing3
testing4
0:	total: 174ms	remaining: 34.6s
1:	total: 202ms	remaining: 20s
2:	total: 230ms	remaining: 15.1s
3:	total: 258ms	remaining: 12.6s
4:	total: 285ms	remaining: 11.1s
5:	total: 312ms	remaining: 10.1s
6:	total: 340ms	remaining: 9.36s
7:	total: 367ms	remaining: 8.82s
8:	total: 411ms	remaining: 8.73s
9:	total: 442ms	remaining: 8.4s
10:	total: 473ms	remaining: 8.13s
11:	total: 504ms	remaining: 7.89s
12:	total: 534ms	remaining: 7.68s
13:	total: 581ms	remaining: 7.72s
14:	total: 613ms	remaining: 7.56s
15:	total: 647ms	remaining: 7.43s
16:	total: 679ms	remaining: 7.3s
17:	total: 713ms	remaining: 7.21s
18:	total: 747ms	remaining: 7.12s
19:	total: 791ms	remaining: 7.12s
20:	total: 826ms	remaining: 7.04s
21:	total: 859ms	remaining: 6.95s
22:	total: 892ms	remaining: 6.86s
23:	total: 926ms	remaining: 6.79s
24:	total: 962ms	remaining: 6.73s
25:	total: 1.01s	remaining: 6.74s
26:	total: 1.05s	remaining: 6.73s
27:	total: 1.09s	remaining: 6.7s
28:	total: 1.13s	remaining: 6.64s
29

41:	total: 1.48s	remaining: 5.55s
42:	total: 1.51s	remaining: 5.51s
43:	total: 1.54s	remaining: 5.47s
44:	total: 1.58s	remaining: 5.43s
45:	total: 1.61s	remaining: 5.39s
46:	total: 1.65s	remaining: 5.38s
47:	total: 1.69s	remaining: 5.35s
48:	total: 1.72s	remaining: 5.31s
49:	total: 1.76s	remaining: 5.27s
50:	total: 1.79s	remaining: 5.24s
51:	total: 1.84s	remaining: 5.22s
52:	total: 1.87s	remaining: 5.2s
53:	total: 1.91s	remaining: 5.16s
54:	total: 1.94s	remaining: 5.12s
55:	total: 1.98s	remaining: 5.08s
56:	total: 2.02s	remaining: 5.07s
57:	total: 2.06s	remaining: 5.04s
58:	total: 2.09s	remaining: 5s
59:	total: 2.13s	remaining: 4.96s
60:	total: 2.16s	remaining: 4.93s
61:	total: 2.2s	remaining: 4.91s
62:	total: 2.24s	remaining: 4.88s
63:	total: 2.28s	remaining: 4.84s
64:	total: 2.31s	remaining: 4.8s
65:	total: 2.35s	remaining: 4.77s
66:	total: 2.38s	remaining: 4.73s
67:	total: 2.42s	remaining: 4.71s
68:	total: 2.46s	remaining: 4.67s
69:	total: 2.49s	remaining: 4.63s
70:	total: 2.53s	rem

81:	total: 3s	remaining: 4.32s
82:	total: 3.05s	remaining: 4.3s
83:	total: 3.09s	remaining: 4.26s
84:	total: 3.13s	remaining: 4.23s
85:	total: 3.16s	remaining: 4.19s
86:	total: 3.2s	remaining: 4.15s
87:	total: 3.25s	remaining: 4.13s
88:	total: 3.29s	remaining: 4.11s
89:	total: 3.33s	remaining: 4.07s
90:	total: 3.37s	remaining: 4.03s
91:	total: 3.41s	remaining: 4s
92:	total: 3.45s	remaining: 3.96s
93:	total: 3.48s	remaining: 3.93s
94:	total: 3.52s	remaining: 3.89s
95:	total: 3.56s	remaining: 3.85s
96:	total: 3.59s	remaining: 3.82s
97:	total: 3.64s	remaining: 3.79s
98:	total: 3.68s	remaining: 3.75s
99:	total: 3.72s	remaining: 3.72s
100:	total: 3.75s	remaining: 3.68s
101:	total: 3.79s	remaining: 3.64s
102:	total: 3.83s	remaining: 3.61s
103:	total: 3.89s	remaining: 3.59s
104:	total: 3.93s	remaining: 3.56s
105:	total: 3.97s	remaining: 3.52s
106:	total: 4.01s	remaining: 3.49s
107:	total: 4.05s	remaining: 3.45s
108:	total: 4.09s	remaining: 3.41s
109:	total: 4.12s	remaining: 3.37s
110:	total: 

125:	total: 4.72s	remaining: 2.77s
126:	total: 4.76s	remaining: 2.74s
127:	total: 4.8s	remaining: 2.7s
128:	total: 4.83s	remaining: 2.66s
129:	total: 4.87s	remaining: 2.62s
130:	total: 4.91s	remaining: 2.58s
131:	total: 4.94s	remaining: 2.55s
132:	total: 4.98s	remaining: 2.51s
133:	total: 5.02s	remaining: 2.47s
134:	total: 5.05s	remaining: 2.43s
135:	total: 5.1s	remaining: 2.4s
136:	total: 5.13s	remaining: 2.36s
137:	total: 5.17s	remaining: 2.32s
138:	total: 5.2s	remaining: 2.28s
139:	total: 5.24s	remaining: 2.24s
140:	total: 5.27s	remaining: 2.21s
141:	total: 5.31s	remaining: 2.17s
142:	total: 5.35s	remaining: 2.13s
143:	total: 5.4s	remaining: 2.1s
144:	total: 5.44s	remaining: 2.06s
145:	total: 5.48s	remaining: 2.03s
146:	total: 5.51s	remaining: 1.99s
147:	total: 5.55s	remaining: 1.95s
148:	total: 5.59s	remaining: 1.91s
149:	total: 5.63s	remaining: 1.88s
150:	total: 5.67s	remaining: 1.84s
151:	total: 5.71s	remaining: 1.8s
152:	total: 5.74s	remaining: 1.76s
153:	total: 5.78s	remaining:

166:	total: 6.32s	remaining: 1.25s
167:	total: 6.36s	remaining: 1.21s
168:	total: 6.4s	remaining: 1.17s
169:	total: 6.44s	remaining: 1.14s
170:	total: 6.47s	remaining: 1.1s
171:	total: 6.51s	remaining: 1.06s
172:	total: 6.54s	remaining: 1.02s
173:	total: 6.58s	remaining: 983ms
174:	total: 6.61s	remaining: 944ms
175:	total: 6.65s	remaining: 907ms
176:	total: 6.69s	remaining: 869ms
177:	total: 6.72s	remaining: 831ms
178:	total: 6.76s	remaining: 793ms
179:	total: 6.8s	remaining: 755ms
180:	total: 6.84s	remaining: 718ms
181:	total: 6.88s	remaining: 680ms
182:	total: 6.92s	remaining: 643ms
183:	total: 6.95s	remaining: 605ms
184:	total: 6.99s	remaining: 567ms
185:	total: 7.03s	remaining: 529ms
186:	total: 7.07s	remaining: 491ms
187:	total: 7.1s	remaining: 453ms
188:	total: 7.14s	remaining: 416ms
189:	total: 7.18s	remaining: 378ms
190:	total: 7.21s	remaining: 340ms
191:	total: 7.25s	remaining: 302ms
192:	total: 7.29s	remaining: 265ms
193:	total: 7.33s	remaining: 227ms
194:	total: 7.36s	remain

In [12]:
print("XG-CV: {}".format(sqrt(mean_squared_error(y_train, xg_oof_train))))
print("ET-CV: {}".format(sqrt(mean_squared_error(y_train, et_oof_train))))
print("RF-CV: {}".format(sqrt(mean_squared_error(y_train, rf_oof_train))))
print("RF-CV: {}".format(sqrt(mean_squared_error(y_train, cb_oof_train))))

XG-CV: 0.2344365239734466
ET-CV: 0.24219880076447614
RF-CV: 0.24050911085038848
RF-CV: 0.2740546455949492


In [13]:
x_train = np.concatenate((xg_oof_train, et_oof_train, rf_oof_train, cb_oof_train), axis=1)
x_test = np.concatenate((xg_oof_test, et_oof_test, rf_oof_test, cb_oof_test), axis=1)

print("{},{}".format(x_train.shape, x_test.shape))

logistic_regression = LogisticRegression()
logistic_regression.fit(x_train,y_train)

(91713, 4),(39308, 4)




LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
test['hospital_death'] = 1 - logistic_regression.predict_proba(x_test)
tackingSubmission = test[["encounter_id","hospital_death"]].to_csv("submission_stacking_5.csv",index=False)