In [21]:
import numpy as np
import pandas as pd 
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
import warnings
warnings.simplefilter('ignore')

In [22]:
DATA_PATH = 'Data/'
OUTPUT_PATH = 'Output/'

In [23]:
preds_df = pd.read_csv(DATA_PATH + 'Sample Submission.csv')
oofs_df = pd.read_csv(DATA_PATH + 'train_labels.csv')

In [24]:
oofs_df['lgb'] = pd.read_csv(OUTPUT_PATH + 'oof_lgbm_2.csv')['outcome_flag']
oofs_df['lgb_180'] = pd.read_csv(OUTPUT_PATH + 'oof_lgbm_3.csv')['outcome_flag']
oofs_df['xgb'] = pd.read_csv(OUTPUT_PATH + 'oofs_xgb.csv')['outcome_flag']
oofs_df['xgb_180'] = pd.read_csv(OUTPUT_PATH + 'oofs_xgb_2.csv')['outcome_flag']

In [25]:
preds_df['lgb'] = pd.read_excel(OUTPUT_PATH + 'preds_lgbm_2.xlsx')['outcome_flag']
preds_df['lgb_180'] = pd.read_excel(OUTPUT_PATH + 'preds_lgbm_3.xlsx')['outcome_flag']
preds_df['xgb'] = pd.read_excel(OUTPUT_PATH + 'preds_xgb.xlsx')['outcome_flag']
preds_df['xgb_180'] = pd.read_excel(OUTPUT_PATH + 'preds_xgb_2.xlsx')['outcome_flag']

In [26]:
oofs_df.columns

Index(['patient_id', 'outcome_flag', 'lgb', 'lgb_180', 'xgb', 'xgb_180'], dtype='object')

In [27]:
ID_COL, TARGET_COL = 'patient_id', 'outcome_flag'
features = [c for c in oofs_df.columns if c not in [ID_COL, TARGET_COL]]

In [28]:
train, test = oofs_df, preds_df
target = train[TARGET_COL]

In [29]:
max_iter = 10
folds = StratifiedKFold(n_splits=10, random_state=1991)
oof = np.zeros(len(train))
predictions = np.zeros(len(test))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, target.values)):
    print("fold n°{}".format(fold_))
    X_trn, y_trn = train.iloc[trn_idx][features], target.iloc[trn_idx]
    X_val, y_val = train.iloc[val_idx][features], target.iloc[val_idx]
    X_test = test[features]
    clf = LogisticRegression(penalty='l1', C=20, class_weight={0:1, 1:0.01}, random_state=41)
    clf.fit(X_trn, y_trn)
    oof[val_idx] = clf.predict_proba(X_val)[:,1]
    print(f"AUC: {roc_auc_score(y_val, oof[val_idx])}")
    current_pred = clf.predict_proba(X_test)[:,1]
    predictions += current_pred/min(max_iter, folds.n_splits)
    
print("CV score: {:<8.5f}".format(roc_auc_score(target, oof)))

fold n°0
AUC: 0.8959155010318733
fold n°1
AUC: 0.8968155239623941
fold n°2
AUC: 0.868803026828709
fold n°3
AUC: 0.8692702361843614
fold n°4
AUC: 0.8659653749140106
fold n°5
AUC: 0.8591292134831461
fold n°6
AUC: 0.8713669023477277
fold n°7
AUC: 0.8716547815167679
fold n°8
AUC: 0.8898252364549611
fold n°9
AUC: 0.8618048915360068
CV score: 0.87408 


In [30]:
blend_oof = (oofs_df['lgb'] * 0.8 + oofs_df['lgb_180'] * 0.2) * 0.2 + (oofs_df['xgb'] * 0.7 + oofs_df['xgb_180']*0.3) * 0.8
print(f"Blend AUC: {roc_auc_score(target, blend_oof)}")

Blend AUC: 0.8748597551026407


In [31]:
net_oof = blend_oof * 0.86 + oof * 0.14
print(f"Net AUC: {roc_auc_score(target, net_oof)}")

Net AUC: 0.8748590661501208


In [32]:
blend_preds = (preds_df['lgb'] * 0.8 + preds_df['lgb_180'] * 0.2) * 0.2 + (preds_df['xgb'] * 0.7 + preds_df['xgb_180']*0.3) * 0.8
net_preds = blend_preds * 0.86 + predictions * 0.14

In [33]:
oof_df = pd.DataFrame()
oof_df['patient_id'] = train['patient_id']
oof_df[TARGET_COL] = net_oof
oof_df.to_csv(OUTPUT_PATH + 'oof_ensemble_6.csv', index=False)

In [34]:
sub_df = pd.DataFrame()
sub_df[ID_COL] = test[ID_COL]
sub_df[TARGET_COL] = net_preds
sub_df[[TARGET_COL]].to_excel(OUTPUT_PATH + 'preds_ensemble_6.xlsx', index=False)
sub_df.head(10)

Unnamed: 0,patient_id,outcome_flag
0,patient_2,0.065141
1,patient_3,0.148098
2,patient_5,0.517646
3,patient_8,0.005452
4,patient_14,0.389862
5,patient_15,0.073515
6,patient_16,0.008973
7,patient_33,0.138347
8,patient_38,0.016393
9,patient_41,0.125466
