In [3]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import RFECV
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold, train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, RandomTreesEmbedding
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

import lightgbm as lgbm
import xgboost as xgb

from imblearn.over_sampling import SMOTE

from IPython.display import display

In [4]:
df = pd.read_csv('data/train_final.csv', index_col='Id')
df_ult = pd.read_csv('data/test_final.csv', index_col='Id')

X = df.drop('Y', axis='columns').values
y = df['Y'].values

X_ult = df_ult.values

scale_pos_weight = len(y[y == 0])/len(y[y == 1])

In [5]:
def transform(df, y=True):
    df_copy = df.copy()
    df_copy['f8-f19'] = df['f8'] - df['f19']
    df_copy['f8-f13'] = df['f8'] - df['f13']
    df_copy['f17-f4'] = df['f17'] - df['f4']
    df_copy['f4-f7'] = df['f4'] - df['f7']
    df_copy['f13-f19'] = df['f13'] - df['f19']
    df_copy['f8wf13wf19'] = PCA(n_components=1).fit_transform(df[['f8', 'f13', 'f19']])
    return df_copy.drop('Y', axis='columns') if y else df_copy

# selected_cols = ['f14', 'f13', 'f15', 'f4', 'f8wf13wf19', 'f4-f7', 'f16', 'f17', 'f19', 'f1', 'f8-f19']
selected_cols = ['f1', 'f4', 'f8', 'f13', 'f14', 'f15', 'f16', 'f17', 'f8-f19', 'f8-f13', 'f17-f4', 'f4-f7', 'f8wf13wf19']

In [6]:
train = transform(df)[selected_cols]
test = transform(df_ult, y=False)[selected_cols]

In [7]:
# uncomment to test
# train, test, y, y_test = train_test_split(train, y, random_state=1, test_size=0.2)

In [8]:
ntrain = train.shape[0]
ntest = test.shape[0]
SEED = 0
NFOLDS = 5
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

class SklearnHelper(object):
    def __init__(self, clf, seed=SEED, params=None):
        if seed is not None:
            params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict_proba(x)[:, 1]
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)

In [9]:
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf.split(x_train, y_train)):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [7]:
rf_params = {
    'class_weight': 'balanced', 
    'criterion': 'gini', 
    'max_depth': None, 
    'n_estimators': 1000, 
    'n_jobs': -1
}

# Extra Trees Parameters
et_params = {
    'class_weight': 'balanced', 
    'criterion': 'entropy', 
    'max_depth': None, 
    'n_estimators': 1000, 
    'n_jobs': -1
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 1000,
    'learning_rate' : 1
}

# Gradient Boosting parameters
gb_params = {
    'loss': 'exponential', 
    'max_depth': 5, 
    'n_estimators': 500, 
}

lgb_params = {
    'boosting_type':                 'goss', 
    'metric':                        'auc', 
    'objective':                     'binary', 
    'scale_pos_weight':              scale_pos_weight, 
    'n_jobs':                        -1, 
    'n_estimators':                  1000, 
    'learning_rate':                 0.1, 
    'colsample_bytree':              0.01, 
    'reg_alpha':                     0, 
    'reg_lambda':                    0.89995, 
}

xgb_params = {
    'objective':                      'binary:logistic', 
    'booster':                        'gbtree',
    'tree_method':                    'exact', 
    'eval_metric':                    'auc',
    'scale_pos_weight':               scale_pos_weight,
    'n_jobs':                         -1,
    'max_depth':                      5, 
    'learning_rate':                  0.1, 
    'n_estimators':                   1000, 
    'colsample_bylevel':              0.03, 
    'colsample_bynode':               0.86
}

xgb2_params = {
    'objective':                      'binary:logistic', 
    'booster':                        'gbtree',
    'tree_method':                    'exact', 
    'eval_metric':                    'auc',
#     'scale_pos_weight':               scale_pos_weight,
    'n_jobs':                         -1,
    'n_estimators':                   260, 
    'learning_rate':                  0.0075, 
    'max_depth':                      4, 
    'reg_alpha':                      0, 
    'reg_lambda':                     0.9, 
    'random_state':                   0,
    'colsample_bylevel':              0.03, 
    'colsample_bynode':               0.8
}

In [11]:
rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)
# knc64 = SklearnHelper(clf=KNeighborsClassifier, seed=None, params={'n_neighbors': 64})
# knc128 = SklearnHelper(clf=KNeighborsClassifier, seed=None, params={'n_neighbors': 128})
# knc256 = SklearnHelper(clf=KNeighborsClassifier, seed=None, params={'n_neighbors': 256})
lgbc = SklearnHelper(clf=lgbm.LGBMClassifier, seed=SEED, params=lgb_params)
xgbc = SklearnHelper(clf=xgb.XGBClassifier, seed=SEED, params=xgb_params)

In [12]:
y_train = y
x_train = train.values
x_test = test.values

In [13]:
et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test)
print('.')
rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test)
print('.')
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test)
print('.')
gb_oof_train, gb_oof_test = get_oof(gb, x_train, y_train, x_test)
print('.')
# knc64_oof_train, knc64_oof_test = get_oof(knc64, x_train, y_train, x_test)
# print('.')
# knc128_oof_train, knc128_oof_test = get_oof(knc128, x_train, y_train, x_test)
# print('.')
# knc256_oof_train, knc256_oof_test = get_oof(knc256, x_train, y_train, x_test)
# print('.')
lgbc_oof_train, lgbc_oof_test = get_oof(lgbc, x_train, y_train, x_test)
print('.')
xgbc_oof_train, xgbc_oof_test = get_oof(xgbc, x_train, y_train, x_test)
print('done')

.
.
.
.
.
done


In [16]:
def get_ind(mask): return [index for index, mask_ele in enumerate(mask) if mask_ele==True]
def get_best_features(model, data, step=1):
    rfecv = RFECV(estimator=model, step=step, cv=StratifiedKFold(5), scoring='roc_auc', n_jobs=-1)
    rfecv.fit(data, df['Y'].values)

    return get_ind(rfecv.ranking_ == 1)

best = get_best_features(xgb.XGBClassifier(**xgb_params), transform(df).values)
best
# pd.Series(clf.feature_importances_, index=list(range(X.shape[1]))).plot.bar(color='steelblue', figsize=(16, 7));

[0, 3, 7, 12, 13, 14, 15, 16, 24, 25, 26, 27, 29]

In [19]:
alts = ['f1', 'f4', 'f8', 'f13', 'f14', 'f15', 'f16', 'f17', 'f8-f19', 'f8-f13', 'f17-f4', 'f4-f7', 'f8wf13wf19']

In [18]:
def ind_to_name(df, idx): return df.columns[idx].tolist()
ind_to_name(transform(df), best)

['f1',
 'f4',
 'f8',
 'f13',
 'f14',
 'f15',
 'f16',
 'f17',
 'f8-f19',
 'f8-f13',
 'f17-f4',
 'f4-f7',
 'f8wf13wf19']

In [15]:
base_predictions_train = pd.DataFrame( {
    'RandomForest': rf_oof_train.ravel(),
    'ExtraTrees': et_oof_train.ravel(),
    'AdaBoost': ada_oof_train.ravel(),
    'GradientBoost': gb_oof_train.ravel(),
#     '64NeighbourClassifier': knc64_oof_train.ravel(),
#     '128NeighbourClassifier': knc128_oof_train.ravel(),
#     '256NeighbourClassifier': knc256_oof_train.ravel(),
    'LGBMClassifier': lgbc_oof_train.ravel(),
    'XGBClassifier': xgbc_oof_train.ravel()
})
base_predictions_train.head(5)

Unnamed: 0,RandomForest,ExtraTrees,AdaBoost,GradientBoost,LGBMClassifier,XGBClassifier
0,0.937,0.982,0.501002,0.998659,0.990588,0.894865
1,0.968,0.972,0.500514,0.951903,0.699376,0.559421
2,0.939,0.936,0.500458,0.997719,0.993427,0.831691
3,0.747,0.743,0.500413,0.892027,0.776301,0.370943
4,0.928,0.986,0.501064,0.999917,0.999913,0.969721


In [14]:
x2_train = np.concatenate(( 
    et_oof_train, 
    rf_oof_train, 
    ada_oof_train, 
    gb_oof_train, 
#     knc64_oof_train, 
#     knc128_oof_train, 
#     knc256_oof_train, 
    lgbc_oof_train, 
    xgbc_oof_train
), axis=1)

x2_test = np.concatenate((
    et_oof_test, 
    rf_oof_test, 
    ada_oof_test, 
    gb_oof_test, 
#     knc64_oof_test, 
#     knc128_oof_test, 
#     knc256_oof_test, 
    lgbc_oof_test, 
    xgbc_oof_test
), axis=1)

x2_train = np.append(x2_train.T, x_train.T, axis=0).T
x2_test = np.append(x2_test.T, x_test.T, axis=0).T

lgbc2 = SklearnHelper(clf=lgbm.LGBMClassifier, seed=SEED, params=lgb_params)
xgbc2 = SklearnHelper(clf=xgb.XGBClassifier, seed=SEED, params=xgb_params)

lgbc2_oof_train, lgbc2_oof_test = get_oof(lgbc2, x2_train, y_train, x2_test)
xgbc2_oof_train, xgbc2_oof_test = get_oof(xgbc2, x2_train, y_train, x2_test)

roc_auc_score(y_test, xgbc2_oof_test)

0.8916954671230289

In [15]:
roc_auc_score(y_test, xgbc_oof_test)

0.8882746026277869

In [17]:
roc_auc_score(y_test, lgbc_oof_test)

0.8936050022040221

In [8]:
# train = train.values
# test = test.values
train, x_cal, y, y_cal = train_test_split(train, y, test_size=0.2)

first_layer_models = [
    RandomForestClassifier(**rf_params),
    ExtraTreesClassifier(**et_params),
    AdaBoostClassifier(**ada_params),
    GradientBoostingClassifier(**gb_params),
    lgbm.LGBMClassifier(**lgb_params),
    xgb.XGBClassifier(**xgb_params)
]

[model.fit(train, y) for model in first_layer_models]

first_layer_train_preds = [model.predict_proba(train)[:, 1] for model in first_layer_models]
first_layer_test_preds = [model.predict_proba(test)[:, 1] for model in first_layer_models]
first_layer_cal = [model.predict_proba(x_cal)[:, 1] for model in first_layer_models]

second_layer_train = np.append(np.array(first_layer_train_preds), train.T, axis=0).T
second_layer_test = np.append(np.array(first_layer_test_preds), test.T, axis=0).T
second_layer_cal = np.append(np.array(first_layer_cal), x_cal.T, axis=0).T

second_layer_model = xgb.XGBClassifier(**xgb2_params)
second_layer_model.fit(second_layer_train, y);

In [9]:
from sklearn.calibration import CalibratedClassifierCV

calibrator = CalibratedClassifierCV(second_layer_model, method='sigmoid', cv='prefit')
calibrator.fit(second_layer_cal, y_cal);

In [10]:
last_predictions = calibrator.predict_proba(second_layer_test)[:, 1]

In [11]:
req_rows = 16384

result = pd.DataFrame(last_predictions, 
                      index=list(range(req_rows, req_rows*2 + 1)), 
                      columns=['Y'])

result.index.name = 'Id'
result.to_csv(f'submission_29_3.csv', float_format='%.20f')

In [257]:
const_params = {
    'objective':                      'binary:logistic', 
    'booster':                        'gbtree',
    'tree_method':                    'exact', 
    'eval_metric':                    'auc',
    'scale_pos_weight':               scale_pos_weight,
    'n_jobs':                         -1
}

tuning_parameters = {    
    'n_estimators':                   [300], 
    'learning_rate':                  [0.02], 
    'max_depth':                      [2], 
    'random_state': [0]
}

grid = GridSearchCV(xgb.XGBClassifier(**const_params), 
                          tuning_parameters, 
                          cv=skfold, 
                          scoring='roc_auc', 
                          n_jobs=-1)

grid.fit(x2_train, y)

print(f'Best parameters {grid.best_params_}.')
print(f'Best auc score is {grid.best_score_}.')

Best parameters {'learning_rate': 0.02, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0}.
Best auc score is 0.890822749511769.


In [258]:
fix_param = {
    'boosting_type':                 'goss', 
    'metric':                        'auc', 
    'objective':                     'binary', 
    'scale_pos_weight':              scale_pos_weight, 
    'n_jobs':                        -1
}

now_param = {
    'n_estimators':                  [250], 
    'learning_rate':                 [0.02], 
    'max_depth':                     [2], 
#     'early_stopping_rounds'
#     'reg_alpha':                     [0], 
#     'reg_lambda':                    [0.89995, 1, 0], 
#     'seed': list(range(60, 62)),
}

grid = GridSearchCV(lgbm.LGBMClassifier(**fix_param), 
                          now_param, 
                          cv=StratifiedKFold(5), 
                          scoring='roc_auc', 
                          n_jobs=-1)

grid.fit(x2_train, y)

print(f'Best parameters {grid.best_params_}.')
print(f'Best auc score is {grid.best_score_}.')

Best parameters {'learning_rate': 0.02, 'max_depth': 2, 'n_estimators': 250}.
Best auc score is 0.8906617395164347.


In [306]:
const_params = {
    'objective':                      'binary:logistic', 
    'booster':                        'gbtree',
    'tree_method':                    'exact', 
    'eval_metric':                    'auc',
    'scale_pos_weight':               scale_pos_weight,
    'n_jobs':                         -1
}

tuning_parameters = {    
    'n_estimators':                   [255, 260], 
    'learning_rate':                  [0.0075], 
    'max_depth':                      [4], 
    'reg_alpha':                      [0], 
    'reg_lambda':                     [0.9], 
    'random_state':                   [0],
    'colsample_bylevel':              [0.03], 
    'colsample_bynode':               [0.8]
}

grid = GridSearchCV(xgb.XGBClassifier(**const_params), 
                          tuning_parameters, 
                          cv=kf, 
                          scoring='roc_auc', 
                          n_jobs=-1)

grid.fit(x2_train, y_train)

print(f'Best parameters {grid.best_params_}.')
print(f'Best auc score is {grid.best_score_}.')

Best parameters {'colsample_bylevel': 0.03, 'colsample_bynode': 0.8, 'learning_rate': 0.0075, 'max_depth': 4, 'n_estimators': 260, 'random_state': 0, 'reg_alpha': 0, 'reg_lambda': 0.9}.
Best auc score is 0.8895835881594708.
