In [2]:
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import RFECV
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold, train_test_split
from sklearn.decomposition import PCA

import lightgbm as lgbm
import xgboost as xgb

from imblearn.over_sampling import SMOTE

from IPython.display import display

In [3]:
df = pd.read_csv('data/train_final.csv', index_col='Id')
df_ult = pd.read_csv('data/test_final.csv', index_col='Id')

X = df.drop('Y', axis='columns').values
y = df['Y'].values

X_ult = df_ult.values

scale_pos_weight = len(y[y == 0])/len(y[y == 1])

const_params = {
    'max_depth':                      11, 
#     'learning_rate':                  0.1, 
    'objective':                      'binary:logistic', 
    'booster':                        'gbtree',
    'tree_method':                    'exact', 
    'eval_metric':                    'auc',
#     'scale_pos_weight':               scale_pos_weight,
    'n_jobs':                         -1
}

In [4]:
def transform(df, y=True):
    df_copy = df.copy()
    df_copy['f8-f19'] = df['f8'] - df['f19']
    df_copy['f8-f13'] = df['f8'] - df['f13']
    df_copy['f17-f4'] = df['f17'] - df['f4']
    df_copy['f4-f7'] = df['f4'] - df['f7']
    df_copy['f13-f19'] = df['f13'] - df['f19']
    df_copy['f8wf13wf19'] = PCA(n_components=1).fit_transform(df[['f8', 'f13', 'f19']])
    return df_copy.drop('Y', axis='columns') if y else df_copy

selected_cols = ['f14', 'f13', 'f15', 'f4', 'f8wf13wf19', 'f4-f7', 'f16', 'f17', 'f19', 'f1', 'f8-f19']

In [41]:
tuning_parameters = {'max_depth':                      [5], 
                     'learning_rate':                  [0.097474, 0.0974741, 0.0974742], 
                     'n_estimators':                   [816, 817], 
                     'reg_alpha':                      [0], 
                     'reg_lambda':                     [1], 
                     'colsample_bytree': [1], 'colsample_bylevel': [0.03], 
                     'colsample_bynode': [0.86], 
                    'random_state': [0]}

grid = GridSearchCV(xgb.XGBClassifier(**const_params), 
                          tuning_parameters, 
#                           n_iter=20,
                          cv=StratifiedKFold(5), 
                          scoring='roc_auc', 
                          n_jobs=-1)

grid.fit(transform(df)[selected_cols].values, y)

print(f'Best parameters {grid.best_params_}.')
print(f'Best auc score is {grid.best_score_}.')

Best parameters {'colsample_bylevel': 0.03, 'colsample_bynode': 0.86, 'colsample_bytree': 1, 'learning_rate': 0.097474, 'max_depth': 5, 'n_estimators': 816, 'random_state': 0, 'reg_alpha': 0, 'reg_lambda': 1}.
Best auc score is 0.8977822444922717.


In [6]:
tuning_parameters = {    
    'learning_rate':                  [0.01, 0.1, 0.05, 0.097474], 
    'n_estimators':                   [400, 500, 600, 700, 816, 817, 1000], 
    'reg_alpha':                      [0], 
    'reg_lambda':                     [1], 
#     'colsample_bytree': [1], 'colsample_bylevel': [0.03], 
#     'colsample_bynode': [0.86], 
    'random_state': [0]
}

grid = GridSearchCV(xgb.XGBClassifier(**const_params), 
                          tuning_parameters, 
#                           n_iter=20,
                          cv=StratifiedKFold(5), 
                          scoring='roc_auc', 
                          n_jobs=-1)

grid.fit(transform(df)[selected_cols].values, y)

print(f'Best parameters {grid.best_params_}.')
print(f'Best auc score is {grid.best_score_}.')

Best parameters {'learning_rate': 0.097474, 'n_estimators': 600, 'random_state': 0, 'reg_alpha': 0, 'reg_lambda': 1}.
Best auc score is 0.8784755948663868.


In [14]:
fix_param = {
    'boosting_type':                 'goss', 
    'metric':                        'auc', 
    'objective':                     'binary', 
    'scale_pos_weight':              scale_pos_weight, 
    'n_jobs':                        -1, 
#     'seed': 75
}

now_param = {
#     'n_estimators':                  [933, 934], 
    'learning_rate':                 [0.1], 
#     'min_child_weight':              [1e-3, 1e-9], 
#     'min_split_gain':                [6e-6], 
    'colsample_bytree':              [0.01], 
    'reg_alpha':                     [0], 
    'reg_lambda':                    [0.89995], 
    'seed': range(20),
}

more_fix_param = {}
more_fix_param = {k:v[0] for k, v in now_param.items()}

now_param = { 'n_estimators': [932, 933, 934], }

grid = GridSearchCV(lgbm.LGBMClassifier(**fix_param, **more_fix_param), 
                          now_param, 
                          cv=StratifiedKFold(5), 
                          scoring='roc_auc', 
                          n_jobs=-1)

grid.fit(transform(df)[selected_cols].values, y)

print(f'Best parameters {grid.best_params_}.')
print(f'Best auc score is {grid.best_score_}.')

Best parameters {'n_estimators': 934}.
Best auc score is 0.8978575448588804.


In [13]:
req_rows = 16384
clf = grid.best_estimator_
clf.fit(transform(df)[selected_cols].values, y)
result = pd.DataFrame(clf.predict_proba(transform(df_ult, y=False)[selected_cols].values)[:, 1], 
                      index=list(range(req_rows, req_rows*2 + 1)), 
                      columns=['Y'])

result.index.name = 'Id'
result.to_csv(f'submission_29_1.csv', float_format='%.20f')