## Imports

In [None]:
import datetime

In [None]:
import lightgbm as lgb

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import *

## Config

In [None]:
RANDOM_SEED = 42

## Read Data

In [None]:
feature_lists = [
    'oofp_manual_lightgbm',
    'oofp_nn_concat_dense_1',
    'oofp_currie32_cnn',
    'oofp_lystdo_lstm',
]

In [None]:
X, X_test, _ = load_feature_lists(feature_lists)

In [None]:
y = load(features_data_folder + 'y_train.pickle')

In [None]:
X.dtypes

## Train Model

In [None]:
lgb_params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting': 'gbdt',
    'device': 'cpu',
#     'bagging_fraction': 0.5,
#     'bagging_freq': 20,
#     'feature_fraction': 0.8,
    'num_leaves': 64,
    'lambda_l2': 3.0,
    'learning_rate': 0.1,
    'num_boost_round': 1000,
    'early_stopping_rounds': 5,
    'verbose': 1,
    'bagging_fraction_seed': RANDOM_SEED,
    'feature_fraction_seed': RANDOM_SEED,
}

In [None]:
lgb_dataset = lgb.Dataset(X, y)

In [None]:
%%time

cv_results = lgb.cv(
    lgb_params,
    lgb_dataset,
    num_boost_round=lgb_params['num_boost_round'],
    early_stopping_rounds=lgb_params['early_stopping_rounds'],
    nfold=5,
    stratified=True,
    shuffle=True,
    metrics=['binary_logloss'],
    seed=RANDOM_SEED,
)

In [None]:
pd.DataFrame(cv_results)

## Sanity Check (Train/Validation)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=RANDOM_SEED,
    stratify=y,
)

In [None]:
lgb_data_train = lgb.Dataset(X_train, y_train)
lgb_data_val = lgb.Dataset(X_val, y_val)

In [None]:
model = lgb.train(
    lgb_params,
    lgb_data_train,
    valid_sets=[lgb_data_val],
    num_boost_round=lgb_params['num_boost_round'],
    early_stopping_rounds=lgb_params['early_stopping_rounds'],
)

In [None]:
pd.DataFrame({
    'column': list(X_train.columns),
    'importance': model.feature_importance(),
}).sort_values(by='importance')

### Evaluate Model

In [None]:
def predict_classes(model, data, threshold=0.5):
    y_pred = model.predict(data)
    y_pred[y_pred < threshold] = 0
    y_pred[y_pred >= threshold] = 1
    return y_pred

In [None]:
y_pred_train = predict_classes(model, X_train)
y_pred_proba_train = model.predict(X_train)

In [None]:
y_pred_val = predict_classes(model, X_val)
y_pred_proba_val = model.predict(X_val)

In [None]:
continuous_metrics = [log_loss, roc_auc_score]

In [None]:
binary_metrics = [accuracy_score, precision_score, recall_score]

### Training

In [None]:
for metric in continuous_metrics:
    print('{:20s}: {:10.5f}'.format(metric.__name__, metric(y_train, y_pred_proba_train)))
for metric in binary_metrics:
    print('{:20s}: {:10.5f}'.format(metric.__name__, metric(y_train, y_pred_train)))

### Validation

In [None]:
for metric in continuous_metrics:
    print('{:20s}: {:10.5f}'.format(metric.__name__, metric(y_val, y_pred_proba_val)))
for metric in binary_metrics:
    print('{:20s}: {:10.5f}'.format(metric.__name__, metric(y_val, y_pred_val)))

In [None]:
raise ValueError('Stopping before the test set')

## Test

In [None]:
y_test = model.predict(X_test)

In [None]:
submission_id = datetime.datetime.now().strftime('%Y-%m-%d-%H%M')

In [None]:
df_submission = pd.DataFrame({
    'test_id': range(len(y_test)),
    'is_duplicate': y_test
})

In [None]:
df_submission = df_submission[['test_id', 'is_duplicate']]

In [None]:
df_submission.head(10)

In [None]:
df_submission.to_csv(
    submissions_data_folder + submission_id + '-submission-draft.csv',
    header=True,
    float_format='%.8f',
    index=None,
)