## Imports

In [None]:
import datetime

In [None]:
import lightgbm as lgb

In [None]:
from sklearn.model_selection import StratifiedKFold

## Config

In [None]:
NUM_FOLDS = 5

In [None]:
RANDOM_SEED = 42

In [None]:
np.random.seed(RANDOM_SEED)

## Read Data

In [None]:
feature_lists = [
    'simple_summaries',
    'jaccard_ngrams',
    'fuzzy',
    'jellyfish',
    'tfidf_distances',
    'embedding_mean',
    'embedding_normalized_sum',
    'wmd',
    'wordnet_similarity',
    'dasolmar_whq',
    'magic_jturkewitz',
    'magic_stas_svd_150',
    'magic_stas_avito',
#     'magic_kcore',
#     'magic_tour1st',
#     'magic_qid',
    'oofp_manual_lightgbm',
    'oofp_bradleypallen_mlp',
    'oofp_currie32_cnn',
    'oofp_lystdo_bi_lstm',
]

In [None]:
df_train, df_test, _ = load_feature_lists(feature_lists)

In [None]:
X_train = df_train.values
X_test = df_test.values

In [None]:
y_train = load(features_data_folder + 'y_train.pickle')

## Train models & compute test predictions from each fold

In [None]:
def mirror_dataset(X_orig):
    X = X_orig.copy(deep=True)
    pairs_to_flip = [
        ['jaccard_ix_norm_q1_2gram', 'jaccard_ix_norm_q2_2gram'],
        ['jaccard_ix_norm_q1_3gram', 'jaccard_ix_norm_q2_3gram'],
        ['jaccard_ix_norm_q1_4gram', 'jaccard_ix_norm_q2_4gram'],
        ['jaccard_ix_norm_q1_5gram', 'jaccard_ix_norm_q2_5gram'],
        ['das_stops1_ratio', 'das_stops2_ratio'],
        ['das_len_q1', 'das_len_q2'],
        ['das_caps_count_q1', 'das_caps_count_q2'],
        ['das_len_char_q1', 'das_len_char_q2'],
        ['das_len_word_q1', 'das_len_word_q2'],
        ['das_avg_word_len1', 'das_avg_word_len2'],
        ['das_q1_how', 'das_q2_how'],
        ['das_q1_what', 'das_q2_what'],
        ['das_q1_which', 'das_q2_which'],
        ['das_q1_who', 'das_q2_who'],
        ['das_q1_where', 'das_q2_where'],
        ['das_q1_when', 'das_q2_when'],
        ['das_q1_why', 'das_q2_why'],
        ['whq_count_q1', 'whq_count_q2'],
        ['magic_jt_q1_freq', 'magic_jt_q2_freq'],
    ]
    
    for pair in pairs_to_flip:
        X[[pair[0], pair[1]]] = X[[pair[1], pair[0]]]
    
    return X

In [None]:
def predict(model, X_orig, X_mirror):
    y_pred_orig = model.predict(X_orig).reshape(-1)
    y_pred_mirror = model.predict(X_mirror).reshape(-1)
    return (y_pred_orig + y_pred_mirror) / 2

In [None]:
X_train_mirror = mirror_dataset(df_train).values
X_test_mirror = mirror_dataset(df_test).values

In [None]:
kfold = StratifiedKFold(
    n_splits=NUM_FOLDS,
    shuffle=True,
    random_state=RANDOM_SEED
)

In [None]:
y_test_pred = np.zeros((len(X_test), NUM_FOLDS))

In [None]:
cv_scores = []

In [None]:
%%time

for fold_num, (ix_train, ix_val) in enumerate(kfold.split(X_train, y_train)):
    print(f'Fitting fold {fold_num + 1} of {kfold.n_splits}')
    
    X_fold_train = np.vstack([X_train[ix_train], X_train_mirror[ix_train]])
    X_fold_val = np.vstack([X_train[ix_val], X_train_mirror[ix_val]])

    y_fold_train = np.concatenate([y_train[ix_train], y_train[ix_train]])
    y_fold_val = np.concatenate([y_train[ix_val], y_train[ix_val]])
    
    lgb_params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting': 'gbdt',
        'device': 'cpu',
        'num_leaves': 64,
        'feature_fraction': 1.0,
        'learning_rate': 0.03,
#         'num_leaves': 279,
#         'feature_fraction': 0.614,
#         'lambda_l2': 9.45,
#         'learning_rate': 0.01,
        'num_boost_round': 1000,
        'early_stopping_rounds': 5,
        'verbose': 1,
        'bagging_fraction_seed': RANDOM_SEED,
        'feature_fraction_seed': RANDOM_SEED,
    }
    
    lgb_data_train = lgb.Dataset(X_fold_train, y_fold_train)
    lgb_data_val = lgb.Dataset(X_fold_val, y_fold_val)    
    evals_result = {}
    
    model = lgb.train(
        lgb_params,
        lgb_data_train,
        valid_sets=[lgb_data_train, lgb_data_val],
        evals_result=evals_result,
        num_boost_round=lgb_params['num_boost_round'],
        early_stopping_rounds=lgb_params['early_stopping_rounds'],
        verbose_eval=False,
    )
    
    fold_train_scores = evals_result['training'][lgb_params['metric']]
    fold_val_scores = evals_result['valid_1'][lgb_params['metric']]
    
    print('Fold {}: {} rounds, training loss {:.6f}, validation loss {:.6f}'.format(
        fold_num + 1,
        len(fold_train_scores),
        fold_train_scores[-1],
        fold_val_scores[-1],
    ))
    print()
    
    cv_scores.append(fold_val_scores[-1])
    y_test_pred[:, fold_num] = predict(model, X_test, X_test_mirror)

In [None]:
pd.DataFrame({
    'column': list(df_train.columns),
    'importance': model.feature_importance(),
}).sort_values(by='importance').values

In [None]:
final_cv_score = np.mean(cv_scores)

In [None]:
print('Final CV score:', final_cv_score)

## Generate submission

In [None]:
y_test = np.mean(y_test_pred, axis=1)

In [None]:
submission_id = datetime.datetime.now().strftime('%Y-%m-%d-%H%M')

In [None]:
df_submission = pd.DataFrame({
    'test_id': range(len(y_test)),
    'is_duplicate': y_test
})

### Recalibrate predictions for a different target balance on test

In [None]:
def recalibrate_prediction(pred, train_pos_ratio=0.3692, test_pos_ratio=0.1746):
    a = test_pos_ratio / train_pos_ratio
    b = (1 - test_pos_ratio) / (1 - train_pos_ratio)
    return a * pred / (a * pred + b * (1 - pred))

In [None]:
df_submission['is_duplicate'] = df_submission['is_duplicate'].map(recalibrate_prediction)

In [None]:
df_submission = df_submission[['test_id', 'is_duplicate']]

### Explore & Save

In [None]:
pd.DataFrame(y_test).plot.hist()

In [None]:
df_submission[df_submission.is_duplicate > 0.9].count()

In [None]:
df_submission.to_csv(
    submissions_data_folder + f'{submission_id}-submission-draft-cv-{final_cv_score:.6f}.csv',
    header=True,
    float_format='%.8f',
    index=None,
)