## Imports

In [1]:
import datetime
import os

In [2]:
import lightgbm as lgb

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import *

## Config

In [4]:
RANDOM_SEED = 42

In [5]:
data_folder = os.path.abspath(os.path.join(os.curdir, os.pardir, 'data')) + os.path.sep
aux_data_folder = os.path.join(data_folder, 'aux') + os.path.sep
preproc_data_folder = os.path.join(data_folder, 'preproc') + os.path.sep
features_data_folder = os.path.join(data_folder, 'features') + os.path.sep
submissions_data_folder = os.path.join(data_folder, 'submissions') + os.path.sep

## Read Data

In [6]:
def read_dataset(filename):
    X = pd.read_csv(filename)
    columns_to_remove = ['id', 'is_duplicate']
    columns_to_remove = [
        column
        for column in columns_to_remove
        if column in X.columns
    ]
    X.drop(columns_to_remove, axis=1, inplace=True)
    return X

In [7]:
X = read_dataset(features_data_folder + 'X_train_all_features.csv')

In [8]:
y = load(features_data_folder + 'y_train.pickle')

In [9]:
X.dtypes

shorter_char_len_log              float64
longer_char_len_log               float64
shorter_token_len_log             float64
longer_token_len_log              float64
char_len_diff_log                 float64
token_len_diff_log                float64
char_len_ratio                    float64
token_len_ratio                   float64
word_diff_ratio                   float64
fuzzy_ratio                       float64
fuzzy_partial_ratio               float64
fuzzy_token_sort_ratio            float64
fuzzy_token_set_ratio             float64
fuzzy_partial_token_sort_ratio    float64
tfidf_cosine                      float64
tfidf_euclidean                   float64
emb_mean_cosine                   float64
emb_mean_cityblock_log            float64
emb_mean_euclidean                float64
emb_norm_sum_cosine               float64
emb_norm_sum_cityblock_log        float64
emb_norm_sum_euclidean            float64
wmd                               float64
wordnet_similarity_raw            

## Train Model

In [10]:
lgb_params = {
    'objective': 'binary',
    'boosting': 'gbdt',
    'device': 'cpu',
#     'bagging_fraction': 0.5,
#     'bagging_freq': 20,
#     'feature_fraction': 0.8,
    'num_leaves': 128,
    'learning_rate': 0.03,
    'num_boost_round': 1000,
    'early_stopping_rounds': 5,
    'verbose': 1,
    'bagging_fraction_seed': RANDOM_SEED,
    'feature_fraction_seed': RANDOM_SEED,
}

In [11]:
lgb_dataset = lgb.Dataset(X, y)

In [12]:
%%time

cv_results = lgb.cv(
    lgb_params,
    lgb_dataset,
    num_boost_round=lgb_params['num_boost_round'],
    early_stopping_rounds=lgb_params['early_stopping_rounds'],
    nfold=5,
    stratified=True,
    shuffle=True,
    metrics=['binary_logloss'],
    seed=RANDOM_SEED,
)

CPU times: user 12min 38s, sys: 3.13 s, total: 12min 41s
Wall time: 1min 45s


In [13]:
pd.DataFrame(cv_results)

Unnamed: 0,binary_logloss-mean,binary_logloss-stdv
0,0.676657,0.000616
1,0.661117,0.001177
2,0.646448,0.001702
3,0.632602,0.002188
4,0.619504,0.002634
5,0.607107,0.003035
6,0.595348,0.003421
7,0.584207,0.003785
8,0.573622,0.004108
9,0.563563,0.004413


## Sanity Check (Train/Validation)

In [14]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=RANDOM_SEED,
    stratify=y,
)

In [15]:
lgb_data_train = lgb.Dataset(X_train, y_train)
lgb_data_val = lgb.Dataset(X_val, y_val)

In [16]:
model = lgb.train(
    lgb_params,
    lgb_data_train,
    valid_sets=[lgb_data_val],
    num_boost_round=lgb_params['num_boost_round'],
    early_stopping_rounds=lgb_params['early_stopping_rounds'],
)

[1]	valid_0's binary_logloss: 0.676442
Train until valid scores didn't improve in 5 rounds.
[2]	valid_0's binary_logloss: 0.660716
[3]	valid_0's binary_logloss: 0.645893
[4]	valid_0's binary_logloss: 0.631887
[5]	valid_0's binary_logloss: 0.618627
[6]	valid_0's binary_logloss: 0.606065
[7]	valid_0's binary_logloss: 0.594184
[8]	valid_0's binary_logloss: 0.5829
[9]	valid_0's binary_logloss: 0.5722
[10]	valid_0's binary_logloss: 0.562034
[11]	valid_0's binary_logloss: 0.552351
[12]	valid_0's binary_logloss: 0.543158
[13]	valid_0's binary_logloss: 0.534389
[14]	valid_0's binary_logloss: 0.52604
[15]	valid_0's binary_logloss: 0.51807
[16]	valid_0's binary_logloss: 0.510487
[17]	valid_0's binary_logloss: 0.503261
[18]	valid_0's binary_logloss: 0.496344
[19]	valid_0's binary_logloss: 0.489728
[20]	valid_0's binary_logloss: 0.483398
[21]	valid_0's binary_logloss: 0.477352
[22]	valid_0's binary_logloss: 0.471577
[23]	valid_0's binary_logloss: 0.466049
[24]	valid_0's binary_logloss: 0.460742
[2

[213]	valid_0's binary_logloss: 0.323852
[214]	valid_0's binary_logloss: 0.323794
[215]	valid_0's binary_logloss: 0.323767
[216]	valid_0's binary_logloss: 0.32374
[217]	valid_0's binary_logloss: 0.32371
[218]	valid_0's binary_logloss: 0.323683
[219]	valid_0's binary_logloss: 0.323666
[220]	valid_0's binary_logloss: 0.32365
[221]	valid_0's binary_logloss: 0.323627
[222]	valid_0's binary_logloss: 0.323615
[223]	valid_0's binary_logloss: 0.323606
[224]	valid_0's binary_logloss: 0.32358
[225]	valid_0's binary_logloss: 0.323555
[226]	valid_0's binary_logloss: 0.323525
[227]	valid_0's binary_logloss: 0.323512
[228]	valid_0's binary_logloss: 0.323506
[229]	valid_0's binary_logloss: 0.323495
[230]	valid_0's binary_logloss: 0.323454
[231]	valid_0's binary_logloss: 0.323451
[232]	valid_0's binary_logloss: 0.323416
[233]	valid_0's binary_logloss: 0.323402
[234]	valid_0's binary_logloss: 0.323351
[235]	valid_0's binary_logloss: 0.323349
[236]	valid_0's binary_logloss: 0.323341
[237]	valid_0's bina

[419]	valid_0's binary_logloss: 0.321556
[420]	valid_0's binary_logloss: 0.321542
[421]	valid_0's binary_logloss: 0.32152
[422]	valid_0's binary_logloss: 0.32152
[423]	valid_0's binary_logloss: 0.321506
[424]	valid_0's binary_logloss: 0.321497
[425]	valid_0's binary_logloss: 0.321484
[426]	valid_0's binary_logloss: 0.321488
[427]	valid_0's binary_logloss: 0.321488
[428]	valid_0's binary_logloss: 0.321476
[429]	valid_0's binary_logloss: 0.321477
[430]	valid_0's binary_logloss: 0.321477
[431]	valid_0's binary_logloss: 0.32146
[432]	valid_0's binary_logloss: 0.321452
[433]	valid_0's binary_logloss: 0.321443
[434]	valid_0's binary_logloss: 0.321443
[435]	valid_0's binary_logloss: 0.321441
[436]	valid_0's binary_logloss: 0.321435
[437]	valid_0's binary_logloss: 0.321431
[438]	valid_0's binary_logloss: 0.321427
[439]	valid_0's binary_logloss: 0.321415
[440]	valid_0's binary_logloss: 0.32141
[441]	valid_0's binary_logloss: 0.321403
[442]	valid_0's binary_logloss: 0.321379
[443]	valid_0's bina

### Evaluate Model

In [17]:
def predict_classes(model, data, threshold=0.5):
    y_pred = model.predict(data)
    y_pred[y_pred < threshold] = 0
    y_pred[y_pred >= threshold] = 1
    return y_pred

In [18]:
y_pred_train = predict_classes(model, X_train)
y_pred_proba_train = model.predict(X_train)

In [19]:
y_pred_val = predict_classes(model, X_val)
y_pred_proba_val = model.predict(X_val)

In [20]:
continuous_metrics = [log_loss, roc_auc_score]

In [21]:
binary_metrics = [accuracy_score, precision_score, recall_score]

### Training

In [22]:
for metric in continuous_metrics:
    print('{:20s}: {:10.5f}'.format(metric.__name__, metric(y_train, y_pred_proba_train)))
for metric in binary_metrics:
    print('{:20s}: {:10.5f}'.format(metric.__name__, metric(y_train, y_pred_train)))

log_loss            :    0.28266
roc_auc_score       :    0.94744
accuracy_score      :    0.86968
precision_score     :    0.81639
recall_score        :    0.83476


### Validation

In [23]:
for metric in continuous_metrics:
    print('{:20s}: {:10.5f}'.format(metric.__name__, metric(y_val, y_pred_proba_val)))
for metric in binary_metrics:
    print('{:20s}: {:10.5f}'.format(metric.__name__, metric(y_val, y_pred_val)))

log_loss            :    0.32110
roc_auc_score       :    0.93017
accuracy_score      :    0.85090
precision_score     :    0.78773
recall_score        :    0.81607


In [24]:
raise ValueError('Stopping before the test set')

ValueError: Stopping before the test set

## Test

In [None]:
X_test = read_dataset(features_data_folder + 'X_test_all_features.csv')

In [None]:
y_test = model.predict(X_test)

In [None]:
submission_id = datetime.datetime.now().strftime('%Y-%m-%d-%H%M')

In [None]:
df_submission = pd.DataFrame({
    'test_id': range(len(y_test)),
    'is_duplicate': y_test
})

In [None]:
df_submission = df_submission[['test_id', 'is_duplicate']]

In [None]:
df_submission.head(10)

In [None]:
df_submission.to_csv(
    submissions_data_folder + submission_id + '-submission-draft.csv',
    header=True,
    float_format='%.8f',
    index=None,
)