## Imports

In [1]:
import datetime
import os

In [2]:
import lightgbm as lgb

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import *

## Config

In [4]:
RANDOM_SEED = 42

In [5]:
data_folder = os.path.abspath(os.path.join(os.curdir, os.pardir, 'data')) + os.path.sep
aux_data_folder = os.path.join(data_folder, 'aux') + os.path.sep
preproc_data_folder = os.path.join(data_folder, 'preproc') + os.path.sep
features_data_folder = os.path.join(data_folder, 'features') + os.path.sep
submissions_data_folder = os.path.join(data_folder, 'submissions') + os.path.sep

## Read Data

In [6]:
def read_dataset(filename):
    X = pd.read_csv(filename)
    columns_to_remove = ['id', 'is_duplicate']
    columns_to_remove = [
        column
        for column in columns_to_remove
        if column in X.columns
    ]
    X.drop(columns_to_remove, axis=1, inplace=True)
    return X

In [7]:
X = read_dataset(features_data_folder + 'X_train_all_features.csv')

In [8]:
y = load(features_data_folder + 'y_train.pickle')

In [9]:
X.dtypes

shorter_char_len_log              float64
longer_char_len_log               float64
shorter_token_len_log             float64
longer_token_len_log              float64
char_len_diff_log                 float64
token_len_diff_log                float64
char_len_ratio                    float64
token_len_ratio                   float64
word_diff_ratio                   float64
jaccard_ix_2gram                  float64
jaccard_ix_norm_q1_2gram          float64
jaccard_ix_norm_q2_2gram          float64
jaccard_ix_3gram                  float64
jaccard_ix_norm_q1_3gram          float64
jaccard_ix_norm_q2_3gram          float64
jaccard_ix_4gram                  float64
jaccard_ix_norm_q1_4gram          float64
jaccard_ix_norm_q2_4gram          float64
jaccard_ix_5gram                  float64
jaccard_ix_norm_q1_5gram          float64
jaccard_ix_norm_q2_5gram          float64
fuzzy_ratio                       float64
fuzzy_partial_ratio               float64
fuzzy_token_sort_ratio            

## Train Model

In [10]:
lgb_params = {
    'objective': 'binary',
    'metric': 'binary_logloss',
    'boosting': 'gbdt',
    'device': 'cpu',
#     'bagging_fraction': 0.5,
#     'bagging_freq': 20,
#     'feature_fraction': 0.8,
    'num_leaves': 64,
    'learning_rate': 0.03,
    'num_boost_round': 1000,
    'early_stopping_rounds': 5,
    'verbose': 1,
    'bagging_fraction_seed': RANDOM_SEED,
    'feature_fraction_seed': RANDOM_SEED,
}

In [11]:
lgb_dataset = lgb.Dataset(X, y)

In [12]:
%%time

cv_results = lgb.cv(
    lgb_params,
    lgb_dataset,
    num_boost_round=lgb_params['num_boost_round'],
    early_stopping_rounds=lgb_params['early_stopping_rounds'],
    nfold=5,
    stratified=True,
    shuffle=True,
    metrics=['binary_logloss'],
    seed=RANDOM_SEED,
)

CPU times: user 1h 10min 56s, sys: 30.8 s, total: 1h 11min 27s
Wall time: 12min 1s


In [13]:
pd.DataFrame(cv_results)

Unnamed: 0,binary_logloss-mean,binary_logloss-stdv
0,0.673801,0.000104
1,0.655603,0.000171
2,0.638409,0.000261
3,0.622174,0.000333
4,0.606794,0.000409
5,0.592253,0.000476
6,0.578436,0.000544
7,0.565316,0.000602
8,0.552829,0.000659
9,0.540950,0.000714


## Sanity Check (Train/Validation)

In [14]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=RANDOM_SEED,
    stratify=y,
)

In [15]:
lgb_data_train = lgb.Dataset(X_train, y_train)
lgb_data_val = lgb.Dataset(X_val, y_val)

In [16]:
model = lgb.train(
    lgb_params,
    lgb_data_train,
    valid_sets=[lgb_data_val],
    num_boost_round=lgb_params['num_boost_round'],
    early_stopping_rounds=lgb_params['early_stopping_rounds'],
)

[1]	valid_0's binary_logloss: 0.673733
Train until valid scores didn't improve in 5 rounds.
[2]	valid_0's binary_logloss: 0.655409
[3]	valid_0's binary_logloss: 0.638133
[4]	valid_0's binary_logloss: 0.621756
[5]	valid_0's binary_logloss: 0.606271
[6]	valid_0's binary_logloss: 0.59158
[7]	valid_0's binary_logloss: 0.57769
[8]	valid_0's binary_logloss: 0.564452
[9]	valid_0's binary_logloss: 0.551889
[10]	valid_0's binary_logloss: 0.539935
[11]	valid_0's binary_logloss: 0.528516
[12]	valid_0's binary_logloss: 0.517684
[13]	valid_0's binary_logloss: 0.507316
[14]	valid_0's binary_logloss: 0.497446
[15]	valid_0's binary_logloss: 0.488074
[16]	valid_0's binary_logloss: 0.479035
[17]	valid_0's binary_logloss: 0.470434
[18]	valid_0's binary_logloss: 0.462169
[19]	valid_0's binary_logloss: 0.454278
[20]	valid_0's binary_logloss: 0.446703
[21]	valid_0's binary_logloss: 0.439454
[22]	valid_0's binary_logloss: 0.432512
[23]	valid_0's binary_logloss: 0.425813
[24]	valid_0's binary_logloss: 0.41943

[204]	valid_0's binary_logloss: 0.235593
[205]	valid_0's binary_logloss: 0.235519
[206]	valid_0's binary_logloss: 0.235463
[207]	valid_0's binary_logloss: 0.235411
[208]	valid_0's binary_logloss: 0.235353
[209]	valid_0's binary_logloss: 0.23531
[210]	valid_0's binary_logloss: 0.235265
[211]	valid_0's binary_logloss: 0.235227
[212]	valid_0's binary_logloss: 0.23517
[213]	valid_0's binary_logloss: 0.235122
[214]	valid_0's binary_logloss: 0.235073
[215]	valid_0's binary_logloss: 0.23502
[216]	valid_0's binary_logloss: 0.234962
[217]	valid_0's binary_logloss: 0.23493
[218]	valid_0's binary_logloss: 0.23486
[219]	valid_0's binary_logloss: 0.234823
[220]	valid_0's binary_logloss: 0.234788
[221]	valid_0's binary_logloss: 0.234734
[222]	valid_0's binary_logloss: 0.234695
[223]	valid_0's binary_logloss: 0.234646
[224]	valid_0's binary_logloss: 0.234607
[225]	valid_0's binary_logloss: 0.234551
[226]	valid_0's binary_logloss: 0.234504
[227]	valid_0's binary_logloss: 0.234468
[228]	valid_0's binar

[407]	valid_0's binary_logloss: 0.23133
[408]	valid_0's binary_logloss: 0.231301
[409]	valid_0's binary_logloss: 0.231297
[410]	valid_0's binary_logloss: 0.231296
[411]	valid_0's binary_logloss: 0.231288
[412]	valid_0's binary_logloss: 0.231277
[413]	valid_0's binary_logloss: 0.231277
[414]	valid_0's binary_logloss: 0.23128
[415]	valid_0's binary_logloss: 0.231276
[416]	valid_0's binary_logloss: 0.231273
[417]	valid_0's binary_logloss: 0.231257
[418]	valid_0's binary_logloss: 0.231243
[419]	valid_0's binary_logloss: 0.231227
[420]	valid_0's binary_logloss: 0.231221
[421]	valid_0's binary_logloss: 0.231206
[422]	valid_0's binary_logloss: 0.231192
[423]	valid_0's binary_logloss: 0.231191
[424]	valid_0's binary_logloss: 0.231172
[425]	valid_0's binary_logloss: 0.231171
[426]	valid_0's binary_logloss: 0.231169
[427]	valid_0's binary_logloss: 0.231166
[428]	valid_0's binary_logloss: 0.231167
[429]	valid_0's binary_logloss: 0.231157
[430]	valid_0's binary_logloss: 0.231154
[431]	valid_0's bi

[611]	valid_0's binary_logloss: 0.230081
[612]	valid_0's binary_logloss: 0.230079
[613]	valid_0's binary_logloss: 0.230072
[614]	valid_0's binary_logloss: 0.230072
[615]	valid_0's binary_logloss: 0.230074
[616]	valid_0's binary_logloss: 0.230063
[617]	valid_0's binary_logloss: 0.230045
[618]	valid_0's binary_logloss: 0.230032
[619]	valid_0's binary_logloss: 0.230035
[620]	valid_0's binary_logloss: 0.230031
[621]	valid_0's binary_logloss: 0.23003
[622]	valid_0's binary_logloss: 0.230026
[623]	valid_0's binary_logloss: 0.23003
[624]	valid_0's binary_logloss: 0.230032
[625]	valid_0's binary_logloss: 0.230025
[626]	valid_0's binary_logloss: 0.230016
[627]	valid_0's binary_logloss: 0.230007
[628]	valid_0's binary_logloss: 0.230009
[629]	valid_0's binary_logloss: 0.230008
[630]	valid_0's binary_logloss: 0.230007
[631]	valid_0's binary_logloss: 0.230014
[632]	valid_0's binary_logloss: 0.230003
[633]	valid_0's binary_logloss: 0.230006
[634]	valid_0's binary_logloss: 0.230006
[635]	valid_0's bi

In [17]:
pd.DataFrame({
    'column': list(X_train.columns),
    'importance': model.feature_importance(),
}).sort_values(by='importance')

Unnamed: 0,column,importance
49,das_diff_len,0
73,das_who_both,2
76,das_where_both,3
31,emb_norm_sum_cosine,12
64,das_how_both,16
77,das_q1_when,18
38,das_word_match_2root,18
75,das_q2_where,23
78,das_q2_when,28
80,das_q1_why,30


### Evaluate Model

In [18]:
def predict_classes(model, data, threshold=0.5):
    y_pred = model.predict(data)
    y_pred[y_pred < threshold] = 0
    y_pred[y_pred >= threshold] = 1
    return y_pred

In [19]:
y_pred_train = predict_classes(model, X_train)
y_pred_proba_train = model.predict(X_train)

In [20]:
y_pred_val = predict_classes(model, X_val)
y_pred_proba_val = model.predict(X_val)

In [21]:
continuous_metrics = [log_loss, roc_auc_score]

In [22]:
binary_metrics = [accuracy_score, precision_score, recall_score]

### Training

In [23]:
for metric in continuous_metrics:
    print('{:20s}: {:10.5f}'.format(metric.__name__, metric(y_train, y_pred_proba_train)))
for metric in binary_metrics:
    print('{:20s}: {:10.5f}'.format(metric.__name__, metric(y_train, y_pred_train)))

log_loss            :    0.20147
roc_auc_score       :    0.97385
accuracy_score      :    0.90827
precision_score     :    0.87437
recall_score        :    0.87764


### Validation

In [24]:
for metric in continuous_metrics:
    print('{:20s}: {:10.5f}'.format(metric.__name__, metric(y_val, y_pred_proba_val)))
for metric in binary_metrics:
    print('{:20s}: {:10.5f}'.format(metric.__name__, metric(y_val, y_pred_val)))

log_loss            :    0.22993
roc_auc_score       :    0.96482
accuracy_score      :    0.89514
precision_score     :    0.86136
recall_score        :    0.85331


In [25]:
raise ValueError('Stopping before the test set')

ValueError: Stopping before the test set

## Test

In [26]:
X_test = read_dataset(features_data_folder + 'X_test_all_features.csv')

In [27]:
y_test = model.predict(X_test)

In [28]:
submission_id = datetime.datetime.now().strftime('%Y-%m-%d-%H%M')

In [29]:
df_submission = pd.DataFrame({
    'test_id': range(len(y_test)),
    'is_duplicate': y_test
})

In [30]:
df_submission = df_submission[['test_id', 'is_duplicate']]

In [31]:
df_submission.head(10)

Unnamed: 0,test_id,is_duplicate
0,0,0.004318
1,1,0.274911
2,2,0.325483
3,3,4.1e-05
4,4,0.047734
5,5,0.000254
6,6,0.991386
7,7,0.692499
8,8,0.215036
9,9,0.003128


In [32]:
df_submission.to_csv(
    submissions_data_folder + submission_id + '-submission-draft.csv',
    header=True,
    float_format='%.8f',
    index=None,
)