## Imports

In [1]:
import datetime
import os

In [2]:
import sklearn_evaluation as skeval
import xgboost as xgb

In [3]:
from scipy.stats import randint, uniform
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import *
from xgboost import XGBClassifier

## Config

In [4]:
RANDOM_SEED = 42

In [5]:
data_folder = os.path.abspath(os.path.join(os.curdir, os.pardir, 'data')) + os.path.sep
aux_data_folder = os.path.join(data_folder, 'aux') + os.path.sep
preproc_data_folder = os.path.join(data_folder, 'preproc') + os.path.sep
features_data_folder = os.path.join(data_folder, 'features') + os.path.sep
submissions_data_folder = os.path.join(data_folder, 'submissions') + os.path.sep

## Read Data

In [6]:
feature_lists = [
    'simple_summaries',
    'fuzzy',
    'tfidf_distances',
    'embedding_mean',
    'wmd',
]

In [7]:
X_train_separate = [
    load(features_data_folder + f'X_train_{feature_list_id}.pickle')
    for feature_list_id in feature_lists
]

In [8]:
running_feature_count = 0

for feature_list_id, features in zip(feature_lists, X_train_separate):
    start_index = running_feature_count
    end_index = running_feature_count + features.shape[-1] - 1
    running_feature_count += features.shape[-1]
    
    print(f'{feature_list_id:30s}: {start_index:3d} - {end_index:3d}')

simple_summaries              :   0 -   7
fuzzy                         :   8 -  13
tfidf_distances               :  14 -  15
embedding_mean                :  16 -  19
wmd                           :  20 -  20


In [9]:
X = np.hstack(X_train_separate)

In [10]:
y = load(features_data_folder + 'y_train.pickle')

In [11]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.15,
    random_state=RANDOM_SEED,
    stratify=y
)

In [12]:
print('X train:', X_train.shape)
print('y train:', y_train.shape)
print('X val:  ', X_val.shape)
print('y val:  ', y_val.shape)

X train: (343646, 21)
y train: (343646,)
X val:   (60644, 21)
y val:   (60644,)


## Train Model

In [13]:
positive_imbalance_ratio = np.count_nonzero(y_train == 0) / np.count_nonzero(y_train == 1)

In [14]:
print('Positive imbalance ratio:', positive_imbalance_ratio)

Positive imbalance ratio: 1.7085825983463778


In [15]:
xgb_params = {
    'objective': 'binary:logistic',
    'n_estimators': 800,
    'seed': RANDOM_SEED,
    'learning_rate': 0.03,
    'max_depth': 10,
    'subsample': 0.8,
    'colsample_bytree': 0.5,
    'scale_pos_weight': positive_imbalance_ratio,
    'silent': 1,
}

In [16]:
model = XGBClassifier(**xgb_params)

In [17]:
kfold = StratifiedKFold(
    n_splits=5,
    shuffle=True,
    random_state=RANDOM_SEED,
)

In [18]:
def log_loss_scorer(estimator, X, y):
    return -log_loss(y, estimator.predict_proba(X)[:, -1])

In [19]:
cv_score = cross_val_score(
    model,
    X_train,
    y_train,
    scoring=log_loss_scorer,
    cv=kfold,
)

In [20]:
cv_score

array([-0.44535946, -0.44553536, -0.44521233, -0.44153757, -0.44508066])

In [21]:
model = XGBClassifier(**xgb_params)

In [22]:
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.5,
       gamma=0, learning_rate=0.03, max_delta_step=0, max_depth=10,
       min_child_weight=1, missing=None, n_estimators=800, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1.7085825983463778, seed=42, silent=1,
       subsample=0.8)

## Evaluate Model

In [23]:
y_pred_train = model.predict(X_train)
y_pred_proba_train = model.predict_proba(X_train)[:, -1]

In [24]:
y_pred_val = model.predict(X_val)
y_pred_proba_val = model.predict_proba(X_val)[:, -1]

In [25]:
continuous_metrics = [log_loss, roc_auc_score]

In [26]:
binary_metrics = [accuracy_score, precision_score, recall_score]

### Train

In [27]:
for metric in continuous_metrics:
    print('{:20s}: {:10.5f}'.format(metric.__name__, metric(y_train, y_pred_proba_train)))
for metric in binary_metrics:
    print('{:20s}: {:10.5f}'.format(metric.__name__, metric(y_train, y_pred_train)))

log_loss            :    0.36994
roc_auc_score       :    0.92444
accuracy_score      :    0.81541
precision_score     :    0.68278
recall_score        :    0.93393


### Validation

In [28]:
for metric in continuous_metrics:
    print('{:20s}: {:10.5f}'.format(metric.__name__, metric(y_val, y_pred_proba_val)))
for metric in binary_metrics:
    print('{:20s}: {:10.5f}'.format(metric.__name__, metric(y_val, y_pred_val)))

log_loss            :    0.44077
roc_auc_score       :    0.86580
accuracy_score      :    0.76623
precision_score     :    0.63441
recall_score        :    0.86565


In [29]:
raise ValueError('Stopping before the test set')

ValueError: Stopping before the test set

### Test

In [30]:
X_test = np.hstack([
    load(features_data_folder + f'X_test_{feature_list_id}.pickle')
    for feature_list_id in feature_lists
])

In [31]:
y_test = model.predict_proba(X_test)[:, -1]

In [32]:
submission_id = datetime.datetime.now().strftime('%Y-%m-%d-%H%M')

In [33]:
df_submission = pd.DataFrame({
    'test_id': range(len(y_test)),
    'is_duplicate': y_test
})

In [34]:
df_submission = df_submission[['test_id', 'is_duplicate']]

In [35]:
df_submission.head(10)

Unnamed: 0,test_id,is_duplicate
0,0,0.229789
1,1,0.798466
2,2,0.737739
3,3,6.5e-05
4,4,0.099807
5,5,0.001966
6,6,0.718302
7,7,0.871229
8,8,0.619264
9,9,0.216951


In [36]:
df_submission.to_csv(submissions_data_folder + submission_id + '-submission-draft.csv', header=True, index=None)