## Imports

In [1]:
import datetime

In [2]:
from sklearn.metrics import *

## Config

In [3]:
RANDOM_SEED = 42

## Read Data

In [4]:
feature_lists = [
    'oofp_manual_lightgbm',
    'oofp_nn_concat_dense_1',
    'oofp_currie32_cnn',
    'oofp_lystdo_lstm',
]

In [5]:
df_train, df_test, _ = load_feature_lists(feature_lists)

In [6]:
y_train = load(features_data_folder + 'y_train.pickle')

In [7]:
df_train.dtypes

oofp_manual_lightgbm      float64
oofp_nn_concat_dense_1    float64
oofp_currie32_cnn         float64
oofp_lystdo_lstm          float64
dtype: object

## Train Model

In [8]:
weights = [1.0, 0.8, 0.75, 1.0]

In [9]:
y_pred_proba_train = (df_train * weights).sum(axis=1).values / sum(weights)

In [10]:
y_pred_proba_test = (df_test * weights).sum(axis=1).values / sum(weights)

### Evaluate Model

In [11]:
def predict_classes(y_proba, threshold=0.5):
    classes = np.array(y_proba)
    classes[y_proba < threshold] = 0
    classes[y_proba >= threshold] = 1
    return classes

In [12]:
y_pred_train = predict_classes(y_pred_proba_train)

In [13]:
continuous_metrics = [log_loss, roc_auc_score]

In [14]:
binary_metrics = [accuracy_score, precision_score, recall_score]

### Training

In [15]:
for metric in continuous_metrics:
    print('{:20s}: {:10.5f}'.format(metric.__name__, metric(y_train, y_pred_proba_train)))
for metric in binary_metrics:
    print('{:20s}: {:10.5f}'.format(metric.__name__, metric(y_train, y_pred_train)))

log_loss            :    0.29702
roc_auc_score       :    0.95270
accuracy_score      :    0.87161
precision_score     :    0.78793
recall_score        :    0.89245


## Test

In [16]:
y_test = y_pred_proba_test

In [17]:
submission_id = datetime.datetime.now().strftime('%Y-%m-%d-%H%M')

In [18]:
df_submission = pd.DataFrame({
    'test_id': range(len(y_test)),
    'is_duplicate': y_test
})

In [19]:
df_submission = df_submission[['test_id', 'is_duplicate']]

In [20]:
df_submission.head(10)

Unnamed: 0,test_id,is_duplicate
0,0,0.11014
1,1,0.440953
2,2,0.415311
3,3,0.034973
4,4,0.055243
5,5,0.001425
6,6,0.93769
7,7,0.467715
8,8,0.277317
9,9,0.010256


In [21]:
df_submission.to_csv(
    submissions_data_folder + submission_id + '-submission-draft.csv',
    header=True,
    float_format='%.8f',
    index=None,
)