# IMPORTS

In [35]:
import sys
sys.path.insert(0, "../..")
import config as cfg
import gc
import os
from typing import Optional, Any, Union, Callable

In [36]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from metrics import compute_single_col_score, get_tresholds, compute_weird_pred_proba_score, compute_weird_pred_score
from sklearn.metrics import recall_score
from helper import make_prediction, check_path, get_prediction, evaluate, seed_everything, save_submission, save_pred_proba_oof, save_pred_proba_test
from collections import defaultdict
from functools import partial

In [37]:
from sklearn.metrics import *
from sklearn.model_selection import StratifiedKFold, train_test_split
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [38]:
import tensorflow as tf
from tensorflow.keras import layers

In [39]:
RANDOM_STATE = 77
seed_everything(RANDOM_STATE)

# MODEL TRAINING

In [40]:
train = pd.read_pickle(cfg.PREPARED_TRAIN_DATA_PATH)
test = pd.read_pickle(cfg.PREPARED_TEST_DATA_PATH)

In [41]:
train[cfg.CAT_UNORDERED_COLS] = train[cfg.CAT_UNORDERED_COLS].astype('string')
test[cfg.CAT_UNORDERED_COLS] = test[cfg.CAT_UNORDERED_COLS].astype('string')

In [42]:
train[cfg.REAL_COLS] = train[cfg.REAL_COLS].fillna(-1)
test[cfg.REAL_COLS] = test[cfg.REAL_COLS].fillna(-1)

In [43]:
X_train, Y_train = train.drop(cfg.TARGETS, axis=1), train[cfg.TARGETS]

In [44]:
EXPERIMENT_FAMILY_NAME = 'keras'
EXPERIMENT_NAME = 'baseline'
N_SPLITS = 7
BATCH_SIZE = 32

In [45]:
CAT_COLS = cfg.CAT_UNORDERED_COLS

In [46]:
def df_to_dataset(data: pd.DataFrame, labels: Optional[pd.DataFrame]=None, shuffle: bool=True, batch_size: int=32):
  df = data.copy()
  df = {key: value.values[:, tf.newaxis] for key, value in data.items()}
  
  if labels is None:
    ds = tf.data.Dataset.from_tensor_slices((dict(df),))
  else:
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(data))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [47]:
def get_normalization_layer(name, dataset):
  # Create a Normalization layer for the feature.
  normalizer = layers.Normalization(axis=None)

  # Prepare a Dataset that only yields the feature.
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the statistics of the data.
  normalizer.adapt(feature_ds)

  return normalizer

In [48]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  # Create a layer that turns strings into integer indices.
  if dtype == 'string':
    index = layers.StringLookup(max_tokens=max_tokens)
  # Otherwise, create a layer that turns integer values into integer indices.
  else:
    index = layers.IntegerLookup(max_tokens=max_tokens)

  # Prepare a `tf.data.Dataset` that only yields the feature.
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the set of possible values and assign them a fixed integer index.
  index.adapt(feature_ds)

  # Encode the integer indices.
  encoder = layers.CategoryEncoding(num_tokens=index.vocabulary_size())

  # Apply multi-hot encoding to the indices. The lambda function captures the
  # layer, so you can use them, or include them in the Keras Functional model later.
  return lambda feature: encoder(index(feature))


In [49]:
def get_keras_model(train):
    all_inputs = []
    encoded_features = []

    # Numerical features.
    for header in cfg.REAL_COLS:
        numeric_col = tf.keras.Input(shape=(1,), name=header)
        normalization_layer = get_normalization_layer(header, train)
        encoded_numeric_col = normalization_layer(numeric_col)
        all_inputs.append(numeric_col)
        encoded_features.append(encoded_numeric_col)

    for header in cfg.CAT_ORDERED_COLS + cfg.BINARY_COLS:
        ordered_cat_col = tf.keras.Input(shape=(1,), name=header, dtype='int32')

        encoding_layer = get_category_encoding_layer(name=header,
                                                    dataset=train,
                                                    dtype='int32',
                                                    max_tokens=None)
        encoded_ordered_cat_col = encoding_layer(ordered_cat_col)
        all_inputs.append(ordered_cat_col)
        encoded_features.append(encoded_ordered_cat_col)

    for header in cfg.CAT_UNORDERED_COLS:
        categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='string')
        encoding_layer = get_category_encoding_layer(name=header,
                                                    dataset=train,
                                                    dtype='string',
                                                    max_tokens=None)
        encoded_categorical_col = encoding_layer(categorical_col)
        all_inputs.append(categorical_col)
        encoded_features.append(encoded_categorical_col)


    all_features = tf.keras.layers.concatenate(encoded_features)
    x = tf.keras.layers.Dense(256)(all_features)
    x = tf.keras.layers.LeakyReLU(alpha=0.05)(x)
    x = tf.keras.layers.Dropout(0.4)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dense(128)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.LeakyReLU(alpha=0.05)(x)
    x = tf.keras.layers.Dropout(0.4)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    output = tf.keras.layers.Dense(len(cfg.TARGETS))(x)

    model = tf.keras.Model(all_inputs, output)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(
            learning_rate=0.0005,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-07,
            amsgrad=False,
            name='Adam'
            )
    ,
        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
        # metrics=['AUC']
        )
    return model

In [50]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
# reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.0001)

In [51]:
def get_keras_input(
    data: pd.DataFrame,
    labels: Optional[pd.DataFrame] = None,
    shuffle=False
    ) -> tf.data.Dataset:
    return df_to_dataset(
        data=data,
        labels=labels,
        shuffle=shuffle, 
        batch_size=BATCH_SIZE)
        


In [52]:
from typing import Tuple


def fit_keras_model(train, val, iterations: int=None) -> Tuple[Any, int]:
    model = get_keras_model(train)

    patience = 10
    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience, restore_best_weights=True)

    if iterations is not None:
        history = model.fit(
            train,
            validation_data=val,
            epochs=iterations, 
            verbose=0)
    else:
        history = model.fit(
            train, 
            validation_data=val,
            epochs=200, 
            verbose=0, 
            callbacks=[early_stopping])

    best_iter = max(3, len(history.epoch) - patience)
    return model, best_iter

In [53]:
def save_keras_model(model: Any, experiment_name: str, experiment_family_name: str, fold: int, suffix='') -> None:
    model_name = f'{experiment_name}_fold_{fold}' + f'_{suffix}' if suffix else ''
    model_path = os.path.join(cfg.MODELS_PATH, EXPERIMENT_FAMILY_NAME, EXPERIMENT_NAME)
    check_path(model_path)
    model.save(os.path.join(model_path, model_name))

In [54]:
def predict_with_keras_model(model, data) -> np.ndarray:
    return tf.nn.sigmoid(model.predict(data)).numpy().squeeze()


In [55]:
weird_scores = []

cv = MultilabelStratifiedKFold(n_splits=N_SPLITS, random_state=RANDOM_STATE, shuffle=True)

pred_int = pd.DataFrame(index=Y_train.index, columns=Y_train.columns, dtype=np.int32)
pred_proba = pd.DataFrame(index=Y_train.index, columns=Y_train.columns, dtype=np.float32)
best_iters_global = []
for train_idx, val_idx in tqdm(cv.split(X_train, Y_train), total=N_SPLITS):

    pred_template = Y_train.iloc[val_idx].copy()
    pred_template.iloc[:, :] = 0
    
    prediction, pred_proba_oof, pred_proba_test, best_iters = get_prediction(
        train_data=X_train.iloc[train_idx],
        train_labels=Y_train.iloc[train_idx],
        test_data=X_train.iloc[val_idx],
        pred_template=pred_template,
        process_input=get_keras_input,
        save_model=save_keras_model,
        fit_model=fit_keras_model,
        predict=predict_with_keras_model,
        n_splits=3,
        random_state=RANDOM_STATE,
        experiment_name=EXPERIMENT_NAME,
        experiment_family_name=EXPERIMENT_FAMILY_NAME,
        suffix='eval',
        rename_cols=False,
        )
    best_iters_global.extend(best_iters)
    
    pred_int.iloc[val_idx] = prediction
    pred_proba.iloc[val_idx] = pred_proba_test

metrics = evaluate(
    test_labels=Y_train,
    prediction=pred_int.astype(np.int32), 
    pred_proba_test=pred_proba
    )


  0%|          | 0/7 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_0_eval/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_1_eval/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_2_eval/assets
[0.6639879374337222, 0.6594428969359332, 0.5704555361813426, 0.6150519978106185, 0.7052692252269561]
0.6428415187177146 0.04611286862311418


  0%|          | 0/3 [00:00<?, ?it/s]

INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_0_eval/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_1_eval/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_2_eval/assets
[0.6203755238344684, 0.6640075567445325, 0.5955196807545802, 0.630484693877551, 0.7103654154789566]
0.6441505741380177 0.039738339330027676


  0%|          | 0/3 [00:00<?, ?it/s]

INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_0_eval/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_1_eval/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_2_eval/assets
[0.6645658721843897, 0.6532962447844228, 0.5735896970796299, 0.6003826530612245, 0.7106259132392747]
0.6404920760697883 0.04847049337603022


  0%|          | 0/3 [00:00<?, ?it/s]

INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_0_eval/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_1_eval/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_2_eval/assets
[0.6188403878611377, 0.6453477051460361, 0.5464302765433913, 0.579719387755102, 0.6978326763870751]
0.6176340867385485 0.05236185744329606


  0%|          | 0/3 [00:00<?, ?it/s]

INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_0_eval/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_1_eval/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_2_eval/assets
[0.6687765111346766, 0.65983286908078, 0.586917348533603, 0.640850209815727, 0.7166110283875307]
0.6545975933904634 0.04205644311903179


  0%|          | 0/3 [00:00<?, ?it/s]

INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_0_eval/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_1_eval/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_2_eval/assets
[0.6691266505609426, 0.6660778859527121, 0.5624523852711772, 0.6360969387755102, 0.7032240286580325]
0.6473955778436749 0.04749966165617686


  0%|          | 0/3 [00:00<?, ?it/s]

INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_0_eval/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_1_eval/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_2_eval/assets
[0.6194658006362672, 0.627083419639035, 0.5332388840453357, 0.6175049730036942, 0.7285172198472549]
0.6251620594343175 0.06201117531463859
[0.6247999126891735, 0.6458703057748404, 0.5414336178981455, 0.5824571703047446, 0.7182178191653378]
0.6225557651664484 0.05981393259847378
TEST METRICS
defaultdict(<class 'list'>, {'weird_score': 0.6225557651664484, 'oof_auc': 0.6730687152102194, 'oof_logloss': 1.0711944862654073})


In [56]:
print('OVERALL METRICS')
print(metrics)
print(best_iters_global)

OVERALL METRICS
defaultdict(<class 'list'>, {'weird_score': 0.6225557651664484, 'oof_auc': 0.6730687152102194, 'oof_logloss': 1.0711944862654073})
[26, 27, 25, 35, 29, 26, 34, 26, 31, 29, 23, 28, 28, 36, 24, 25, 34, 30, 25, 28, 32]


In [57]:
N_ITERS = int(np.mean(best_iters_global))

In [58]:
# [0.632011421731218, 0.6490860706862861, 0.661385903220677, 0.6271449441761077, 0.6818099910552275]
# 0.6502876661739032 0.01993929966464236

In [59]:
sample_submission = pd.read_csv(cfg.SAMPLE_SUBMISSION_PATH).set_index('ID')

prediction, pred_proba_oof, pred_proba_test, _ = get_prediction(
    train_data=X_train,
    train_labels=Y_train,
    test_data=test,
    pred_template=sample_submission,
    process_input=get_keras_input,
    save_model=save_keras_model,
    fit_model=partial(fit_keras_model, iterations=N_ITERS),
    predict=predict_with_keras_model,
    n_splits=N_SPLITS,
    random_state=RANDOM_STATE,
    experiment_name=EXPERIMENT_NAME,
    experiment_family_name=EXPERIMENT_FAMILY_NAME,
    suffix=''
    )

  0%|          | 0/7 [00:00<?, ?it/s]

INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/assets
[0.6590815386107878, 0.6668502539624258, 0.5762571252709611, 0.6077547099322197, 0.704604121331724]
0.6429095498216236 0.04542976811

In [60]:
save_submission(prediction, EXPERIMENT_FAMILY_NAME, EXPERIMENT_NAME)
save_pred_proba_oof(pred_proba_oof, EXPERIMENT_FAMILY_NAME, EXPERIMENT_NAME)
save_pred_proba_test(pred_proba_test, EXPERIMENT_FAMILY_NAME, EXPERIMENT_NAME)