# IMPORTS

In [1]:
import sys
sys.path.insert(0, "../..")
import config as cfg
import gc
import os
from typing import Optional, Any, Union, Callable

In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from metrics import compute_single_col_score, get_tresholds, compute_weird_pred_proba_score, compute_weird_pred_score
from sklearn.metrics import recall_score
from helper import make_prediction, check_path, get_prediction, evaluate, seed_everything, save_submission, save_pred_proba_oof, save_pred_proba_test
from collections import defaultdict

In [3]:
from sklearn.metrics import *
from sklearn.model_selection import StratifiedKFold, train_test_split
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

In [4]:
import tensorflow as tf
from tensorflow.keras import layers

In [5]:
RANDOM_STATE = 77
seed_everything(RANDOM_STATE)

# MODEL TRAINING

In [6]:
train = pd.read_pickle(cfg.PREPARED_TRAIN_DATA_PATH)
test = pd.read_pickle(cfg.PREPARED_TEST_DATA_PATH)

In [7]:
train[cfg.CAT_UNORDERED_COLS] = train[cfg.CAT_UNORDERED_COLS].astype('string')
test[cfg.CAT_UNORDERED_COLS] = test[cfg.CAT_UNORDERED_COLS].astype('string')

In [8]:
train[cfg.REAL_COLS] = train[cfg.REAL_COLS].fillna(-1)
test[cfg.REAL_COLS] = test[cfg.REAL_COLS].fillna(-1)

In [9]:
X_train, Y_train = train.drop(cfg.TARGETS, axis=1), train[cfg.TARGETS]

In [10]:
EXPERIMENT_FAMILY_NAME = 'keras'
EXPERIMENT_NAME = 'baseline'
N_SPLITS = 8
BATCH_SIZE = 16

In [11]:
CAT_COLS = cfg.CAT_UNORDERED_COLS

In [12]:
def df_to_dataset(data: pd.DataFrame, labels: Optional[pd.DataFrame]=None, shuffle: bool=True, batch_size: int=32):
  df = data.copy()
  df = {key: value.values[:, tf.newaxis] for key, value in data.items()}
  
  if labels is None:
    ds = tf.data.Dataset.from_tensor_slices((dict(df),))
  else:
    ds = tf.data.Dataset.from_tensor_slices((dict(df), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(data))
  ds = ds.batch(batch_size)
  ds = ds.prefetch(batch_size)
  return ds

In [13]:
def get_normalization_layer(name, dataset):
  # Create a Normalization layer for the feature.
  normalizer = layers.Normalization(axis=None)

  # Prepare a Dataset that only yields the feature.
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the statistics of the data.
  normalizer.adapt(feature_ds)

  return normalizer

In [14]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
  # Create a layer that turns strings into integer indices.
  if dtype == 'string':
    index = layers.StringLookup(max_tokens=max_tokens)
  # Otherwise, create a layer that turns integer values into integer indices.
  else:
    index = layers.IntegerLookup(max_tokens=max_tokens)

  # Prepare a `tf.data.Dataset` that only yields the feature.
  feature_ds = dataset.map(lambda x, y: x[name])

  # Learn the set of possible values and assign them a fixed integer index.
  index.adapt(feature_ds)

  # Encode the integer indices.
  encoder = layers.CategoryEncoding(num_tokens=index.vocabulary_size())

  # Apply multi-hot encoding to the indices. The lambda function captures the
  # layer, so you can use them, or include them in the Keras Functional model later.
  return lambda feature: encoder(index(feature))


In [15]:
def get_keras_model(train):
    all_inputs = []
    encoded_features = []

    # Numerical features.
    for header in cfg.REAL_COLS:
        numeric_col = tf.keras.Input(shape=(1,), name=header)
        normalization_layer = get_normalization_layer(header, train)
        encoded_numeric_col = normalization_layer(numeric_col)
        all_inputs.append(numeric_col)
        encoded_features.append(encoded_numeric_col)

    for header in cfg.CAT_ORDERED_COLS + cfg.BINARY_COLS:
        ordered_cat_col = tf.keras.Input(shape=(1,), name=header, dtype='int32')

        encoding_layer = get_category_encoding_layer(name=header,
                                                    dataset=train,
                                                    dtype='int32',
                                                    max_tokens=None)
        encoded_ordered_cat_col = encoding_layer(ordered_cat_col)
        all_inputs.append(ordered_cat_col)
        encoded_features.append(encoded_ordered_cat_col)

    for header in cfg.CAT_UNORDERED_COLS:
        categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='string')
        encoding_layer = get_category_encoding_layer(name=header,
                                                    dataset=train,
                                                    dtype='string',
                                                    max_tokens=None)
        encoded_categorical_col = encoding_layer(categorical_col)
        all_inputs.append(categorical_col)
        encoded_features.append(encoded_categorical_col)


    all_features = tf.keras.layers.concatenate(encoded_features)
    x = tf.keras.layers.Dense(256)(all_features)
    x = tf.keras.layers.LeakyReLU(alpha=0.05)(x)
    x = tf.keras.layers.Dropout(0.4)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dense(128)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.LeakyReLU(alpha=0.05)(x)
    x = tf.keras.layers.Dropout(0.4)(x)
    x = tf.keras.layers.BatchNormalization()(x)
    output = tf.keras.layers.Dense(len(cfg.TARGETS))(x)

    model = tf.keras.Model(all_inputs, output)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(
            learning_rate=0.0005,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-07,
            amsgrad=False,
            name='Adam'
            )
    ,
        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
        # metrics=['AUC']
        )
    return model

In [16]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
# reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.0001)

In [17]:
def get_keras_input(
    data: pd.DataFrame,
    labels: Optional[pd.DataFrame] = None,
    shuffle=False
    ) -> tf.data.Dataset:
    return df_to_dataset(
        data=data,
        labels=labels,
        shuffle=shuffle, 
        batch_size=BATCH_SIZE)
        


In [18]:
def fit_keras_model(train, val):
    model = get_keras_model(train)

    early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
    callbacks = [early_stopping]

    model.fit(
        train, 
        validation_data=val,
        epochs=200, 
        verbose=0, 
        callbacks=callbacks)
        
    return model

In [19]:
def save_keras_model(model: Any, experiment_name: str, experiment_family_name: str, fold: int, suffix='') -> None:
    model_name = f'{experiment_name}_fold_{fold}' + f'_{suffix}' if suffix else ''
    model_path = os.path.join(cfg.MODELS_PATH, EXPERIMENT_FAMILY_NAME, EXPERIMENT_NAME)
    check_path(model_path)
    model.save(os.path.join(model_path, model_name))

In [20]:
def predict_with_keras_model(model, data) -> np.ndarray:
    return tf.nn.sigmoid(model.predict(data)).numpy().squeeze()


In [21]:
weird_scores = []

cv = MultilabelStratifiedKFold(n_splits=N_SPLITS, random_state=RANDOM_STATE, shuffle=True)

pred_int = pd.DataFrame(index=Y_train.index, columns=Y_train.columns, dtype=np.int32)
pred_proba = pd.DataFrame(index=Y_train.index, columns=Y_train.columns, dtype=np.float32)
for train_idx, val_idx in tqdm(cv.split(X_train, Y_train), total=N_SPLITS):

    pred_template = Y_train.iloc[val_idx].copy()
    pred_template.iloc[:, :] = 0
    
    prediction, pred_proba_oof, pred_proba_test = get_prediction(
        train_data=X_train.iloc[train_idx],
        train_labels=Y_train.iloc[train_idx],
        test_data=X_train.iloc[val_idx],
        pred_template=pred_template,
        process_input=get_keras_input,
        save_model=save_keras_model,
        fit_model=fit_keras_model,
        predict=predict_with_keras_model,
        n_splits=3,
        random_state=RANDOM_STATE,
        experiment_name=EXPERIMENT_NAME,
        experiment_family_name=EXPERIMENT_FAMILY_NAME,
        suffix='eval',
        rename_cols=False,
        )
    
    pred_int.iloc[val_idx] = prediction
    pred_proba.iloc[val_idx] = pred_proba_test

metrics = evaluate(
    test_labels=Y_train,
    prediction=pred_int.astype(np.int32), 
    pred_proba_test=pred_proba
    )


  0%|          | 0/8 [00:00<?, ?it/s]

2022-07-06 21:23:30.319282: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-07-06 21:23:31.162820: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5379 MB memory:  -> device: 0, name: NVIDIA GeForce GTX 1060 with Max-Q Design, pci bus id: 0000:01:00.0, compute capability: 6.1


  0%|          | 0/3 [00:00<?, ?it/s]

INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_0_eval/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_1_eval/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_2_eval/assets
[0.6295592705167173, 0.6349708820858043, 0.5681646955759965, 0.6285357588728375, 0.7079625158100494]
0.633838624572281 0.044396241878955654


  0%|          | 0/3 [00:00<?, ?it/s]

INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_0_eval/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_1_eval/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_2_eval/assets
[0.6643760700019021, 0.6676282561409093, 0.5489035087719298, 0.6236789041857878, 0.7144482857966004]
0.6438070049794259 0.055484309551896385


  0%|          | 0/3 [00:00<?, ?it/s]

INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_0_eval/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_1_eval/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_2_eval/assets
[0.6641858474415065, 0.6879463927453655, 0.6233790999237223, 0.6741239048811014, 0.7190002881014117]
0.6737271066186216 0.03123448066872082


  0%|          | 0/3 [00:00<?, ?it/s]

INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_0_eval/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_1_eval/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_2_eval/assets
[0.6234148119967028, 0.6568226199074445, 0.5596052631578947, 0.6770094562647754, 0.7077787381158167]
0.6449261778885268 0.05072219191676589


  0%|          | 0/3 [00:00<?, ?it/s]

INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_0_eval/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_1_eval/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_2_eval/assets
[0.6550151975683891, 0.6863012234866699, 0.556101620674551, 0.6322916666666667, 0.7147073191758384]
0.648883405514423 0.05413727917316716


  0%|          | 0/3 [00:00<?, ?it/s]

INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_0_eval/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_1_eval/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_2_eval/assets
[0.6736638804457953, 0.6691810487556127, 0.5960526315789474, 0.5988194444444445, 0.7118279260898301]
0.649908986262926 0.045343821459418246


  0%|          | 0/3 [00:00<?, ?it/s]

INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_0_eval/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_1_eval/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_2_eval/assets
[0.6450734549138804, 0.6231100069455575, 0.6005519053876478, 0.6461805555555555, 0.7267103598942164]
0.6483252565393716 0.042608685209509764


  0%|          | 0/3 [00:00<?, ?it/s]

INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_0_eval/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_1_eval/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/baseline_fold_2_eval/assets
[0.6659067882472138, 0.6601080809017338, 0.5944634253175646, 0.6443749999999999, 0.7170863516154996]
0.6563879292164023 0.039414975171944575
[0, 0, 0, 0, 0]
0.0 0.0
TEST METRICS
defaultdict(<class 'list'>, {'weird_score': 0.0, 'oof_auc': 0.6836071741031793, 'oof_logloss': 1.0570146348467737})


In [51]:
print('OVERALL METRICS')
print(metrics)

OVERALL METRICS
defaultdict(<class 'list'>, {'weird_score': 0.6367762920439424, 'oof_auc': 0.6836071741031793, 'oof_logloss': 1.0570146348467737})


In [23]:
# [0.632011421731218, 0.6490860706862861, 0.661385903220677, 0.6271449441761077, 0.6818099910552275]
# 0.6502876661739032 0.01993929966464236

In [24]:
sample_submission = pd.read_csv(cfg.SAMPLE_SUBMISSION_PATH).set_index('ID')

prediction, pred_proba_oof, pred_proba_test = get_prediction(
    train_data=X_train,
    train_labels=Y_train,
    test_data=test,
    pred_template=sample_submission,
    process_input=get_keras_input,
    save_model=save_keras_model,
    fit_model=fit_keras_model,
    predict=predict_with_keras_model,
    n_splits=N_SPLITS,
    random_state=RANDOM_STATE,
    experiment_name=EXPERIMENT_NAME,
    experiment_family_name=EXPERIMENT_FAMILY_NAME,
    suffix=''
    )

  0%|          | 0/8 [00:00<?, ?it/s]

INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/assets
INFO:tensorflow:Assets written to: /home/as/my_repositories/yar_digital_breakthrough_2022/checkpoints/keras/baseline/assets
[0.68055

In [25]:
save_submission(prediction, EXPERIMENT_FAMILY_NAME, EXPERIMENT_NAME)
save_pred_proba_oof(pred_proba_oof, EXPERIMENT_FAMILY_NAME, EXPERIMENT_NAME)
save_pred_proba_test(pred_proba_test, EXPERIMENT_FAMILY_NAME, EXPERIMENT_NAME)