In [1]:
import sys
sys.path.insert(0, '..')

In [19]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from typing import Dict
from config import TEXT_COLS, ORDERED_CATEGORIES, UNORDERED_CATEGORIES
from sklearn.model_selection import train_test_split

In [3]:
train = pd.read_pickle('../data/prepared/train.pkl')
target = pd.read_pickle('../data/prepared/target.pkl')
test = pd.read_pickle('../data/prepared/test.pkl')

In [4]:
BUFFER_SIZE = 10000
VOCAB_SIZE = 10000
BATCH_SIZE = 64
N_TARGETS = target.shape[1]

In [5]:
def get_encoders(data: pd.DataFrame, **keywords) -> Dict:
    encoders = {}
    for col in data.columns:
        encoder = layers.TextVectorization(max_tokens=VOCAB_SIZE, name=col, **keywords)
        encoder.adapt(train[col])
        encoders[col] = encoder
    return encoders

In [6]:
encoders = get_encoders(train[TEXT_COLS+UNORDERED_CATEGORIES])

2022-01-17 18:39:05.836905: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [23]:
def get_model(encoders):

    def _get_text_model(input, encoder):
        x = encoder(input)
        x = layers.Embedding(
                input_dim=len(encoder.get_vocabulary()),
                output_dim=64,
                mask_zero=True)(x)
        x = layers.Bidirectional(tf.keras.layers.LSTM(64))(x)
        x = layers.Dropout(0.5)(x)
        x = layers.Dense(64, activation='relu')(x)
        x = layers.Dropout(0.3)(x)
        x = layers.Dense(N_TARGETS)(x)
        return x

    def _get_ordered_category_model(input):
        x = layers.Dense(32, activation='relu')(input)
        x = layers.Dropout(0.2)(x)
        x = layers.Dense(N_TARGETS)(x)
        return x

    def _get_unordered_category_mode(input, encoder):
        x = encoder(input)
        x = layers.CategoryEncoding(num_tokens=len(encoder.get_vocabulary()))(x)
        x = layers.Dense(64, activation='relu')(x)
        x = layers.Dropout(0.2)(x)
        x = layers.Dense(32, activation='relu')(x)
        x = layers.Dropout(0.2)(x)
        x = layers.Dense(N_TARGETS)(x)
        return x

    def _get_final_classifier(features):
        x = layers.Dense(128, activation='relu')(features)
        x = layers.Dropout(0.3)(x)
        x = layers.Dense(64, activation='relu')(x)
        x = layers.Dropout(0.3)(x)
        x = layers.Dense(N_TARGETS, activation='sigmoid')(x)
        return x

    text_inputs = {col: keras.Input(shape=(None,), dtype='string', name=f'{col}_text') for col in TEXT_COLS + UNORDERED_CATEGORIES}
    unordered_cat_inputs = {col: keras.Input(shape=(None,), dtype='string', name=f'{col}_unordered_cat') for col in UNORDERED_CATEGORIES}
    ordered_cat_input = keras.Input(shape=(len(ORDERED_CATEGORIES)), name='ordered_cat_input')

    ordered_cat_features = [_get_ordered_category_model(ordered_cat_input)]
    text_features = [_get_text_model(text_inputs[col], encoders[col]) for col in TEXT_COLS]
    unordered_cat_features = [_get_unordered_category_mode(unordered_cat_inputs[col], encoders[col]) for col in UNORDERED_CATEGORIES]

    features = layers.Add()(ordered_cat_features + text_features + unordered_cat_features)
    out = _get_final_classifier(features)

    model = keras.Model(
        inputs=[ordered_cat_input] + list(text_inputs.values()) + list(unordered_cat_inputs.values()),
        outputs=out
    )

    model.compile(
    optimizer=keras.optimizers.Adam(),
    loss=keras.losses.BinaryCrossentropy(from_logits=False)
)

    return model


In [24]:
model = get_model(encoders)

In [20]:
X_train, X_val, y_train, y_val = train_test_split(train, target, test_size=0.33, random_state=42)

In [25]:
x = {f'{col}_text': X_train[col] for col in TEXT_COLS + UNORDERED_CATEGORIES}
x.update({f'{col}_unordered_cat': X_train[col] for col in UNORDERED_CATEGORIES})
x.update({'ordered_cat_input': X_train[ORDERED_CATEGORIES]})

In [26]:
model.fit(x=x, y=y_train, epochs=3, batch_size=2048)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f8968ce3c40>

In [27]:
x = {f'{col}_text': X_val[col] for col in TEXT_COLS + UNORDERED_CATEGORIES}
x.update({f'{col}_unordered_cat': X_val[col] for col in UNORDERED_CATEGORIES})
x.update({'ordered_cat_input': X_val[ORDERED_CATEGORIES]})

In [29]:
pred = model.predict(x)

In [35]:
np.median(pred, axis=0)

array([0.35894182, 0.04393944, 0.00411439, 0.00540468, 0.00485599,
       0.00279172, 0.00828095, 0.00307931, 0.64143836], dtype=float32)

In [38]:
from sklearn.metrics import f1_score

In [42]:
pred_labels = np.where(pred > np.quantile(pred, axis=0), 1, 0)
f1_score(y_val, pred_labels, average='samples')

0.5947687115346436

In [43]:
def expect_f1(y_prob, thres):
    idxs = np.where(y_prob >= thres)[0]
    tp = y_prob[idxs].sum()
    fp = len(idxs) - tp
    idxs = np.where(y_prob < thres)[0]
    fn = y_prob[idxs].sum()
    return 2*tp / (2*tp + fp + fn)

def optimal_threshold(y_prob):
    y_prob = np.sort(y_prob)[::-1]
    f1s = [expect_f1(y_prob, p) for p in y_prob]
    thres = y_prob[np.argmax(f1s)]
    return thres, f1s

thres, f1s = optimal_threshold(pred[:, 0])
print(f"Predicted Optimal Threshold is {thres:.5f} with F1 score {expect_f1(pred[:, 0], thres):.5f}")

Predicted Optimal Threshold is 0.34998 with F1 score 0.70018


In [45]:
pred_labels = np.zeros_like(pred, dtype=np.int8)
for col_idx in range(y_val.shape[1]):
    thres, _ = optimal_threshold(pred[:, col_idx])
    pred_labels[:, col_idx] = np.where(pred[:, col_idx] > thres, 1, 0)
print('f1_scores_samples', f1_score(y_val, pred_labels, average='samples'))
    

f1_scores_samples 0.7493755473564884


In [49]:
def get_pred_labels(pred_proba: np.ndarray, y_true: np.ndarray = None):
    pred_labels = np.zeros_like(pred_proba, dtype=np.int8)
    for col_idx in range(9):
        thres, _ = optimal_threshold(pred_proba[:, col_idx])
        pred_labels[:, col_idx] = np.where(pred_proba[:, col_idx] > thres, 1, 0)
    if y_true:
        print('f1_scores_samples', f1_score(y_true, pred_labels, average='samples'))
    return pred_labels

In [50]:
pred_labels = get_pred_labels(pred)

In [53]:
a = pd.DataFrame(np.zeros_like(pred))

In [52]:
pred_labels.shape

(16790, 9)

In [54]:
a.iloc[list(range(100))] = pred_labels[list(range(100))]

In [55]:
a

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...
16785,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16786,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16787,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16788,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [56]:
target.shape

(50876, 9)