In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
import os
from tqdm import tqdm_notebook as tqdm
from datetime import datetime
import pickle
from helpers import gen_embeddings_index, gen_acc_mappings, gen_tokenizer, convert_text_to_seq, get_labels, gen_embedding_weights

%matplotlib inline
pd.options.display.max_rows = 6
sns.set(style='darkgrid')

# Set Parameters

In [None]:
DESCR_COL_NAME = 'voucher_descr_proc'
VENDOR_COL_NAME = 'vendor_name_proc'
MAX_DESCR_LENGTH = 30
MAX_VENDOR_LENGTH = 8
MAX_NUM_WORDS = 30000
EMBEDDING_DIM = 100
N_CONV_FILTERS = 512
BATCH_SIZE = 512
EPOCHS = 100

# Load Data

In [None]:
x_train = pd.read_feather('data/processed/train/x_train.feather')
y_train = pd.read_feather('data/processed/train/y_train.feather')
x_val = pd.read_feather('data/processed/val/x_val.feather')
y_val = pd.read_feather('data/processed/val/y_val.feather')
x_test = pd.read_feather('data/processed/test/x_test.feather')
y_test = pd.read_feather('data/processed/test/y_test.feather')
acc_mapping_df = pd.read_feather('data/misc/acc_mapping.feather')
x_train

In [None]:
(x_train.voucher_descr_proc.str.count(' ') + 1).hist()

# Build Embeddings Dict

In [None]:
embeddings_index = gen_embeddings_index()

# Prepare Mappings for Interpreting Results

In [None]:
acc_indices, acc_index_to_code, acc_index_to_descr = gen_acc_mappings(acc_mapping_df)
acc_index_to_descr

# Tokenize

In [None]:
print('<--------Fitting tokenizer on texts-------->')
tokenizer = gen_tokenizer(list(x_train[DESCR_COL_NAME].values) + list(x_train[VENDOR_COL_NAME].values), MAX_NUM_WORDS)
print('<-------Converting text to sequences------->')
descr_train = convert_text_to_seq(tokenizer, x_train[DESCR_COL_NAME].values, MAX_DESCR_LENGTH)
descr_val = convert_text_to_seq(tokenizer, x_val[DESCR_COL_NAME].values, MAX_DESCR_LENGTH)
descr_test = convert_text_to_seq(tokenizer, x_test[DESCR_COL_NAME].values, MAX_DESCR_LENGTH)
vendor_name_train = convert_text_to_seq(tokenizer, x_train[VENDOR_COL_NAME].values, MAX_VENDOR_LENGTH)
vendor_name_val = convert_text_to_seq(tokenizer, x_val[VENDOR_COL_NAME].values, MAX_VENDOR_LENGTH)
vendor_name_test = convert_text_to_seq(tokenizer, x_test[VENDOR_COL_NAME].values, MAX_VENDOR_LENGTH)
print('<-----Converting labels to categorical----->')
labels_train = get_labels(y_train, acc_mapping_df)
labels_val = get_labels(y_val, acc_mapping_df)
labels_test = get_labels(y_test, acc_mapping_df)
print('Train labels shape: {}\nValidation labels shape: {}\nTest labels shape: {}'.format(labels_train.shape, labels_val.shape, labels_test.shape))
print('<-----------Getting word indices----------->')
word_index = tokenizer.word_index
print('Found {} unique tokens.'.format(len(word_index)))

# Prepare Embedding Matrix

In [None]:
num_words = min(MAX_NUM_WORDS, len(word_index))
embedding_weights = gen_embedding_weights(num_words, EMBEDDING_DIM, word_index, embeddings_index)
embedding_weights[:2]

# Define Model

In [None]:
def gen_model(ngrams=[1,2,3,4], embedding_trainable=True, incl_voucher_amt=True, dropout_rate=0.5):
    embedding_layer = tf.keras.layers.Embedding(
        num_words,
        EMBEDDING_DIM,
        weights=[embedding_weights],
        # input_length=MAX_DESCR_LENGTH,
        trainable=True, 
        name='embedding'
    )
    descr_input = tf.keras.layers.Input(shape=(MAX_DESCR_LENGTH,), dtype='int32', name='descr_input')
    embedded_descr = embedding_layer(descr_input)
    vendor_input = tf.keras.layers.Input(shape=(MAX_VENDOR_LENGTH,), dtype='int32', name='vendor_input')
    embedded_vendor = embedding_layer(vendor_input)

    # add convolutions for ngrams
    descr_pools, vendor_pools = [], []
    for ngram in ngrams:
        descr_conv = tf.keras.layers.Conv1D(N_CONV_FILTERS, ngram, activation='relu', name='{}gram_descr_conv'.format(ngram))(embedded_descr)
        descr_pool = tf.keras.layers.MaxPool1D(MAX_DESCR_LENGTH - ngram + 1, name='{}gram_descr_pool'.format(ngram))(descr_conv)
        vendor_conv = tf.keras.layers.Conv1D(N_CONV_FILTERS / 2, ngram, activation='relu', name='{}gram_vendor_conv'.format(ngram))(embedded_vendor)
        vendor_pool = tf.keras.layers.MaxPool1D(MAX_VENDOR_LENGTH - ngram + 1, name='{}gram_vendor_pool'.format(ngram))(vendor_conv)
        descr_pools.append(descr_pool)
        vendor_pools.append(vendor_pool)
    
    # concatenate all ngram features, flatten and add dropout
    descr_total_pool =  tf.keras.layers.Concatenate(name='all_ngrams_descr_pool')(descr_pools)
    vendor_total_pool =  tf.keras.layers.Concatenate(name='all_ngrams_vendor_pool')(vendor_pools)
    descr_total_pool_flattened = tf.keras.layers.Flatten(name='descr_flatten')(descr_total_pool)
    vendor_total_pool_flattened = tf.keras.layers.Flatten(name='vendor_flatten')(vendor_total_pool)
    total_pool_flattened = tf.keras.layers.Concatenate(name='total_pool_flattened')([descr_total_pool_flattened, vendor_total_pool_flattened])
    dropout = tf.keras.layers.Dropout(dropout_rate, name='dropout')(total_pool_flattened)

    if incl_voucher_amt:
        voucher_amt_input = tf.keras.layers.Input(shape=(1,), dtype='float32', name='voucher_amt_input')
        voucher_amt_normalised = tf.keras.layers.BatchNormalization(name='voucher_amt_normalised')(voucher_amt_input)
        pool_with_amt = tf.keras.layers.Concatenate(name='pool_with_amt')([dropout, voucher_amt_normalised])
        preds = tf.keras.layers.Dense(len(acc_indices), activation='softmax', name='output')(pool_with_amt)
        model = tf.keras.models.Model([descr_input, vendor_input, voucher_amt_input], preds)
    else:
        preds = tf.keras.layers.Dense(len(acc_indices), activation='softmax', name='output')(dropout)
        model = tf.keras.models.Model([descr_input, vendor_input], preds)
        
    model.compile(
        loss='categorical_crossentropy',
        optimizer='adam',
        metrics=['accuracy']
    )
    
    return incl_voucher_amt, model

In [None]:
incl_voucher_amt, model = gen_model(
    ngrams=[1,2,3,4], 
    embedding_trainable=True, 
    incl_voucher_amt=True, 
    dropout_rate=0.5
)
model.summary()

# Fit Model

In [None]:
def train_model():
    start_time = str(datetime.now())[:16].replace(r':', '')
    print('Started training at {}'.format(start_time))
    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        histogram_freq=1, 
        write_graph=True,
        write_grads=False,
        log_dir='./logs/{}'.format(start_time),
        # embeddings_freq=1,
        # embeddings_layer_names=['sentence_embedding'],
        # embeddings_metadata={'sentence_embedding': 'data/misc/sentence_embedding.tsv'}
    )
    early_stop_callback = tf.keras.callbacks.EarlyStopping(patience=5)
    
    if incl_voucher_amt:
        model.fit(
            [descr_train, vendor_name_train, x_train.payment_voucher_amt], labels_train,
            batch_size=BATCH_SIZE,
            epochs=EPOCHS,
            verbose=2,
            validation_data=([descr_val, vendor_name_val, x_val.payment_voucher_amt], labels_val),
            callbacks = [tensorboard_callback, early_stop_callback]
        )
    else:
        model.fit(
            [descr_train, vendor_name_train], labels_train,
            batch_size=BATCH_SIZE,
            epochs=EPOCHS,
            verbose=2,
            validation_data=([descr_val, vendor_name_val], labels_val),
            callbacks = [tensorboard_callback, early_stop_callback]
        )
    model.save('model/model {}.h5'.format(start_time))
    
    return start_time, model

In [None]:
start_time, model = train_model()

In [None]:
def predict():
    if incl_voucher_amt:
        preds = model.predict([descr_val, vendor_name_val, x_val.payment_voucher_amt])
    else:
        preds = model.predict([descr_val, vendor_name_val])
        
    preds_df = pd.DataFrame({
        'actual_cls': np.argmax(labels_val, axis=1),
        'pred_cls': np.argmax(preds, axis=1)
    })
    
    if incl_voucher_amt:
         preds_df = preds_df.assign(payment_voucher_amt = x_val.payment_voucher_amt)
            
    preds_df = preds_df.assign(
        voucher_full_descr = x_val.voucher_full_descr,
        voucher_descr_proc = x_val.voucher_descr_proc,
        vendor_name = x_val.vendor_name,
        vendor_name_proc = x_val.vendor_name_proc,
        confidence = np.max(preds, axis=1),
        actual = preds_df.actual_cls.map(lambda cls: acc_index_to_descr[cls]),
        pred = preds_df.pred_cls.map(lambda cls: acc_index_to_descr[cls])
    )
    
    preds_df = preds_df.assign(
        wrong = preds_df.actual != preds_df.pred
    )[['voucher_full_descr', 'voucher_descr_proc', 'vendor_name', 'vendor_name_proc', 'payment_voucher_amt', 'actual', 'pred', 'wrong', 'confidence']]
    
    if not incl_voucher_amt:
        preds_df = preds_df.drop('payment_voucher_amt', axis=1)
        
    preds_df.to_excel('data/output/validation-{}.xlsx'.format(start_time), index=False)
    
    return preds_df

In [None]:
preds_df = predict()
preds_df