In [1]:
!python --version

Python 3.9.10


In [None]:
# python3 -m venv demo-env
# source demo-env/bin/activate
# pip install -r requirements.txt

Importing packages

In [2]:
import os
import tensorflow as tf
from tensorflow.keras import layers
from official.nlp import optimization  # to create AdamW optimizer
import tensorflow_hub as hub
import tensorflow_text as text
from keras import regularizers
import json
import re
import random
import numpy as np
import matplotlib.pyplot as plt
import pickle
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

Reading json file that includes fake names and corresponding id's

In [3]:
def read_assignee_dict(path):
    with open(path, 'r') as f:
        assignee_dict = json.load(f)
    return assignee_dict

In [4]:
assignee_dict = read_assignee_dict('datasets/assignee_dict.json')

Model paths that are trained in the training notebook

In [5]:
bow_model_path = 'demo-models/bow_model_3'
rnn_model_path = 'demo-models/rnn_model_3'
bert_model_path = 'demo-models/bert_model_4'

Reloading trained models 

In [6]:
bow_model_reloaded = tf.keras.models.load_model(bow_model_path)
rnn_model_reloaded = tf.keras.models.load_model(rnn_model_path)
bert_model_reloaded = tf.keras.models.load_model(bert_model_path)



Reading saved input vectorizer for Bag of words model 

In [7]:
input_vectorizer_data = pickle.load(open("demo-models/input_vectorizer_1.pkl", "rb"))
input_vectorizer_loaded = layers.TextVectorization.from_config(input_vectorizer_data['config'])
# You have to call `adapt` with some dummy data (BUG in Keras) https://stackoverflow.com/questions/65103526/how-to-save-textvectorization-to-disk-in-tensorflow
input_vectorizer_loaded.adapt(tf.data.Dataset.from_tensor_slices([""]))
input_vectorizer_loaded.set_weights(input_vectorizer_data['weights'])

We need to recompile BERT model with optimizer, loss, and metrics as adamw is not available for saving

In [8]:
init_lr = 1e-3
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=2070,
                                          num_warmup_steps=207,
                                          optimizer_type='adamw')

loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)
metrics = tf.keras.metrics.CategoricalAccuracy()

# compiling the model
bert_model_reloaded.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

Creating a pipeline that takes test data (one issue) model and model type. According to inputs it returns prediction

In [11]:
def prediction_pipeline(test_data,model,model_type):
    if model_type == 'bow':
        test_data = input_vectorizer_loaded(test_data)
        #print(test_data)
        test_data = tf.reshape(test_data, [1,13897])
        prediction = model.predict(test_data)
        prediction = prediction.flatten()
        #print(prediction)
    elif model_type == 'rnn':
        test_data = tf.constant([test_data])
        prediction = model.predict(test_data)
        prediction = prediction.flatten()
        #print(prediction)
    elif model_type == 'bert':
        test_data = tf.constant([test_data])
        prediction = model.predict(test_data)
        prediction = prediction.flatten()
        #print(prediction)
    return prediction

def calculate_top_n_prediction(prediction, n, assignee_dict):
    print('Top {} predictions:'.format(n))
    top_predictions = (-prediction).argsort()[:n]
    for pred in top_predictions:
        print(list(assignee_dict.keys())[list(assignee_dict.values()).index(pred)])

Demo data that was in the test dataset during training ground truth is **Jennifer**

In [15]:
test_data = 'We need to add the new requests response for the new consumer https cwiki apache org confluence display KAFKA Kafka Consumer Rewrite Design using the protocol definition'

Bag of Words Demo

In [16]:
bow_prediction = prediction_pipeline(test_data,model=bow_model_reloaded,model_type='bow')
calculate_top_n_prediction(bow_prediction, n=5, assignee_dict=assignee_dict)

Top 5 predictions:
Alice
Bob
Charlie
Jennifer
Patricia


BiLSTM Demo

In [17]:
rnn_prediction = prediction_pipeline(test_data,model=rnn_model_reloaded,model_type='rnn')
calculate_top_n_prediction(rnn_prediction, n=5, assignee_dict=assignee_dict)

Top 5 predictions:
Jennifer
Thomas
Mary
Richard
Jan


BERT DEMO

In [18]:
bert_prediction = prediction_pipeline(test_data,model=bert_model_reloaded,model_type='bert')
calculate_top_n_prediction(bert_prediction, n=5, assignee_dict=assignee_dict)

Top 5 predictions:
Jennifer
Sophia
Charlie
Julia
Tom


Second Demo Example

Demo data that was in the test dataset during training ground truth is **Steven**

In [None]:
test_data = "In KIP we are reusing the request timeout ms to timeout the batches in the accumulator We were intended to avoid the case that the batches sitting in the accumulator forever when topic metadata is missing Currently we are not checking if metadata is available or not when we timeout the batches in the accumulator although the comments says we will check the metadata This causes problem that once the previous batch hit a request timeout and got retried all the subsequent batches will fail with timeout exception We should only timeout the batches in the accumulator when the metadata of the partition is missing"

Bag of Words Demo

In [12]:
bow_prediction = prediction_pipeline(test_data,model=bow_model_reloaded,model_type='bow')
calculate_top_n_prediction(bow_prediction, n=5, assignee_dict=assignee_dict)

Top 5 predictions:
Alice
Charlie
Natalie
Vanessa
Bob


BiLSTM Demo

In [13]:
rnn_prediction = prediction_pipeline(test_data,model=rnn_model_reloaded,model_type='rnn')
calculate_top_n_prediction(rnn_prediction, n=5, assignee_dict=assignee_dict)

Top 5 predictions:
Steven
Vanessa
Mary
Julia
Alice


BERT Demo

In [14]:
bert_prediction = prediction_pipeline(test_data,model=bert_model_reloaded,model_type='bert')
calculate_top_n_prediction(bert_prediction, n=5, assignee_dict=assignee_dict)

Top 5 predictions:
Steven
Thomas
Sophia
Anna
Melissa
