In [1]:
import random
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split

from src.models import GoldenRetriever
from src.encoders import USEEncoder, ALBERTEncoder, BERTEncoder
from src.data_handler.kb_handler import kb, kb_handler

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
train_dict = dict()
test_dict = dict()

# Get df using kb_handler
kbh = kb_handler()
kbs = kbh.load_sql_kb(
                      cnxn_path="./db_cnxn_str.txt", 
                      kb_names=["nrf_virement"])

df = pd.concat([single_kb.create_df() for single_kb in kbs]).reset_index(drop='True')
df = df.iloc[:10]
kb_names = df['kb_name'].unique()

for kb_name in kb_names:
    kb_id = df[df['kb_name'] == "nrf_virement"].index.values
    train_idx, test_idx = train_test_split(kb_id, test_size=0.4,
                                        random_state=100)

    train_dict["nrf_virement"] = train_idx
    test_dict["nrf_virement"] = test_idx

In [4]:
def _generate_hard_neg_ans(df, train_dict, model):
    """
    Generates negative answer from dataframe by randomization
    
    Sample output:
    --------------
    {'PDPA': [array([ 95,  84,  42, 185, 187, 172, 145,  71,   5,  36,  43, 153,  70,
                    140, 165,   0,  78, 162,  68, 184, 179,  30, 106,  13,  72,  17,
                    18,  38, 109,  47, 113,  56,  27,  63, 147, 105, 121,   2,  80,
                    182,  61,  49, 135, 193,  91,   4, 100, 141, 129, 159, 132, 108,
                    155, 130,  86,  93, 137, 144,  58,  60, 107, 143, 194,  34,  14,
                    66,  53,  98, 180,  94, 138, 176,  79,  87, 103,  67,  24,   8]),
              array([141, 129, 155,   5, 108, 180,  63,   0, 143, 130,  98, 132,  61,
                     103, 137,  13,  17,  71, 107, 144, 121,  68,  66, 184, 179, 135,
                     113, 194,  58,  53, 193,  34,  42,  78,  60, 106, 182,  72, 172,
                     145, 100, 176,  36, 159,  30,  14,  93,  43,  95,  79,   2,  87,
                       8,  18, 147,  91,  49,   4,  70,  67,  84,  80,  27,  47,  38,
                     138,  24, 187,  86, 153,  94, 140, 162, 109,  56, 105, 185, 165])],
     'nrf': [array([214, 240, 234, 235, 326, 244, 226, 252, 317, 331, 259, 215, 333,
                    318, 276, 267, 251, 329, 257, 261, 243, 245, 203, 337, 255, 287,
                    315, 296, 279, 209, 197, 227, 200, 304, 223, 198, 282, 289, 205,
                    319, 212, 254, 256, 303, 338, 230, 210, 262, 249, 294, 290, 275,
                    283, 299, 263, 220, 204]),
              array([249, 245, 331, 290, 254, 249, 249, 261, 296, 251, 214, 240, 275,
                     294, 319, 337, 215, 197, 200, 257, 289, 203, 282, 252, 315, 317,
                     230, 283, 304, 279, 333, 249, 299, 204, 318, 326, 262, 287, 256,
                     234, 303, 235, 243, 276, 198, 338, 220, 329, 255, 209, 263, 267,
                     210, 223, 259, 212, 205])]}
    """
    train_dict_with_neg = {}
    random.seed(42)

    for kb, ans_pos_idxs in train_dict.items():
        keys = []
        train_df = df.loc[ans_pos_idxs]

        # encodings of all possible answers
        all_possible_answers_in_kb = train_df.processed_string.unique().tolist()
        encoded_all_possible_answers_in_kb = model.predict(all_possible_answers_in_kb, string_type='response')

        # encodings of train questions
        train_questions = train_df.query_string
        encoded_train_questions = model.predict(train_questions, string_type='query')

        # get similarity matrix
        from sklearn.metrics.pairwise import cosine_similarity
        similarity_matrix = cosine_similarity(encoded_train_questions, encoded_all_possible_answers_in_kb)

        # get index of correct answers, indexed according to unique answers
        correct_answers = train_df.processed_string.tolist()
        idx_of_correct_answers = [all_possible_answers_in_kb.index(correct_answer) for correct_answer in correct_answers]

        # get second best answer index by kb_df
        ans_neg_idxs = []
        for idx_of_correct_answer, similarity_array in zip(idx_of_correct_answers, similarity_matrix):
            similarity_array[idx_of_correct_answer] = -1
            second_best_answer_idx_in_all_possible_answers = similarity_array.argmax()
            second_best_answer_string = all_possible_answers_in_kb[second_best_answer_idx_in_all_possible_answers]
            second_best_answer_idx_in_kb_df = train_df.loc[train_df.processed_string == second_best_answer_string].index[0]
            ans_neg_idxs.append(second_best_answer_idx_in_kb_df)

        # return a list of correct and close wrong answers
        keys.append(ans_pos_idxs)
        keys.append(np.array(ans_neg_idxs))
        train_dict_with_neg[kb] = keys 
    
    return train_dict_with_neg

In [5]:
def gen(batch_size, query, response, neg_response, shuffle_data=False):
    random.seed(42)
    zip_list = list(zip(query,response,neg_response))

    num_samples = len(query)
    while True:
        if shuffle_data:
            random.shuffle(zip_list)

        for offset in range(0, num_samples, batch_size):
            q_batch = [x[0] for x in zip_list[offset:offset+batch_size]]
            r_batch = [x[1] for x in zip_list[offset:offset+batch_size]]
            neg_r_batch = [x[2] for x in zip_list[offset:offset+batch_size]]
        
            yield(q_batch, r_batch, neg_r_batch)

In [6]:
def hard_triplet_generator(df, train_dict, model):
    """
    Returns a generator that gives batches of training triplets
    """
    train_dict_with_neg = _generate_hard_neg_ans(df, train_dict, model)
    train_pos_idxs = np.concatenate([v[0] for k,v in train_dict_with_neg.items()], axis=0)
    train_neg_idxs = np.concatenate([v[1] for k,v in train_dict_with_neg.items()], axis=0)

    train_query = df.iloc[train_pos_idxs].query_string.tolist()
    train_response = df.iloc[train_pos_idxs].processed_string.tolist()
    train_neg_response = df.iloc[train_neg_idxs].processed_string.tolist()
    
    train_dataset_loader = gen(2, train_query, train_response, train_neg_response, shuffle_data=True)
    
    return train_dataset_loader

In [7]:
def test_goldenretriever(encoder=None, save_dir=None, kbs=None,
                         query_string=None, kb_name=None):

    enc = encoder()
    gr = GoldenRetriever(enc)

    train_dataset_loader = hard_triplet_generator(df, train_dict, gr)
    
    for q, r, neg_r in train_dataset_loader:

        cost_mean_batch = gr.finetune(question=q, answer=r, context=r, \
                                         neg_answer=neg_r, neg_answer_context=neg_r, \
                                         margin=0.3, loss="triplet")
        print("cost_mean_batch", cost_mean_batch)

        break
        
    encoded_text = gr.encoder.encode("Why is the sky blue?", string_type="query")
    gr.export_encoder(save_dir=save_dir)
    
    enc_2 = encoder()
    gr_2 = GoldenRetriever(enc_2)
    gr_2.restore_encoder(save_dir=save_dir)
    
    gr_2.load_kb(kbs)
    gr_2.make_query(query_string, kb_name=kb_name)
    encoded_text_2 = gr_2.encoder.encode("Why is the sky blue?", string_type="query")
    
    print("encoded_text", encoded_text)
    print("encoded_text_2", encoded_text_2)
    
    tf.debugging.assert_equal(
    encoded_text, encoded_text_2, message=None, summarize=None, name=None
    )

In [8]:
query_string = "What are the requirements of debarring investigator"

In [10]:
test_goldenretriever(encoder=USEEncoder, save_dir="./finetune_use", kbs=kbs,
                     query_string=query_string, kb_name="nrf_virement")

model initiated!
cost_mean_batch 0.3
Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


INFO:tensorflow:Assets written to: ./finetune_use\assets


INFO:tensorflow:Assets written to: ./finetune_use\assets


model initiated!
model initiated!
2020-05-14 20:06:23.759892 : kb loaded - nrf_virement 
encoded_text tf.Tensor(
[[ 0.07713778 -0.01766009 -0.04501201 -0.04626905 -0.05282243 -0.026104
   0.07056151  0.03558303 -0.05186373 -0.0064343   0.02678842 -0.08156374
  -0.02875711 -0.03512157 -0.05726431 -0.00918735 -0.03882345  0.0346642
  -0.02840316 -0.03403239 -0.01009876 -0.07093532 -0.00752224 -0.03060159
  -0.03592576  0.01303794  0.05862886  0.0063034  -0.02272478  0.06184517
   0.03744913 -0.00751562  0.00954572  0.11016147  0.02429915 -0.00077278
   0.01034335  0.01482726 -0.06822246 -0.05939544  0.02190411  0.03945052
  -0.04884628 -0.00552307  0.05113245  0.0342414   0.02333902 -0.01315226
   0.01515451 -0.03625035 -0.06819116 -0.05765338 -0.05160503  0.04113428
  -0.0154777   0.01416995 -0.07035542  0.06165114 -0.07420117  0.09438286
   0.00774255 -0.04791952  0.09487133  0.06020354 -0.02937498  0.05366033
  -0.05098031 -0.06611858 -0.05001807  0.00823597 -0.03199609 -0.01229879
  

In [11]:
test_goldenretriever(encoder=ALBERTEncoder, save_dir="./finetune_ab", kbs=kbs,
                     query_string=query_string, kb_name="nrf_virement")

Initializing tokenizer and optimizer
model initiated!
cost_mean_batch 0.009000003
Initializing tokenizer and optimizer
model initiated!
2020-05-14 20:42:28.124809 : kb loaded - nrf_virement 
encoded_text tf.Tensor(
[[-0.13788912  0.17447492 -0.893247   ...  0.65187836 -0.10483229
  -0.09538179]
 [-0.13961184  0.17658634 -0.89331156 ...  0.6524843  -0.10137887
  -0.09767119]
 [ 0.46738753 -0.5955453   0.626259   ... -0.1343245  -0.9994972
   0.6047143 ]
 ...
 [-0.12754634  0.17121354 -0.9021557  ...  0.6586172  -0.14886823
  -0.09163377]
 [-0.21566625  0.0611416   0.3675295  ... -0.18863773 -0.99876195
   0.04262691]
 [ 0.4591443  -0.4570454   0.67921865 ... -0.40107793 -0.99924856
   0.5520427 ]], shape=(20, 768), dtype=float32)
encoded_text_2 tf.Tensor(
[[-0.13788912  0.17447492 -0.893247   ...  0.65187836 -0.10483229
  -0.09538179]
 [-0.13961184  0.17658634 -0.89331156 ...  0.6524843  -0.10137887
  -0.09767119]
 [ 0.46738753 -0.5955453   0.626259   ... -0.1343245  -0.9994972
   0.604

In [9]:
test_goldenretriever(encoder=BERTEncoder, save_dir="./finetune_b", kbs=kbs,
                     query_string=query_string, kb_name="nrf_virement")

INFO:absl:Using C:\Users\Kenneth\AppData\Local\Temp\tfhub_modules to cache modules.


Downloaded model from Hub, initializing tokenizer and optimizer
model initiated!
















cost_mean_batch 0.009000003




Instructions for updating:
If using Keras pass *_constraint arguments to layers.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


INFO:tensorflow:Assets written to: ./finetune_b\assets


INFO:tensorflow:Assets written to: ./finetune_b\assets


Downloaded model from Hub, initializing tokenizer and optimizer
model initiated!
model initiated!
2020-05-14 20:12:58.293952 : kb loaded - nrf_virement 
encoded_text tf.Tensor(
[[-0.9291435  -0.27677286 -0.4453595   0.7965723   0.12668778 -0.14410233
   0.9025326   0.31802318 -0.34721842 -0.9999503  -0.01584115  0.8066995
   0.98737043  0.15799586  0.95072967 -0.6912116  -0.31104967 -0.65116733
   0.22260383 -0.7609941   0.63196707  0.99704844  0.33729225  0.23478359
   0.42206326  0.8977543  -0.7308328   0.950486    0.9629009   0.7482801
  -0.72387433  0.13606414 -0.9907128  -0.10589422 -0.48667514 -0.9899922
   0.26835898 -0.7442552   0.07682539  0.1357647  -0.8997477   0.1150761
   0.99973905 -0.4107975   0.12698889 -0.28491598 -0.9999993   0.26945782
  -0.9156161   0.56834775  0.44929972  0.18698466  0.10193743  0.41794595
   0.41930962  0.142543   -0.0940474   0.06745354 -0.18820778 -0.543353
  -0.59150755  0.4353783  -0.5030982  -0.89181125  0.53510845  0.2544465
   0.01428961 -0