### I will be using this jupyter notebook to run my local changes on the source code

In [50]:
# Load raw CSV -> Convert them to pickle form with {"question_to_label":[], "answer_to_label": []}
# Save pickle form in format of <domain>_orignal.pkl where an example of <domain> is comcare/babybonus

In [1]:
"""
Obtain the lists of questions and answers from any format, particularly from the pickle file
"""
import pickle

def save_dict(obj, path):
    # Save in .pickle format
    with open(path, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_dict(path):
    # Load .pickle format as dictionary
    with open(path, 'rb') as f:
        return pickle.load(f)

def extractQA(orignalDct):
    # Return list of questions and answers in list format
    # originalDct format: {"question_to_label": {<question 1>: <label that maps to answer>,...}, 
    #                      "answer_to_label": {<answer question 1>: <label for questions to be mapped to>}}
    # Handles 1-1 mapping of question-answer OR many-1 mapping of question-answer
    q2L = orignalDct["question_to_label"]
    a2L = orignalDct["answer_to_label"]
    

    l2A = dict()
    for a,l in a2L.items():
        l2A[l] = a


    questions = []
    answers = []

    for q,l in q2L.items():
        questions.append(q)
        answers.append(l2A[l])
    return questions , answers

def visualize_question_answer(questions, answers, show_pairs = 5):
    count = 1
    for question, answer in zip(questions, answers):
        print(f"Question {count}: {question}\n\n'{answer}''")
        print("="*117)
        count += 1
        if count > show_pairs:
            break

In [2]:
"""
Convert parallel lists of questions and answers to CSVs for dialogflow or Bani
"""
import csv

def convert_pickle_to_csv(questions, answers, file_name = "new.csv", is_bani = True):
    # Take in lists of question and answer where question-answer pair have the same index in both lists
    # Read each question-answer pair into csv
    with open(file_name, mode='w') as new_csv:
        csv_writer = csv.writer(new_csv, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

        count = 0
        for question, answer in zip(questions, answers):
            if is_bani:
                csv.writer.writerow([count, question, answer])
                count += 1
            else:
                # Format for dialogflow
                csv_writer.writerow([question, answer])

In [1]:
from Bani.Bani import Bani
from Bani.core.FAQ import FAQ
from Bani.core.defaults import defaultGenerateManager

def get_faq(dict_path, name, generator):
    """
    @param dict_path: str -> relative path to .pkl file with keys "question_to_label" and "answer_to_label"
    @param name: str -> Name of FAQ
    @param generator: GenerateManager -> Generator pipeline for augmenting questions. Set to none if not augmenting
    @output FAQ object
    """
    original_dict = load_dict(dict_path)
    questions, answers = extractQA(original_dict)
    
    resultFAQ = FAQ(name = name, questions = questions, answers = answers)
    if generator:
        resultFAQ.buildFAQ(generator = generator)
    else:
        resultFAQ.buildFAQ(generator = None)
    return resultFAQ

babyBonusFAQ = get_faq(dict_path = "./tutorialFAQs/baby_bonus_orignal.pkl", name = "babyBonus", generator = None)
comcareFAQ = get_faq(dict_path = "./tutorialFAQs/comcare_orignal.pkl", name = "comcare", generator = None)

"""
@param modelPath = None -> Download a pretrained model of the SentenceTransformers
@param assignVectors = True -> Assign vectors accordingly. 
@param FAQs: List of FAQ object
"""
bot = Bani(modelPath = None , FAQs =[babyBonusFAQ, comcareFAQ], assignVectors = True)

# bot.train("./checkpoints/dummy", lossName = "softmaxLayerLos1s", batchSize = 8, warmst)

# TODO: Convert FAQ object back to CSV

# Optional: Save FAQ
# babyBonusFAQ.save("./jamesFAQs/no_generate")
# comcareFAQ.save("./jamesFAQs/no_generate")

# Optional: Load FAQ
# babyBonusFAQ = FAQ("babyBonus")
# babyBonusFAQ.load("./jamesFAQs/no_generate")

# comcareFAQ = FAQ("comcare")
# comcareFAQ.load("./faqStore/no_generate")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


NameError: name 'load_dict' is not defined

In [4]:
# name must correspond to the .pkl file name. i.e. name = babyBonus, .pkl file = babyBonus.pkl
testFAQ = FAQ(name = "babyBonus")
testFAQ.load("./jamesFAQs/no_generate")

name = testFAQ.name
questions = testFAQ.questions
answers = testFAQ.answers
l2Q = testFAQ.l2Q
l2A = testFAQ.l2A
FAQ_units = testFAQ.FAQ

In [5]:
FAQ_units[0].orignal.__str__()

'how can my organisation apply to be a baby bonus approved institution? 0'

## Manual Inspection of Augmented Questions for their Semantic Integrity/Accuracy

In [99]:
"""
Inspecting defaultGenerateManager's generators and their corresponding producer

From the output, you will notice:
1. Different producers have their own properties and methods
2. nlpAug generator's producer have no "name" property
"""
from Bani.core.defaults import defaultGenerateManager

for i in range(len(defaultGenerateManager.generators)):
    generator = defaultGenerateManager.generators[i]
    producer = generator.producer
    print(f"Generator Name: {generator.name}")
    print(f"Picking {generator.num} questions")
    if "name" in dir(producer):
        print(f"Producer Name: {producer.name}")
    else:
        print("*No known producer name*")
    print(f"Producer's dir:\n {dir(producer)[27:]}")
    print("="*115 + "\n")
    



Generator Name: SymSub
Picking 3 questions
Producer Name: Sense-disambiguated Synonym Substitution
Producer's dir:
 ['_get_synonyms', 'batch_generate', 'discount_factor', 'encoder', 'generate', 'name', 'threshold']

Generator Name: FPM
Picking 3 questions
Producer Name: Fuzzy Question Pattern Matching
Producer's dir:
 ['_format_output', '_generate_patterns', '_has_multiple_question', '_preprocess', '_split_question', 'batch_generate', 'generate', 'matcher', 'name', 'patterns']

Generator Name: EDA
Picking 3 questions
Producer Name: Easy Data Augmentation Techniques
Producer's dir:
 ['_get_only_chars', '_get_synonyms', '_random_deletion', '_random_insertion', '_random_swap', '_swap_word', '_synonym_replacement', 'alpha_ri', 'alpha_rs', 'alpha_sr', 'batch_generate', 'generate', 'name', 'num_aug', 'p_rd']

Generator Name: nlpAug
Picking 2 questions
*No known producer name*
Producer's dir:
 ['augs', 'exact_batch_generate']



In [142]:
"""
Pass in original questions, and generate their corresponding augmented questions for inspection in subsequent cells
You may pass in only selected number of original questions e.g. questions[:3] to choose 3 questions
"""
from Bani.core.defaults import defaultGenerateManager
from collections import defaultdict

original_dict = load_dict("./tutorialFAQs/baby_bonus_orignal.pkl")
questions, answers = extractQA(original_dict)

symsub_producer = defaultGenerateManager.generators[0].producer
fpm_producer = defaultGenerateManager.generators[1].producer
eda_producer = defaultGenerateManager.generators[2].producer
nlpAug_producer = defaultGenerateManager.generators[3].producer

result_symsub = defaultdict(list)
result_fpm = defaultdict(list)
result_eda = defaultdict(list)
result_nlpAug = defaultdict(list)
for question in questions[:3]:
    result_symsub[question] = symsub_producer.generate(question)
    result_fpm[question] = fpm_producer.generate(question)
    result_eda[question] = eda_producer.generate(question)    
    result_nlpAug[question] = nlpAug_producer.exact_batch_generate(question,3)    

100%|██████████| 70/70 [00:06<00:00, 10.96it/s]
100%|██████████| 215/215 [00:19<00:00, 11.04it/s]
100%|██████████| 191/191 [00:16<00:00, 11.58it/s]


In [173]:
print(f"Original question: \n{questions[1]}")

Original question: 
I have entered the Unique Entity Number (UEN) using 'Join as an Approved Institution (AI)' service, but your system does not have matching records of my Unique Entity Number (UEN). Can I still submit my application?


In [174]:
len(result_symsub[questions[1]])

575

In [179]:
len(result_fpm[questions[1]])

0

In [180]:
result_eda[questions[1]]

['i have entered the unique entity number uen using join as an ok foundation ai service but your system does not have matching register of my unique entity number uen can i still submit my application',
 'entered the unique entity number uen join as an approved institution ai service but your system does not have matching records of my unique entity number uen can i still submit my application',
 'i three toed sloth have entered the unique entity number uen using join alone as an approved institution ai service but your associate in nursing system does not have matching records of my unique entity number uen can i still submit my application',
 'i have entered the okay unique okeh entity number uen using robert william service join as an approved institution ai service but your system does not have matching records of my unique entity number uen can i still submit my application',
 'not have entered the unique entity number uen using my as service approved institution ai an but your sy

In [154]:
result_nlpAug[questions[0]].keys()

dict_keys(['H', 'o', 'w', ' ', 'c', 'a', 'n', 'm', 'y', 'r', 'g', 'i', 's', 't', 'p', 'l', 'b', 'e', 'B', 'u', 'A', 'v', 'd', 'I', '?'])

In [187]:
QUESTION_NUM = 2
print(len(result_symsub[questions[QUESTION_NUM]]))
print(len(result_fpm[questions[QUESTION_NUM]]))
print(len(result_eda[questions[QUESTION_NUM]]))
print(len(result_nlpAug[questions[QUESTION_NUM]]))                     

863
12
10
33


In [None]:
# name must correspond to the .pkl file name. i.e. name = babyBonus, .pkl file = babyBonus.pkl
babyBonusBani = FAQ(name = "babyBonus")
babyBonusBani.load("./faqStore")

babyBonus_name = babyBonusBani.name
babyBonus_questions = babyBonusBani.questions
babyBonus_answers = babyBonusBani.answers
babyBonus_l2Q = babyBonusBani.l2Q
babyBonus_l2A = babyBonusBani.l2A
babyBonus_FAQ_units = babyBonusBani.FAQ


In [None]:
print(babyBonus_FAQ_units[27].orignal.__str__())
print(babyBonus_FAQ_units[27].label)
print(dir(babyBonus_FAQ_units[0]))

In [68]:
import pandas as pd
df = pd.read_csv("./tutorialFAQs/babybonusTest.csv")
testData = []

for i in range(len(df)):
    orignal = df.loc[i,"original"]
    reframed = df.loc[i,"reframed"]
    
    testData.append((orignal,reframed))

In [66]:
visualize_question_answer(df["original"].tolist(), df["reframed"].tolist())

Question 1: How can my organisation apply to be a Baby Bonus Approved Institution?


'Apply to be a Baby Bonus Approved Institution?''
Question 2: I have entered the Unique Entity Number (UEN) using 'Join as an Approved Institution (AI)' service, but your system does not have matching records of my Unique Entity Number (UEN). Can I still submit my application?


'I have entered a UEN using the 'Join an AI' service, but my system does not have a matching record of my UEN. Can I still submit my application?''
Question 3: I have entered the Unique Entity Number (UEN) using the 'Join as Approved Institution (AI)' service, but your system indicated that my Unique Entity Number (UEN) is invalid, what should I do?


'My UEN is showing as invalid. What shoud I do?''
Question 4: Is there a validity period to be a Baby Bonus Approved Institution?


'Validity period to be a Baby Bonus AI?
''
Question 5: How much does an organisation need to pay to register as an Approved Institution (AI) with Min