In [4]:
import pickle

def save_dict(obj, path):
    # Save in .pickle format
    with open(path, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_dict(path):
    # Load .pickle format as dictionary
    with open(path, 'rb') as f:
        return pickle.load(f)

def extractQA(orignalDct):
    # Return list of questions and answers in list format
    # originalDct format: {"question_to_label": {<question 1>: <label that maps to answer>,...}, 
    #                      "answer_to_label": {<answer question 1>: <label for questions to be mapped to>}}
    # Handles 1-1 mapping of question-answer OR many-1 mapping of question-answer
    q2L = orignalDct["question_to_label"]
    a2L = orignalDct["answer_to_label"]
    

    l2A = dict()
    for a,l in a2L.items():
        l2A[l] = a


    questions = []
    answers = []

    for q,l in q2L.items():
        questions.append(q)
        answers.append(l2A[l])
    return questions , answers

def visualize_question_answer(questions, answers, show_pairs = 5):
    count = 1
    for question, answer in zip(questions, answers):
        print(f"Question {count}: {question}\n\n'{answer}''")
        print("="*117)
        count += 1
        if count > show_pairs:
            break

In [5]:
import csv

def convert_pickle_to_csv(questions, answers, file_name = "new.csv", is_bani = True):
    # Take in lists of question and answer where question-answer pair have the same index in both lists
    # Read each question-answer pair into csv
    with open(file_name, mode='w') as new_csv:
        csv_writer = csv.writer(new_csv, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)

        count = 0
        for question, answer in zip(questions, answers):
            if is_bani:
                csv_writer.writerow([count, question, answer])
                count += 1
            else:
                # Format for dialogflow
                csv_writer.writerow([question, answer])

In [None]:
# Load orignalDct and the corresponding answers and questions 
comcare_faq = load_dict("./tutorialFAQs/comcare_orignal.pkl")
comcare_questions, comcare_answers = extractQA(comcare_faq)

# visualize_question_answer(comcare_questions, comcare_answers, 5)
convert_pickle_to_csv(comcare_questions, comcare_answers, "comcare_original_dialogflow.csv", False)

In [None]:
# Load orignalDct and the corresponding answers and questions 
faq = load_dict("./tutorialFAQs/baby_bonus_orignal.pkl")
questions, answers = extractQA(faq)

# visualize_question_answer(questions, answers, 5)
convert_pickle_to_csv(questions, answers, "baby_bonus_original_dialogflow.csv", False)

In [71]:
"""
Cell for converting babyBonus_pre_input_sbert.csv to babyBonus_input_<NUM_RETURN>_<MODEL_EVALUATOR>>=<LOWER_BOUND>.csv. 
Both csv files should be in the directory defined under `rootDir`.
Update NUM_RETURN and LOWER_BOUND accordingly. 
"""
import csv

# Update these parameters
NUM_RETURN = 11
LOWER_BOUND = 4.0
MODEL_EVALUATOR = "sbert"

outputName = f"babyBonus_input_{NUM_RETURN}_{MODEL_EVALUATOR}>={LOWER_BOUND}"
inputName = f"babyBonus_pre_input_sbert"
outputPath = os.path.join(rootDir, outputName + ".csv")
inputPath = os.path.join(rootDir, inputName + ".csv")

inputFile = pd.read_csv(inputPath)
label = -1
original_question = ""
with open(outputName + ".csv", mode='w') as new_csv:
    csv_writer = csv.writer(new_csv, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    
    # seen_questions -> Prevent duplicates within and cross-label
    seen_questions = []
    
    count = 0
    for row in inputFile.itertuples(index=False):
        if row.Label != label or label == -1:
            # if: encounter a new label
            # Update label and original question
            label = row.Label
            original_question = row.Question
            original_answer = row.Answer
            seen_questions.append(row.Question)
            count = 0
            
            # Write first row for new label -> Original FAQ pair {(q,a)}
            csv_writer.writerow([original_question, original_answer, original_question, label, row.Score])
        else:
            # else: find other non-first instances of label
            # Filter for duplicated questions (within and cross-label)
            if row.Question not in seen_questions and count <= NUM_RETURN:
                if float(row.Score) >= LOWER_BOUND:
                    seen_questions.append(row.Question)

                    # row.Question is the paraphrase of original_question
                    csv_writer.writerow([row.Question, original_answer, original_question, row.Label, row.Score])
                    count += 1

os.rename(outputName + ".csv", outputPath)

In [88]:
"""
Cell for converting babyBonus_pre_input_sbert_t5.csv to babyBonus_input_<NUM_RETURN>_<MODEL_EVALUATOR>>=<LOWER_BOUND>.csv. 
Both csv files should be in the directory defined under `rootDir`.
Update NUM_RETURN and LOWER_BOUND accordingly. 
"""
import csv

# Update these parameters
NUM_RETURN = 11
LOWER_BOUND = 3.6
MODEL_EVALUATOR = "t5"

outputName = f"babyBonus_input_{NUM_RETURN}_{MODEL_EVALUATOR}>={LOWER_BOUND}"
inputName = f"babyBonus_pre_input_sbert_t5"
outputPath = os.path.join(rootDir, outputName + ".csv")
inputPath = os.path.join(rootDir, inputName + ".csv")

inputFile = pd.read_csv(inputPath)
label = -1
original_question = ""
with open(outputName + ".csv", mode='w') as new_csv:
    csv_writer = csv.writer(new_csv, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    
    # Order of columns
#     csv_writer.writerow(["Question", "Answer", "Original", "Label", "Score"])
    seen_questions = []
    
    count = 0
    for row in inputFile.itertuples(index=False):
        if row.Label != label or label == -1:
            # if: encounter a new label
            # Update label and original question
            label = row.Label
            original_question = row.Question
            original_answer = row.Answer
            seen_questions.append(row.Question)
            count = 0
            
            # Write first row for new label -> Original FAQ pair {(q,a)}
            csv_writer.writerow([original_question, original_answer, original_question, label, row.SBERT_Score, row.T5_Score])
        else:
            # else: find other non-first instances of label
            # Filter for duplicated questions (within and cross-label)
            if row.Question not in seen_questions and count <= NUM_RETURN:
                if float(row.T5_Score) >= LOWER_BOUND:
                    seen_questions.append(row.Question)

                    # row.Question is the paraphrase of original_question
                    csv_writer.writerow([row.Question, original_answer, original_question, row.Label, row.SBERT_Score, row.T5_Score])
                    count += 1

os.rename(outputName + ".csv", outputPath)

In [87]:
""" w/ SORTING for FIRST <NUM_RETURN>
Cell for converting babyBonus_pre_input_sbert.csv to babyBonus_input_<NUM_RETURN>_<MODEL_EVALUATOR>>=<LOWER_BOUND>.csv. 
Both csv files should be in the directory defined under `rootDir`.
Update NUM_RETURN and LOWER_BOUND accordingly. 
"""
import csv

# Update these parameters
NUM_RETURN = 11
LOWER_BOUND = 4.0
MODEL_EVALUATOR = "sbert"

outputName = f"babyBonus_input_{NUM_RETURN}_{MODEL_EVALUATOR}>={LOWER_BOUND}_sorted2"
inputName = f"babyBonus_pre_input_sbert"
outputPath = os.path.join(rootDir, outputName + ".csv")
inputPath = os.path.join(rootDir, inputName + ".csv")

inputFile = pd.read_csv(inputPath)
label = -1
original_question = ""
within_labels = []
with open(outputName + ".csv", mode='w') as new_csv:
    csv_writer = csv.writer(new_csv, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    
    # seen_questions -> Prevent duplicates within and cross-label
    seen_questions = []
    
    count = 0
    for row in inputFile.itertuples(index=False):
        if row.Label != label or label == -1:
            # if: encounter a new label
            # Update label and original question
            label = row.Label
            original_question = row.Question
            original_answer = row.Answer
            seen_questions.append(row.Question)
            count = 0
            
            if len(within_labels) > 0:
                ## Sorting paraphrases of the previous label by their score in descending order
                within_labels.sort(key = lambda x: x[4], reverse=True)
                
                ## Write top NUM_RETURN paraphrases into csv for the previous label
                for row_label in within_labels:
                    csv_writer.writerow(row_label)
                within_labels = []
                
            
            # Write first row for new label -> Original FAQ pair {(q,a)}
            csv_writer.writerow([original_question, original_answer, original_question, label, row.Score])
        else:
            # else: find other non-first instances of label
            # Filter for duplicated questions (within and cross-label)
            if row.Question not in seen_questions and count <= NUM_RETURN:
                if float(row.Score) >= LOWER_BOUND:
                    seen_questions.append(row.Question)
                    count += 1

                    within_labels.append([row.Question, original_answer, original_question, row.Label, row.Score])
                    # row.Question is the paraphrase of original_question
#                     csv_writer.writerow([row.Question, original_answer, original_question, row.Label, row.Score])
    
    # For the last label
    if len(within_labels) > 0:
        ## Sorting paraphrases of the previous label by their score in descending order
        within_labels.sort(key = lambda x: x[4], reverse=True)

        ## Write top NUM_RETURN paraphrases into csv for the previous label
        for row_label in within_labels:
            csv_writer.writerow(row_label)
        within_labels = []

os.rename(outputName + ".csv", outputPath)

In [89]:
""" w/ SORTING to get TOP <NUM_RETURN>
Cell for converting babyBonus_pre_input_sbert.csv to babyBonus_input_<NUM_RETURN>_<MODEL_EVALUATOR>>=<LOWER_BOUND>.csv. 
Both csv files should be in the directory defined under `rootDir`.
Update NUM_RETURN and LOWER_BOUND accordingly. 
"""
import csv

# Update these parameters
NUM_RETURN = 11
LOWER_BOUND = 4.0
MODEL_EVALUATOR = "sbert"

outputName = f"babyBonus_input_{NUM_RETURN}_{MODEL_EVALUATOR}>={LOWER_BOUND}_sorted3"
inputName = f"babyBonus_pre_input_sbert"
outputPath = os.path.join(rootDir, outputName + ".csv")
inputPath = os.path.join(rootDir, inputName + ".csv")

inputFile = pd.read_csv(inputPath)
label = -1
original_question = ""
within_labels = []
with open(outputName + ".csv", mode='w') as new_csv:
    csv_writer = csv.writer(new_csv, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    
    # seen_questions -> Prevent duplicates within and cross-label
    seen_questions = []
    
    for row in inputFile.itertuples(index=False):
        if row.Label != label or label == -1:
            # if: encounter a new label
            # Update label and original question
            label = row.Label
            original_question = row.Question
            original_answer = row.Answer
            seen_questions.append(row.Question)
            
            if len(within_labels) > 0:
                ## Sorting paraphrases of the previous label by their score in descending order
                within_labels.sort(key = lambda x: x[4], reverse=True)
                
                ## Write top NUM_RETURN paraphrases into csv for the previous label
                count = 0
                for row_label in within_labels:
                    count += 1
                    if count <= NUM_RETURN:
                        csv_writer.writerow(row_label)
                within_labels = []
                
            
            # Write first row for new label -> Original FAQ pair {(q,a)}
            csv_writer.writerow([original_question, original_answer, original_question, label, row.Score])
        else:
            # else: find other non-first instances of label
            # Filter for duplicated questions (within and cross-label)
            if row.Question not in seen_questions:
                if float(row.Score) >= LOWER_BOUND:
                    seen_questions.append(row.Question)

                    within_labels.append([row.Question, original_answer, original_question, row.Label, row.Score])
                    # row.Question is the paraphrase of original_question
#                     csv_writer.writerow([row.Question, original_answer, original_question, row.Label, row.Score])
    
    # For the last label
    if len(within_labels) > 0:
        ## Sorting paraphrases of the previous label by their score in descending order
        within_labels.sort(key = lambda x: x[4], reverse=True)

        ## Write top NUM_RETURN paraphrases into csv for the previous label
        count = 0
        for row_label in within_labels:
            count += 1
            if count <= NUM_RETURN:
                csv_writer.writerow(row_label)
        within_labels = []

os.rename(outputName + ".csv", outputPath)