Requirements:
* Relational Database (MySQL)
* Wikipedia on NoSql DB (Mongo DB)

Outputs:
* Topics table
* Hypothetical terms table
* Valid terms table

In [1]:
# import external libraries
import os
import sys
import json
import pandas as pd
import re

In [None]:
# import local modules
current_dir = os.path.dirname(os.path.abspath('__file__'))
parent_dir = os.path.dirname(current_dir)
data_dir = os.path.join(parent_dir, 'data')

sys.path.append(os.path.join(parent_dir))

from src.sqldb import HallucinationDb
from wiki.search import WikiSearcher
import settings

In [4]:
db = HallucinationDb(settings)

#### Saving Topics

In [6]:
topics = json.load(open(os.path.join(data_dir, 'intermediate', 'topics.json')))
topic_table = db.GetTableDefinition(db.TOPIC_TABLE)

for index, topic in enumerate(topics):
    string_list = topic.split(":")
    name = string_list[0]
    explanation = string_list[1]
    insert_statement = topic_table.insert().values(name=name, explanation=explanation)
    db.sql.execute(insert_statement)

#### Saving Hypothetical Terms

In [7]:
hypothetical_terms_path = os.path.join(data_dir, "intermediate", "related_terms.csv")
hypothetical_terms_df = pd.read_csv(hypothetical_terms_path, delimiter="\t")

In [8]:
topic_name = ""
topic_id = 0

nonexistent_table = db.GetTableDefinition(db.NONEXISTENT_TABLE)

for index, row in hypothetical_terms_df.iterrows():
    if topic_name != row["topic"]:
        topic_name = row["topic"]
        topic_id += 1

    insert_statement = nonexistent_table.insert().values(
        term=row["term"],
        explanation=row["explanation"],
        topic_id=topic_id, 
        )
    db.sql.execute(insert_statement)

#### Saving LLM Suggested Valid Terms

In [10]:
wiki_searcher = WikiSearcher()

In [17]:
real_terms_table = db.GetTableDefinition(db.REAL_TERMS_TABLE)
nonexistent_real_table = db.GetTableDefinition(db.NONEXISTENT_REAL_TABLE)

In [19]:
def str2list(list_str):
    list_str = list_str.replace("... ", "' ")
    if  "," in list_str:
        terms_list=[term.strip()[1:-1] for term in list_str[1:-1].split(",")]
    else:
        terms_list = [re.sub(r'[0-9.]', "", term).strip() for term in list_str.split("\n")]
    return terms_list

def check_term(term_name):
    result_row = db.sql.execute(real_terms_table.select().where(real_terms_table.c.term == term_name)).fetchone()
    if result_row:
        return result_row[real_terms_table.c.id]
    else:
        return 0

In [23]:
term_list = []
definition_list = []
topic_list = []
parent_term_list = []

for index, row in hypothetical_terms_df.iterrows():
    related_terms = str2list(row["related_term"])

    for term in related_terms:
        definition = wiki_searcher.get_definition(term)
        if definition not in ["None", "ambiguous"]:
            term_id = check_term(term)
            if term_id == 0:
                term_insert_result = db.sql.execute(
                    real_terms_table.insert().values(
                        term=term,
                        explanation=definition,
                        source_id=1
                ))
                term_id = term_insert_result.inserted_primary_key[0]
            db.sql.execute(nonexistent_real_table.insert().values(
                nonexistent_id = index+1,
                real_id = term_id
            ))