Requirements:
* Wikipedia on NoSql DB
* SQL DB
* OpenAI API key

Outputs:
* related_terms.csv
* llm_suggestion_terms.csv

In [7]:
# import external libraries
import sys
import os
import json
import pandas as pd
import re
import time
import openai

In [3]:
# import local modules
current_dir = os.path.dirname(os.path.abspath('__file__'))
parent_dir = os.path.dirname(current_dir)
data_dir = os.path.join(parent_dir, 'data')

sys.path.append(os.path.join(parent_dir))

from wiki.search import WikiSearcher
from src.searchUnexistent import Searcher
from src.utilities import get_strtime
from prompts import templates

#### Generate real terms

In [8]:
def try_gptapi_call(messages, temperature=0, model="gpt-3.5-turbo"):
    for i in range(3):
        try:
            return openai.ChatCompletion.create(
                model=model,
                temperature=temperature,
                messages=messages)
        except Exception as exc:
            print(f"Exception: {exc}")
            time.sleep(60)
            continue

topics = json.load(open(os.path.join(data_dir, 'intermediate', 'topics.json')))

def get_topic_with_explanation(topic_title):
    return[topic for topic in topics if topic_title in topic ][0]

In [4]:
term_candidates_path = os.path.join(data_dir, "intermediate", "hypothetical_term_candidates.csv")
term_candidates_df = pd.read_csv(term_candidates_path)
hypothetical_terms_df = term_candidates_df[term_candidates_df["exists"] == False]

In [5]:
related_terms_list = []

In [None]:
for index, row in hypothetical_terms_df.iterrows():
    system_prompt = templates.related_wiki_term_generator_system
    user_prompt = templates.related_term_generator_user.format(
        topic=get_topic_with_explanation(row["topic"]), 
        madeup_term=f"""{row["term"]}: {row["explanation"]}""")
    
    messages = [{"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}]
    
    response = try_gptapi_call(messages, temperature=0)
    related_terms = response['choices'][0]['message']['content']

    related_terms_list.append(related_terms)

In [None]:
hypothetical_terms_df["related_term"] = related_terms_list
hypothetical_terms_df.to_csv(f"related_terms_{get_strtime()}.csv", index=False, sep="\t")

#### Validate Generated Terms

In [13]:
hypothetical_terms_path = os.path.join(data_dir, "intermediate", "related_terms.csv")
hypothetical_terms_df = pd.read_csv(hypothetical_terms_path, delimiter="\t")

In [14]:
wiki_searcher = WikiSearcher()

In [15]:
def str2list(list_str):
    list_str = list_str.replace("... ", "' ")
    if  "," in list_str:
        terms_list=[term.strip()[1:-1] for term in list_str[1:-1].split(",")]
    else:
        terms_list = [re.sub(r'[0-9.]', "", term).strip() for term in list_str.split("\n")]
    return terms_list

In [16]:
related_terms_dict = {}
wiki_terms = []
nonwiki_terms = []
ambiguous_terms = []

In [18]:
for index, row in hypothetical_terms_df.iterrows():
    related_terms = str2list(row["related_term"])
    for term in related_terms:
        if term in related_terms_dict:
            continue
        definition = wiki_searcher.get_definition(term)
        if definition == "None":
            nonwiki_terms.append(term)
        elif definition == "ambiguous":
            ambiguous_terms.append(term)
        else:
            wiki_terms.append(term)
        related_terms_dict[term] = definition

In [19]:
print(f"related_terms_dict length {len(related_terms_dict)}")
print(f"wiki_terms length {len(wiki_terms)}")
print(f"nonwiki_terms length {len(nonwiki_terms)}")
print(f"ambiguous_terms length {len(ambiguous_terms)}")

related_terms_dict length 14271
wiki_terms length 5914
nonwiki_terms length 7750
ambiguous_terms length 607


#### Save Validated Terms to SQL

In [20]:
from src.sqldb import HallucinationDb
import settings

In [21]:
hallucinationDb = HallucinationDb(settings)

In [None]:
real_terms_table = hallucinationDb.GetTableDefinition(hallucinationDb.REAL_TERMS_TABLE)
nonexistent_real_table = hallucinationDb.GetTableDefinition(hallucinationDb.NONEXISTENT_REAL_TABLE)

In [None]:
def check_term(term_name):
    result_row = hallucinationDb.sql.execute(real_terms_table.select().where(real_terms_table.c.term == term_name)).fetchone()
    if result_row:
        return result_row[real_terms_table.c.id]
    else:
        return 0

In [None]:
for index, row in hypothetical_terms_df.iterrows():
    related_terms = str2list(row["related_term"])

    for term in related_terms:
        definition = related_terms_dict[term]
        if definition not in ["None", "ambiguous"]:
            term_id = check_term(term)
            if term_id == 0:
                term_insert_result = hallucinationDb.sql.execute(
                    real_terms_table.insert().values(
                        term=term,
                        explanation=definition,
                        source_id=1
                ))
                term_id = term_insert_result.inserted_primary_key[0]
            hallucinationDb.sql.execute(nonexistent_real_table.insert().values(
                nonexistent_id = index+1,
                real_id = term_id
            ))

#### Create dataframe for related terms

In [None]:
term_list = []
definition_list = []
topic_list = []
parent_term_list = []

In [None]:
for index, row in hypothetical_terms_df.iterrows():
    related_terms = str2list(row["related_term"])

    for term in related_terms:
        definition = related_terms_dict[term]
        if definition not in ["None", "ambiguous"]:
            term_list.append(term)
            definition_list.append(definition)
            topic_list.append(row["topic"])
            parent_term_list.append(row["term"])

In [None]:
llm_suggestion_df = pd.DataFrame(list(zip(term_list, definition_list, topic_list, parent_term_list)), 
             columns=['term', 'explanation', 'topic', 'parent_term'])

In [None]:
llm_suggestion_df.to_csv(f"llm_suggestion_terms_{get_strtime()}.csv", index=False, sep="\t")