Requirements:
* Vector Database (Milvus)
* Relational Database (MySQL)

Outputs:
* Valid terms from title similarity
* Valid terms from definition similarity

In [1]:
# import external libraries
import os
import sys
import pandas as pd
from tqdm import tqdm

In [None]:
# import local modules
current_dir = os.path.dirname(os.path.abspath('__file__'))
parent_dir = os.path.dirname(current_dir)
data_dir = os.path.join(parent_dir, 'data')

sys.path.append(os.path.join(parent_dir))

import settings
from wiki.embeddings import Embeddings
from wiki.search import WikiSearcher
from src.sqldb import HallucinationDb

In [None]:
wiki_embeddings = Embeddings(settings)
db = HallucinationDb(settings)
wiki_searcher = WikiSearcher()

In [4]:
hypothetical_terms_path = os.path.join(data_dir, "intermediate", "related_terms.csv")
hypothetical_terms_df = pd.read_csv(hypothetical_terms_path, delimiter="\t")

In [5]:
real_terms_table = db.GetTableDefinition(db.REAL_TERMS_TABLE)
nonexistent_real_table = db.GetTableDefinition(db.NONEXISTENT_REAL_TABLE)

In [6]:
def check_term(term_name):
    result_row = db.sql.execute(real_terms_table.select().where(real_terms_table.c.term == term_name)).fetchone()
    if result_row:
        return result_row[real_terms_table.c.id]
    else:
        return 0

#### Title similarity

In [15]:
collection_name = "wiki_title"
start_from = 0

In [None]:
for index, row in hypothetical_terms_df.iloc[start_from:].iterrows():
    print("Index: ", index)
    fake_term = row["term"]
    similarity_search = wiki_embeddings.vector_search(collection_name, fake_term)
    for hits in similarity_search:
        for hit in hits:
            real_term_title = hit.entity.get('title')
            real_term_definition = wiki_searcher.get_definition(real_term_title)

            if real_term_definition in ["None", "ambiguous"]:
                continue

            term_id = check_term(real_term_title)
            
            if term_id == 0:
                term_insert_result = db.sql.execute(
                    real_terms_table.insert().values(
                        term=real_term_title,
                        explanation=real_term_definition,
                        source_id=2
                ))
                term_id = term_insert_result.inserted_primary_key[0]

            db.sql.execute(nonexistent_real_table.insert().values(
                nonexistent_id = index+1,
                real_id = term_id
            ))

#### Text Similarity

In [13]:
collection_name = "wiki_text"
start_from = 0
partition_batch = 3

In [11]:
similar_description_table = db.GetTableDefinition(db.SIMILAR_DESCRIPTION_TABLE)

In [16]:
partition_list = [partition.name for partition in wiki_embeddings.get_collection(collection_name).partitions if partition.name != "_default"]

In [None]:
for f_index, f_row in hypothetical_terms_df.iloc[start_from:].iterrows():
    fake_term = f_row["term"]
    fake_definition = f_row["explanation"]

    similarity_data = []
    for i in tqdm(range(0, len(partition_list), partition_batch), desc=f"Index {f_index}:{fake_term}"):
        partition_names = partition_list[i:i+partition_batch]

        similarity_search = wiki_embeddings.vector_search(collection_name, fake_definition, partition_names=partition_names)
        for hits in similarity_search:
            for hit in hits:
                similarity_data.append([hit.id, hit.distance, hit.entity.get('title'), hit.entity.get('text')[:9000]])

    similarity_df = pd.DataFrame(similarity_data, columns=["_id", "distance", "title", "text"]).sort_values(by="distance").head(10)

    for r_index, r_row in similarity_df.iterrows():
        real_term_title = r_row["title"]
        real_term_text = r_row["text"]
        wiki_id = r_row["_id"]
        real_term_definition = wiki_searcher.get_definition_by_id(wiki_id)

        if real_term_definition in ["None", "ambiguous"]:
            real_term_definition = real_term_text

        term_id = check_term(real_term_title)
        if term_id == 0:
            term_insert_result = db.sql.execute(
                real_terms_table.insert().values(
                    term=real_term_title,
                    explanation=real_term_definition,
                    source_id=3
            ))
            term_id = term_insert_result.inserted_primary_key[0]

        db.sql.execute(nonexistent_real_table.insert().values(
            nonexistent_id = f_index+1,
            real_id = term_id
        ))

        db.sql.execute(similar_description_table.insert().values(
            wiki_id = wiki_id,
            title = real_term_title,
            text = real_term_text
        ))