This is a Python project that takes semi-structured text data, tokenizes it, vectorizes it, and stores the result in a database that can be retrieved via a semantic search (does not have to be exact keywords). Currently, it works withe Grimm's Fairy Tales collection. The next steps are to add a GUI and integrate a LLM. Changing the data from Grimm's Fairy Tales to Stack Exchange information is another important step that remains.

In [47]:
# Pre-processing for Grimm's Fairy Tales, only needs to be run if stories are not yet split

# import os

# def split_stories(input_file, output_dir):
#     with open(input_file, 'r', encoding='utf-8') as f:
#         lines = f.readlines()

#     story = []
#     title = None
#     for line in lines:
#         if line.isupper():
#             if story and title:
#                 with open(os.path.join(output_dir, title + '.txt'), 'a', encoding='utf-8') as f:
#                     f.write(''.join(story))
#             title = line.strip()
#             story = []
#         else:
#             story.append(line)

#     # Save the last story
#     if story and title:
#         with open(os.path.join(output_dir, title + '.txt'), 'a', encoding='utf-8') as f:
#             f.write(''.join(story))

# # Usage
# filepath = 'datasets/grimms/raw_data/grimms.txt'
# output_directory = 'datasets/grimms/split'

# split_stories(filepath, output_directory)


In [50]:
import os

# Directory where preprocessed whole documents are stored
preprocessed_directory = 'datasets/grimms/preprocessed'

# Initialize the list to store dictionaries for each chunk
vector_database = []
chunk_size = 64

# Iterate through preprocessed files to create the vector database
for filename in os.listdir(preprocessed_directory):
    if filename.endswith('.txt'):
        file_path = os.path.join(preprocessed_directory, filename)

        # Read the preprocessed content of the file
        with open(file_path, 'r', encoding='utf-8') as file:
            preprocessed_text = file.read()

        # Split the text into words and create chunks
        words = preprocessed_text.split()
        line_number = 1  # Initialize line_number for each file

        for i in range(0, len(words), chunk_size):
            chunk = ' '.join(words[i:i + chunk_size])
            # Calculate the line number increment for this chunk
            newline_count = chunk.count('\n')
            
            # Create and add the chunk data to the vector database
            chunk_data = {
                'vector': None,  # Placeholder for the vector
                'tf_idf': None,  # Placeholder for TF-IDF scores
                'preprocessed_text': chunk,
                'filename': filename, 
                'original_text': chunk,
                'start_line_number': line_number
            }
            vector_database.append(chunk_data)

            # Update the line number for the next chunk
            line_number += newline_count + 1  # Increment line_number based on the newlines in this chunk



Top 1024 tokens:
fox: 0.3333853027326776
hans: 0.3135247325024424
gretel: 0.2607338086214813
cat: 0.23107620850493185
king: 0.20381368062207383
wolf: 0.18973665916524982
chanticleer: 0.16844279797503525
she: 0.14885198941867608
mr: 0.1267191013151264
mrs: 0.11170988813809332
mother: 0.11015098866727362
partlet: 0.10738543739288041
bird: 0.1063402810357104
horse: 0.10536109787685465
mouse: 0.10377520959762923
princess: 0.10334432122466915
children: 0.10266691340995643
peasant: 0.10226007783342773
doctor: 0.10126171813446366
father: 0.10054626549904314
soldier: 0.0991401260432083
lion: 0.09378431061782946
son: 0.09291321038255362
sparrow: 0.0914783955034835
dwarf: 0.09027691756137583
elsie: 0.08923034419621452
master: 0.08853626872798942
tailor: 0.08793844347434875
her: 0.08670129685566542
wife: 0.08468570969454996
queen: 0.08455328790789401
grandmother: 0.08425319425220434
redcap: 0.08425319425220434
woman: 0.08414034932240788
sausage: 0.0778574558221281
bear: 0.0775693617724055
rapunze

In [54]:
# Example of calculating TF-IDF scores for each chunk
for chunk in vector_database:
    tokens = chunk['preprocessed_text'].split()
    tf = defaultdict(int)
    for word in tokens:
        tf[word] += 1
    
    # Normalize TF and calculate TF-IDF
    chunk_tf_idf = {}
    for word in tf:
        normalized_tf = tf[word] / len(tokens)
        if word in IDF:  # Ensure the word is in your IDF dictionary
            tf_idf_value = normalized_tf * IDF[word]
            chunk_tf_idf[word] = tf_idf_value
    
    chunk['tf_idf'] = chunk_tf_idf


In [55]:
def create_chunk_vectors(vector_database, top_tokens):
    # Extract just the token names from the top tokens list
    top_token_names = [token for token, _ in top_tokens]

    # Iterate over each chunk in the vector database
    for chunk in vector_database:
        # Create a vector for the chunk with all zeros
        vector = [0] * len(top_token_names)

        # Iterate over each top token
        for i, token in enumerate(top_token_names):
            # If the token is in the chunk, use its TF-IDF score
            if token in chunk['tf_idf']:
                vector[i] = chunk['tf_idf'][token]

        # Update the chunk's dictionary with the vector
        chunk['vector'] = vector

# Call the function to create vectors for each chunk
create_chunk_vectors(vector_database, top_tokens)

# Now each chunk in vector_database has a corresponding vector based on the top tokens


In [56]:
def dot_product(vector_a, vector_b):
    """Calculate the dot product of two vectors."""
    return sum(a * b for a, b in zip(vector_a, vector_b))

def magnitude(vector):
    """Calculate the magnitude of a vector."""
    return sum(x**2 for x in vector) ** 0.5

def cosine_similarity(vector_a, vector_b):
    """Calculate the cosine similarity between two vectors."""
    dot_prod = dot_product(vector_a, vector_b)
    mag_a = magnitude(vector_a)
    mag_b = magnitude(vector_b)
    if mag_a == 0 or mag_b == 0:
        # Handling the case where one vector is all zeros
        return 0
    return dot_prod / (mag_a * mag_b)


In [57]:
# Convert a query to a vector utilizing existing preprocess_text function

def query_to_vector(query, top_tokens):

    preprocessed_query = preprocess_text(query)
    query_tokens = preprocessed_query.split()
    query_vector = [0] * len(top_tokens)
    
    for i, (token, _) in enumerate(top_tokens):
        if token in query_tokens:
            # Use the frequency of the token in the query for simplicity
            query_vector[i] = query_tokens.count(token)
    
    return query_vector

def search_database(query, vector_database, top_tokens, results_to_return=5):
    
    query_vector = query_to_vector(query, top_tokens)
    similarities = []

    for chunk in vector_database:
        chunk_vector = chunk['vector']
        similarity = cosine_similarity(query_vector, chunk_vector)
        if similarity > 0:
            similarities.append((chunk, similarity))

    # Sort the results by similarity score in descending order
    sorted_results = sorted(similarities, key=lambda x: x[1], reverse=True)

    # Return the top matches with additional details
    return sorted_results[:results_to_return]



In [58]:
# Loop for continuous searching if using console
# while True:
#     query = input("Enter your search query (or type 'exit' to stop): ").strip()
#     if query.lower() == 'exit':
#         break

#     top_matches = search_database(query, vector_database, top_tokens)

#     if not top_matches:
#         print(f"\nQuery: {query}"
#               f"\nNo matches found.")
#     else:
#         print(f"\nQuery: {query}")
#         for match in top_matches:
#             chunk_data, similarity = match
#             print(f"Filename: {chunk_data['filename']}")
#             print(f"Original Text: {chunk_data['original_text']}")
#             print(f"Start Line Number: {chunk_data['start_line_number']}")
#             print(f"Similarity: {similarity}")
#             print(f"-------------------------------------------------")

# print("Search ended.")

In [60]:
import dearpygui.dearpygui as dpg

base_context_length = 1024
window_width = 1400
window_height = 800

def query_callback(sender, app_data, user_data):
    query_text = dpg.get_value("query_input").strip()
    if query_text:
        # Perform the search
        top_matches = search_database(query_text, vector_database, top_tokens)
        output = ""
        if not top_matches:
            output = "No matches found."
        else:
            for match in top_matches:
                chunk_data, similarity = match
                output += f"Filename: {chunk_data['filename']}\n"
                output += f"Start Line Number: {chunk_data['start_line_number']}\n"
                output += f"Original Text: {chunk_data['original_text']}\n"
                output += f"Similarity: {similarity:.2f}\n"
                output += "-" * 40 + "\n"
        # Update the GUI with the search results
        dpg.set_value("query_output", output)
        # Update the "Submitted Query" label
        dpg.set_value("submitted_query_label", f"Submitted Query: {query_text}")
        # Clear the input field after submission
        dpg.set_value("query_input", "")
    else:
        # Update the system message if no query is entered
        dpg.set_value("system_message", "Please enter a query before submitting.")

def adjust_slider_ranges(sender, app_data, user_data):
    if dpg.get_value("advanced_user"):
        dpg.configure_item("temp_slider", max_value=2.0)
        dpg.set_value("system_message", "Advanced user!")
    else:
        dpg.configure_item("temp_slider", max_value=1.5)
        dpg.set_value("system_message", "Regular user!")

def adjust_context_length(sender, app_data, user_data):
    slider_value = dpg.get_value("context_length_slider") if dpg.does_item_exist("context_length_slider") else 0
    context_length = base_context_length * (2 ** slider_value)
    dpg.set_value("context_length_display", f"Context Length: {context_length}")

dpg.create_context()
dpg.create_viewport(title='DU Bot', width=window_width, height=window_height)

with dpg.window(label="DU Support Bot", width=window_width, height=window_height, no_collapse=True, no_move=True, no_close=True):
    dpg.add_text("Hello, I am your friendly DU Support Bot!")
    dpg.add_input_text(tag="query_input", label="Query")
    with dpg.group(horizontal=True):
        dpg.add_button(label="Submit Query", callback=query_callback)
        dpg.add_text(tag="submitted_query_label", label="")
    dpg.add_slider_float(tag="temp_slider", label="Temperature", default_value=0.5, min_value=0.0, max_value=1.0)
    dpg.add_slider_int(tag="context_length_slider", label="Context Length", default_value=1, min_value=0, max_value=10, callback=adjust_context_length)
    dpg.add_text("Context Length: 2048", tag="context_length_display")
    with dpg.group(horizontal=True):
        dpg.add_checkbox(tag="advanced_user", label="Advanced User",  callback=adjust_slider_ranges)
        dpg.add_text(tag="system_message", label="System message: Welcome to the system!")
    dpg.add_text(tag="query_output", label="", wrap=600)  # Added for displaying search results
    

dpg.setup_dearpygui()
dpg.show_viewport()

# Manually trigger the context length adjustment to update display at startup
adjust_context_length("context_length_slider", None, None)

dpg.start_dearpygui()
dpg.destroy_context()
