In [1]:
import os
import gensim
import gensim.downloader as api
from gensim.models import KeyedVectors
import sentence_transformers
import numpy as np
import requests
import pickle
import pandas as pd
import zipfile
import fasttext
from langchain_ollama import ChatOllama
import pandas as pd
from nltk.corpus import stopwords
import string
from langchain_core.messages import AIMessage
from IPython.display import Markdown


  from tqdm.autonotebook import tqdm, trange


# RAG-Experiment

In this notebook, we'll use the vectorizations we created in the notebook `1_vectorisation` to provide relevant information from the software repository in response to a user query. 
To do so, we'll:

1. Load the models, the data and the vector representations of said data. 

2. Load and prepare the LLM we want to use for RAG. 



## 1. Load the data

First, we'll load the models, the chunked descriptions from our software repository and the vector representations we prepared in the notebook `1_vectorisation.`

If the data and vectorizations have not yet been created, please run the `1_vectorisation` notebook first. Note that this process may take some time.

### 1. Load Models

First we'll load the different models, as we need them to vectorize the queries.

Once we loaded the models, we'll create functions to vectorize the queries using the loaded models. 

#### 1. Load the TFIDF-Vectorizer + create a function to vectorize the query

In [2]:
path = os.path.join(os.getcwd(), "models/tfidf_vectorizer.pickle")
with open(path, "rb") as file:
    tfidf_vectorizer = pickle.load(file)
    
def get_tfidf_vector(query):
    return tfidf_vectorizer.transform([query])
    
print("Model sucessfully loaded")

Model sucessfully loaded


#### 2. Load the Word2Vec model + create a function to vectorize the query

In [3]:
current_path = os.getcwd()
path = os.path.join(current_path, "models/word2vec-google-news-300.bin")

# Load the model if it is already in our project. If not, download it.
if os.path.isfile(path):
    print("Model found. Loading...")
    word2vec_model = KeyedVectors.load(path)
    
else:
    print("Model not found. Downloading...")
    word2vec_model = api.load("word2vec-google-news-300")
    word2vec_model.save(path)
print("Model loaded sucessfully")

Model found. Loading...
Model loaded sucessfully


In [4]:
# create a function to vectorize the query    
def get_word2vec_vector(query, model):
    words = query.split()
    # Filter words that are in the model's vocabulary
    valid_words = [word for word in words if word in model]

    if not valid_words:
        # Return a zero vector if no valid words are found
        return np.zeros(model.vector_size)

    # Average the vectors of the valid words to create a document representation
    vectors = [model[word] for word in valid_words]
    return np.mean(vectors, axis=0)

#### 3. Load the Fast-Text models + create a function to vectorize the query

In [5]:
### CREATE HELPER FUNCTIONS

# define functions to load FastText models
def download_file(url: str, file_path: str) -> None:
    """Download a file from a URL and save it locally."""
    try:
        
        if os.path.isfile(file_path):
            print("File was already downloaded.")
            return None
        
        response = requests.get(url, stream=True)
        response.raise_for_status()  # Check if the request was successful
        with open(file_path, "wb") as file:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    file.write(chunk)
        print(f"The file has been downloaded and saved as: {file_path}")
    except requests.RequestException as e:
        print(f"An error occurred while downloading the file: {e}")

# define function to unzip models
def unzip_file(zip_file_path: str, extract_to: str) -> None:
    """Unzip a file to a target directory."""
    try:
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
        print(f"Unzipped {zip_file_path} to {extract_to}")
    except zipfile.BadZipFile as e:
        print(f"Error while unzipping the file: {e}")

# define a function to load word vectors from a file
def load_word_vectors(file_path: str):
    """Load word vectors from a file."""
    try:
        model = gensim.models.KeyedVectors.load_word2vec_format(file_path)
        print("Vectors loaded successfully.")
        return model
    except Exception as e:
        print(f"An error occurred while loading the vectors: {e}")
        return None

### LOAD MODELS

# download or load the fasttext model for language detection
langident_path = os.path.join(os.getcwd(), 'models/lid.176.bin')

if os.path.isfile(langident_path):
    print("Model found.")
    language_detection_model = fasttext.load_model(langident_path)
    
else:
    print("Model not found. Downloading...")
    url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
    # download the model
    download_file(url, langident_path)
    # load the model
    print("Loading model from file...")
    language_detection_model = fasttext.load_model(langident_path)
    
language_identification_model = fasttext.load_model(langident_path)

print("Language identification model loaded sucessfully.")

# Check if the english model file exists. If so, load it. If not, download it and convert it to .bin for faster loading in the future. 
# This might take a while

current_path = os.getcwd()
models_dir = os.path.join(current_path, "models")
fasttext_eng_zip_path = os.path.join(models_dir, "wiki.en.zip")
fasttext_eng_path_vec = os.path.join(models_dir, "wiki.en.vec")
fasttext_eng_path_bin = os.path.join(models_dir, "wiki.en.bin")

if os.path.isfile(fasttext_eng_path_bin):
    print("Model found. Loading...")
    aligned_vectors_eng = gensim.models.fasttext.load_facebook_model(fasttext_eng_path_bin) #load the full model, including subword information.
    
else:
    print("Model not found. Downloading...")
    url = "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip" 
    # download the models
    download_file(url, fasttext_eng_zip_path)
    
    print("Unzipping the file...")
    unzip_file(fasttext_eng_zip_path, models_dir)    

    # load the model
    print("Loading model from file...")
    aligned_vectors_eng = gensim.models.fasttext.load_facebook_model.load(fasttext_eng_path_bin)
    # save the model as binary to reduce loading time in the future
    aligned_vectors_eng.save(fasttext_eng_path_bin)

    
if aligned_vectors_eng is None:
    raise ValueError("The FastText model was not loaded properly.")

print("English model loaded sucessfully.")

# Check if the german model file exists. If so, load it. If not, download it and convert it to .bin for faster loading in the future.
# This might take a while

fasttext_de_path_bin = os.path.join(current_path, "models/wiki_de_align.bin")
fasttext_de_path_vec = os.path.join(current_path, "models/wiki_de_align.vec")

if os.path.isfile(fasttext_de_path_bin):
    print("Model found. Loading...")
    aligned_vectors_de = KeyedVectors.load(fasttext_de_path_bin)
    
else:
    print("Model not found. Downloading...")
    url = "https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.de.align.vec"
    # download the model
    download_file(url, fasttext_de_path_vec)
    # load the model
    print("Loading model from file...")
    aligned_vectors_de = load_word_vectors(fasttext_de_path_vec)
    # save the model as binary to reduce loading time in the future
    aligned_vectors_de.save(fasttext_de_path_bin)
    
if aligned_vectors_de is None:
    raise ValueError("The FastText model or vectors were not loaded properly.")

print("German model loaded sucessfully.")

Model found.
Language identification model loaded sucessfully.
Model found. Loading...
English model loaded sucessfully.
Model found. Loading...
German model loaded sucessfully.


In [6]:
# define a function to identify a query's language
def identify_language(query):
    """
    Identifies the language of the given query.
    Parameters:
    query (str): The text to be analyzed for language identification.
    """
    
    lang_detected = language_identification_model.predict(query)

    return lang_detected[0][0].replace("__label__", "")


# create function to embed query (based on language)
def get_fasttext_vector(query, aligned_vectors_de=None, aligned_vectors_eng=None):
    """
    Calculates the FastText vector representation for a given query.
    Parameters:
    - text: A text.
    - aligned_vectors_de: Aligned FastText vectors for the German language.
    - aligned_vectors_eng: Aligned FastText vectors for the English language.
    Note:
    - If the language is not specified or not supported (only "en" and "de" are supported), it returns a zero vector.
    - If a word in the row's description is not found in the aligned vectors, it tries to create a vector based on english subword information.
    - If no vectors are found, it returns a zero vector.
    """
    
    # default size to avoid errors if vectors are None
    vector_size = aligned_vectors_de.vector_size if aligned_vectors_de else 300
    
    # check if language is valid
    lang = identify_language(query)
    if pd.isna(lang) or lang not in ["en", "de"]:
        return np.zeros(vector_size) # Maybe rather use none?
    
    words = query.split()
    vectors = []

    # process based on language
    if lang == "de" and aligned_vectors_de:
        for word in map(str.lower, words):
            try:
                vectors.append(aligned_vectors_de[word])
            except KeyError:
                print(f"Created Vector based on Subword Information for: {word}")                
                vectors.append(aligned_vectors_eng.wv[word])
                #vectors.append(np.zeros(vector_size))
                
    elif lang == "en" and aligned_vectors_eng:
        for word in map(str.lower, words):
            try:
                vectors.append(aligned_vectors_eng.wv[word])
            except KeyError:
                print(f"Missing Vector for: {word}")
                vectors.append(aligned_vectors_eng.wv[word])
    
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros

#### 4. Load the SBERT and Cross Encoder models + create a function to vectorize the query

In [7]:
# Download SBERT model or load them from drive
sbert_path = os.path.join(os.getcwd(),"models/sbert")
downloaded = os.path.isdir(sbert_path)

if not downloaded:
    print("Downloading Sentence Transformer...")
    sbert_model = sentence_transformers.SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
    sbert_model.save(sbert_path)
else:
    print("Load Sentence Transformer from drive...")
    sbert_model = sentence_transformers.SentenceTransformer(sbert_path)
    
print("SBERT model loaded sucessfully")
    
# Download Cross Encoder or load them from drive

cross_encoder_path = os.path.join(os.getcwd(), "models/cross")
downloaded = os.path.isdir(cross_encoder_path)

if not downloaded:
    print("Downloading Cross Encoder...")
    cross_encoder_model = sentence_transformers.CrossEncoder("corrius/cross-encoder-mmarco-mMiniLMv2-L12-H384-v1")
    cross_encoder_model.save(cross_encoder_path)
else:
    print("Load Cross Encoder from drive...")
    cross_encoder_model = sentence_transformers.CrossEncoder(cross_encoder_path)
    
print("Cross encoder loaded sucessfully")

Load Sentence Transformer from drive...
SBERT model loaded sucessfully
Load Cross Encoder from drive...
Cross encoder loaded sucessfully


In [8]:
# function to embedd the query
def get_sbert_vector(query, model):
    """
    Get Sentence-BERT embeddings for a given text using a specified model.
    Parameters:
    text (str): The input text to encode.
    model: The Sentence-BERT model to use for encoding.
    Returns:
    numpy.ndarray: The Sentence-BERT embeddings for the input text.
    """
    default_embedding = np.zeros((model.get_sentence_embedding_dimension(),))
    
    if pd.isna(query) or query.strip() == '':
        return default_embedding    
    return model.encode(query, convert_to_tensor=False)

### 2. Load chunked descriptions

Next, we load the chunked descriptions from the software repository we prepared in the notebook `1_vectorisation`. We'll need them to supply text to the user and the llm used for the RAG-process.

We'll only load four columns. 

1. **description_clean_chunks:** The chunk.
2. **description_preprocessed_chunks:** The preprocessed chunk (lowercase + removed punctuation and stopwords).
3. **description:** The complete description from which the chunk was extracted.
4. **brand_name:** The name of the software described by the chunk.

In [9]:
# load the data
dataset_path = os.path.join(os.getcwd(), "data/edition_software_info_chunked.csv")
columns = ["brand_name", "description", "description_clean_chunks", "description_preprocessed_chunks"]
df = pd.read_csv(dataset_path ,skipinitialspace=True, usecols=columns)

# replace missing values with empty strings
df = df.fillna("")

# display the first row
with pd.option_context('display.max_colwidth', None):
    display(df.head(1))

Unnamed: 0,brand_name,description,description_clean_chunks,description_preprocessed_chunks
0,Transkribus,"# Erkennen, Transkribieren und Durchsuchen von historischen Dokumenten mitttels KI\n\n- Trainieren von spezifischen Texterkennungsmodellen, die in der Lage sind, handschriftliche, maschinengeschriebene oder gedruckte Dokumente zu erkennen.\n\n- KI-gestützte Erkennung von handgeschriebenem Text, Layout-Analyse und Strukturerkennung.\n\n- Manuelles Transkribieren im Transkriptionseditor\nKI-gestützten Erkennung mittels öffentlicher oder selbst trainierter KI-Modelle\n\n- Durchsuchen von Dokumenten mit erweiterten Suchoptionen, wie z. B. dem Tool zum Aufspüren von Schlüsselwörtern.\n\n\n- Gemeinsames Arbeiten an Dokumenten, Organisation in Sammlungen\n\n- Teilen von Dokumenten durch eine read&search Website oder Export als PDF oder ALTO (XML).\n\n- Alle Transkribus-Inhalte, d.h. hochgeladene Bilder, erkannte Texte, trainierte Erkennungsmodelle und eingegebene Metadaten, werden innerhalb der EU gehostet und sind GDPR konform.","Transkribus ist eine umfassende Plattform für die Digitalisierung, Texterkennung mithilfe Künstlicher Intelligenz, Transkription und das Durchsuchen von historischen Dokumenten. Erkennen, Transkribieren und Durchsuchen von historischen Dokumenten mitttels KI Trainieren von spezifischen Texterkennungsmodellen, die in der Lage sind, handschriftliche, maschinengeschriebene oder gedruckte Dokumente zu erkennen. KI-gestützte Erkennung von handgeschriebenem Text, Layout-Analyse und Strukturerkennung. Manuelles Transkribieren im Transkriptionseditor KI-gestützten Erkennung mittels öffentlicher oder selbst trainierter KI-Modelle Durchsuchen von Dokumenten mit erweiterten Suchoptionen, wie z. B. dem Tool zum Aufspüren von Schlüsselwörtern. Gemeinsames Arbeiten an Dokumenten, Organisation in Sammlungen Teilen von Dokumenten durch eine read&search Website oder Export als PDF oder ALTO (XML). Alle Transkribus-Inhalte, d.h. hochgeladene Bilder, erkannte Texte, trainierte Erkennungsmodelle und eingegebene Metadaten, werden innerhalb der EU gehostet und sind GDPR konform.",transkribus umfassende plattform digitalisierung texterkennung mithilfe künstlicher intelligenz transkription durchsuchen historischen dokumenten erkennen transkribieren durchsuchen historischen dokumenten mitttels ki trainieren spezifischen texterkennungsmodellen lage handschriftliche maschinengeschriebene gedruckte dokumente erkennen kigestützte erkennung handgeschriebenem text layoutanalyse strukturerkennung manuelles transkribieren transkriptionseditor kigestützten erkennung mittels öffentlicher trainierter kimodelle durchsuchen dokumenten erweiterten suchoptionen z b tool aufspüren schlüsselwörtern gemeinsames arbeiten dokumenten organisation sammlungen teilen dokumenten readsearch website export pdf alto xml transkribusinhalte dh hochgeladene bilder erkannte texte trainierte erkennungsmodelle eingegebene metadaten innerhalb eu gehostet gdpr konform


### 3. Load Vectors

Finally we can load the vector representations of the chunks we created in `1_vectorisation`.

In [10]:
vectors_path = os.path.join(os.getcwd(), "vectorisations")

chunks_tfidf = np.load(os.path.join(vectors_path, "tfidf.npy"), allow_pickle=True)
    
chunks_word2vec = np.load(os.path.join(vectors_path, "word2vec.npy")).tolist()

chunks_fasttext = np.load(os.path.join(vectors_path, "fasttext.npy")).tolist()

chunks_sbert = np.load(os.path.join(vectors_path, "sbert.npy")).tolist()

In [11]:
chunks_tfidf_sparse = chunks_tfidf.item() 
chunks_tfidf_dense = chunks_tfidf_sparse.toarray()  
chunks_tfidf_dense.shape

(70, 19962)

## 2. Prepare the LLM

Next, we'll load the llm we want to use. 

We'll start off using llama3. This can be changed in the future.

In [12]:
llm = ChatOllama(
    model="llama3",
    temperature=1,
)

## 3. Define Functions for Retrieval

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

def get_similarities_sbert(query: str) -> np.array(float):
    """
    Calculates similarity scores between the query and SBERT-vector-representations of the chunks.
    Args:
        query (str): The query.
    Returns:
        numpy-array: An array of cosine similarity scores between the query and the chunks.
    """

    query_sbert = get_sbert_vector(query, sbert_model)

    # Reshape the query vector to be a 2D array with one row
    query_sbert = query_sbert.reshape(1, -1)

    # Compute cosine similarity between the query and the documents
    similarities = cosine_similarity(query_sbert, chunks_sbert)
    
    return similarities


def get_similarities_word2vec(query: str) -> np.array(float):
    """
    Calculates similarity scores between the query and word2vec-vector-representations of our documents.
    Args:
        query (str): The query.
    Returns:
        numpy-array: An array of cosine similarity scores between the query and the documents.
    """
    
    query_word2vec = get_word2vec_vector(query, word2vec_model)

    # Reshape the query vector to be a 2D array with one row
    query_word2vec = query_word2vec.reshape(1, -1)

    # Compute cosine similarity between the query and the documents
    similarities = cosine_similarity(query_word2vec, chunks_word2vec)
    
    return similarities


def get_similarities_fasttext(query: str) -> np.array(float):
    """
    Calculates similarity scores between the query and fasttext-vector-representations of our documents.
    Args:
        query (str): The query.
    Returns:
        numpy-array: An array of cosine similarity scores between the query and the documents.
    """
    
    query_fasttext = get_fasttext_vector(query, aligned_vectors_de, aligned_vectors_eng)

    # Reshape the query vector to be a 2D array with one row
    query_fasttext = query_fasttext.reshape(1, -1)

    # Compute cosine similarity between the query and the documents
    similarities = cosine_similarity(query_fasttext, chunks_fasttext)
    
    return similarities

# get stopwords
stopwords_english = set(stopwords.words('english'))
stopwords_german = set(stopwords.words('german'))
stopwords_combined = stopwords_german.union(stopwords_english)

def preprocess(stopwords: list[str], text: str) -> str:
    """
    Preprocesses the given text by converting it to lowercase, removing punctuation, and filtering out stopwords.
    Args:
        stopwords (List[str]): A list of stopwords to be filtered out from the text.
        text (str): The input text to be preprocessed.
    """
    
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))    
    text = ' '.join([word for word in text.split() if word not in stopwords])
    return text


def get_similarities_tfidf(query: str) -> np.array(float):
    """
    Calculates similarity scores between the query and tfidf-representations of our documents.
    Args:
        query (str): The query.
    Returns:
        numpy-array: An array of cosine similarity scores between the query and the documents.
    """
    
    query_tfidf = get_tfidf_vector(preprocess(stopwords_combined, query))
    
    chunks_tfidf_dense = chunks_tfidf.item().toarray()
    
    print(type(query_tfidf))
    print(type(chunks_tfidf_dense))    

    # Compute cosine similarity between the query and the documents
    similarities = cosine_similarity(query_tfidf, chunks_tfidf_dense)
    
    return similarities


def get_similarity(query, vectorisation):
    """
    Calculate the similarity between a query and our chunks using given vectorisation.
    Parameters:
    query (str): The query string.
    vectorisation (str): The vectorisation method: Choose from: tfidf, fasttext, word2vec, sbert.
    Returns:
    similarity (float): The similarity score between the query and the chunks.
    Raises:
    KeyError: If the given vectorisation is not valid.
    """
    
    vectorisations = {
        "tfidf":get_similarities_tfidf,
        "fasttext":get_similarities_fasttext,
        "word2vec":get_similarities_word2vec,
        "sbert":get_similarities_sbert
    }
    
    if vectorisation not in vectorisations:
        raise KeyError(f"'{vectorisation}' is not a valid approach. Choose from: {', '.join(vectorisations.keys())}")
    
    methode = vectorisations[vectorisation]
    similarity = methode(query)
    
    return similarity


def get_similar_chunks(query, vectorisation, n):
    similarity = get_similarity(query, vectorisation)
    df["similarity"] = similarity[0]
    return df.sort_values("similarity", ascending=False).head(n)

In [19]:
test = get_similar_chunks("Ich will ein Manuskript transkribieren", vectorisation="sbert", n=5)
test

Unnamed: 0,brand_name,description,description_clean_chunks,description_preprocessed_chunks,similarity
29,Transcribo,[Transcribo](https://tcdh.uni-trier.de/en/proj...,Transcribo is an editing tool developed by the...,transcribo editing tool developed trier center...,0.579517
0,Transkribus,"# Erkennen, Transkribieren und Durchsuchen von...",Transkribus ist eine umfassende Plattform für ...,transkribus umfassende plattform digitalisieru...,0.562283
14,correspSearch,With correspSearch you can search through inde...,With correspSearch you can search within the m...,correspsearch search within metadata diverse s...,0.559531
67,VMR CRE,The Virtual Manuscript Room Collaborative Rese...,menus and dialogs to assist the researcher wit...,menus dialogs assist researcher composing tran...,0.538878
53,ChrysoCollate,The program offers:\n\n- two modes: collation ...,ChrysoCollate is a freeware for collating and ...,chrysocollate freeware collating editing texts...,0.518004


# 4. Question Bot

This bot answers a single question based on data from the text+ repo. 

In [125]:
### YOUR INPUT: Please enter your question and specifications ###

your_query = "I want to start digitizing a collection of manuscripts" # your question 
text_vectorisation_method = "sbert" # the text vectrisation method used for comparison
n = 10 # number of (potentially) relevant text chunks from the repository provided to the llm
temperature = 0.25 # Lower temperatures (0.2 - 0.5) make the model's responses more focused. 
                # Higher Temperatures (e.g., 0.7 - 1.0) introduce more randomness.


### This code runs the request. You don't need to change anything below this comment. ###

# find relevant chunks 
context = get_similar_chunks(query=your_query, vectorisation=text_vectorisation_method, n=n)
context["combined"] = "This text is about " + context["brand_name"] + ". " + context["description_clean_chunks"] # add the software name to each chunk

# load system prompt
sys_path = os.path.join(os.getcwd(), "prompts/sys.txt")
with open(sys_path, 'r') as file:
    sys_prompt = file.read()
sys_prompt = sys_prompt.replace("\n", " ")

messages = [
    (
        "system",
        f"{sys_prompt}"
    ),
    ("human", f"Answer this question: {your_query}. Answer in the language used in the question. Here is some software from the Text+ Repository, that might be relevant.: {context["combined"]}. You can ignore software that is not relevant. "),
]
ai_msg = llm.invoke(messages, temperature=temperature)


from IPython.display import Markdown

formatted_answer = f"""
### Answer:

{ai_msg.content}
"""
display(Markdown(formatted_answer))


### Answer:

Based on your question, I would recommend using Transkribus for digitizing a collection of manuscripts. It's described as a comprehensive platform for handwritten text recognition and is well-suited for processing large collections of manuscripts. Additionally, it offers features such as OCR, data validation, and linking to other archives or libraries, making it an efficient tool for your task.


# Chat-Bot

This chat bot answers questions based on the Text+ repository.

The visualisation is still a work in progress.

In [137]:
import ipywidgets as widgets
from IPython.display import display, Markdown

# initialize conversation history
conversation_history = []

# define widgets for user input, sending button, output, and loading indicator
text_input = widgets.Text(
    description='Your Message:',
    placeholder='Type your message here'
)
send_button = widgets.Button(description="Send")
output = widgets.Output()
loading_indicator = widgets.HTML(
    value='<div class="spinner" style="width: 30px; height: 30px; border: 4px solid rgba(0, 0, 0, 0.1); border-radius: 50%; border-top: 4px solid #007bff; animation: spin 1s linear infinite;"></div>',
    layout=widgets.Layout(display='none')
)

# CSS for spinner
spinner_css = """
<style>
@keyframes spin {
  0% { transform: rotate(0deg); }
  100% { transform: rotate(360deg); }
}
</style>
"""
display(widgets.HTML(spinner_css))

# function to handle user input and AI response
def on_send_button_click(b):
    with output:
        # clear previous output
        output.clear_output()
        
        # show the loading indicator
        loading_indicator.layout.display = 'block'
        display(loading_indicator)
        
        # get the user message
        user_message = text_input.value
        
        # append the user message to the conversation history
        conversation_history.append(f"**User:** {user_message}")
        
        # process the user message
        user_query = user_message
        context = get_similar_chunks(query=user_query, vectorisation=text_vectorisation_method, n=n)
        context["combined"] = "This text is about " + context["brand_name"] + ". " + context["description_clean_chunks"]  # add the software name to each chunk
        
        # load system prompt
        sys_path = os.path.join(os.getcwd(), "prompts/sys.txt")
        with open(sys_path, 'r') as file:
            sys_prompt = file.read()
        sys_prompt = sys_prompt.replace("\n", " ")
        
        messages = [
            ("system", f"{sys_prompt}"),
        ]
        
        # add previous conversation history to messages
        for history in conversation_history:
            messages.append(("human", history))
        
        # append the current user message
        messages.append(("human", f"Answer this question: {user_query}. Answer in the language used in the question. Here is some software from the Text+ Repository, that might be relevant.: {context['combined']}. You can ignore software that is not relevant."))
        
        # generate response
        ai_msg = llm.invoke(messages, temperature=temperature)
        
        # hide the loading indicator
        loading_indicator.layout.display = 'none'
        
        # append the AI response to the conversation history
        ai_response = ai_msg.content
        conversation_history.append(f"**T+2000:** {ai_response}")
        
        # format and display the conversation history
        formatted_history = "\n\n".join(conversation_history)  # Line breaks between messages
        display(Markdown(f"### Conversation History:\n{formatted_history}"))
        
        # clear the input box
        text_input.value = ''

# arrange widgets in a vertical box layout
chat_box = widgets.VBox([output, loading_indicator, text_input, send_button])

# attach the event handler to the send button
send_button.on_click(on_send_button_click)

# display the chat box
display(chat_box)

HTML(value='\n<style>\n@keyframes spin {\n  0% { transform: rotate(0deg); }\n  100% { transform: rotate(360deg…

VBox(children=(Output(), HTML(value='<div class="spinner" style="width: 30px; height: 30px; border: 4px solid …

In [140]:
# If you want to, you can inspect the information that was provided to the chatbot from the software repository. 
with pd.option_context('display.max_colwidth', None):
    display(context[["combined"]])

Unnamed: 0,combined
14,"This text is about correspSearch. With correspSearch you can search within the metadata of diverse scholarly editions of letters. One can search according to the letter's sender, adressee, as well as place and date of the letter's creation. With correspSearch you can search through indexes of different letter collections (digital or print) by sender, addressee, location written, location sent, and date. To this purpose a website and a technical interface are provided. The web service collects and evaluates TEI-XML data in the ‘Correspondence Metadata Interchange’ format. The web service correspSearch is operated and developed according to the following principles: Reference System : The web service aims to help users with their research by offering a central location to search for letters, and by guiding them to the original publication. Academic Data : The web service is based on the data from letter-indexes of editions or repositories that are edited according to academic criteria. Conceptionaly Open : There is no focus on a particular time period or place. This allows for new kinds of research questions to be explored. Open Access : Data is only collected that is under a free license, and the data from the web service continues to be under a free license and is thus available for further use. Open Interfaces : correspSearch offers technical interfaces that are open and well documented. Other projects can easily query and"
53,"This text is about ChrysoCollate. ChrysoCollate is a freeware for collating and editing texts in any language (Unicode). It allows you to easily collate your manuscripts, order them and compare them, and provide you with tools for easy editing, as the making of automatic apparatus.The program offers: two modes: collation mode and edition mode; a collation table with automatic distinctive colours and previsional semi-automatic completion of readings; annotation tools for the collation table, including a system of references to the images of the witnesses that allows you to navigate easily in your textual tradition; a viewer that displays witness pictures while one collates or edits (various formats of images, pdf, or websites); semi-automatic apparatus, according to the readings that are chosen by the editor; a stemma codicum checker; a translation box to manage and synchronise your translation; exportation in various formats (odt, cte, etc.)."
0,"This text is about Transkribus. Transkribus ist eine umfassende Plattform für die Digitalisierung, Texterkennung mithilfe Künstlicher Intelligenz, Transkription und das Durchsuchen von historischen Dokumenten. Erkennen, Transkribieren und Durchsuchen von historischen Dokumenten mitttels KI Trainieren von spezifischen Texterkennungsmodellen, die in der Lage sind, handschriftliche, maschinengeschriebene oder gedruckte Dokumente zu erkennen. KI-gestützte Erkennung von handgeschriebenem Text, Layout-Analyse und Strukturerkennung. Manuelles Transkribieren im Transkriptionseditor KI-gestützten Erkennung mittels öffentlicher oder selbst trainierter KI-Modelle Durchsuchen von Dokumenten mit erweiterten Suchoptionen, wie z. B. dem Tool zum Aufspüren von Schlüsselwörtern. Gemeinsames Arbeiten an Dokumenten, Organisation in Sammlungen Teilen von Dokumenten durch eine read&search Website oder Export als PDF oder ALTO (XML). Alle Transkribus-Inhalte, d.h. hochgeladene Bilder, erkannte Texte, trainierte Erkennungsmodelle und eingegebene Metadaten, werden innerhalb der EU gehostet und sind GDPR konform."
65,"This text is about VMR CRE. management; 4) indexing of folio content; 5) transcribing; 6) collating; 7) regularizing; 8) editing an apparatus; 9) genealogical analysis of the witness corpus. Metadata and Feature Tagging The VMR CRE stores with each manuscript a very limited set of descriptive data, reserving the primary metadata capture for a dynamic tagging facility called Feature Tagging. A Feature is any defined metadata information which might be captured for a manuscript or manuscript page. For example, an alternative catalog identifier, an external image repository, the canvas material type, the ink type, the script type; these are all Features which might be tagged on a manuscript; For individual pages: an illumination, a canon table, or even individual sample script characters might be tagged as Features. These Features must first be defined in the system, and the VMR CRE comes by default with a predefined set of Feature Definitions used at the INTF. A Feature Definition can specify that zero or more values should be captured with the Feature tag and what those value types and value domains should be. Once a Feature is defined, it can be used to tag manuscripts or manuscript pages, capturing individual Feature values for each tag, if necessary. Every Feature Definition adds to the number of facets available in the catalog search facility. For example, one might search for all manuscript folio sides from Egypt"
49,"This text is about TEITOK. therefore indexed using the Corpus WorkBench (CWB), allowing texts to be search efficiently, and with the rich query language that CWB provides. Words are indexed in the CWB with various orthographic forms, providing many ways to search through the data. The type of corpora that TEITOK is meant for are very labour-intensive: for ancient texts, hardly any of the data will be available in digital format, and have to be scanned. In many cases, OCR will not work and even for human readers the texts are often very hard to read. And the data will display a lot of orthographic variation in which a lot of the linguistic annotation, including normalization, will have to be done by hand. As a result, most corpora created with TEITOK will have a limited size, and searching for linguistic properties in them will not yield a lot of results. Therefore, TEITOK offers the option to index the corpus in a central database, which can be searched via this site. Each search result will only display the direct context of the word, and will link directly to the word in the original text on the site of the project it originated from. This way, it is possible to search through multiple corpora at the same time, and get access to the full original data in a way that prominently features the"
11,"This text is about ediarum. ediarum is a digital working environment consisting of several software components that allows scholars to edit transcriptions of manuscripts and prints in TEI-compliant XML, to provide them with a text and subject apparatus as well as indexes, and to publish them on the web and in print. Benutzerfreundliches Arbeiten Als zentrale Softwarekomponente der Arbeitsumgebung wird Oxygen XML Author eingesetzt. Die Bearbeiter arbeiten im Oxygen XML Author nicht in einer Codeansicht, sondern in einer benutzerfreundlichen, Word-artigen »Autorenansicht«, die über Cascading Stylesheets (CSS) gestaltet wird. Dem Bearbeiter stehen dabei mehrere Ansichten zur Auswahl, so dass per Mausklick die für den Arbeitsschritt geeigneteste ausgewählt werden kann. Außerdem kann der Endanwender über eine eigene Werkzeugleiste per Knopfdruck Auszeichnungen vornehmen. So können z.B. in Manuskripten Streichungen markiert oder Sachanmerkungen eingegeben werden. Auch Personen- oder Ortsnamen können mit der entsprechenden TEI-Auszeichnung versehen und gleichzeitig über eine komfortable Auswahlliste mit dem jeweiligen Eintrag im zentralen Personen- bzw. Ortsregister verknüpft werden. Der gesamte Text kann dadurch einfach und schnell mit TEI-konformen XML ausgezeichnet werden. Kollaboratives Arbeiten Die digitale Arbeitsumgebung nutzt die freie XML-Datenbank existdb als zentrales Repositorium für die XML-Dokumente. Die Datenbank ist auf einem Server installiert und online zugänglich. Dadurch können alle Projektmitarbeiter auf ein und denselben Datenbestand zugreifen und zusammenarbeiten. Um die Einrichtung und Konfiguration zu vereinfachen, wurde das Modul ediarum.DB entwickelt. Website Neben dem eigentlichen Eingabewerkzeug in Oxygen XML Author, wird"
26,"This text is about Publex. A browser-based publishing software for XML-annotated dictionaries. As part of the ELEXIS “European Lexicographic Infrastructure” initiative, TCDH developed the [Publex tool](), a browser-based publishing software for XML-annotated dictionaries. With the help of the software and an intuitive user interface, users can upload their dictionary data marked in XML and define the desired formatting for the dictionary by configuring the individual components. With these settings and the attached metadata, the dictionary can finally be published online on a platform provided by ELEXIS. Publex, thus, also enables users who do not have the appropriate infrastructure or technical knowledge to make their dictionaries accessible on the Internet. Step by step dictionary publishing with Publex A user manual guides you in detail and step by step through the application process ([Publication on DARIAH Campus]()). This can be roughly divided into three steps: 1. uploading the data 2. defining the representation 3. publishing the dictionary Users can import their XML dictionary data from a Git repository into Publex and submit metadata about the dictionary. The core of the tool is the definition of styling rules, which define how the individual elements of the dictionary articles should be displayed in the published online version. Upon import, Publex parses the data and captures all tags, attributes, and associated attribute values with which the resource has been tagged. For each of these elements and"
48,"This text is about TEITOK. based web server. Features Manuscript-based corpora Align your manuscript with your transcript Display each manuscript line with its transcription Transcribe directly from the manuscript Search directly for manuscript fragments Keep multiple editions within the same environment Stand-off Annotations Adds stand-off annotations to any corpus file Edit using an efficient interface Annotate over discontinuous regions Incorporate annotations into the CQP corpus Audio-based corpora Align your audio with your transcription Transcribe directly from the audio file Scroll transcription vertical with wave function horizontal Search directly for audio segments Dependency Grammar Keep dependency relations inside any corpus type Visualize dependency trees for any sentence Edit trees easily Search using dependency relations Geolocation Coordinates Map documents onto the world map Document are clustered into counted groups Access the documents from the map Compare corpus queries on the world map Edit from CQP Query Search for words often incorrectly annotated Click on any token in a KWIC list to edit it Edit all results in a systematic way Edit each results individually in a list Pre-modify each result by a regular expression Search The rich XML format used in TEITOK is hard to search through. For easier access, all corpora are therefore indexed using the Corpus WorkBench (CWB), allowing texts to be search efficiently, and with the rich query language that CWB provides. Words are indexed in the CWB with various"
37,"This text is about FuD - The Virtual Research Environment for the Humanities and Social Sciences . publication. FuD can also be used to create various digital, historical-critical editions of works from scratch. Collection management With FuD, primary research data can be recorded using a jointly developed metadata schema and shared within a research group. Analysis FuD supports the analysis of text and image data with its annotation tools. The tools can be adapted to the project-specific research method. Network analysis On the basis of annotated texts, a visualization tool connected to FuD is used to analyse and display networks. Metadata FuD supports the structured recording of document metadata via individual input masks. System architecture FuD is a modular software system whose subcomponents are connected to each other via well-defined interfaces. The FuD working environment is used for data collection, analysis and editorial processing. This is a client-server architecture (tcl/Tk, MySQL database), whereby the FuD client is installed on the workstation computer, with which the data is processed on the FuD server. The FuD client is available for current Windows operating systems and, in a slightly reduced form, also for Macintosh operating systems. This architecture requires a permanent Internet connection to the server while working with the system. Other systems such as Zotero for managing secondary literature or the transcription tool ""Transcribo"" for transcribing full texts are connected to the FuD working environment. This is configured according to project-specific requirements. Existing structured data"
40,"This text is about TextGridLab. A virtual research environment for the humanities that is optimized for working with TEI-coded resources and covers the entire research process up to publication. The TextGrid Laboratory, together with the TextGrid Repository, is a diverse research environment. The TextGridLab includes various tools and services that you can access to create, manage, and edit research data. The TextGridLab offers the possibility to collaboratively edit and generate research data in a protected environment. The Open-Source-Software is available for Windows, Mac OS X, and Linux. Other open source tools and services optimized for use with TextGrid can be integrated via the MarketPlace integrated into the TextGridLab (access via the start window of the TextGridLab). Thanks to the modular open-source architecture of the software, the tools and services provided by TextGrid can also be adapted to specific project-specific requirements. At the heart of the third funding phase (2012-2015) was the transfer of the virtual research environment into the long-term sustainable operation. TextGrid is freely available as part of the DARIAH infrastructure. The TextGridLab is optimised for XML/TEI development, e.g. in the context of digital editions. Advantages Decentralized work area : Users can access the TextGridLab and the TextgridRep independently of the location and work together in complex research projects. Standardization : The controlled metadata vocabulary and open standards facilitate the exchange of data, text search, and digital archiving. Extensibility :"
