# Import dependencies and create frequently used functions

In [None]:
import pandas as pd
import requests
import os
import json
from typing import List
from util.webscraper import WebScraper
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import string
import numpy as np
import gensim.downloader as api
import gensim
from gensim.models import KeyedVectors
from gensim.models import FastText
from sklearn.metrics.pairwise import cosine_similarity
import fasttext
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from joblib import Parallel, delayed
import sentence_transformers


In [None]:
# define functions to load the FastText models
def download_file(url: str, file_path: str) -> None:
    """Download a file from a URL and save it locally."""
    try:
        
        if os.path.isfile(file_path):
            print("File was already downloaded.")
            return None
        
        response = requests.get(url, stream=True)
        response.raise_for_status()  # Check if the request was successful
        with open(file_path, "wb") as file:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    file.write(chunk)
        print(f"The file has been downloaded and saved as: {file_path}")
    except requests.RequestException as e:
        print(f"An error occurred while downloading the file: {e}")
        
def load_word_vectors(file_path: str):
    """Load word vectors from a file."""
    try:
        model = gensim.models.KeyedVectors.load_word2vec_format(file_path)
        print("Vectors loaded successfully.")
        return model
    except Exception as e:
        print(f"An error occurred while loading the vectors: {e}")
        return None


# Data Collection

First, we get the data from the API. As the API is not yet published, both the API-Url and the query to get information on edition-software need to be specified in your .env file. (consult the README for more information)

In [None]:
%load_ext dotenv
%dotenv

In [None]:
# get api_url and query
api_url = os.environ['API_URL']
query = os.environ['QUERY']

# get data from api
api_response = requests.get(api_url + query)

Now that we got the data from the API, we can load it into a dataframe to prepare it to be used as a knowledge base for rag. 

In [None]:
edition_software_info = json.loads(api_response.text)
edition_software_info = pd.DataFrame(edition_software_info)
edition_software_info.info()

A brief inspection allows us to formulate some initial tasks and questions for this experiment.

- **Preprocessing:** As we can see, not a single entry contains a associated concept_doi. We might consider dropping the column.
- **Impact of using short descriptions only:** Three entries are missing the in depth description. We can assume that rag won't be too useful for these entries. 
- **Impact of additional information:** Only three have a description-url. Down the road, we need to evaluate, if adding info from this source improves the performance of the rag-system.

# Data Cleaning

### 1. Remove Artefacts

Both the `description` and `short_statement` columns seem to be of particular interest for the task at hand. To asses necessary preprocessing step, we'll need to take a closer look at them.

In [None]:
descriptions = edition_software_info[["description", "short_statement"]]
with pd.option_context('display.max_colwidth', None):
    display(descriptions.head())

As we can see, the `description` column contains some formatting artefacts like `\n` and markdown syntax like `**` and `#`. Let's clean them up.
While we're at it, we can also remove double whitespaces etc.

In [None]:
pattern = '\\n+'
edition_software_info["description_clean"] = edition_software_info["description"].str.replace(pattern, ' ', regex=True)

pattern = r'[*#]+|\s-+\s|]]' #\[\]()<>
edition_software_info["description_clean"] = edition_software_info["description_clean"].str.replace(pattern, ' ', regex=True)

with pd.option_context('display.max_colwidth', None):
    display(edition_software_info[["brand_name", "description", "description_clean"]].head())

### 2. Fill nan values

Before we continue preprocessing the data for later vectorization, we need to check for missing values and replace them with empty strings.

In [None]:
edition_software_info["description_clean"].fillna("", inplace=True)

# Scrape Webpages

To provide additional context-information for the retrieval process, we'll scrape all webpages referenced in the software-description.

### 1. Get urls

First, we isolate the urls from our description.

In [None]:
pattern = r"((?:https?:\/\/|w{3}.)[\w\d%/.-]+)"

urls = edition_software_info["description"].str.extractall(pattern)
urls = urls.droplevel(1)
urls_grouped = urls.groupby(urls.index).agg((lambda x: ','.join(set(x))))
edition_software_info["urls"] = urls_grouped

with pd.option_context('display.max_colwidth', None):
    display(edition_software_info[["description_clean", "urls"]].head())

### 2. Scrape Webpages

Now we scrape the paragraphs from the webpages we found. 
The webscraper will take the list of urls associated with an entry and will save paragraphs from all webpages as a string in a column of our dataframe. 

**This might take some time**

In [None]:
webscraper = WebScraper(tags = ["p"], exclude = ["wikipedia"])
edition_software_info["webpages_text"] = edition_software_info["urls"].apply(lambda x: webscraper.scrape(x))

### 3. Inspect data

In [None]:
edition_software_info[["urls", "webpages_text"]].head()

Now that the data is collected from the webpages, we can take a look at the average length of the texts received for each entry.

In [None]:
length = edition_software_info["webpages_text"].apply(lambda x: len(x) if not pd.isna(x) else 0)
length[length>0].describe()

Looking only at entries, that we were able to collected webpage text for, we have an average character count of about 15.000 per entry. 
The standard deviation is quite large compared to the mean, indicating that there is a high degree of variability in character counts.

The distribution is skewed towards entries with lower character counts, while some outliers with a high character counts pull the mean upwards.



# Data Augmentation

We can combine some information to increase the information density in our description. We'll start off by simply appending the `short statement` to the front of the description. 
In later steps we might add other information. 

In [None]:
edition_software_info["description_clean"] = edition_software_info["short_statement"] + edition_software_info["description_clean"]

# Preprocessing

Next, we can prepare our data for the information retrieval process. We'll focus on 'descriptions' for now. However, this process could be easily expanded to the webpage text we collected earlier.

### 1. Remove links

First, we'll remove all links from the descriptions.

In [None]:
pattern = r"((?:https?:\/\/|w{3}.)[\w\d%/.-]+)"
edition_software_info["description_clean"] = edition_software_info["description_clean"].str.replace(pattern, '', regex=True)
edition_software_info["description_clean"].head(3)

### 2. Replace missing descriptions

To allow for smoother text processing, we'll replace all NaN values in the relevant text columns with an empty string.

In [None]:
edition_software_info["description_clean"] = edition_software_info["description_clean"].fillna('')
null = edition_software_info["description_clean"].isna().sum()
print(f"NaN remaining: {null}")

#### 4. Add Language Information

As the entries in our repository are in both english and german, we add information on the texts language to the dataset.

To do so, we'll use a fasttext-model for language identification, which can be found [here](https://fasttext.cc/docs/en/language-identification.html).

As the model was trained on UTF-8 data, it expects UTF-8 as input. This sould be the case, as pandas `read_csv`-function imports text in UTF-8 by default.



In [None]:
# download the fasttext model

fasttext_path = os.path.join(os.getcwd(), 'models/lid.176.bin')

if os.path.isfile(fasttext_path):
    print("Model found.")
    language_detection_model = fasttext.load_model(fasttext_path)
    
else:
    print("Model not found. Downloading...")
    url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
    # download the model
    download_file(url, fasttext_path)
    # load the model
    print("Loading model from file...")
    language_detection_model = fasttext.load_model(fasttext_path)


Now we can use the model to identify the languages of our entries. This might take some time. 

In [None]:

fasttext_model = fasttext.load_model(fasttext_path)

def identify_language(text):
    lang_detected = fasttext_model.predict(text)
    return lang_detected[0][0]

# clean the webpage text, as the model expect text without newlines
edition_software_info.loc[:,"description_clean"] = edition_software_info["description_clean"].str.replace("\n"," ")

# detect message-languages. If the column contains empty text, the language is set to nan
edition_software_info.loc[:,"description_lang"] = edition_software_info["description_clean"].apply(lambda x: identify_language(x) if not x == '' else np.nan)

# clean the output
edition_software_info.loc[:,"description_lang"] = edition_software_info["description_lang"].str.replace("__label__","")

# print the new columns
edition_software_info[["description","description_lang", "short_statement"]].head(10)

#### 5. Chunking

Finally, we chunk longer entries into smaller paragraphs. We'lle leave some overlap between chunks, as to retain some context between chunks.

We start off with chunk sizes of 108 token and an overlap of 20, as 128 was identified to be a effective chunk size for RAG in this [blogpost](https://www.mattambrogi.com/posts/chunk-size-matters/) by Matt Ambrogi.

**Both the chunk and overlap size are hyperparamters and should be fine tuned for the task at hand.**


In [None]:
def chunk(text, chunk_size=226, overlap_size=30):
    """
    Splits the given text into chunks with overlap.
    Args:
        text (str): The input text to be chunked.
        chunk_size (int, optional): The size of each chunk. Defaults to 108.
        overlap_size (int, optional): The size of overlap between chunks. Defaults to 20.
    Returns:
        list: A list of chunks with overlap.
    """
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap_size):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
        if i + chunk_size >= len(words):
            break
    return chunks

# Apply the chunking function and explode the chunks, to give every chunk its own row while retaining all other data
edition_software_info_chunked = edition_software_info.copy()
edition_software_info_chunked['description_clean_chunks'] = edition_software_info_chunked['description_clean'].apply(lambda x: chunk(x))
edition_software_info_chunked = edition_software_info_chunked.explode('description_clean_chunks') 

# reindex the dataframe
edition_software_info_chunked = edition_software_info_chunked.reset_index(drop=True)

with pd.option_context('display.max_colwidth', None):
    display(edition_software_info_chunked[["brand_name", "description_clean", "description_clean_chunks"]].head(10))


#### 5. Removing punctiation, stopwords and upper case letters.

As some vectorization methods need additional preprocessing steps, we'll create a new column for description with their punctiation, stopwords and upper case letters removed. As we have both english and german texts, we'll need to account for stopwords in both languages. 

In [None]:
def preprocess(stopwords: List[str], text: str) -> str:
    """
    Preprocesses the given text by converting it to lowercase, removing punctuation, and filtering out stopwords.
    Args:
        stopwords (List[str]): A list of stopwords to be filtered out from the text.
        text (str): The input text to be preprocessed.
    """
    
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))    
    text = ' '.join([word for word in text.split() if word not in stopwords])
    return text

# get stopwords
stopwords_english = set(stopwords.words('english'))
stopwords_german = set(stopwords.words('german'))
stopwords_combined = stopwords_german.union(stopwords_english)

In [None]:
edition_software_info_chunked["description_preprocessed_chunks"] = edition_software_info_chunked["description_clean_chunks"].apply(lambda x: preprocess(stopwords_combined, x))
edition_software_info_chunked["description_preprocessed_chunks"].head()

# Export dataset

Before we move on to the vectorisations, we'll save the dataset to our drive.

In [None]:
current_dir = os.getcwd()
path = os.path.join(current_dir, 'data/edition_software_info.csv')
edition_software_info.to_csv(path)

# Vectorization 1: TFIDF

We'll start off with a simple TF-IDF vectorization.

**Term Frequency-Inverse Document Frequency (TF-IDF)** is a weighting scheme that weights the cells of a term-document matrix by their potential to be discriminatory.

To do so, we first calculate the **term frequency (TF)**. The term frequency represents the number of instances of a given word $t$ in a document $d$.

$$
\text{TF}(t, d) = \frac{\text{Count of } t \text{ in } d}{\text{Total number of words in } d}
$$

This term frequency is then multiplied by the **inverse document frequency (IDF)**. The IDF is calculated by counting all documents that contain a term $t$ (the document frequency $\text{df}(t)$). Then, we divide the total number of documents $N$ in the corpus by $\text{df}(t)$.

This inverse frequency is chosen over the regular frequency to **downweight** terms that appear in many documents, since these terms are less likely to be useful for distinguishing between documents.

Usually, we also take the logarithm of the IDF to smooth out the very large values that can occur when a term appears in only a few documents. This ensures that rare terms are not excessively weighted.

$$
\text{df}(t) = \text{Document frequency of a term } t
$$
$$
N = \text{Number of documents}
$$
$$
\text{IDF}(t) = \log\left(\frac{N}{\text{df}(t)}\right)
$$

Finally, we calculate the **TF-IDF** by multiplying the term frequency $\text{TF}(t, d)$ with the inverse document frequency $\text{IDF}(t)$.

$$
\text{TF-IDF}(t, d) = \text{TF}(t, d) \times \text{IDF}(t)
$$

The resulting value can be interpreted as a measure of the importance of the term in a document relative to the entire corpus. Terms that are frequent in a document but rare across the corpus will have higher TF-IDF scores, indicating their importance.


**N-grams:**

To capture not just the importance of single words but also some of the **context** in which they are used, we can apply TF-IDF to **n-grams**. N-grams are contiguous sequences of $n$ words that appear together in a text. The size of the sequence, $n$, is a hyperparameter that can be adjusted depending on the specific task. 


### 1: Fit TF-IDF Vectorizer
First, we fit the vectorizer on the preprocessed descriptions. 
This way, the vectorizer can transform text into numerical feature vectors based on the learned vocabulary and its distribution over documents.

In [None]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,4))
tfidf_matrix = tfidf_vectorizer.fit_transform(edition_software_info_chunked['description_preprocessed_chunks'])

# display the resulting matrix
tfidf_matrix_beautify = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_matrix_beautify

### 2: Inspect the tf-idf representations

Each column in this dataframe is a unique word, while each row is a document. The cells denote the number of occurances of a word in a document, weighted by the words potential to be distinctive.

Let's take a look at the tf-idf filtered words for each description (You can find them in the column "tf_idf_filtered_words")

In [None]:
# Define the threshold for TF-IDF scores
threshold = 0.11

# Filter words with TF-IDF scores greater than the threshold for each document
def filter_words_by_threshold(row, threshold):
    filtered_words = [(word, score) for word, score in zip(tfidf_matrix_beautify.columns, row) if score > threshold]
    return sorted(filtered_words, key=lambda x: x[1], reverse=True)


# Apply the function to each row of the TF-IDF DataFrame
filtered_words = tfidf_matrix_beautify.apply(lambda row: filter_words_by_threshold(row, threshold), axis=1)

# Create a dataframe to display the filtered words
filtered_words_df = pd.DataFrame(filtered_words, columns=["tf_idf_filtered_words"])
tfidf_display = pd.concat([edition_software_info_chunked[["brand_name","description_clean_chunks"]], filtered_words_df], axis=1)
    
with pd.option_context('display.max_colwidth', None):
    display(tfidf_display[["brand_name", "description_clean_chunks", "tf_idf_filtered_words"]].head(5))

### 3. Test tfidf-representation

This cell will return the most relevant documents from our dataset based on a comparison of their tf-idf representations and a query. The query can be changed.

In [None]:
query = 'I want to transcribe and annotate a manuscript'
# Preprocess the query
query = preprocess(stopwords_combined, query)
# Transform the query to TF-IDF space
query_tfidf = tfidf_vectorizer.transform([query]) 

# Compute cosine similarity between the query and the documents
similarities = cosine_similarity(query_tfidf, tfidf_matrix)

similarity_df = pd.DataFrame({
    'similarity_score': similarities[0]
})

result_df = pd.concat([edition_software_info_chunked[["brand_name", "description_clean_chunks"]], similarity_df], axis=1)
result_df_sorted = result_df.sort_values(by='similarity_score', ascending=False)

# print the top 3 description, that might be relevant to our query
with pd.option_context('display.max_colwidth', None):
    display(result_df_sorted[["brand_name", 'description_clean_chunks', 'similarity_score']].head(3))

#### 4. Save vectorisations and the TFIDF-vectorizer

Now we can save the vectorisations and the vectorizer to be later used in our RAG-Pipeline.

In [None]:
path = "vectorisations/tfidf.npy"
np.save(path, tfidf_matrix)

In [None]:
import pickle
path = os.path.join(os.getcwd(), "models/tfidf_vectorizer.pickle")
pickle.dump(tfidf_vectorizer, open(path, "wb"))

# Vectorization 2: Aggregated Word2Vec


Next, we'll create document representations by aggregating the word2vec embeddings of each word in a description. 

Word2vec encodes the meaning of the words by capturing their semantic relationships based on the context in which they appear. By aggregating the word2vec embeddings of each word in a description, we can create a document representation that retains the semantic information and provides a more nuanced understanding of the content.

From a computational perspective, these representations are shorter and denser than tf-idf representations, making them more suitable for computations such as similarity measures, clustering, or classification tasks. The dense nature of word2vec embeddings allows for efficient storage and faster processing compared to sparse representations like tf-idf. Additionally, because word2vec captures the meaning and context of words, it can provide more meaningful insights into the relationships between different documents or terms.


#### 1. Load the pretrained model

In [None]:
current_path = os.getcwd()
path = os.path.join(current_path, "models/word2vec-google-news-300.bin")

# Load the model if it is already in our project. If not, download it.
if os.path.isfile(path):
    print("Model found. Loading...")
    word2vec_model = KeyedVectors.load(path)
    
else:
    print("Model not found. Downloading...")
    word2vec_model = api.load("word2vec-google-news-300")
    word2vec_model.save(path)
    


In [None]:
""" #TODO: Add some preprocessing
def preprocess_word2vec(text: str) -> str:
    pass
"""

#### 2. Create the document representations

In [None]:
def get_word2vec_vector(words, model):
    words = words.split()
    # Filter words that are in the model's vocabulary
    valid_words = [word for word in words if word in model]
    
    if not valid_words:
        # Return a zero vector if no valid words are found
        return np.zeros(model.vector_size)
    
    # Average the vectors of the valid words to create a document representation
    vectors = [model[word] for word in valid_words]
    return np.mean(vectors, axis=0)

# Apply the function to create aggregated vectors
word2vec = edition_software_info_chunked['description_preprocessed_chunks'].apply(lambda x: get_word2vec_vector(x, word2vec_model))

# Convert the Series of 1D arrays to a 2D numpy array (to calculate the cosine similarity later on)
word2vec_array = np.array(word2vec.tolist())
len(word2vec_array)

#### 3. Test Word2Vec Representation

In [None]:
query = 'I need a search capability for my edition of letters'
# Preprocess the query
query = preprocess(stopwords_combined, query)
# get vector representation of the query using word2vec
query_word2vec = get_word2vec_vector(query, word2vec_model)
# Reshape the query vector to be a 2D array with one row
query_word2vec = query_word2vec.reshape(1, -1)

# Compute cosine similarity between the query and the documents
similarities = cosine_similarity(query_word2vec, word2vec_array)

similarity_df = pd.DataFrame({
    'similarity_score': similarities.flatten()
})

result_df = pd.concat([edition_software_info_chunked[["brand_name","description_clean_chunks"]], similarity_df], axis=1)
result_df_sorted = result_df.sort_values(by='similarity_score', ascending=False)

# print the top 3 description, that might be relevant to our query
with pd.option_context('display.max_colwidth', None):
    display(result_df_sorted[['brand_name','description_clean_chunks', 'similarity_score']].head(5))

#### 4. Save the vectorisations

Now we can save the vectorisations for later use in RAG.

In [None]:
path = "vectorisations/word2vec.npy"
np.save(path, np.array(word2vec_array))

# Vectorization 3: Aggregated FastText


One downside of pretrained Word2Vec representations is their inability to handle words not contained in their vocabulary. 

FastText overcomes this issue by representing words not only as embeddings but also as collections of embedded character n-grams. This approach allows FastText to generate meaningful word vectors for previously unseen words, which is particularly useful when dealing with highly specialized terminologies. In our dataset, which contains such specialized language, FastText may therefore offer more reliable performance compared to traditional Word2Vec models.


### 1. Load the models

FastText provides pre-aligned word vectors, meaning that word vectors for different languages (like German and English) have already been mapped into a common vector space. This allows words with similar meanings across different languages to have similar vector representations, which is crucial when working with multilingual datasets.

Since our dataset contains both German and English texts, we need to download the pre-aligned FastText models for these two languages.


In [None]:
import zipfile

def unzip_file(zip_file_path: str, extract_to: str) -> None:
    """Unzip a file to a target directory."""
    try:
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
        print(f"Unzipped {zip_file_path} to {extract_to}")
    except zipfile.BadZipFile as e:
        print(f"Error while unzipping the file: {e}")

In [None]:
# Check if the english model file exists. If so, load it. If not, download it and convert it to .bin for faster loading in the future. 
# This might take a while

current_path = os.getcwd()
models_dir = os.path.join(current_path, "models")
fasttext_eng_zip_path = os.path.join(models_dir, "wiki.en.zip")
fasttext_eng_path_vec = os.path.join(models_dir, "wiki.en.vec")
fasttext_eng_path_bin = os.path.join(models_dir, "wiki.en.bin")

if os.path.isfile(fasttext_eng_path_bin):
    print("Model found. Loading...")
    aligned_vectors_eng = gensim.models.fasttext.load_facebook_model(fasttext_eng_path_bin) #load the full model, including subword information.
    
else:
    print("Model not found. Downloading...")
    url = "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip" 
    # download the models
    download_file(url, fasttext_eng_zip_path)
    
    print("Unzipping the file...")
    unzip_file(fasttext_eng_zip_path, models_dir)    

    # load the model
    print("Loading model from file...")
    aligned_vectors_eng = gensim.models.fasttext.load_facebook_model.load(fasttext_eng_path_bin)
    # save the model as binary to reduce loading time in the future
    aligned_vectors_eng.save(fasttext_eng_path_bin)

    
if aligned_vectors_eng is None:
    raise ValueError("The FastText model was not loaded properly.")

In [None]:
# Check if the german model file exists. If so, load it. If not, download it and convert it to .bin for faster loading in the future.
# This might take a while

fasttext_de_path_bin = os.path.join(current_path, "models/wiki_de_align.bin")
fasttext_de_path_vec = os.path.join(current_path, "models/wiki_de_align.vec")

if os.path.isfile(fasttext_de_path_bin):
    print("Model found. Loading...")
    aligned_vectors_de = KeyedVectors.load(fasttext_de_path_bin)
    
else:
    print("Model not found. Downloading...")
    url = "https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.de.align.vec"
    # download the model
    download_file(url, fasttext_de_path_vec)
    # load the model
    print("Loading model from file...")
    aligned_vectors_de = load_word_vectors(fasttext_de_path_vec)
    # save the model as binary to reduce loading time in the future
    aligned_vectors_de.save(fasttext_de_path_bin)
    
if aligned_vectors_de is None:
    raise ValueError("The FastText model or vectors were not loaded properly.")


Now that we loaded both models, let's check if they are properly aligned. 

To do so, we'll pick an english word, get its english vector representation and return the most similar word vector in the german vector space.

In [None]:
# get the vector-representation of a random english word
word_english = 'skyscraper'
word_vector_in_english = aligned_vectors_eng.wv[word_english]

print(f"German word vector closest to {word_english}:", aligned_vectors_de.most_similar(positive=[word_vector_in_english]))

#### 2. Create Document Representations

Now that we confirmed, that the vector spaces are properly aligned, we can create the document representations. 
As the pre-aligned german model does not contain subword-infomation, we'll use the subword-information contained in the english model to embedd unknown words in both languages.

We'll use the preprocessed descriptions we created earlier.

Additionaly, we'll print the words we created new wod vectors using the english subword information. 

In [None]:
def get_fasttext_vector(row, aligned_vectors_de=None, aligned_vectors_eng=None):
    """
    Calculates the FastText vector representation for a given row.
    Parameters:
    - row: A row of data.
    - aligned_vectors_de: Aligned FastText vectors for the German language. Default is None.
    - aligned_vectors_eng: Aligned FastText vectors for the English language. Default is None.
    Note:
    - If the language is not specified or not supported (only "en" and "de" are supported), it returns a zero vector.
    - If a word in the row's description is not found in the aligned vectors, it tries to create a vector based on english subword information.
    - If no vectors are found, it returns a zero vector.
    """
    
    
    # default size to avoid errors if vectors are None
    vector_size = aligned_vectors_de.vector_size if aligned_vectors_de else 300
    
    # check if language is valid
    lang = row.get("description_lang")
    if pd.isna(lang) or lang not in ["en", "de"]:
        return np.zeros(vector_size) #Maybe rather use none?
    
    words = row.get("description_preprocessed_chunks", "").split()
    vectors = []

    # process based on language
    if lang == "de" and aligned_vectors_de:
        for word in map(str.lower, words):
            try:
                vectors.append(aligned_vectors_de[word])
            except KeyError:
                print(f"Created Vector based on Subword Information for: {word}")                
                vectors.append(aligned_vectors_eng.wv[word])
                #vectors.append(np.zeros(vector_size))
                
    elif lang == "en" and aligned_vectors_eng:
        for word in map(str.lower, words):
            try:
                vectors.append(aligned_vectors_eng.wv[word])
            except KeyError:
                print(f"Missing Vector for: {word}")
                vectors.append(aligned_vectors_eng.wv[word])
    
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros


# Apply the function to create aggregated vectors
fasttext = edition_software_info_chunked.apply(lambda x: get_fasttext_vector(x, aligned_vectors_de, aligned_vectors_eng), axis=1)

# Convert the Series of 1D arrays to a 2D numpy array (to calculate the cosine similarity later on)
fasttext_array = np.array(word2vec.tolist())
len(fasttext)

#### 3. Test the FastText Representations

In [None]:
query = 'I want to transcribe a manuscript'

# check the queries language, to decide which model to use.
query_lang = identify_language(query).replace("__label__","") if not query == '' else np.nan

# preprocess the query
query = preprocess(stopwords_combined, query)

# create a series that can be used as an input in the get_fasttext_vector function. Ignore the key names.
query_information = pd.Series({
    "description_lang": query_lang,
    "description_preprocessed_chunks": query
})

# get vector representation of the query using fasttext
query_fasttext= get_fasttext_vector(query_information, aligned_vectors_de, aligned_vectors_eng)

# Reshape the query vector to be a 2D array with one row
query_fasttext = query_fasttext.reshape(1, -1)

# Compute cosine similarity between the query and the documents
similarities = cosine_similarity(query_fasttext, fasttext_array)

similarity_df = pd.DataFrame({
    'similarity_score': similarities.flatten()
})

result_df = pd.concat([edition_software_info_chunked[["brand_name","description_clean_chunks"]], similarity_df], axis=1)
result_df_sorted = result_df.sort_values(by='similarity_score', ascending=False)

# print the top 3 description, that might be relevant to our query
with pd.option_context('display.max_colwidth', None):
    display(result_df_sorted[['brand_name','description_clean_chunks', 'similarity_score']].head(5))

#### 5. Save the vectorisations

Now we save the vectors we created for later use in RAG.

In [None]:
path = "vectorisations/fasttext.npy"
np.save(path, np.array(fasttext_array))

# Vectorization 4: SBERT

**SBERT**: 

In contrast to the output of regular embedding models, word vectors created using BERT models are contextualized. This means that they can generate multiple word embeddings for the same word, depending on the meaning it takes on in a certain context. "Bank," for example, will have two different representations—one as a river's shore and another as a place to store money.

Since BERT models by themselves are notoriously bad for semantic similarity tasks applied to sentence- or paragraph-level vectors, we will use  SBERT to create document representations instead. These models also create contextualized embeddings but are specifically trained with semantic similarity in mind: A triplet loss function is used to minimize the distance between an "anchor point" and a positive sample while maximizing the distance to a negative sample. This forces sentence transformers to produce a vector space where semantically similar sentences are close together, while sentence embeddings of semantically dissimilar sentences are far apart.

Specifically, we will use the sentence-transformers library, which builds on the original [SBERT paper](https://arxiv.org/abs/1908.10084).

**Reranking:**

As it is suggested in the documentation of the sentence transformer module, the similarities calculated using the SBERT-representations will be reranked using a Cross-Encoder. 
Cross-Encoders tackle the task of calculating similarity as a classification task, classifying two sentences as either "relevant" or "not relevant" in relation to one another. We'll not rerank the all similarity scores, but only the entries most similar to our query.

Detailed information on this retrieve & re-rank process and its benefits can be found [here](https://www.sbert.net/examples/applications/retrieve_rerank/README.html).


#### 1. Load the models

In [None]:
# Download SBERT model or load them from drive
sbert_path = os.path.join(os.getcwd(),"models/sbert")
downloaded = os.path.isdir(sbert_path)

if not downloaded:
    print("Downloading Sentence Transformer...")
    sbert_model = sentence_transformers.SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
    sbert_model.save(sbert_path)
else:
    print("Load Sentence Transformer from drive...")
    sbert_model = sentence_transformers.SentenceTransformer(sbert_path)
    
# Download Cross Encoder or load them from drive

cross_encoder_path = os.path.join(os.getcwd(), "models/cross")
downloaded = os.path.isdir(cross_encoder_path)

if not downloaded:
    print("Downloading Cross Encoder...")
    cross_encoder_model = sentence_transformers.CrossEncoder("corrius/cross-encoder-mmarco-mMiniLMv2-L12-H384-v1")
    sbert_model.save(cross_encoder_path)
else:
    print("Load Cross Encoder from drive...")
    cross_encoder_model = sentence_transformers.SentenceTransformer(sbert_path)

#### 2. Create Document Representations

As creating SBERT embeddings may take some time, we'll first create a function to parallelize the process. The parameter num_chunks can be increased to control the number of concurrent embedding processes.

In [None]:
# Set environment variable to control tokenizers parallelism
os.environ["TOKENIZERS_PARALLELISM"] = "true"

def get_sbert_embeddings(text, model):
    """
    Get Sentence-BERT embeddings for a given text using a specified model.
    Parameters:
    text (str): The input text to encode.
    model: The Sentence-BERT model to use for encoding.
    Returns:
    numpy.ndarray: The Sentence-BERT embeddings for the input text.
    """
    default_embedding = np.zeros((model.get_sentence_embedding_dimension(),))
    
    if pd.isna(text) or text.strip() == '':
        return default_embedding    
    return model.encode(text, convert_to_tensor=False)


def compute_sbert_embeddings_in_parallel(df, text_column, model, num_chunks=3):
    """
    Compute SBERT embeddings in parallel for the given text column in the DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame containing the text data.
        text_column (str): The column name of the text data.
        model (Any): SBERT model for generating embeddings.
        num_chunks (int): The number of chunks to split the data for parallel processing.

    Returns:
        list: A list containing the computed SBERT embeddings.
    """

    # Function to process each chunk of data
    def process_chunk(chunk):
        embeddings = []
        
        for text in chunk:
            embeddings.append(get_sbert_embeddings(text, model))
        
        return embeddings

    # Split DataFrame into chunks for parallel processing
    df_chunks = np.array_split(df[text_column], num_chunks)

    # Process each chunk in parallel
    results = Parallel(n_jobs=num_chunks)(
        delayed(process_chunk)(chunk) for chunk in df_chunks
    )

    # Combine and return the results
    return np.concatenate(results).tolist()

In [None]:
# get vector representation of the query using the sbert model
sbert_array = compute_sbert_embeddings_in_parallel(edition_software_info_chunked, "description_clean_chunks", sbert_model, num_chunks=3)

#### 3. Test SBERT-Embeddings

In this cell, we can test similarities between a query and the SBERT-Embeddings, without reranking of the results.

In [None]:
#query = 'Ich muss verschiedene Stadien eines Manuskripts vergleichen und verschiedene Versionen desselben Schriftstückes nebeneinander darstellen'
query = 'I want to compare different versions of a text to see how it developed over time'

query_sbert = get_sbert_embeddings(query, sbert_model)

# Reshape the query vector to be a 2D array with one row
query_sbert = query_sbert.reshape(1, -1)

# Compute cosine similarity between the query and the documents
similarities = cosine_similarity(query_sbert, sbert_array)

similarity_df = pd.DataFrame({
    'similarity_score': similarities.flatten()
})

result_df = pd.concat([edition_software_info_chunked[["brand_name","description_clean_chunks"]], similarity_df], axis=1)
result_df_sorted = result_df.sort_values(by='similarity_score', ascending=False)

# print the top 5 description, that might be relevant to our query
with pd.option_context('display.max_colwidth', None):
    display(result_df_sorted[['brand_name','description_clean_chunks', 'similarity_score']].head(5))

#### 4. Test reranked SBERT-Representations

Now, we can apply reranking to the similariites calculated above and compare the results.

In [None]:
# Sort the results by similarity scores calculated using the sbert-representations in descending order
result_df_sorted = result_df.sort_values(by='similarity_score', ascending=False)

# Select the top N (e.g., top 20) results for reranking
n = 20
top_n_results = result_df_sorted.head(n)

# Prepare the input pairs (query, document description) for the reranking model
model_inputs = [[query, description] for description in top_n_results["description_clean_chunks"]]

# Predict the relevance scores using the CrossEncoder model
scores = cross_encoder_model.predict(model_inputs)

# Create a DataFrame for reranked results
reranked_results = pd.DataFrame({
    "brand_name": top_n_results["brand_name"].values,
    "description_clean_chunks": top_n_results["description_clean_chunks"].values,
    "initial_similarity_score": top_n_results["similarity_score"].values,
    "rerank_score": scores
})

# Sort the results by rerank score in descending order
reranked_results_sorted = reranked_results.sort_values(by='rerank_score', ascending=False)

# Display the top reranked results
with pd.option_context('display.max_colwidth', None):
    display(reranked_results_sorted[['brand_name', 'description_clean_chunks', 'initial_similarity_score', 'rerank_score']].head(5))

Now, we can manually inspect the results. 

After testing it using different queries, we can conclude that the reranking provides more relevant results, while taking a lot longer than just  calculating the similarities across vectors. 

Further testing should be conducted by domain experts to evaluate, if the trade-off between performance and quality is justifyable.

#### 5. Save the vectorisations

Now we save the vectors we created for later use in RAG.

In [None]:
path = "vectorisations/sbert.npy"
np.save(path, np.array(sbert_array))