In [None]:
import os
import gensim
import gensim.downloader as api
from gensim.models import KeyedVectors
import sentence_transformers
import numpy as np
import requests
import pickle
import pandas as pd
import zipfile
import fasttext
from langchain_ollama import ChatOllama
import pandas as pd
from nltk.corpus import stopwords
import string


# RAG-Experiment

In this notebook, we'll use the vectorizations we created in the notebook `1_vectorisation` to provide relevant information from the software repository in response to a user query. 
To do so, we'll:

1. Load the models, the data and the vector representations of said data. 

2. Load and prepare the LLM we want to use for RAG. 



## 1. Load the data

First, we'll load the models, the chunked descriptions from our software repository and the vector representations we prepared in the notebook `1_vectorisation.`

If the data and vectorizations have not yet been created, please run the `1_vectorisation` notebook first. Note that this process may take some time.

### 1. Load Models

First we'll load the different models, as we need them to vectorize the queries.

Once we loaded the models, we'll create functions to vectorize the queries using the loaded models. 

#### 1. Load the TFIDF-Vectorizer + create a function to vectorize the query

In [None]:
path = os.path.join(os.getcwd(), "models/tfidf_vectorizer.pickle")
with open(path, "rb") as file:
    tfidf_vectorizer = pickle.load(file)
    
def get_tfidf_vector(query):
    return tfidf_vectorizer.transform([query])
    
print("Model sucessfully loaded")

#### 2. Load the Word2Vec model + create a function to vectorize the query

In [None]:
current_path = os.getcwd()
path = os.path.join(current_path, "models/word2vec-google-news-300.bin")

# Load the model if it is already in our project. If not, download it.
if os.path.isfile(path):
    print("Model found. Loading...")
    word2vec_model = KeyedVectors.load(path)
    
else:
    print("Model not found. Downloading...")
    word2vec_model = api.load("word2vec-google-news-300")
    word2vec_model.save(path)
print("Model loaded sucessfully")

In [None]:
# create a function to vectorize the query    
def get_word2vec_vector(query, model):
    words = query.split()
    # Filter words that are in the model's vocabulary
    valid_words = [word for word in words if word in model]

    if not valid_words:
        # Return a zero vector if no valid words are found
        return np.zeros(model.vector_size)

    # Average the vectors of the valid words to create a document representation
    vectors = [model[word] for word in valid_words]
    return np.mean(vectors, axis=0)

#### 3. Load the Fast-Text models + create a function to vectorize the query

In [None]:
### CREATE HELPER FUNCTIONS

# define functions to load FastText models
def download_file(url: str, file_path: str) -> None:
    """Download a file from a URL and save it locally."""
    try:
        
        if os.path.isfile(file_path):
            print("File was already downloaded.")
            return None
        
        response = requests.get(url, stream=True)
        response.raise_for_status()  # Check if the request was successful
        with open(file_path, "wb") as file:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:
                    file.write(chunk)
        print(f"The file has been downloaded and saved as: {file_path}")
    except requests.RequestException as e:
        print(f"An error occurred while downloading the file: {e}")

# define function to unzip models
def unzip_file(zip_file_path: str, extract_to: str) -> None:
    """Unzip a file to a target directory."""
    try:
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
        print(f"Unzipped {zip_file_path} to {extract_to}")
    except zipfile.BadZipFile as e:
        print(f"Error while unzipping the file: {e}")

# define a function to load word vectors from a file
def load_word_vectors(file_path: str):
    """Load word vectors from a file."""
    try:
        model = gensim.models.KeyedVectors.load_word2vec_format(file_path)
        print("Vectors loaded successfully.")
        return model
    except Exception as e:
        print(f"An error occurred while loading the vectors: {e}")
        return None

### LOAD MODELS

# download or load the fasttext model for language detection
langident_path = os.path.join(os.getcwd(), 'models/lid.176.bin')

if os.path.isfile(langident_path):
    print("Model found.")
    language_detection_model = fasttext.load_model(langident_path)
    
else:
    print("Model not found. Downloading...")
    url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin"
    # download the model
    download_file(url, langident_path)
    # load the model
    print("Loading model from file...")
    language_detection_model = fasttext.load_model(langident_path)
    
language_identification_model = fasttext.load_model(langident_path)

print("Language identification model loaded sucessfully.")

# Check if the english model file exists. If so, load it. If not, download it and convert it to .bin for faster loading in the future. 
# This might take a while

current_path = os.getcwd()
models_dir = os.path.join(current_path, "models")
fasttext_eng_zip_path = os.path.join(models_dir, "wiki.en.zip")
fasttext_eng_path_vec = os.path.join(models_dir, "wiki.en.vec")
fasttext_eng_path_bin = os.path.join(models_dir, "wiki.en.bin")

if os.path.isfile(fasttext_eng_path_bin):
    print("Model found. Loading...")
    aligned_vectors_eng = gensim.models.fasttext.load_facebook_model(fasttext_eng_path_bin) #load the full model, including subword information.
    
else:
    print("Model not found. Downloading...")
    url = "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.en.zip" 
    # download the models
    download_file(url, fasttext_eng_zip_path)
    
    print("Unzipping the file...")
    unzip_file(fasttext_eng_zip_path, models_dir)    

    # load the model
    print("Loading model from file...")
    aligned_vectors_eng = gensim.models.fasttext.load_facebook_model.load(fasttext_eng_path_bin)
    # save the model as binary to reduce loading time in the future
    aligned_vectors_eng.save(fasttext_eng_path_bin)

    
if aligned_vectors_eng is None:
    raise ValueError("The FastText model was not loaded properly.")

print("English model loaded sucessfully.")

# Check if the german model file exists. If so, load it. If not, download it and convert it to .bin for faster loading in the future.
# This might take a while

fasttext_de_path_bin = os.path.join(current_path, "models/wiki_de_align.bin")
fasttext_de_path_vec = os.path.join(current_path, "models/wiki_de_align.vec")

if os.path.isfile(fasttext_de_path_bin):
    print("Model found. Loading...")
    aligned_vectors_de = KeyedVectors.load(fasttext_de_path_bin)
    
else:
    print("Model not found. Downloading...")
    url = "https://dl.fbaipublicfiles.com/fasttext/vectors-aligned/wiki.de.align.vec"
    # download the model
    download_file(url, fasttext_de_path_vec)
    # load the model
    print("Loading model from file...")
    aligned_vectors_de = load_word_vectors(fasttext_de_path_vec)
    # save the model as binary to reduce loading time in the future
    aligned_vectors_de.save(fasttext_de_path_bin)
    
if aligned_vectors_de is None:
    raise ValueError("The FastText model or vectors were not loaded properly.")

print("German model loaded sucessfully.")

In [None]:
# define a function to identify a query's language
def identify_language(query):
    """
    Identifies the language of the given query.
    Parameters:
    query (str): The text to be analyzed for language identification.
    """
    
    lang_detected = language_identification_model.predict(query)

    return lang_detected[0][0].replace("__label__", "")


# create function to embed query (based on language)
def get_fasttext_vector(query, aligned_vectors_de=None, aligned_vectors_eng=None):
    """
    Calculates the FastText vector representation for a given query.
    Parameters:
    - text: A text.
    - aligned_vectors_de: Aligned FastText vectors for the German language.
    - aligned_vectors_eng: Aligned FastText vectors for the English language.
    Note:
    - If the language is not specified or not supported (only "en" and "de" are supported), it returns a zero vector.
    - If a word in the row's description is not found in the aligned vectors, it tries to create a vector based on english subword information.
    - If no vectors are found, it returns a zero vector.
    """
    
    # default size to avoid errors if vectors are None
    vector_size = aligned_vectors_de.vector_size if aligned_vectors_de else 300
    
    # check if language is valid
    lang = identify_language(query)
    if pd.isna(lang) or lang not in ["en", "de"]:
        return np.zeros(vector_size) # Maybe rather use none?
    
    words = query.split()
    vectors = []

    # process based on language
    if lang == "de" and aligned_vectors_de:
        for word in map(str.lower, words):
            try:
                vectors.append(aligned_vectors_de[word])
            except KeyError:
                print(f"Created Vector based on Subword Information for: {word}")                
                vectors.append(aligned_vectors_eng.wv[word])
                #vectors.append(np.zeros(vector_size))
                
    elif lang == "en" and aligned_vectors_eng:
        for word in map(str.lower, words):
            try:
                vectors.append(aligned_vectors_eng.wv[word])
            except KeyError:
                print(f"Missing Vector for: {word}")
                vectors.append(aligned_vectors_eng.wv[word])
    
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros

#### 4. Load the SBERT and Cross Encoder models + create a function to vectorize the query

In [None]:
# Download SBERT model or load them from drive
sbert_path = os.path.join(os.getcwd(),"models/sbert")
downloaded = os.path.isdir(sbert_path)

if not downloaded:
    print("Downloading Sentence Transformer...")
    sbert_model = sentence_transformers.SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
    sbert_model.save(sbert_path)
else:
    print("Load Sentence Transformer from drive...")
    sbert_model = sentence_transformers.SentenceTransformer(sbert_path)
    
print("SBERT model loaded sucessfully")
    
# Download Cross Encoder or load them from drive

cross_encoder_path = os.path.join(os.getcwd(), "models/cross")
downloaded = os.path.isdir(cross_encoder_path)

if not downloaded:
    print("Downloading Cross Encoder...")
    cross_encoder_model = sentence_transformers.CrossEncoder("corrius/cross-encoder-mmarco-mMiniLMv2-L12-H384-v1")
    sbert_model.save(cross_encoder_path)
else:
    print("Load Cross Encoder from drive...")
    cross_encoder_model = sentence_transformers.SentenceTransformer(sbert_path)
    
print("Cross encoder loaded sucessfully")

In [None]:
# function to embedd the query
def get_sbert_vector(query, model):
    """
    Get Sentence-BERT embeddings for a given text using a specified model.
    Parameters:
    text (str): The input text to encode.
    model: The Sentence-BERT model to use for encoding.
    Returns:
    numpy.ndarray: The Sentence-BERT embeddings for the input text.
    """
    default_embedding = np.zeros((model.get_sentence_embedding_dimension(),))
    
    if pd.isna(query) or query.strip() == '':
        return default_embedding    
    return model.encode(query, convert_to_tensor=False)

### 2. Load chunked descriptions

Next, we load the chunked descriptions from the software repository we prepared in the notebook `1_vectorisation`. We'll need them to supply text to the user and the llm used for the RAG-process.

We'll only load four columns. 

1. **description_clean_chunks:** The chunk.
2. **description_preprocessed_chunks:** The preprocessed chunk (lowercase + removed punctuation and stopwords).
3. **description:** The complete description from which the chunk was extracted.
4. **brand_name:** The name of the software described by the chunk.

In [None]:
# load the data
dataset_path = os.path.join(os.getcwd(), "data/edition_software_info_chunked.csv")
columns = ["brand_name", "description", "description_clean_chunks", "description_preprocessed_chunks"]
df = pd.read_csv(dataset_path ,skipinitialspace=True, usecols=columns)

# replace missing values with empty strings
df = df.fillna("")

# display the first row
with pd.option_context('display.max_colwidth', None):
    display(df.head(1))

### 3. Load Vectors

Finally we can load the vector representations of the chunks we created in `1_vectorisation`.

In [None]:
vectors_path = os.path.join(os.getcwd(), "vectorisations")

chunks_tfidf = np.load(os.path.join(vectors_path, "tfidf.npy"), allow_pickle=True)
    
chunks_word2vec = np.load(os.path.join(vectors_path, "word2vec.npy")).tolist()

chunks_fasttext = np.load(os.path.join(vectors_path, "fasttext.npy")).tolist()

chunks_sbert = np.load(os.path.join(vectors_path, "sbert.npy")).tolist()

In [None]:
chunks_tfidf_sparse = chunks_tfidf.item() 
chunks_tfidf_dense = chunks_tfidf_sparse.toarray()  
chunks_tfidf_dense.shape

## 2. Prepare the LLM

Next, we'll load the llm we want to use. 

We'll start off using llama3. This can be changed in the future.

In [None]:
llm = ChatOllama(
    model="llama3",
    temperature=1,
)

## 3. Define Functions for Retrieval

In [306]:
from sklearn.metrics.pairwise import cosine_similarity

def get_similarities_sbert(query: str) -> np.array(float):
    """
    Calculates similarity scores between the query and SBERT-vector-representations of the chunks.
    Args:
        query (str): The query.
    Returns:
        numpy-array: An array of cosine similarity scores between the query and the chunks.
    """

    query_sbert = get_sbert_vector(query, sbert_model)

    # Reshape the query vector to be a 2D array with one row
    query_sbert = query_sbert.reshape(1, -1)

    # Compute cosine similarity between the query and the documents
    similarities = cosine_similarity(query_sbert, chunks_sbert)
    
    return similarities


def get_similarities_word2vec(query: str) -> np.array(float):
    """
    Calculates similarity scores between the query and word2vec-vector-representations of our documents.
    Args:
        query (str): The query.
    Returns:
        numpy-array: An array of cosine similarity scores between the query and the documents.
    """
    
    query_word2vec = get_word2vec_vector(query, word2vec_model)

    # Reshape the query vector to be a 2D array with one row
    query_word2vec = query_word2vec.reshape(1, -1)

    # Compute cosine similarity between the query and the documents
    similarities = cosine_similarity(query_word2vec, chunks_word2vec)
    
    return similarities


def get_similarities_fasttext(query: str) -> np.array(float):
    """
    Calculates similarity scores between the query and fasttext-vector-representations of our documents.
    Args:
        query (str): The query.
    Returns:
        numpy-array: An array of cosine similarity scores between the query and the documents.
    """
    
    query_fasttext = get_fasttext_vector(query, aligned_vectors_de, aligned_vectors_eng)

    # Reshape the query vector to be a 2D array with one row
    query_fasttext = query_fasttext.reshape(1, -1)

    # Compute cosine similarity between the query and the documents
    similarities = cosine_similarity(query_fasttext, chunks_fasttext)
    
    return similarities

# get stopwords
stopwords_english = set(stopwords.words('english'))
stopwords_german = set(stopwords.words('german'))
stopwords_combined = stopwords_german.union(stopwords_english)

def preprocess(stopwords: list[str], text: str) -> str:
    """
    Preprocesses the given text by converting it to lowercase, removing punctuation, and filtering out stopwords.
    Args:
        stopwords (List[str]): A list of stopwords to be filtered out from the text.
        text (str): The input text to be preprocessed.
    """
    
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))    
    text = ' '.join([word for word in text.split() if word not in stopwords])
    return text


def get_similarities_tfidf(query: str) -> np.array(float):
    """
    Calculates similarity scores between the query and tfidf-representations of our documents.
    Args:
        query (str): The query.
    Returns:
        numpy-array: An array of cosine similarity scores between the query and the documents.
    """
    
    query_tfidf = get_tfidf_vector(preprocess(stopwords_combined, query))
    
    chunks_tfidf_dense = chunks_tfidf.item().toarray()
    
    print(type(query_tfidf))
    print(type(chunks_tfidf_dense))    

    # Compute cosine similarity between the query and the documents
    similarities = cosine_similarity(query_tfidf, chunks_tfidf_dense)
    
    return similarities


def get_similarity(query, vectorisation):
    """
    Calculate the similarity between a query and our chunks using given vectorisation.
    Parameters:
    query (str): The query string.
    vectorisation (str): The vectorisation method: Choose from: tfidf, fasttext, word2vec, sbert.
    Returns:
    similarity (float): The similarity score between the query and the chunks.
    Raises:
    KeyError: If the given vectorisation is not valid.
    """
    
    vectorisations = {
        "tfidf":get_similarities_tfidf,
        "fasttext":get_similarities_fasttext,
        "word2vec":get_similarities_word2vec,
        "sbert":get_similarities_sbert
    }
    
    if vectorisation not in vectorisations:
        raise KeyError(f"'{vectorisation}' is not a valid approach. Choose from: {', '.join(vectorisations.keys())}")
    
    methode = vectorisations[vectorisation]
    similarity = methode(query)
    return similarity


def get_similar_chunks(query, vectorisation, n):
    similarity = get_similarity(query, vectorisation)
    df["similarity"] = similarity[0]
    return df.sort_values("similarity", ascending=False).head(n)

In [309]:
test = get_similar_chunks("Ich will ein Manuskript transkribieren", vectorisation="sbert", n=5)
test

Unnamed: 0,brand_name,description,description_clean_chunks,description_preprocessed_chunks,similarity
29,Transcribo,[Transcribo](https://tcdh.uni-trier.de/en/proj...,Transcribo is an editing tool developed by the...,transcribo editing tool developed trier center...,0.579517
0,Transkribus,"# Erkennen, Transkribieren und Durchsuchen von...",Transkribus ist eine umfassende Plattform für ...,transkribus umfassende plattform digitalisieru...,0.562283
14,correspSearch,With correspSearch you can search through inde...,With correspSearch you can search within the m...,correspsearch search within metadata diverse s...,0.559531
67,VMR CRE,The Virtual Manuscript Room Collaborative Rese...,menus and dialogs to assist the researcher wit...,menus dialogs assist researcher composing tran...,0.538878
53,ChrysoCollate,The program offers:\n\n- two modes: collation ...,ChrysoCollate is a freeware for collating and ...,chrysocollate freeware collating editing texts...,0.518004


In [None]:
from langchain_core.messages import AIMessage

messages = [
    (
        "system",
        "You are a helpfull assistant and keep answers brief",
    ),
    ("human", "Answer this question {query} using this context {context}"),
]
ai_msg = llm.invoke(messages)
ai_msg.content