# RAG Pipeline

Imports

In [23]:
from openai import OpenAI
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import tiktoken
import os
import pandas as pd
import re
from pydantic import BaseModel, Field
from typing import List
import openpyxl
import numpy as np
import openai

In [5]:
def merge_small_chunks(documents):
    """
    PyPDFLoader in combination with the recursive textsplitter can lead to very small chunks at the end of a page which only contains a few sentences.
    This method merges very small chunks at the end of a page with the previous chunk. Note here documents and chunks are the same thing.
    :param documents: all documents/chunks that were returned by the recursive text splitter.
    :return: again the list of documents/chunks but with the merged chunks.
    """

    # Remember chunks that have been merged into another chunk to delete them at the end.
    deletion_chunks = []

    # iterate over all documents/chunks
    for i, d in enumerate(documents[1:], start = 1):

        # Check if the chunk has less than 50 tokens.
        enc = encoding.encode(d.page_content)
        if len(enc) < 50:
            # If chunk has less than 50 tokens, check if the previous chunk is on the same page.
            prev_doc = documents[i-1]
            if prev_doc.metadata["page"] == d.metadata["page"]:
                # Merge current chunk into the previous chunk.
                prev_doc.page_content = combine_strings(prev_doc.page_content, d.page_content)

                documents[i-1] = prev_doc

                deletion_chunks.append(d)
    # Remove merged chunks to prevent duplicates
    for i in deletion_chunks:
        documents.remove(i)
    return documents

def combine_strings(s1, s2):
    """
    Combines two strings that have an overlap. It finds the biggest overlap between s1 and s2 and then merges them together.
    This method assumes that s1 ends with a substring and that s2 starts with that exact substring.
    This method is used to merge two chunks that overlap.
    """
    max_overlap = 0
    overlap_index = 0
    # This for loops finds the biggest overlap between s1 and s2.
    for i in range(len(s1), -1, -1):
        if s1[i:] == s2[:len(s1) - i] and len(s1) - i > max_overlap:
            max_overlap = len(s1) - i
            overlap_index = i
    # Concatenate strings considering the biggest overlap between them.
    if max_overlap > 0:
        return s1[:overlap_index] + " " + s2
    return s1 + " "+ s2

def create_df_docs(relevant_docs):
    """
    This method creates a dataframe for the chunks/documents that were used to answer a question.
    The resulting dataframe can be displayed in the Chat-interface or added to the Excel sheet for the EGA/upload mode.
    :param relevant_docs: chunks that were used as context to answer the questions. It is a list of tuples with the format: (similarity score, Document)
    :return: dataframe with columns: Number, Content, Section, Page Number, Relevance, Source
    """
    content = []                # content of the chunk
    section = []                # section number or chapter of the chunk
    pages = []                  # page number of the chunk
    similarities = []           # the similarity score between the question and chunk
    files = []                  # file name of the chunk
    numbers = []                # index of the chunk

    # iterate over all chunks and collect the attributes above.
    for i, (doc, sim) in enumerate(relevant_docs):
        # similarity treshhold, don't include chunks that exceed it.
        if sim >= 0.55:
            break
        similarities.append(round(1-sim, 2)) # round the similarity score

        # To give GPT-4 more guidance and context the chunks contain headers like "##Part 1.2" or "##Header: ... ##Part ..."
        # Here we remove these headers to make it more readable for the user.
        #part_number = extract_number(doc.page_content)

       
        pattern = r'##Part\s+(\d+)'
        match = re.search(pattern, doc.page_content)
        if match:
            part_number = int(match.group(1))
        else:
            part_number = None


        if part_number == 1:
            pattern = r'##Part[^\n]*\n?'
        else:
            pattern = r'##Header.*?##Part[^\n]*\n?'
        content.append(re.sub(pattern, '\n', doc.page_content, flags=re.DOTALL).replace("##Header: ", ""))
        pattern = r'##Header:(.*?)##Part'
        section.append(re.findall(pattern, doc.page_content, re.DOTALL))

        files.append(doc.metadata["source"].split("/")[-1]) # append file name
        pages.append(doc.metadata["page"] +1) # append page number
        numbers.append(i+1)

    relevant_docs_df = pd.DataFrame({"Number":numbers, "Content": content, "Section":section, "Page Number": pages, "Relevance": similarities, "Source": files})
    return relevant_docs_df

Create Vector Database

In [6]:
embeddings = OpenAIEmbeddings(model = model_embedding)
all_dbs = {}
for file in files:
    loader = PyPDFLoader(file)
    documents = loader.load()
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(           # Split the Text
        model_name=model.split(".")[1],
        chunk_size=512, 
        chunk_overlap=128,
        separators=["\n\n", ". "],
        )
    docs = text_splitter.split_documents(documents) 
    docs = merge_small_chunks(docs)                                                 # Merge the Chunks
    all_dbs[file] = FAISS.from_documents(docs, embeddings)                          # Vector Store
    all_dbs["All PDFs combined"] = db_combined = FAISS.from_documents(docs, embeddings)

relevant_docs = all_dbs["All PDFs combined"].similarity_search_with_relevance_scores(query, k=50)

In [7]:
sim_values = []
for doc in relevant_docs:
    sim_values.append(doc[1])
# sim_values

In [9]:
context = ""
used_pages = set()
used_file = set()
s = 0
number_of_chunks = 0

for i, (doc, sim) in enumerate(relevant_docs):
    if sim >= 0.55:                                                     # Similarity over 0.55
        break
    content = doc.page_content
    file = doc.metadata["source"].split("/")[-1]
    page_number = doc.metadata["page"]

    content_tokenized = encoding.encode(content)                        # Tokenize Content
    if s + len(content_tokenized) > 5500:
        break

    s += len(content_tokenized)
    context += f"Extract {i+1}, page: {page_number+1} in {file}:\n[Start of Extract {i+1}]\n {content}\n[End of Extract {i+1}]\n\n"


    used_pages.add(page_number+1)
    used_file.add(file)
    number_of_chunks += 1

used_pages = sorted(list(used_pages))
used_pages = ", ".join([str(a) for a in used_pages])

used_file = sorted(list(used_file))
number_of_used_files = len(used_file)
used_file = ", ".join([str(a) for a in used_file])


prompt = prompt.replace('{##context##}', context)
prompt = prompt.replace('{##question##}', query)

Sample GPT Request

In [None]:
# Create Request
class Answer_Request(BaseModel):
    Answer: str | bool | int
    Explanation: str
    Reference: List[str]

class Score_Request(BaseModel):
    Score: int
    Score_Explanation: str 

answers = []
references = []
explanations = []
# Create Answers
for i in range(5):
    completion = client.beta.chat.completions.parse(
        model=model,
        n = 1,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": prompt}
            ],
        response_format=Answer_Request, 
        temperature=temperature,
    )
    result = completion.choices[0].message.parsed 
    answers.append(result.Answer)
    references.append(result.Reference)
    explanations.append(result.Explanation)

completion = client.beta.chat.completions.parse(
        model=model,
        n = 1,
        messages=[
            {"role": "system", "content": metric},
            {"role": "user", "content": f"""
            These are the Answers: {str(answers)}
            This is the Context: {str(context)}
            And these are the References {str(references)}
            """}
            ],
        response_format=Score_Request,
        temperature=temperature,
    )
score_result= completion.choices[0].message.parsed 
score_explanation = score_result.Score_Explanation
score = score_result.Score
print(completion.choices[0].message.parsed)