In [1]:
from openai import OpenAI

In [2]:
# Connection parameters (only has to be executed once during a session)

# Note: `localhost` is only valid as long as you are logged in on madagaskarweihe. 
# Otherwise, you will need to perform port forwarding on your machine using `ssh -fNL 8006:127.0.0.1:8006 ashousaa@madagaskarweihe`.
# This might require an ssh key to be uploaded to madagaskarweihe first.
VLLM_API_ENDPOINT = 'http://localhost:8006/v1' 
VLLM_KEY = 's7Vtzeyq3kfhVkPhlWdL95pPRBq36KDP1d5bBj54BqQ'
MODEL = "default"

In [3]:
# Connection to LLM server (only has to be executed once during a session)
client = OpenAI(api_key=VLLM_KEY,
                base_url=VLLM_API_ENDPOINT)

In [4]:

# Chunk documents, save results to a dictionary where each chunk has an id and save the chunks to a CSV file
import fitz  # PyMuPDF 
import csv
def chunk_pdf_by_marker(pdf_path, marker="#"):
    doc = fitz.open(pdf_path)
    full_text = ""

    # Extract text from each page and concatenate
    for page in doc:
        full_text += page.get_text()
        # print(full_text)

    # Split by the marker
    chunks = [chunk.strip() for chunk in full_text.split(marker) if chunk.strip()] # Splits the text by the marker and iterates on each chunk and cleans the whitespace. Also, it removes empty strings after cleaning.

     # Store chunks in a dictionary
    chunk_dict = {f"chunk_{i+1}": chunk for i, chunk in enumerate(chunks)} # After the colon is the value of the dictionary, which is the chunk of text. The key is the chunk number, which is created by iterating over the chunks and adding 1 to the index.

    return chunk_dict


# Example usage
pdf_path = "Doc 4 flat.pdf"
chunk_dict = chunk_pdf_by_marker(pdf_path)

# Print the first 3 chunks entirely
print("--- First 3 Chunks ---")
for key, value in list(chunk_dict.items())[:4]:  # preview first 3 chunks
    print(f"{key}:\n{value}\n{'='*30}\n")

# Also save chunks with ids as a csv file
def save_chunks_to_csv(chunks, output_file="chunks.csv"):
    with open(output_file, mode="w", encoding="utf-8", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(["chunk_id", "chunk_text"])  # Write header

        for i, chunk in enumerate(chunks):
            chunk_id = f"chunk_{i + 1}"
            writer.writerow([chunk_id, chunk])
            writer.writerow([]) # Empty row to create a line break

# Call the save_chunks_to_csv function with the values from the chunk_dict
save_chunks_to_csv(list(chunk_dict.values()))

--- First 3 Chunks ---
chunk_1:
English Travel Reimbursement Law Revised Version of the State Travel Expense Act 
Preliminary Page A. Objective The previous travel expense regulations are outdated and require 
updating and legal simplification to facilitate the conduct and administrative processing of official 
travel. In addition, with regard to mobility behavior, the requirements of climate protection shall 
be taken into account (the exemplary function of the state administration pursuant to §  of the 
Baden-Württemberg Climate Protection Act).

chunk_2:
Revised Version of the State Travel Expense Act 
Preliminary Page B. Essential Content A revision of the State Travel Expense Act resulting in a 
modern regulatory framework. The focal points are: . A new regulation for travel costs and 
mileage allowance. . . . . Adjustment of the reduction of the per diem allowance in the case of 
complimentary meals in line with tax law provisions, thereby eliminating the need to tax parts of 
th

In [5]:

# Generate questions using the LLM
with open("response.txt", "w") as f:
    for id in chunk_dict:
        doc = chunk_dict[id]
        # define prompts
        messages = [
            {"role": "system", "content": "You are about to go on a bussiness trip and want to ask very precise questions. Only output the question, no additional information."},
            # {"role": "user", "content": f"Imagine you are planning a business trip. What five questions would you ask after reading this document? {doc} Do not include the question if it is not stated in the chunk" },  
            # {"role": "user", "content": f"Generate a list of natural-sounding questions that a traveling employee would have. The questions should reflect a genuine need to understand this information for their reimbursement claim.Only generate questions for which the corresponding answer is explicitly present here.{doc}"} 
            {"role": "user", "content": f"What specific questions would I, as a traveling employee, ask that could be answered *solely by the information provided in this chunk*? Only generate questions for which the corresponding answer is explicitly present here.{doc}"}
            # {"role": "user", "content": f"As a traveling employee, what are some key, natural questions I would have to understand my rights, responsibilities, and potential reimbursements related to this specific information? Please generate questions that reflect a genuine need for clarity on how this impacts my expense report and reimbursement. Ensure the answer to each question is explicitly stated within this chunk, and format each question on a new line, numbered as in the examples you provided {doc}"}  
              ] 
        
        # send prompts and wait for answer
        response = client.chat.completions.create(
                    model=MODEL,
                    messages=messages,
                    seed=42,
                    temperature=0.7,
            )
      
        # Get and clean the response
        raw_response = response.choices[0].message.content.strip() 

        # Split response into individual questions
        questions = [q.strip("0123456789).:- ") for q in raw_response.split("\n") if q.strip()] 

        # Write each question on a new line with the chunk ID
        for question in questions:
            f.write(f"{question} || {id}\n") 

        # Add a blank line to separate questions from different chunks
        f.write("\n")

In [6]:
# Get the top-k matching chunks for a given query

from sentence_transformers import SentenceTransformer
import torch
import pandas as pd

def get_top_matching_chunks(query, k):
    # Load the model only once (move it outside this function if calling often)
    model = SentenceTransformer("all-MiniLM-L6-v2")

    # Load CSV with both chunk_id and chunk_text
    df = pd.read_csv("chunks.csv")  # df is a pandas DataFrame is a 2-dimensional labeled data structure with columns of potentially different types. Think of it like a table in a spreadsheet or a SQL database.

    # Get lists
    chunk_ids = df["chunk_id"].tolist()
    chunk_texts = df["chunk_text"].tolist()

    # Encode chunks and query
    chunk_embeddings = model.encode(chunk_texts, convert_to_tensor=True)
    query_embedding = model.encode(query, convert_to_tensor=True)

    # Compute cosine similarity
    cos = torch.nn.CosineSimilarity(dim=1)
    similarities = cos(chunk_embeddings, query_embedding)

    # Get top-k indices
    top_indices = torch.topk(similarities, k=k).indices.tolist()


    # Map indices back to chunk_ids
    top_chunk_ids = [chunk_ids[i] for i in top_indices]

    return top_chunk_ids


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# This method reads a CSV file containing questions and their corresponding chunk IDs.
# It returns a list of dictionaries where each dictionary contains the chunk ID as the key and the question as the value.

import csv

def get_queries_from_csv(csv_file):
    queries_chunk_map = []

    # Open and read the CSV file
    with open(csv_file, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.DictReader(file) # DictReader reads the CSV file into a dictionary format, where the keys are the column names and the values are the corresponding data in each row.

        # Iterate over each row in the CSV file
        for row in reader:
            chunk_id = row['Chunk']  #Getting what coresponds to the column 'Chunk' of the dictionary
            query= row['Question']   #Getting what coresponds to the column 'Question' of the dictionary

            # Create a dictionary with chunk_id as key and question as value and appends it to the list
            queries_chunk_map.append({chunk_id: query})

    return queries_chunk_map

In [8]:
import pandas as pd
def get_CSVcolumn(csv_file,column_name):


    # Read the CSV file
    df = pd.read_csv(csv_file)

    # Extract the questions into a list
    column_list = df[column_name].tolist()
    
    # Return the list
    return column_list

In [9]:
import unicodedata
import re

def normalize_string(s):
    # Step 1: Convert to lowercase
    s = s.lower()
    
    # Step 2: Strip leading/trailing whitespace
    s = s.strip()
    
    # Step 3: Normalize Unicode characters (e.g., accented characters)
    s = unicodedata.normalize('NFKD', s)  # Normalize to decomposed form
    
    # Step 4: Remove non-printable characters (e.g., invisible characters)
    s = re.sub(r'[^\x20-\x7E]', '', s)  # Keep only printable characters
    
    return s

In [10]:
k = 10
correct_answers_count = 0
queries_chunk_map = get_queries_from_csv("final_test_set.csv")[:20]  # Limit to 20 queries

# Prompt LLM
with open("RAG_Answers_To.txt", "w", encoding="utf-8") as f:

    for i, query_map in enumerate(queries_chunk_map, start=1):
        chunk_id, query = list(query_map.items())[0]  # Extract the chunk ID and question

        # Get top k chunks
        top_k_chunk_ids = get_top_matching_chunks(query, k)

        # Load the corresponding chunk texts from the CSV
        csv_file = '/mount/arbeitsdaten/studenten4/ashousaa/chunks.csv' 
        column_name = 'chunk_text'
        result = get_CSVcolumn(csv_file, column_name)
        top_k_documents = [result[int(chunk_id.split("_")[1]) - 1] for chunk_id in top_k_chunk_ids]

        # Join the top k documents into a single string
        top_k_string = "\n".join(top_k_documents)

     #  PROMPT SELECTION:  Modify this section to try different prompts
        messages = []

        # #  Prompt 1: Basic Template with Explicit Instructions  [cite: 19, 20, 21]
        # messages = [
        #     {"role": "system", "content": "Use the following pieces of retrieved context (replace with documents) to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise."},
        #     {"role": "user", "content": f"Question: {query} \n Context: {top_k_string} \n Answer:"}
        # ]

        # #  Prompt 2: Template with Context Delimiters  [cite: 22, 23]
        # messages = [
        #     {"role": "user", "content": f"--------------------- {top_k_string} --------------------- \n Given the context information and not prior knowledge, answer the query: {query} \n Answer:"}
        # ]

        # #  Prompt 3: Template with Emphasis on Factual Grounding  [cite: 23, 24, 25]
        # messages = [
        #     {"role": "user", "content": f"DOCUMENT: {top_k_string} \n QUESTION: {query} \n INSTRUCTIONS: Answer the users QUESTION using the DOCUMENT text above. Keep your answer grounded in the facts of the DOCUMENT. If the DOCUMENT doesn't contain the facts to answer the QUESTION return {{NONE}}"}
        # ]

        # #  Prompt 4: Template for Extractive Answering  [cite: 25, 26, 27]
        # messages = [
        #     {"role": "user", "content": f"Extract the most relevant passage from the retrieved documents {top_k_string} that answers the query {query}. Return only the exact text from {top_k_string} without modification."}
        # ]

        # #  Prompt 5: Template Incorporating Chain-of-Thought (CoT)  [cite: 27, 28, 29]
        # messages = [
        #     {"role": "user", "content": f"Based on the retrieved context: {top_k_string}, answer the question {query} step by step, first identifying key facts, then reasoning through the answer."}
        # ]

        # #  Prompt 6: More Concise Template  [cite: 29, 30]
        # messages = [
        #     {"role": "user", "content": f"Context: {top_k_string} \n Question: {query} \n Answer the question using only the context provided."}
        # ]

        # # Prompt 7: Template with Specific Formatting Instructions  [cite: 30, 31, 32]
        # messages = [
        #     {"role": "user", "content": f"Context information is below. \n --------------------- \n {top_k_string} \n --------------------- \n Given the context information and not prior knowledge, answer the query: {query} \n Answer:"}
        # ]


        # # Prompt 8: Query-Focused Template [cite: 2, 3]
        # messages = [
        #     {"role": "system", "content": "I want you to answer the question based on the retrieved context below."},
        #     {"role": "user", "content": f"Context: {top_k_string} \n Question: {query} \n Answer:"}
        # ]

        # # Prompt 9: Multi-Vector Retrieval Template [cite: 7, 8, 9]
        # messages = [
        #     {"role": "system", "content": "I need you to answer a question based on the following context information. If the information needed is not available in the context, please state that."},
        #     {"role": "user", "content": f"Context: {top_k_string} \n Question: {query} \n Answer (be concise and extract relevant information from the context):"}
        # ]

        # Prompt 10: FLARE Template [cite: 9, 10, 11, 12]
        messages = [
            {"role": "system", "content": "Answer the question based on the context below. If you need more information that's not in the context, indicate this in your response."},
            {"role": "user", "content": f"Context: {top_k_string} \n Question: {query} \n If you need additional information, please specify what you need to know. \n Answer:"}
        ]



        # Get answer from LLM
        response = client.chat.completions.create(
            model=MODEL,
            messages=messages,
            seed=42,
            temperature=0.7,
        )

        raw_response = response.choices[0].message.content.strip()

        # Check if correct chunk is among top-k
        normalized_chunk_id = normalize_string(chunk_id)
        normalized_top_k_chunk_ids= [normalize_string(a) for a in top_k_chunk_ids]
        correct_chunk_found = normalized_chunk_id in normalized_top_k_chunk_ids

        if correct_chunk_found:
            correct_answers_count += 1
        

        # Write to file
        f.write(f"Question {i} (Original Chunk ID: {chunk_id})\n")
        f.write(f"Top-k Chunks Used: {top_k_chunk_ids}\n")
        f.write(f"Question: {query}\n")
        f.write(f"Answer: {raw_response}\n\n")
    f.write(f"Total Correct Answers: {correct_answers_count}\n\n")
