In [None]:
from openai import OpenAI

In [None]:
# Connection parameters (only has to be executed once during a session)

# Note: `localhost` is only valid as long as you are logged in on madagaskarweihe. 
# Otherwise, you will need to perform port forwarding on your machine using `ssh -fNL 8006:127.0.0.1:8006 ashousaa@madagaskarweihe`.
# This might require an ssh key to be uploaded to madagaskarweihe first.
VLLM_API_ENDPOINT = 'http://localhost:8006/v1' 
VLLM_KEY = 's7Vtzeyq3kfhVkPhlWdL95pPRBq36KDP1d5bBj54BqQ'
MODEL = "default"

In [None]:
# Connection to LLM server (only has to be executed once during a session)
client = OpenAI(api_key=VLLM_KEY,
                base_url=VLLM_API_ENDPOINT)

In [None]:

# Chunk documents, save results to a dictionary where each chunk has an id and save the chunks to a CSV file
import fitz  # PyMuPDF 
import csv
def chunk_pdf_by_marker(pdf_path, marker="#"):
    doc = fitz.open(pdf_path)
    full_text = ""

    # Extract text from each page and concatenate
    for page in doc:
        full_text += page.get_text()
        # print(full_text)

    # Split by the marker
    chunks = [chunk.strip() for chunk in full_text.split(marker) if chunk.strip()] # Splits the text by the marker and iterates on each chunk and cleans the whitespace. Also, it removes empty strings after cleaning.

     # Store chunks in a dictionary
    chunk_dict = {f"chunk_{i+1}": chunk for i, chunk in enumerate(chunks)} # After the colon is the value of the dictionary, which is the chunk of text. The key is the chunk number, which is created by iterating over the chunks and adding 1 to the index.

    return chunk_dict


# Example usage
pdf_path = "Doc 4 flat.pdf"
chunk_dict = chunk_pdf_by_marker(pdf_path)

# Print the first 3 chunks entirely
print("--- First 3 Chunks ---")
for key, value in list(chunk_dict.items())[:4]:  # preview first 3 chunks
    print(f"{key}:\n{value}\n{'='*30}\n")

# Also save chunks with ids as a csv file
def save_chunks_to_csv(chunks, output_file="chunks.csv"):
    with open(output_file, mode="w", encoding="utf-8", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(["chunk_id", "chunk_text"])  # Write header

        for i, chunk in enumerate(chunks):
            chunk_id = f"chunk_{i + 1}"
            writer.writerow([chunk_id, chunk])
            writer.writerow([]) # Empty row to create a line break

# Call the save_chunks_to_csv function with the values from the chunk_dict
save_chunks_to_csv(list(chunk_dict.values()))

In [None]:

# Generate questions using the LLM and save to a text file named "response.txt"
with open("response.txt", "w") as f:
    for id in chunk_dict:
        doc = chunk_dict[id]
        # define prompts
        messages = [
            {"role": "system", "content": "You are about to go on a bussiness trip and want to ask very precise questions. Only output the question, no additional information."},
            # {"role": "user", "content": f"Imagine you are planning a business trip. What five questions would you ask after reading this document? {doc} Do not include the question if it is not stated in the chunk" },  
            # {"role": "user", "content": f"Generate a list of natural-sounding questions that a traveling employee would have. The questions should reflect a genuine need to understand this information for their reimbursement claim.Only generate questions for which the corresponding answer is explicitly present here.{doc}"} 
            {"role": "user", "content": f"What specific questions would I, as a traveling employee, ask that could be answered *solely by the information provided in this chunk*? Only generate questions for which the corresponding answer is explicitly present here.{doc}"}
            # {"role": "user", "content": f"As a traveling employee, what are some key, natural questions I would have to understand my rights, responsibilities, and potential reimbursements related to this specific information? Please generate questions that reflect a genuine need for clarity on how this impacts my expense report and reimbursement. Ensure the answer to each question is explicitly stated within this chunk, and format each question on a new line, numbered as in the examples you provided {doc}"}  
              ] 
        
        # send prompts and wait for answer
        response = client.chat.completions.create(
                    model=MODEL,
                    messages=messages,
                    seed=42,
                    temperature=0.7,
            )
      
        # Get and clean the response
        raw_response = response.choices[0].message.content.strip() 

        # Split response into individual questions
        questions = [q.strip("0123456789).:- ") for q in raw_response.split("\n") if q.strip()] 

        # Write each question on a new line with the chunk ID
        for question in questions:
            f.write(f"{question} || {id}\n") 

        # Add a blank line to separate questions from different chunks
        f.write("\n")

Code to iterate over models:

In [None]:
# from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed ,pipeline

# def load_model(model_name):
#     tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=MODEL_CACHE_DIR)
#     model = AutoModelForCausalLM.from_pretrained(
#         model_name,
#         torch_dtype="auto",
#         cache_dir=MODEL_CACHE_DIR
#     ).to("cuda:0")
#     return model, tokenizer

In [None]:
# def generate(system_prompt: str, user_prompt: str, model, tokenizer, temperature: float = 0.7, seed: int = 42, enable_thinking=False) -> str:
#     set_seed(seed=seed)
#     messages = [
#         {"role": "system", "content": system_prompt},
#         {"role": "user", "content": user_prompt}
#     ]
#     text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=enable_thinking)
#     model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
#     generated_ids = model.generate(**model_inputs, max_new_tokens=32768)
#     output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
#     content = tokenizer.decode(output_ids, skip_special_tokens=True)
#     return content


In [None]:
# import csv
# import gc
# import torch


# k = 10
# prompt_id = "v1"

# # Models to iterate over
# model_names = [
#     "Qwen/Qwen3-0.6B",
#     "Qwen/Qwen3-1.7B",
#     "Qwen/Qwen3-4B",
#     "Qwen/Qwen3-8B",
# ]

# # Load test set
# with open("modified_final_test_set.csv", "r", encoding="utf-8") as f:
#     reader = csv.DictReader(f)
#     full_data = list(reader)

# queries_chunk_map = [
#     {
#         "chunk_id": row["Chunk"],
#         "question": row["Question"],
#         "reference": row["Answer"]
#     }
#     for row in full_data
# ]

# # Output files
# output_txt_path = "RAG_Output_Answers_ALL_MODELS.txt"
# output_csv_path = "RAG_Output_Answers_ALL_MODELS.csv"

# with open(output_txt_path, "w", encoding="utf-8") as txt_file, \
#      open(output_csv_path, mode='w', newline='', encoding='utf-8') as csv_file:

#     fieldnames = [
#         "Model Used", "Question Index", "Question", "Original Chunk",
#         "Chunks Retrieved", "Answer", "Reference Answer", "Prompt Used"
#     ]
#     writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
#     writer.writeheader()

#     for model_name in model_names:
#         print(f"\n🔄 Loading model: {model_name}")
#         model, tokenizer = load_model(model_name)

#         correct_answers_count = 0

#         for i, item in enumerate(queries_chunk_map, start=1):
#             chunk_id = item["chunk_id"]
#             query = item["question"]
#             reference_answer = item["reference"]

#             top_k_chunk_ids = get_top_matching_chunks(query, k)
#             result = get_CSVcolumn('/mount/arbeitsdaten/studenten4/ashousaa/chunks.csv', 'chunk_text')
#             top_k_documents = [result[int(c_id.split("_")[1]) - 1] for c_id in top_k_chunk_ids]
#             top_k_string = "\n".join(top_k_documents)

#             prompt_text = f"Context: {top_k_string} \n Question: {query} \n Answer the question using only the context provided."

#             raw_response = generate(
#                 system_prompt="Please answer the user question in a faithful way.",
#                 user_prompt=prompt_text,
#                 model=model,
#                 tokenizer=tokenizer,
#                 temperature=0.7,
#                 seed=42
#             ).strip()

#             # Evaluate if correct chunk was found
#             normalized_chunk_id = normalize_string(chunk_id)
#             normalized_top_k_chunk_ids = [normalize_string(a) for a in top_k_chunk_ids]
#             if normalized_chunk_id in normalized_top_k_chunk_ids:
#                 correct_answers_count += 1

#             # Write to text file
#             txt_file.write(f"[{model_name}] Question {i} (Original Chunk ID: {chunk_id})\n")
#             txt_file.write(f"Question: {query}\n")
#             txt_file.write(f"Top-k Chunks Used: {top_k_chunk_ids}\n")
#             txt_file.write(f"Answer: {raw_response}\n\n")

#             # Write to CSV
#             writer.writerow({
#                 "Model Used": model_name,
#                 "Question Index": i,
#                 "Question": query,
#                 "Original Chunk": chunk_id,
#                 "Chunks Retrieved": "; ".join(top_k_chunk_ids),
#                 "Answer": raw_response,
#                 "Reference Answer": reference_answer,
#                 "Prompt Used": prompt_id
#             })

#         print(f"✅ Finished {model_name}: {correct_answers_count}/{len(queries_chunk_map)} correct chunks")

#         # 🧹 Free memory
#         del model
#         del tokenizer
#         gc.collect()
#         torch.cuda.empty_cache()


Code to Generate Model By Model

In [None]:
from common_methods import get_CSVcolumn, normalize_string, get_top_matching_chunks


In [None]:
import os
MODEL_CACHE_DIR = '/mount/arbeitsdaten/asr-2/vaethdk/resources/weights/llm/students'
os.environ['TRANSFORMERS_CACHE'] = MODEL_CACHE_DIR

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed ,pipeline

# The model name points to a repository on huggingface, where it will load the model and configuration from, e.g.: https://huggingface.co/Qwen/Qwen3-0.6B
model_name = "Qwen/Qwen3-0.6B"
#model_name = "Qwen/Qwen3-1.7B"
#model_name = "Qwen/Qwen3-4B"
# model_name = "Qwen/Qwen3-8B"


# load the tokenizer and the model (you can leave this block unchanged)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=MODEL_CACHE_DIR)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    cache_dir=MODEL_CACHE_DIR
).to("cuda:0")

In [None]:
# This method is for:
# 1. converting the queries into the right template format
# 2. tokenizing the prompt text
# 3. converting the tokens to token ids
# 4. running the token ids through the model, generating output token ids
# 5. converting the output token ids back to text tokens
def generate(system_prompt: str, user_prompt: str, temperature: float = 0.7, seed: int = 42, enable_thinking=False) -> str:
    # set a random seed for reproducability (otherwise, calling generation twice can result in different texts)
    set_seed(seed=seed)

    # convert the prompts into the correct template
    messages = [
        {"role": "system", "content": system_prompt}, 
        {"role": "user", "content": user_prompt}
    ]
    text = tokenizer.apply_chat_template( 
        messages,
        tokenize=False,
        add_generation_prompt=True,
        enable_thinking=enable_thinking
    )
    # convert prompt texts to tokens, move the token ids to the GPU
    model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
    # generate the output token ids
    generated_ids = model.generate(
        **model_inputs,
        max_new_tokens=32768
    )
    output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 
    # convert the output token ids to text tokens
    content = tokenizer.decode(output_ids, skip_special_tokens=True)
    return content

In [None]:
import csv

k = 10
correct_answers_count = 0

prompt_id = "v1_Basic_RAG_Prompt"
encoding_used="mpnet"
model_shortname = model_name.split("/")[-1]  # Remove slash inside the name cause gives error


# === Load test set ===
with open("modified_final_test_set.csv", "r", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    full_data = list(reader)


queries_chunk_map = [ 
    {
        "chunk_id": row["Chunk"],
        "question": row["Question"],
        "reference": row["Answer"],
        "type": row["Type"],  
        "source_QID": row["Source_QID"]
    }
    
    for row in full_data
]

# === Output files ===
txt_path = f"RAG_Output_Answers_{model_shortname}.txt"
csv_path = f"RAG_Output_Answers_{model_shortname}.csv"

with open(txt_path, "w", encoding="utf-8") as txt_file, open(csv_path, mode='w', newline='', encoding='utf-8') as csv_file:
    fieldnames = [
       "Generation Model", "Question Index", "Question", "Type", "Source_QID", 
        "Chunks Retrieved", "Generated Answer", "Reference Answer", "Generation Prompt Used", "Encoding Used"
    ]
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()

    for i, item in enumerate(queries_chunk_map, start=1):
        chunk_id = item["chunk_id"]
        query = item["question"]
        reference_answer = item["reference"]

        # Get top-k chunks
        top_k_chunk_ids = get_top_matching_chunks(query, k)

        # Load chunk texts
        csv_file_path = '/mount/arbeitsdaten/studenten4/ashousaa/chunks.csv'
        result = get_CSVcolumn(csv_file_path, 'chunk_text')
        top_k_documents = [result[int(c_id.split("_")[1]) - 1] for c_id in top_k_chunk_ids]

        # Build prompt
        top_k_string = "\n".join(top_k_documents)
        prompt_text = f"Context: {top_k_string} \n Question: {query} \n Answer the question using only the context provided."

        # Generate response
        raw_response = generate(
            system_prompt="Please answer the user question in a faithful way.",
            user_prompt=prompt_text,
            temperature=0.7,
            seed=42
        ).strip()

        # Correct chunk detection
        normalized_chunk_id = normalize_string(chunk_id)
        normalized_top_k_chunk_ids = [normalize_string(a) for a in top_k_chunk_ids]
        correct_chunk_found = normalized_chunk_id in normalized_top_k_chunk_ids
        if correct_chunk_found:
            correct_answers_count += 1

        # Write to TXT
        txt_file.write(f"Question {i} (Original Chunk ID: {chunk_id})\n")
        txt_file.write(f"Question: {query}\n")
        txt_file.write(f"Top-k Chunks Used: {top_k_chunk_ids}\n")
        txt_file.write(f"Answer: {raw_response}\n\n")

        # Write to CSV
        writer.writerow({
            "Generation Model": model_name,
            "Question Index": i,
            "Question": query,
            "Type": item["type"],
            "Source_QID": item["source_QID"],
            "Original Chunk": chunk_id,
            "Chunks Retrieved": "; ".join(top_k_chunk_ids),
            "Generated Answer": raw_response,
            "Reference Answer": reference_answer,
            "Generation Prompt Used": prompt_id,
            "Encoding Used": encoding_used    
        })

    txt_file.write(f"Total Correct Answers: {correct_answers_count}\n")
