In [1]:
from openai import OpenAI

In [2]:
# Connection parameters (only has to be executed once during a session)

# Note: `localhost` is only valid as long as you are logged in on madagaskarweihe. 
# Otherwise, you will need to perform port forwarding on your machine using `ssh -fNL 8006:127.0.0.1:8006 ashousaa@madagaskarweihe`.
# This might require an ssh key to be uploaded to madagaskarweihe first.
VLLM_API_ENDPOINT = 'http://localhost:8006/v1' 
VLLM_KEY = 's7Vtzeyq3kfhVkPhlWdL95pPRBq36KDP1d5bBj54BqQ'
MODEL = "default"

In [3]:
# Connection to LLM server (only has to be executed once during a session)
client = OpenAI(api_key=VLLM_KEY,
                base_url=VLLM_API_ENDPOINT)

In [4]:

# Chunk documents and save results to a dictionary where each chunk has an id
import fitz  # PyMuPDF
import csv
def chunk_pdf_by_marker(pdf_path, marker="#"):
    doc = fitz.open(pdf_path)
    full_text = ""

    # Extract text from each page and concatenate
    for page in doc:
        full_text += page.get_text()
        # print(full_text)

    # Split by the marker
    chunks = [chunk.strip() for chunk in full_text.split(marker) if chunk.strip()]

     # Store chunks in a dictionary
    chunk_dict = {f"chunk_{i+1}": chunk for i, chunk in enumerate(chunks)}

    return chunk_dict


# Example usage
pdf_path = "Doc 4 flat.pdf"
chunk_dict = chunk_pdf_by_marker(pdf_path)

# Print the first 3 chunks entirely
print("--- First 3 Chunks ---")
for key, value in list(chunk_dict.items())[:4]:  # preview first 3 chunks
    print(f"{key}:\n{value}\n{'='*30}\n")

# Also save chunks with ids as a csv file
def save_chunks_to_csv(chunks, output_file="chunks.csv"):
    with open(output_file, mode="w", encoding="utf-8", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(["chunk_id", "chunk_text"])  # Write header

        for i, chunk in enumerate(chunks):
            chunk_id = f"chunk_{i + 1}"
            writer.writerow([chunk_id, chunk])
            writer.writerow([]) # Empty row to create a line break

# Call the save_chunks_to_csv function with the values from the chunk_dict
save_chunks_to_csv(list(chunk_dict.values()))

--- First 3 Chunks ---
chunk_1:
English Travel Reimbursement Law Revised Version of the State Travel Expense Act 
Preliminary Page A. Objective The previous travel expense regulations are outdated and require 
updating and legal simplification to facilitate the conduct and administrative processing of official 
travel. In addition, with regard to mobility behavior, the requirements of climate protection shall 
be taken into account (the exemplary function of the state administration pursuant to §  of the 
Baden-Württemberg Climate Protection Act).

chunk_2:
Revised Version of the State Travel Expense Act 
Preliminary Page B. Essential Content A revision of the State Travel Expense Act resulting in a 
modern regulatory framework. The focal points are: . A new regulation for travel costs and 
mileage allowance. . . . . Adjustment of the reduction of the per diem allowance in the case of 
complimentary meals in line with tax law provisions, thereby eliminating the need to tax parts of 
th

In [18]:
# # Prompt LLM
# prompt_templates = [
#     "Imagine you are planning a business trip. What five questions would you ask after reading this policy?\n\n{doc}",
#     # "Generate five clarifying questions someone might have about this policy:\n\n{doc}"
# ]

# with open("response.txt", "w", encoding="utf-8") as f:
#     for chunk_id, doc in chunk_dict.items():
#         for i, template in enumerate(prompt_templates):
#             user_prompt = template.format(doc=doc)

#             messages = [
#                 {"role": "system", "content": "You are a helpful assistant that generates useful and realistic questions about travel reimbursement policies."},
#                 {"role": "user", "content": user_prompt}
#             ]

#             response = client.chat.completions.create(
#                 model=MODEL,
#                 messages=messages,
#                 temperature=0.7,
#                 seed=42
#             )

#             output = response.choices[0].message.content.strip()
#             f.write(f"Chunk: {chunk_id} | Template {i+1}\n{output}\n\n")

# Prompt LLM
with open("response.txt", "w") as f:
    for id in chunk_dict:
        doc = chunk_dict[id]
        # define prompts
        messages = [
            {"role": "system", "content": "You are about to go on a bussiness trip and want to ask very precise questions. Only output the question, no additional information."},
            # {"role": "user", "content": f"Imagine you are planning a business trip. What five questions would you ask after reading this document? {doc} Do not include the question if it is not stated in the chunk" },  
            # {"role": "user", "content": f"Generate a list of natural-sounding questions that a traveling employee would have. The questions should reflect a genuine need to understand this information for their reimbursement claim.Only generate questions for which the corresponding answer is explicitly present here.{doc}"} 
            {"role": "user", "content": f"What specific questions would I, as a traveling employee, ask that could be answered *solely by the information provided in this chunk*? Only generate questions for which the corresponding answer is explicitly present here.{doc}"}
            # {"role": "user", "content": f"As a traveling employee, what are some key, natural questions I would have to understand my rights, responsibilities, and potential reimbursements related to this specific information? Please generate questions that reflect a genuine need for clarity on how this impacts my expense report and reimbursement. Ensure the answer to each question is explicitly stated within this chunk, and format each question on a new line, numbered as in the examples you provided {doc}"}  
              ] 
        
        # send prompts and wait for answer
        response = client.chat.completions.create(
                    model=MODEL,
                    messages=messages,
                    seed=42,
                    temperature=0.7,
            )
        # # print response
        # print(response.choices[0].message.content)
        # # TODO: split the response so each question is on its own line and add the id to each line
        # f.write(f'{response.choices[0].message.content} || {id}\n')
        # Get and clean the response
        raw_response = response.choices[0].message.content.strip()

        # Split response into individual questions
        questions = [q.strip("0123456789).:- ") for q in raw_response.split("\n") if q.strip()]

        # Write each question on a new line with the chunk ID
        for question in questions:
            f.write(f"{question} || {id}\n")

        # Add a blank line to separate questions from different chunks
        f.write("\n")