In [9]:
from openai import OpenAI

In [None]:
# Connection parameters (only has to be executed once during a session)

# Note: `localhost` is only valid as long as you are logged in on madagaskarweihe. 
# Otherwise, you will need to perform port forwarding on your machine using `ssh -fNL 8006:127.0.0.1:8006 ashousaa@madagaskarweihe`.
# This might require an ssh key to be uploaded to madagaskarweihe first.

# On the server where VLLM, a software framework designed to efficiently run Large Language Models (LLMs), is installed, the VLLM endpoint is the specific network address (like a website URL) that your program uses to communicate with the VLLM server, and the VLLM key is a secret code used to authenticate your program and grant it permission to access the LLMs managed by VLLM.
# The chosen model is the default one.

VLLM_API_ENDPOINT = 'http://localhost:8006/v1' 
VLLM_KEY = 's7Vtzeyq3kfhVkPhlWdL95pPRBq36KDP1d5bBj54BqQ'
MODEL = "default"

In [None]:
# Connection to LLM server (only has to be executed once during a session)
# It creates a communication channel (client) to a specific language model server located at VLLM_API_ENDPOINT. To be allowed to use this server, your program provides the VLLM_KEY as proof of authorization. Once this client object is created, you can use its methods to send text prompts to the language model and receive its generated responses
client = OpenAI(api_key=VLLM_KEY,
                base_url=VLLM_API_ENDPOINT)

In [None]:

# Split questions by ?
import docx # Imports the python-docx library, which allows Python to interact with Microsoft Word .docx files.
import csv

def split_questions_by_marker(doc_path, marker="?"):
    doc = docx.Document(doc_path) #Opens the .docx file specified by doc_path using the docx.Document() constructor
    full_text = ""
    for paragraph in doc.paragraphs:
        full_text += paragraph.text + "\n" # Appends the text of the current paragraph to the full_text string, followed by a newline character to ensure paragraph breaks are maintained.

    # Split on "?" 
    raw_parts = full_text.split(marker) # Splits the full_text into a list of strings (raw_parts) using the specified marker (default "?") as the delimiter. The marker itself is not included in the resulting parts.
    questions = []

    for part in raw_parts:
        cleaned = part.strip() # Remove leading/trailing whitespace
        if cleaned:
            questions.append(cleaned + marker)  # Add ? back to have question mark at the end of each question

    return questions

# Example usage
doc_path = "Manual Ques (after modification) .docx"
questions = split_questions_by_marker(doc_path)

# # Save questions with ids as a csv file
# def save_chunks_to_csv(questions, output_file="original_questions.csv"):
#     with open(output_file, mode="w", encoding="utf-8", newline="") as file:
#         writer = csv.writer(file)
#         writer.writerow(["question_id", "question_text"])  # Write header

#         for i, question in enumerate(questions):
#             question_id = f"question_{i + 1}"
#             writer.writerow([question_id, question])
#             writer.writerow([]) # Empty row to create a line break

# # Call the save_chunks_to_csv function with the values from the chunk_dict
# save_chunks_to_csv(questions=questions, output_file="original_questions.csv")

In [None]:
# # Prompt LLM To Paraphrase Questions
# with open("paraphrased_questions.txt", "w", encoding="utf-8") as f:
#     for i, question in enumerate(questions):

#         messages = [
#             {"role": "system", "content": "You are a helpful assistant that paraphrases questions. Do NOT repeat the question exactly. Do NOT add new information."},
#             {"role": "user", "content": f"Paraphrase the following question: {question}"}
#         ]
        
#         # Send prompts and wait for answer
#         response = client.chat.completions.create(
#             model=MODEL,
#             messages=messages,
#             seed=42,#  An initial value that initializes the random number generator in a generative model. For reproducibility you can set it to a fixed value.
#             temperature=0.7, # A parameter that controls the randomness and creativity of a generative model's output.
#         )

#         # Extract and clean the LLM's response
#         paraphrased = response.choices[0].message.content.strip() 

#         # Save paraphrased question
#         f.write(f"Q{i+1} Paraphrased: {paraphrased}\n\n")


# Prompt LLM To Paraphrase Questions
with open("modified_paraphrased_questions.txt", "w", encoding="utf-8") as f:
    for i, question in enumerate(questions):

        messages = [
            {"role": "system", "content": "You are a helpful assistant that paraphrases questions. Do NOT repeat the question exactly. Do NOT add new information."},
            {"role": "user", "content": f"Paraphrase the following question: {question}"}
        ]
        
        # Send prompts and wait for answer
        response = client.chat.completions.create(
            model=MODEL,
            messages=messages,
            seed=42, # An initial value that initializes the random number generator in a generative model that controls which next word to choose . For reproducibility you can set it to a fixed value.
            temperature=0.7, # A parameter that controls the randomness and creativity of a generative model's output.
        )

        # Extract and clean the LLM's response
        # LLM response object -> selecting the first choice, here likely there are no others -> getting the message content -> remove leading/trailing whitespace
        content = response.choices[0].message.content.strip()

        # Remove introductory lines like "Here are a few ways..."
        lines = content.splitlines() # Split the content into lines
        for line in lines:
            line = line.strip() # Remove leading/trailing whitespace
            if not line:
                continue

            if line.startswith(("*", "-", "•")): # Check if the line starts with a bullet point, dash, or asterisk incdicating it is paraphrased
                line = line[1:].strip()
                f.write(f"Q{i+1} Paraphrased: {line}\n") 

            elif not line.lower().startswith("here are"): # Check if the line starts with "Here are" (case insensitive)
                f.write(f"Q{i+1} Paraphrased: {line}\n")
        f.write("\n")
