In [None]:
import re
import csv
from docx import Document
from collections import defaultdict

# === Step 1: Load and parse the DOCX file ===
def parse_docx(docx_path):
    doc = Document(docx_path)
    qa_data = [] # List to hold question-answer-chunks dictionaries
    buffer = ""  # Buffer to accumulate text for answers
    for para in doc.paragraphs:
        text = para.text.strip() # Remove leading/trailing whitespace
        if not text:
            continue
        if re.match(r"^\d{0,2}\)?\s?[\w\s,?'\"]+:?$", text) or text.endswith("?"):# Check if it's a question
            if buffer:
                qa_data[-1]["answer"] += " " + buffer.strip() # Since a new question is found, append the content of the buffer to the last answer
                buffer = "" # Reset buffer
            qa_data.append({"question": text, "answer": "", "chunk": ""}) #Initialize a new entry and append it to the list
        elif text.lower().startswith("chunk_"): # Check if it's a chunk
            if qa_data: # If there's an active question otherwise skip
                qa_data[-1]["chunk"] = text #Adds the chunk number to the most recently added question in the qa_data list.
        else:
            buffer += " " + text # Accumulate text for answers
    if buffer and qa_data:
        qa_data[-1]["answer"] += " " + buffer.strip() # Append any remaining text in the buffer to the last answer

    # Extract chunk from answer if embedded
    for entry in qa_data:
        match = re.search(r"(chunk_\d+)", entry["answer"], re.IGNORECASE) # Check if the answer contains a chunk reference
        if match:
            entry["chunk"] = match.group(1) # Extract the chunk number
            entry["answer"] = re.sub(r"\s*chunk_\d+", "", entry["answer"], flags=re.IGNORECASE).strip() # Remove the chunk reference from the answer
    return qa_data # Return the list of dictionaries containing question, answer, and chunk

# === Step 2: Parse the paraphrased questions ===
def parse_paraphrases(txt_path):
    paraphrased_map = defaultdict(list)# Initialize a default dictionary to hold paraphrased questions. A defaultdict is a dictionary that returns a default value when a key is not found.
    current_qid = None # Variable to keep track of the current question ID
    with open(txt_path, "r", encoding="utf-8") as file:
        for line in file:
            line = line.strip() # Remove leading/trailing whitespace
            if not line:
                continue
            match = re.match(r"Q(\d+)\s+Paraphrased: *(.*)", line) # Check if the line starts with a question ID and paraphrased text
            if match:
                current_qid = int(match.group(1)) # The first pattern grabber (\d+) picks up the numbers after 'Q'. This is the first captured group
                paraphrased = match.group(2).strip(" :") # The second grabber (.*) picks up all the words after 'Paraphrased:'. This is the second captured group.
                if paraphrased:  
                    paraphrased_map[current_qid].append(paraphrased) # Add the paraphrased question to the list for the current question ID
            elif current_qid is not None and line.startswith("*"):# Check if the line starts with an asterisk, indicating a paraphrased question
                paraphrased_map[current_qid].append(line.strip("* ").strip())
            elif current_qid is not None:
                paraphrased_map[current_qid].append(line.strip()) #If we have a current_qid and the line didn't start with "Q..." or an asterisk, we assume this line is also a paraphrase for the current original question.
    return paraphrased_map

# === Step 3: Create the test set and write to CSV ===
def create_test_set_csv(qa_data, paraphrased_map, csv_path):
    with open(csv_path, mode="w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f) # Create a CSV writer object
        writer.writerow(["Number", "Question", "Answer", "Chunk"])# Write the header row

        for i, qa in enumerate(qa_data[:75]):#Loop through the first 75 entries in the qa_data list
            writer.writerow([2*i+1, qa["question"], qa["answer"], qa["chunk"]]) # Write the original question, answer, and chunk
            para_qs = paraphrased_map.get(i+1) # Get the paraphrased questions for the current question ID. The i+1 is used because the question IDs in the paraphrased_map are 1-based.
            if para_qs:
                writer.writerow([2*i+2, para_qs[0], qa["answer"], qa["chunk"]]) # Write the first paraphrased question, answer, and chunk
    print(f"Test set saved to: {csv_path}")

# === Main function ===
def main():
    docx_path = "Manual QA.docx"
    txt_path = "paraphrased_questions.txt"
    output_csv = "final_test_set.csv"

    qa_data = parse_docx(docx_path)
    paraphrased_map = parse_paraphrases(txt_path)
    create_test_set_csv(qa_data, paraphrased_map, output_csv)

if __name__ == "__main__":
    main()


Test set saved to: final_test_set.csv
