In [None]:
import os
import pandas as pd
import re

def extract_problems_and_solutions_final_fix_v5(file_path, chapter_number):
    # Open and read the content of the text file
    with open(file_path, "r") as file:
        content = file.read()

    # Remove noise data like "Solutions for Fundamentals of Modern Manufacturing..."
    content = re.sub(r'Solutions for Fundamentals of Modern Manufacturing.*?\d{2}-\d{2}-\d{2}', '', content, flags=re.DOTALL)

    # Initialize variables to store questions and answers
    parsed_questions = []
    parsed_answers = []
    question = ''
    answer = ''
    inside_question = False

    # Regular expression to match valid question numbers (e.g., 3.1, 23.1, etc.), avoiding decimals like 12.00 and 12.0
    question_number_pattern = re.compile(r'^\d+\.\d{1,2}')

    # Split the content into lines for easier processing
    lines = content.splitlines()

    # Loop through each line to extract questions and answers
    for line in lines:
        stripped_line = line.strip()

        # Check if the line starts with a valid question number using the regex pattern
        match = question_number_pattern.match(stripped_line)
        if match:
            question_number = match.group()  # Extract the valid question number
            question_decimal_part = int(question_number.split('.')[1])  # Extract the part after the dot as integer

            # Ensure the question number is between 1 and 40 and avoid xx.00 and xx.0 cases
            if 1 <= question_decimal_part <= 40:
                correct_question_number = f"{chapter_number}.{question_decimal_part:02d}"  # Ensure two decimal digits for valid questions

                if question and answer:  # If we already have a previous question/answer pair, store it
                    parsed_questions.append(question.strip())
                    parsed_answers.append(answer.strip())
                    question = ''
                    answer = ''

                # Start the new question with the correct chapter number
                question = correct_question_number + ' ' + stripped_line[len(question_number):].strip()
                inside_question = True

        elif stripped_line.startswith('Solution:'):
            # Start capturing the answer
            inside_question = False
            answer = stripped_line
        else:
            # Continue capturing question or answer
            if inside_question:
                question += ' ' + stripped_line
            else:
                answer += ' ' + stripped_line

    # Capture the last question-answer pair if any
    if question and answer:
        parsed_questions.append(question.strip())
        parsed_answers.append(answer.strip())

    return parsed_questions, parsed_answers


# Reprocess the files, associating question numbers with their respective chapter numbers
all_chapter_numbers_final_fix_v5 = []
all_question_numbers_final_fix_v5 = []
all_questions_final_fix_v5 = []
all_answers_final_fix_v5 = []

# Automatically fetch all text files in the folder
directory_path_fix = './Math'
txt_files_final_fix = [file for file in os.listdir(directory_path_fix) if file.endswith('.txt')]

# Helper function to get the chapter number from the filename
def get_chapter_number_from_filename_v3(filename):
    return re.findall(r'\d+', filename)[0]  # Extract the first number in the file name


# Process each text file and extract questions and answers with the final logic
for txt_file in txt_files_final_fix:
    file_path = os.path.join(directory_path_fix, txt_file)
    chapter_number = get_chapter_number_from_filename_v3(txt_file)  # Extract chapter number from the file name

    # Extract questions and answers from the current file with chapter number association
    parsed_questions_final_fix_v5, parsed_answers_final_fix_v5 = extract_problems_and_solutions_final_fix_v5(file_path, chapter_number)

    # Store the extracted questions and answers
    all_chapter_numbers_final_fix_v5.extend([chapter_number] * len(parsed_questions_final_fix_v5))
    all_question_numbers_final_fix_v5.extend([q.split()[0] for q in parsed_questions_final_fix_v5])
    all_questions_final_fix_v5.extend(parsed_questions_final_fix_v5)
    all_answers_final_fix_v5.extend(parsed_answers_final_fix_v5)

# Create a dataframe from the final parsed data, adding a chapter column
df_combined_final_fix_v5 = pd.DataFrame({
    "chapter": all_chapter_numbers_final_fix_v5,
    "question_number": all_question_numbers_final_fix_v5,
    "question": all_questions_final_fix_v5,
    "answer": all_answers_final_fix_v5
})

# Save the final combined data into a single CSV file
final_csv_path_final_fix_v5 = "Final_Processed_MathQA_Fixed_v5.csv"
df_combined_final_fix_v5.to_csv(final_csv_path_final_fix_v5, index=False)

final_csv_path_final_fix_v5
