## Add questions from files in new_files/ to bank.xml. 
### It makes rudimentaary checks whether a question is already there. Also assigns it a unique question_number.

In [None]:
import xml.etree.ElementTree as ET
import os

def add_questions_from_directory(bank_file, directory='new_files'):
    # Parse the existing question bank XML
    bank_tree = ET.parse(bank_file)
    bank_root = bank_tree.getroot()

    # Function to find the current highest question number to ensure uniqueness
    def get_next_question_number():
        max_number = 0
        # Check numbers in standalone questions
        for question_elem in bank_root.findall('.//question'):
            q_number_elem = question_elem.find('question_number')
            if q_number_elem is not None and q_number_elem.text.isdigit():
                max_number = max(max_number, int(q_number_elem.text))
        return max_number + 1

    # Function to check if the question already exists
    def question_exists(question_text):
        for question_elem in bank_root.findall('.//question'):
            if question_elem.find('text').text.strip() == question_text.strip():
                return True
        return False

    # Initialize the next question number
    next_question_number = get_next_question_number()

    # Get all XML files in the specified directory
    new_files = [os.path.join(directory, file) for file in os.listdir(directory) if file.endswith('.xml')]

    # Iterate over each new file
    for file_path in new_files:
        new_tree = ET.parse(file_path)
        new_root = new_tree.getroot()

        # Handle standalone questions
        for new_question in new_root.findall('question'):
            new_question_text = new_question.find('text').text

            # Check if the question already exists in the bank
            if not question_exists(new_question_text):
                # Assign a unique question number
                question_number_elem = ET.SubElement(new_question, 'question_number')
                question_number_elem.text = str(next_question_number)
                next_question_number += 1

                # Add the new question to the bank
                bank_root.append(new_question)

        # Handle grouped questions
        for new_group in new_root.findall('question_group'):
            new_group_text = new_group.find('text').text
            # Create a new question group element in the bank
            new_bank_group = ET.SubElement(bank_root, 'question_group')
            new_bank_group_text = ET.SubElement(new_bank_group, 'text')
            new_bank_group_text.text = new_group_text

            for new_question in new_group.findall('question'):
                new_question_text = new_question.find('text').text

                # Check if the question already exists in the bank
                if not question_exists(new_question_text):
                    # Assign a unique question number
                    question_number_elem = ET.SubElement(new_question, 'question_number')
                    question_number_elem.text = str(next_question_number)
                    next_question_number += 1

                    # Add the new question to the created group in the bank
                    new_bank_group.append(new_question)

    # Save the updated XML to the same file
    bank_tree.write(bank_file)

# Example usage
bank_file_path = 'bank.xml'
add_questions_from_directory(bank_file_path)


## Stats: Parse new files and print frequency stats.

**Go through new_files and create simple stats as per need to see if there is something off and needs to be fixed.**

In [None]:
import os
import xml.etree.ElementTree as ET
from collections import Counter, defaultdict

def parse_mcqs(directory):
    global_stats = {
        'total_number_of_questions': 0,
        'global_topics': set(),
        'global_answer_distribution': Counter(),
        'global_difficulty_distribution': Counter(),
        'global_author_distribution': Counter(),
        'global_year_distribution': Counter()
    }
    
    stats = defaultdict(lambda: {
        'number_of_questions': 0,
        'topics': set(),
        'answer_distribution': Counter(),
        'difficulty_distribution': Counter(),
        'author_distribution': Counter(),
        'year_distribution': Counter()
    })
    
    # List all files in the specified directory
    for filename in os.listdir(directory):
        if filename.endswith(".xml"):  # Ensures we are only reading XML files
            path = os.path.join(directory, filename)
            tree = ET.parse(path)
            root = tree.getroot()
            
            # Process each question in the file
            for question in root.findall('.//question'):  # Use XPath to find questions regardless of parent
                stats[filename]['number_of_questions'] += 1
                global_stats['total_number_of_questions'] += 1
                
                topic_element = question.find('topic')
                if topic_element is not None:
                    topic = topic_element.text.strip()
                    stats[filename]['topics'].add(topic)
                    global_stats['global_topics'].add(topic)
                
                answer_element = question.find('answer')
                if answer_element is not None:
                    answer = answer_element.text.strip()
                    stats[filename]['answer_distribution'][answer] += 1
                    global_stats['global_answer_distribution'][answer] += 1
                
                difficulty_element = question.find('difficulty')
                if difficulty_element is not None:
                    difficulty = difficulty_element.text.strip()
                    stats[filename]['difficulty_distribution'][difficulty] += 1
                    global_stats['global_difficulty_distribution'][difficulty] += 1
                
                author_element = question.find('author')
                if author_element is not None:
                    author = author_element.text.strip()
                    stats[filename]['author_distribution'][author] += 1
                    global_stats['global_author_distribution'][author] += 1
                
                year_element = question.find('year')
                if year_element is not None:
                    year = year_element.text.strip()
                    stats[filename]['year_distribution'][year] += 1
                    global_stats['global_year_distribution'][year] += 1

    # Print the statistics for each file
    for file, data in stats.items():
        print(f"Stats for {file}:")
        print(f"  Number of Questions: {data['number_of_questions']}")
        print(f"  Topics: {', '.join(data['topics'])}")
        print(f"  Answer Distribution: {dict(data['answer_distribution'])}")
        print(f"  Difficulty Distribution: {dict(data['difficulty_distribution'])}")
        print(f"  Author Distribution: {dict(data['author_distribution'])}")
        print(f"  Year Distribution: {dict(data['year_distribution'])}")
        print("")

    # Print the aggregated global statistics
    print("Global Stats:")
    print(f"  Total Number of Questions: {global_stats['total_number_of_questions']}")
    print(f"  Global Topics: {', '.join(global_stats['global_topics'])}")
    print(f"  Global Answer Distribution: {dict(global_stats['global_answer_distribution'])}")
    print(f"  Global Difficulty Distribution: {dict(global_stats['global_difficulty_distribution'])}")

# Example usage
parse_mcqs('new_files')


## Remove Question Number attribute from question Element in bank.xml
**We use question_number tag. If there is an attribute then let us remove it.**

In [None]:
import xml.etree.ElementTree as ET

def remove_question_number_attribute(bank_file):
    # Parse the existing question bank XML
    bank_tree = ET.parse(bank_file)
    bank_root = bank_tree.getroot()

    # Iterate over each question in the bank
    for question_elem in bank_root.iter('question'):
        # Check if the question_number attribute exists
        if 'question_number' in question_elem.attrib:
            # Remove the question_number attribute
            del question_elem.attrib['question_number']

    # Save the modified XML back to the file
    bank_tree.write(bank_file, xml_declaration=True, encoding='utf-8')

# Example usage
bank_file_path = 'bank.xml'
remove_question_number_attribute(bank_file_path)


## Stats: Print questions of each topic
**Frequency of questions of each topic in bank.xml**

*When asking people to create new questions, send them this list of topics.*

In [83]:
import xml.etree.ElementTree as ET

def count_questions_by_topic(bank_file):
    # Parse the XML file
    bank_tree = ET.parse(bank_file)
    bank_root = bank_tree.getroot()

    # Dictionary to store topic counts
    topic_counts = {}

    # Helper function to increment count for a topic
    def increment_topic_count(topic):
        if topic in topic_counts:
            topic_counts[topic] += 1
        else:
            topic_counts[topic] = 1

    # Process each question and count topics
    for question_elem in bank_root.findall('.//question'):
        # Extract the topic from the question or from the parent group if available
        topic_elem = question_elem.find('topic')
        if topic_elem is not None:
            increment_topic_count(topic_elem.text.strip())
        else:
            # Check if this question is part of a group and get the group's topic
            parent_group = question_elem.find('../topic')
            if parent_group is not None:
                increment_topic_count(parent_group.text.strip())

    return topic_counts

# Example usage
bank_file_path = 'bank.xml'
topic_counts = count_questions_by_topic(bank_file_path)
print(topic_counts)


{'Analytical': 101, 'Series': 13, 'Geometry': 17, 'Numbers': 13, 'Statistics': 4, 'Set Theory': 11, 'LA': 17, 'Counting': 8, 'Probability': 5, 'Maths': 31, 'Algebra': 17, 'Functions': 3, 'Calculus': 3, 'Trigonometry': 2}


## ADD TYPES BASED ON TOPIC
**We will use types to create the actual test. For BS test type 1 represents analytical reasoning questions and type 2 is mathematical. Although there is quite an overlap.**

In [None]:
import xml.etree.ElementTree as ET

def normalize_topic(topic):
    """Normalize the topic by converting it to lowercase and removing spaces."""
    return ''.join(topic.lower().split())

def assign_question_types(bank_file):
    # Parse the XML file
    bank_tree = ET.parse(bank_file)
    bank_root = bank_tree.getroot()

    # Dictionary to keep track of topic types already known
    topic_types = {}

    # Process each question to assign a type based on topic
    for question_elem in bank_root.findall('.//question'):
        topic_elem = question_elem.find('topic')
        if topic_elem is not None:
            # Normalize the topic text
            topic = normalize_topic(topic_elem.text)

            # Check if the type is already assigned
            type_elem = question_elem.find('type')
            if type_elem is not None:
                # Skip this question as it already has a type
                continue

            # If the type is new or not assigned, check if known, ask the user if not
            if topic not in topic_types:
                print(f"Enter the type for topic '{topic}' (1 for Type 1, 2 for Type 2):")
                type_input = input()
                type_name = '1' if type_input == '1' else '2'
                topic_types[topic] = type_name

            # Add the type element to the question since it's confirmed not to have one
            type_elem = ET.SubElement(question_elem, 'type')
            type_elem.text = topic_types[topic]

    # Save the updated XML with types assigned
    bank_tree.write(bank_file)

# Example usage, when you're ready to test:
bank_file_path = 'bank.xml'
assign_question_types(bank_file_path)


## Stats: Print number of questions on each type in the bank.
**This should help guide proportion of new questions each type needed to make bank.xml balanced as per test needs.**

In [None]:
import xml.etree.ElementTree as ET

def count_questions_by_type(bank_file):
    # Parse the XML file
    bank_tree = ET.parse(bank_file)
    bank_root = bank_tree.getroot()

    # Initialize counters for each type
    type_counts = {'1': 0, '2': 0, '3':0}

    # Iterate through all question elements
    for question_elem in bank_root.findall('.//question'):
        # Find the type element within each question
        type_elem = question_elem.find('type')
        if type_elem is not None:
            # Increment the count for the corresponding type
            if type_elem.text in type_counts:
                type_counts[type_elem.text] += 1
            else:
                # If a new type number is found, start counting it
                type_counts[type_elem.text] = 1
        else:
            type_counts['3'] += 1

    # Print the counts for each type
    for type_id, count in type_counts.items():
        print(f"Type {type_id}: {count} questions")

# Example usage
bank_file_path = 'bank.xml'
count_questions_by_type(bank_file_path)


## Replace topic_name with topic
**In some older questions, we had topic_name instead of topic - fixing that here.**

In [None]:
import xml.etree.ElementTree as ET

def update_topic_tags(bank_file):
    # Parse the XML file
    bank_tree = ET.parse(bank_file)
    bank_root = bank_tree.getroot()

    # To replace 'topic_name' with 'topic', we need to collect these elements first to avoid modifying the tree while iterating.
    topics_to_replace = []
    for topic_name_elem in bank_root.iter():
        for child in list(topic_name_elem):
            if child.tag == 'topic_name':
                topics_to_replace.append((topic_name_elem, child))

    # Now replace each 'topic_name' with 'topic'
    for parent, topic_name_elem in topics_to_replace:
        # Create a new 'topic' element with the same content
        topic_elem = ET.Element('topic')
        topic_elem.text = topic_name_elem.text

        # Insert the new 'topic' element right before the 'topic_name' element
        parent.insert(list(parent).index(topic_name_elem), topic_elem)

        # Remove the old 'topic_name' element
        parent.remove(topic_name_elem)

    # Save the updated XML back to the file
    bank_tree.write(bank_file)

# Example usage
bank_file_path = 'bank.xml'
update_topic_tags(bank_file_path)


## Remove types: Dangerous and irreversible - USE WITH CARE
**This will remove type tag of each question in the bank.xml. This should not be used unless you know what you are doing. Essentially if you feel that original assignment is wrong, you will have to reassign each topic using above script.**

In [84]:
import xml.etree.ElementTree as ET

# Function to prompt the user for confirmation
def ask_for_confirmation():
    prompt = "This code will irreversibly change the bank. Do you really want to run it? Type 'yes' to confirm: "
    user_input = input(prompt)
    return user_input.lower() == 'yes'
    
def remove_type_tags(bank_file):
    if ask_for_confirmation() == False:
        print("Operation canceled by the user.")
        return
    else:
        # Parse the XML file
        bank_tree = ET.parse(bank_file)
        bank_root = bank_tree.getroot()
    
        # Iterate over all question elements and remove 'type' tags
        for question_elem in bank_root.findall('.//question'):
            # Find the 'type' element
            type_elem = question_elem.find('type')
            if type_elem is not None:
                # Remove the 'type' element from the question
                question_elem.remove(type_elem)
    
        # Save the updated XML back to the file
        bank_tree.write(bank_file)

# Example usage
bank_file_path = 'bank.xml'
remove_type_tags(bank_file_path)


This code will irreversibly change the bank. Do you really want to run it? Type 'yes' to confirm:  no


Operation canceled by the user.
