# Extract questions for inter-annotator agreement

---
* Load the data set: contains token_count, Conceptual Question Type, Functional Question Type, and question_text

* Group by token count: Divide token_count into 3 groups: 5-10, 11-15, and 16-20.

* Extract questions for Conceptual Question Type:
  * For each available question type in Conceptual Question Type: Extract 3 random questions from each token group (if available).

* Extract Questions for Functional Question Type:
  * For each available question type in Functional Question Type: Extract 3 random questions from each token group.

* Ensure no overlap with the previously extracted questions.

* Save the extracted questions to a new CSV.

---

In [None]:
import pandas as pd
import random

# Load the dataset
file_path = "updated_representative_sample.csv"

# Define token count groups
def assign_token_group(token_count):
    if 5 <= token_count <= 10:
        return "5-10"
    elif 11 <= token_count <= 15:
        return "11-15"
    elif 16 <= token_count <= 20:
        return "16-20"
    return None

# Add token group column
data['token_group'] = data['token_count'].apply(assign_token_group)

# Function to sample questions
def sample_questions(data, group_column, question_type_column, sample_size=3):
    sampled_questions = []
    grouped_data = data.groupby([question_type_column, 'token_group'])

    # Iterate over each question type and token group
    for (question_type, token_group), group in grouped_data:
        if token_group:  # Skip rows without a valid token group
            sample = group.sample(n=min(sample_size, len(group)), random_state=42)
            sampled_questions.append(sample)

    return pd.concat(sampled_questions).reset_index(drop=True)

# Extract for "Conceptual Question Type"
conceptual_questions = sample_questions(data, 'token_group', 'Conceptual Question Type')

# Remove already selected questions for "Functional Question Type"
remaining_data = data[~data['question_text'].isin(conceptual_questions['question_text'])]

# Extract for "Functional Question Type"
functional_questions = sample_questions(remaining_data, 'token_group', 'Functional Question Type')

# Combine the results
extracted_questions = pd.concat([conceptual_questions, functional_questions])

# Save to a new CSV
output_file = "extracted_questions_for_IAA.csv"
extracted_questions.to_csv(output_file, index=False)

print(f"Extracted questions saved to '{output_file}'.")
