In [None]:
%pip install vllm
%pip install accelerate

In [None]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
from typing import List
import torch
import re
from tqdm import tqdm
import os
import vllm
import re
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
torch.cuda.empty_cache()

## Model Loading 

## Prepare Dataframe

## Prompts Templates

#

In [None]:
PROMPT_subject= """"Question: {Question}.
Problem type: {Problem_type}.
Subject List：
{Subject_types}

Your task: You are a genius mathematician. Identify the subject from the subject list. Give concisely your final subjectID and subjectName for the question from the list inside <response>$$INSERT TEXT HERE$$</response>.
Before answering the question think step by step concisely in 1-2 sentence inside <thinking>$$INSERT TEXT HERE$$</thinking> tag.
"""

PROMPT_misconceptions= """Question: {Question}.
Problem type: {Problem_type}.
Subject: {Subject}.
Correct answer: {Answer}.
Correct answer reasoning: {Correct_reasoning}.
Misconceptions list: format<MisconceptionId, MisconceptioName>
{Misconception_list}

Your task: You are a genius mathematician. Identify a possible misconception from the misconceptions list appliable to the reasoning of of the correct answer. Answer concisely the misconceptionId and MisconceptionName from the list inside <response>$$INSERT TEXT HERE$$</response>.
Before answering the question think step by step concisely in 1-3 sentence inside <thinking>$$INSERT TEXT HERE$$</thinking> tag.
If there is nothing related, answer <response>no misconception</response>.
"""


PROMPT_misconception_reasoning= """Question: {Question}.
Subject: {Subject}.
Correct answer reasoning: {Correct_reasoning}.
Correct answer: {Answer}.
Misconception: {Misconception}.

Your task: You are a student who applied the misconception. Start from the correct answer reasoning, apply the misconception the answer in the correct position. Answer concisel the wrong numerical solution inside <response>$$INSERT TEXT HERE$$</response>.
Before answering the question think step by step by concisely and numerically in 1-5 steps where to apply the misconception and apply it inside <thinking>$$INSERT TEXT HERE$$</thinking> tag.

Example of your task. Derive Wrong answer reasoning and wrong answer from correct answer.
Question: \[3 \times 2+4-5\] Where do the brackets need to go to make the answer equal \( 13 \) ?
Subject: 33 BIDMAS
Correct answer reasoning: Put brackets around 2+4, and give precedence to this operation. Sum 2+4 =6. \[3 \times 6-5\]. Multiply 3 and 6.\[18-5\]=13.
Correct answer:\( 3 \times(2+4)-5 \)
Misconception:1672 Confuses the order of operations, believes addition comes before multiplication 
Wrong answer reasoning: Sum directly 2+4. \[3 \times 6-5\]. Multiply 3 and 6.\[18-5\]=13.
Wrong answer: Does not need brackets.
"""
PROMPT_correct_answer_concise = """Question: {Question}.
Correct answer: {Correct_reasoning}.

Your task: You are a genius mathematician. Answer concisely the solution taken from the correct answer inside <response>$$INSERT TEXT HERE$$</response>.
"""


In [2]:
# Read Math dataset
math_dataset = pd.read_parquet("/flash2/aml/chenjiah24_wangwd24_lad24/dataset/math.parquet",engine="pyarrow")

# Extract all unique types
unique_types = math_dataset['train'].apply(lambda x: x['type']).unique()

# Filter dataset with 200 samples per type
random_sampled_data = pd.concat([
    math_dataset[math_dataset['train'].apply(lambda x: x['type']) == t].sample(
        n=min(200, len(math_dataset[math_dataset['train'].apply(lambda x: x['type']) == t])),
        random_state=42
    )
    for t in unique_types
])
math_dataset=random_sampled_data

# Model 
MODEL_AWQ_PATH = "/flash2/aml/chenjiah24_wangwd24_lad24/Qwen2.5-32B-Instruct-AWQ"
try:
    # Load the tokenizer
    if 'tokenizer' not in globals():
        tokenizer = AutoTokenizer.from_pretrained(MODEL_AWQ_PATH)
        print("Tokenizer loaded.")
    else:
        print("Tokenizer already loaded.")
    # Load the model
    if 'model' not in globals():
        model = AutoModelForCausalLM.from_pretrained(
            MODEL_AWQ_PATH,
            trust_remote_code=True,
            torch_dtype="auto" 
        )
        model.to(torch.device("cuda:0"))  # Move the model to GPU
        print("Model loaded and moved to GPU.")
    else:
        print("Model already loaded.")
    # Initialize vLLM
    if 'llm' not in globals():
        llm = vllm.LLM(
            model=MODEL_AWQ_PATH,
            quantization="awq",  # Use the AWQ quantization
            device=torch.device("cuda:0"),  # Explicitly specify GPU
            gpu_memory_utilization=0.95,
            trust_remote_code=True,
            dtype="half",  # Use FP16 for efficiency
            enforce_eager=True,
            max_model_len=5120,
            disable_log_stats=True,
        )
        print("vLLM loaded and using GPU.")
    else:
        print("vLLM already loaded.")
except Exception as e:
    print(f"Error during model initialization: {e}")

#Loading the train and
train_df = pd.read_csv("data/train.csv")
misconceptions_df = pd.read_csv("data/misconception_mapping.csv")

# List of misconception columns
misconception_columns = [
    "MisconceptionAId", "MisconceptionBId", "MisconceptionCId", "MisconceptionDId"
]

# Function to gather non-zero misconceptions for each row
def get_non_zero_misconceptions(row):
    misconceptions = []
    for col in misconception_columns:
        # Check if the value is not NaN
        if pd.notna(row[col]):
            misconceptions.append(row[col])
    return misconceptions

# Apply the function to each row to create a new column with the list of misconceptions
train_df["Misconceptions"] = train_df.apply(get_non_zero_misconceptions, axis=1)

# Group by SubjectId and aggregate the misconceptions into a unique list
subject_misconceptions = (
    train_df.groupby("SubjectId", group_keys=False)
    .apply(lambda x: {
        "SubjectName": x["SubjectName"].iloc[0],
        "Misconceptions": np.unique(np.concatenate(x["Misconceptions"].to_numpy()))
    })
    .to_dict()
)
subject_misconceptions_df = pd.DataFrame.from_dict(subject_misconceptions, orient="index")


expanded_misconceptions = (
    subject_misconceptions_df.explode("Misconceptions")
    .reset_index()
    .rename(columns={"index": "SubjectId", "Misconceptions": "MisconceptionId"})
)

# Ensure MisconceptionId is of the same type as in misconceptions_df
expanded_misconceptions["MisconceptionId"] = expanded_misconceptions["MisconceptionId"].astype(int).astype(str)
misconceptions_df["MisconceptionId"] = misconceptions_df["MisconceptionId"].astype(str)
# Merge with misconceptions_df to get Misconception Names
merged_misconceptions = pd.merge(
    expanded_misconceptions,
    misconceptions_df,
    on="MisconceptionId",
    how="inner"  # Use inner join to exclude unmapped MisconceptionIds
)

# Filter out rows where the misconception name is NaN (if any remain after the merge)
merged_misconceptions = merged_misconceptions[merged_misconceptions["MisconceptionName"].notna()]

# Group by SubjectId to create final lists of Misconception IDs and Names
possible_misconception_df = merged_misconceptions.groupby("SubjectId").apply(
    lambda group: {
        "SubjectName": group["SubjectName"].iloc[0],
        "Misconceptions": list(zip(group["MisconceptionId"], group["MisconceptionName"]))
    }
).to_dict()
possible_misconception_df = {str(key): value for key, value in possible_misconception_df.items()}
# Example usage

subject_mapping = pd.read_csv("data/mapping_output.csv", usecols=['Type', 'SubjectId', 'SubjectName'])

# Group by 'Type' and aggregate into lists of [SubjectId, SubjectName] pairs
grouped = (
    subject_mapping.groupby('Type')
    .apply(lambda x: [[row.SubjectId, row.SubjectName] for row in x.itertuples()])
    .reset_index(name='Subjects')
)

pd.set_option('display.max_colwidth', None)



# Phase 1: Problem Reconstruction
# Phase 2: Systematic Analysis
# Phase 3: Feedback Integration
#
PROMPT_RCOT1 = """Give the concrete prompt (problem) that can generate this answer.
The problem should contain all basic and necessary information and correspond to the answer.
    The problem can only ask for one result.
"""

def verification():
    pass

import re
pd.set_option('display.max_colwidth', None)

PROMPT_subject= """"Question: {Question}.
Problem type: {Problem_type}.
Subject List：
{Subject_types}

Your task: You are a genius mathematician. Identify the subject from the subject list. Give concisely your final subjectID and subjectName for the question from the list inside <response>$$INSERT TEXT HERE$$</response>.
Before answering the question think step by step concisely in 1-2 sentence inside <thinking>$$INSERT TEXT HERE$$</thinking> tag.
"""

PROMPT_misconceptions= """Question: {Question}.
Problem type: {Problem_type}.
Subject: {Subject}.
Correct answer: {Answer}.
Correct answer reasoning: {Correct_reasoning}.
Misconceptions list: format<MisconceptionId, MisconceptioName>
{Misconception_list}

Your task: You are a genius mathematician. Identify a possible misconception from the misconceptions list appliable to the reasoning of of the correct answer. Answer concisely the misconceptionId and MisconceptionName from the list inside <response>$$INSERT TEXT HERE$$</response>.
Before answering the question think step by step concisely in 1-3 sentence inside <thinking>$$INSERT TEXT HERE$$</thinking> tag.
If there is nothing related, answer <response>no misconception</response>.
"""


PROMPT_misconception_reasoning= """Question: {Question}.
Subject: {Subject}.
Correct answer reasoning: {Correct_reasoning}.
Correct answer: {Answer}.
Misconception: {Misconception}.

Your task: You are a student who applied the misconception. Start from the correct answer reasoning, apply the misconception the answer in the correct position. Answer concisel the wrong numerical solution inside <response>$$INSERT TEXT HERE$$</response>.
Before answering the question think step by step by concisely and numerically in 1-5 steps where to apply the misconception and apply it inside <thinking>$$INSERT TEXT HERE$$</thinking> tag.

Example of your task. Derive Wrong answer reasoning and wrong answer from correct answer.
Question: \[3 \times 2+4-5\] Where do the brackets need to go to make the answer equal \( 13 \) ?
Subject: 33 BIDMAS
Correct answer reasoning: Put brackets around 2+4, and give precedence to this operation. Sum 2+4 =6. \[3 \times 6-5\]. Multiply 3 and 6.\[18-5\]=13.
Correct answer:\( 3 \times(2+4)-5 \)
Misconception:1672 Confuses the order of operations, believes addition comes before multiplication 
Wrong answer reasoning: Sum directly 2+4. \[3 \times 6-5\]. Multiply 3 and 6.\[18-5\]=13.
Wrong answer: Does not need brackets.
"""
PROMPT_correct_answer_concise = """Question: {Question}.
Correct answer: {Correct_reasoning}.

Your task: You are a genius mathematician. Answer concisely the solution taken from the correct answer inside <response>$$INSERT TEXT HERE$$</response>.
"""


def apply_template_subject(question,type, subjects, tokenizer):
    messages = [
        {
            "role": "user", 
            "content": 
                PROMPT_subject.format(
                    Question=question,
                    Problem_type=type,
                    Subject_types="\n".join(f"{item[0]}, {item[1]}" for item in subjects)
                )
        }
    ]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return text


def apply_template_misconception(question, type, subject, answer, reasoning, misconceptions, forbidden, tokenizer):
    # Extract MisconceptionIds from the forbidden list
    forbidden_ids = {item[0] for item in forbidden}  # Set of forbidden MisconceptionIds
    
    # Filter misconceptions to exclude forbidden ones by their IDs
    filtered_misconceptions = [
        item for item in misconceptions if item[0] not in forbidden_ids
    ]

    # Format the message
    messages = [
        {
            "role": "user",
            "content": PROMPT_misconceptions.format(
                Question=question,
                Problem_type=type,
                Answer=answer,
                Subject=subject,
                Correct_reasoning=reasoning,
                Misconception_list="\n".join(f"{item[0]}, {item[1]}" for item in filtered_misconceptions)
            )
        }
    ]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return text




def apply_template_misconception_reasoning(question,type,subject,answer,reasoning,misconception,tokenizer):
    messages = [
        {
            "role": "user", 
            "content": 
                PROMPT_misconception_reasoning.format(
                    Question=question,
                    Problem_type=type,
                    Answer = answer,
                    Subject=subject,
                    Correct_reasoning=reasoning,
                    Misconception = misconception
                )
        }
    ]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return text

def apply_template_correct_answer_concise(question,reasoning,tokenizer):
    messages = [
        {
            "role": "user", 
            "content": 
                PROMPT_correct_answer_concise.format(
                    Question=question,
                    Correct_reasoning=reasoning,
                )
        }
    ]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return text


# Phase 1: Problem Reconstruction
# Phase 2: Systematic Analysis
# Phase 3: Feedback Integration
#
PROMPT_RCOT1 = """Give the concrete prompt (problem) that can generate this answer.
The problem should contain all basic and necessary information and correspond to the answer.
    The problem can only ask for one result.
"""

def verification():
    pass

columns = ['Question', 'SubjectId', 'SubjectName','CorrectAnswer', 'Solution', 'Misconception']
new_dataset_df = pd.DataFrame(columns=columns)
checkpoint=0
for index, row in tqdm(math_dataset.iterrows(), total=len(math_dataset), desc="Processing questions", unit="question"):
    """"From their subject-> subject of EEDI -> misconceptions then take 3 misconception that can be added 
    to the correct answer, create the reasoning behind the misconception, then use RCOT to validate 
    the reasoning behind the misconception, if validate then okay add it to the dataset
    otherwise try again for another misconception
    """
    try:
        checkpoint+=1
        if checkpoint %100==0:
            print("ADDING")
            new_dataset_df.to_csv(f'output2/new_dataset_{index}.csv', index=False)

        question = row['train']['problem'] 
        problem_type = row['train']['type']
        possible_subjects = grouped[grouped['Type'] == problem_type]['Subjects'].values[0]
        subject_message = apply_template_subject(question,problem_type,possible_subjects,tokenizer)

        responses = llm.generate(
            [subject_message],
            vllm.SamplingParams(
                n=1,                     # Generate one output sequence per prompt
                top_k=1,                 # Deterministic output
                temperature=0,           # No randomness
                seed=777,                # Reproducibility
                skip_special_tokens=False,
                max_tokens=512,           # Adjust max tokens as needed],
            ),
        )
        # Extract the subject
        response_text = responses[0].outputs[0].text
        #print(response_text)
        #Extract the response content
        match = re.search(r"<response>(.*?)</response>", response_text)
        if match:
            subject_id = match.group(1).strip('$').split(",", 2)
            if "no subject" in subject_id[0]:
                continue
        else:
            continue 
        #check subject exists
        try:
        #print(subject_id)
            possible_misconceptions=possible_misconception_df[subject_id[0]]['Misconceptions']
        except:
            print("problem")
            continue
        #create with llm a concise answer for the correct answer
        correct_answer=llm.generate(
            [apply_template_correct_answer_concise(question,row['train']['solution'],tokenizer)],
            vllm.SamplingParams(
                n=1,                     # Generate one output sequence per prompt
                top_k=1,                 # Deterministic output
                temperature=0,           # No randomness
                seed=777,                # Reproducibility
                skip_special_tokens=False,
                max_tokens=64,           # Adjust max tokens as needed],
            ),
        )
        correct_answer = correct_answer[0].outputs[0].text
        match = re.search(r"<response>(.*?)</response>", correct_answer)
        if match:
            correct_answer = match.group(1).strip('$')
        else:
            continue
        generated_misconceptions=3
        misconceptions_set=[]
        used_misconception_list=[]
        while generated_misconceptions>0:
            correct_reasoning = row['train']['solution'] 
            misconception_message = apply_template_misconception(question,problem_type, subject_id[0], correct_answer, correct_reasoning, possible_misconceptions,used_misconception_list,tokenizer )
            responses = llm.generate(
                [misconception_message],
                vllm.SamplingParams(
                    n=1,                     # Generate one output sequence per prompt
                    top_k=1,                 # Deterministic output
                    temperature=0,           # No randomness
                    seed=777,                # Reproducibility
                    skip_special_tokens=False,
                    max_tokens=2056,           # Adjust max tokens as needed
                ),
            )
            # Extract the misconceptions
            response_text = responses[0].outputs[0].text
            #print(response_text)
            try:
                misconception = re.search(r"<response>(.*?)</response>", response_text).group(1).strip('$').split(",",1)
            except:
                break
            if 'no misconception' in misconception[0]:
                break
            #print(misconception_id)
            used_misconception_list.append(misconception)
            wrong_reasoning_message = apply_template_misconception_reasoning(question,problem_type, subject_id[0], correct_answer, correct_reasoning, misconception,tokenizer)
            #print(wrong_reasoning_message)

            # Extract the misconceptions reasoning and answer
            response_text = responses[0].outputs[0].text
            response_text = re.sub(r"\s+", " ", response_text) 
            #print(response_text)
            try:
                misconception_reasoning =  re.search(r"<thinking>(.*?)</thinking>", response_text).group(1).strip('$')
                misconception_answer = re.search(r"<response>(.*?)</response>", response_text).group(1).strip('$')
            except:
                break
            #print(misconception_answer)
            used_misconception_list.append(misconception)
            misconceptions_set.append([misconception[0],misconception[1],misconception_answer,misconception_reasoning])
            generated_misconceptions-=1
            print("generated")
        if len(misconceptions_set)>0:
            new_row = [question, subject_id[0],subject_id[1],correct_answer,row['train']['solution'],misconceptions_set]
            new_dataset_df.loc[len(new_dataset_df)] = new_row
    except Exception as e:
        print(f"Error {e}")
        continue

    print(f"Done {checkpoint}")

# Save to CSV
new_dataset_df.to_csv('output2/new_dataset.csv', index=False)


Tokenizer loaded.


  .apply(lambda x: {
  possible_misconception_df = merged_misconceptions.groupby("SubjectId").apply(
  .apply(lambda x: [[row.SubjectId, row.SubjectName] for row in x.itertuples()])


Error during model initialization: Loading an AWQ quantized model requires accelerate (`pip install accelerate`)


Processing questions:  22%|██▏       | 308/1400 [00:00<00:00, 3072.34question/s]

Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error na

Processing questions:  48%|████▊     | 677/1400 [00:00<00:00, 3434.61question/s]

Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error na

Processing questions:  75%|███████▌  | 1050/1400 [00:00<00:00, 3566.64question/s]

Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error na

Processing questions: 100%|██████████| 1400/1400 [00:00<00:00, 3551.29question/s]

Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error name 'llm' is not defined
Error na




In [5]:
import pandas as pd
pd.set_option('display.max_rows', None)  # Display all rows
pd.set_option('display.max_columns', None)  # Display all columns
pd.set_option('display.expand_frame_repr', False)  # Avoid line wrapping

# Load the data
constructed_data = pd.read_csv("output/processed_dataset_with_construct.csv")

# Add a base QuestionId column
constructed_data['QuestionId'] = range(1, len(constructed_data) + 1)

# Melt the Misconception columns into rows
melted_data = constructed_data.melt(
    id_vars=['QuestionId', 'Question', 'SubjectId', 'CorrectAnswer', 'Solution', 'construct'],
    value_vars=['Misconception_part_1', 'Misconception_part_2', 'Misconception_part_3'],
    var_name='MisconceptionPart',
    value_name='Misconception'
)

# Drop rows where 'Misconception' is NaN
melted_data = melted_data.dropna(subset=['Misconception'])

# Map numeric parts to letters
suffix_mapping = {'1': 'A', '2': 'B', '3': 'C'}
melted_data['QuestionId'] = (
    melted_data['QuestionId'].astype(str) +
    "_" +
    melted_data['MisconceptionPart'].str.extract(r'(\d+)')[0].map(suffix_mapping)
)

# Drop the MisconceptionPart column (optional)
melted_data = melted_data.drop(columns=['MisconceptionPart'])

# Optionally reset the index
melted_data.reset_index(drop=True, inplace=True)

# Save the result to a new CSV or review the result
melted_data.to_csv("output/separated_misconceptions_with_letter_suffix.csv", index=False)




In [6]:
import pandas as pd
import ast

# Load the data
constructed_data = pd.read_csv("output/processed_dataset_with_construct.csv")

# Add a base QuestionId column
constructed_data['QuestionId'] = range(4000, 4000+len(constructed_data) )

# Drop rows where 'construct' contains "INSERT TEXT HERE"
constructed_data = constructed_data[constructed_data['construct'] != "INSERT TEXT HERE"]

# Melt the Misconception columns into rows
melted_data = constructed_data.melt(
    id_vars=['QuestionId', 'Question', 'SubjectId', 'CorrectAnswer', 'Solution', 'construct'],
    value_vars=['Misconception_part_1', 'Misconception_part_2', 'Misconception_part_3'],
    var_name='MisconceptionPart',
    value_name='Misconception'
)

# Drop rows where 'Misconception' is NaN
melted_data = melted_data.dropna(subset=['Misconception'])

# Check data types in Misconception
print("Data types in 'Misconception':")
print(melted_data["Misconception"].apply(type).value_counts())

# Convert strings to lists if necessary
melted_data["Misconception"] = melted_data["Misconception"].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

# Check lengths of lists in Misconception
melted_data["MisconceptionLength"] = melted_data["Misconception"].apply(
    lambda x: len(x) if isinstance(x, list) else None
)
print("Distribution of list lengths in 'Misconception':")
print(melted_data["MisconceptionLength"].value_counts())

# Filter rows where 'Misconception' is a list with exactly 4 elements
melted_data = melted_data[
    melted_data["Misconception"].apply(lambda x: isinstance(x, list) and len(x) == 4)
]

# Continue processing as before
suffix_mapping = {'1': 'A', '2': 'B', '3': 'C'}
melted_data['QuestionId'] = (
    melted_data['QuestionId'].astype(str) +
    "_" +
    melted_data['MisconceptionPart'].str.extract(r'(\d+)')[0].map(suffix_mapping)
)

# Drop the MisconceptionPart column
melted_data = melted_data.drop(columns=['MisconceptionPart', 'MisconceptionLength'])

# Expand the Misconception column
expanded_misconceptions = pd.DataFrame(
    melted_data["Misconception"].tolist(),
    columns=['MisconceptionId', 'MisconceptionName', 'MisconceptionAnswer', 'LLMreasoning']
)

# Combine the expanded columns with the rest of the data
melted_data = pd.concat([melted_data.drop(columns=['Misconception']), expanded_misconceptions], axis=1)

# Reset the index
melted_data.reset_index(drop=True, inplace=True)

# Save the result
melted_data.to_csv("output/expanded_misconceptions_filtered.csv", index=False)
print(melted_data.head())


Data types in 'Misconception':
Misconception
<class 'str'>    963
Name: count, dtype: int64
Distribution of list lengths in 'Misconception':
MisconceptionLength
4    963
Name: count, dtype: int64
  QuestionId                                                                                             Question  SubjectId                                                        CorrectAnswer                                                                                                                                                                                                                                             Solution                                                                                                   construct MisconceptionId                                                                          MisconceptionName                                                                    MisconceptionAnswer                                                                

In [7]:
import pandas as pd


subject = pd.read_csv("data/train.csv")
subject_dict = subject.set_index(subject['SubjectId'].astype(str))['SubjectName'].astype(str).to_dict()

print(subject_dict)

synthetics_dataset = pd.read_csv("/home/chenjiah24/output/math_dataset_newnumber.csv")
# Drop rows with NaN in the SubjectId and MisconceptionId columns
synthetics_dataset = synthetics_dataset[
    synthetics_dataset["SubjectId"].notnull() & synthetics_dataset["MisconceptionId"].notnull()
]

# Convert SubjectId to integer, then string
synthetics_dataset["SubjectId"] = synthetics_dataset["SubjectId"].astype(int).astype(str)

# Convert MisconceptionId to integer, then string
synthetics_dataset["MisconceptionId"] = synthetics_dataset["MisconceptionId"].astype(int).astype(str)

# Map SubjectName column based on the dictionary
synthetics_dataset.insert(
    synthetics_dataset.columns.get_loc("SubjectId") + 1,  # Position after SubjectId
    "SubjectName",
    synthetics_dataset["SubjectId"].map(subject_dict)
)

print(synthetics_dataset.head())
synthetics_dataset.to_csv("output/dataset_with_subjectname_number.csv",index=False)

{'33': 'BIDMAS', '1077': 'Simplifying Algebraic Fractions', '339': 'Range and Interquartile Range from a List of Data', '88': 'Properties of Quadrilaterals', '67': 'Substitution into Formula', '75': 'Area of Simple Shapes', '238': 'Converting between Fractions and Percentages', '224': 'Multiplying and Dividing with Decimals', '230': 'Adding and Subtracting Fractions', '164': 'Transformations of functions in the form f(x)', '335': 'Expanding Triple Brackets and more', '85': 'Nets', '182': 'Angle Facts with Parallel Lines', '209': 'Time', '69': 'Trial and Improvement and Iterative Methods', '241': 'Sharing in a Ratio', '210': 'Ordering Negative Numbers', '211': 'Adding and Subtracting Negative Numbers', '1078': 'Adding and Subtracting Algebraic Fractions', '189': 'Volume of Prisms', '53': 'Factorising into a Double Bracket', '203': 'Mental Addition and Subtraction', '58': 'Real Life Graphs', '204': 'Mental Multiplication and Division', '159': 'Equation of a Circle', '200': 'Counting', '1