# LTL Tutor Response Pre-processing

This notebook pre-processes `ltl-tutor-responses.csv` to:
1. Parse `question_options` JSON and expand each option into its own row
2. Extract a **seed formula** per question (the ground-truth LTL formula, regardless of question type)
3. Normalize seed formulas using `parse_ltl_string` from `ltlnode`
4. Add a **chosen option** column (correct answer or the misconception-matching distractor)

In [6]:
import sys, os
import json
import ast
import re
import pandas as pd

# Make src/ importable so we can use parse_ltl_string
SRC_DIR = os.path.abspath(os.path.join('..', 'src'))
if SRC_DIR not in sys.path:
    sys.path.insert(0, SRC_DIR)

from ltlnode import parse_ltl_string

print('SRC_DIR:', SRC_DIR)
print('parse_ltl_string imported OK')


SRC_DIR: /Users/siddharthaprasad/Desktop/LTLTutor/src
parse_ltl_string imported OK


## 1. Load raw data

In [7]:
RAW_CSV = os.path.join('ltl-tutor-responses.csv')
df_raw = pd.read_csv(RAW_CSV)
print(f"Loaded {len(df_raw):,} rows")
print("Columns:", list(df_raw.columns))
df_raw.head(3)

Loaded 5,786 rows
Columns: ['id', 'user_id', 'timestamp', 'misconception', 'question_text', 'question_options', 'correct_answer', 'question_type', 'mp_class', 'exercise', 'course']


Unnamed: 0,id,user_id,timestamp,misconception,question_text,question_options,correct_answer,question_type,mp_class,exercise,course
0,1,anon-user-FIE5Vl,2024-06-27 18:46:16.631438,MisconceptionCode.OtherImplicit,(q <-> t) \n q & ! t;cycle{t & !q;t & !q},"[{""value"": ""Yes"", ""misconceptions"": ""['Misconc...",False,trace_satisfaction_yn,guarantee safety,Exercise,
1,2,anon-user-FIE5Vl,2024-06-27 18:46:19.712717,MisconceptionCode.OtherImplicit,(G (q <-> (X t))),"[{""value"": ""! q & t;! q & ! t;t & !q;cycle{!t ...",False,trace_satisfaction_mc,safety,Exercise,
2,3,anon-user-xYuKsl,2024-06-27 23:39:53.86804,,"eventually, if 'b' holds, then globally, 'c' h...","[{""value"": ""(G (b -> (G c)))"", ""misconceptions...",True,english_to_ltl,persistence,Exercise contemplation-hawthorns,


In [3]:
print(df_raw['question_type'].value_counts())
print()
print("correct_answer distribution:")
print(df_raw['correct_answer'].value_counts())

question_type
english_to_ltl           2270
trace_satisfaction_mc    1807
trace_satisfaction_yn    1709
Name: count, dtype: int64

correct_answer distribution:
correct_answer
True     4467
False    1319
Name: count, dtype: int64


# 1.5 Remove Those Questions that are from the robotrain exercise

Filter out rows where the exercise is `robotrain-entry` or `robotrain-exit`

In [None]:
# Filter out robotrain exercises
df_raw = df_raw[~df_raw['exercise'].isin(['robotrain-entry', 'robotrain-exit'])].reset_index(drop=True)
print(f"After filtering robotrain exercises: {len(df_raw):,} rows")

## 2. Parse `question_options` and expand into one row per option

In [8]:
def parse_options(raw_options):
    """Parse the question_options JSON string into a list of dicts.
    Each dict has keys 'value' and 'misconceptions' (list of strings).
    """
    if pd.isna(raw_options):
        return []
    try:
        options = json.loads(raw_options)
    except (json.JSONDecodeError, TypeError):
        return []

    result = []
    for opt in options:
        value = opt.get('value', '')
        misc_raw = opt.get('misconceptions', '[]')
        # misconceptions is stored as a Python-list string: "['MisconceptionCode.Foo']"
        if isinstance(misc_raw, str):
            try:
                misc_list = ast.literal_eval(misc_raw)
            except (ValueError, SyntaxError):
                misc_list = []
        elif isinstance(misc_raw, list):
            misc_list = misc_raw
        else:
            misc_list = []
        result.append({'option_value': value, 'option_misconceptions': misc_list})
    return result


# Expand each question row into one row per option
records = []
for _, row in df_raw.iterrows():
    options = parse_options(row['question_options'])
    if not options:
        # Keep a single row even if options couldn't be parsed
        rec = row.to_dict()
        rec['option_value'] = None
        rec['option_misconceptions'] = []
        records.append(rec)
        continue
    for opt in options:
        rec = row.to_dict()
        rec['option_value'] = opt['option_value']
        rec['option_misconceptions'] = opt['option_misconceptions']
        records.append(rec)

df = pd.DataFrame(records).reset_index(drop=True)
print(f"Expanded to {len(df):,} rows (one per option)")
df.head(6)

Expanded to 19,894 rows (one per option)


Unnamed: 0,id,user_id,timestamp,misconception,question_text,question_options,correct_answer,question_type,mp_class,exercise,course,option_value,option_misconceptions
0,1,anon-user-FIE5Vl,2024-06-27 18:46:16.631438,MisconceptionCode.OtherImplicit,(q <-> t) \n q & ! t;cycle{t & !q;t & !q},"[{""value"": ""Yes"", ""misconceptions"": ""['Misconc...",False,trace_satisfaction_yn,guarantee safety,Exercise,,Yes,[MisconceptionCode.OtherImplicit]
1,1,anon-user-FIE5Vl,2024-06-27 18:46:16.631438,MisconceptionCode.OtherImplicit,(q <-> t) \n q & ! t;cycle{t & !q;t & !q},"[{""value"": ""Yes"", ""misconceptions"": ""['Misconc...",False,trace_satisfaction_yn,guarantee safety,Exercise,,No,[]
2,2,anon-user-FIE5Vl,2024-06-27 18:46:19.712717,MisconceptionCode.OtherImplicit,(G (q <-> (X t))),"[{""value"": ""! q & t;! q & ! t;t & !q;cycle{!t ...",False,trace_satisfaction_mc,safety,Exercise,,! q & t;! q & ! t;t & !q;cycle{!t & q;!t & q},[MisconceptionCode.BadStateQuantification]
3,2,anon-user-FIE5Vl,2024-06-27 18:46:19.712717,MisconceptionCode.OtherImplicit,(G (q <-> (X t))),"[{""value"": ""! q & t;! q & ! t;t & !q;cycle{!t ...",False,trace_satisfaction_mc,safety,Exercise,,q & ! t;q & ! t;cycle{q & t;q & ! t;q & t},[MisconceptionCode.OtherImplicit]
4,2,anon-user-FIE5Vl,2024-06-27 18:46:19.712717,MisconceptionCode.OtherImplicit,(G (q <-> (X t))),"[{""value"": ""! q & t;! q & ! t;t & !q;cycle{!t ...",False,trace_satisfaction_mc,safety,Exercise,,q & !t;! q & t;t & !q;cycle{t & !q;t & !q},[MisconceptionCode.ImplicitG]
5,2,anon-user-FIE5Vl,2024-06-27 18:46:19.712717,MisconceptionCode.OtherImplicit,(G (q <-> (X t))),"[{""value"": ""! q & t;! q & ! t;t & !q;cycle{!t ...",False,trace_satisfaction_mc,safety,Exercise,,q & t;cycle{q & t;q & t},[]


## 3. Extract seed formula per question

- **`english_to_ltl`**: seed = the option whose `misconceptions` list is empty (the correct LTL answer).
- **`trace_satisfaction_yn`**: `question_text` = `"<formula>\n<trace>"` — take the first line.
- **`trace_satisfaction_mc`**: `question_text` is the bare LTL formula.  
  As a fallback we also check whether any option value looks like an LTL formula (not a trace) — a value is treated as a trace if it contains `;` or `cycle{`.

In [9]:
def looks_like_trace(value: str) -> bool:
    """Return True if value looks like a trace (contains ';' or 'cycle{')."""
    return ';' in value or 'cycle{' in value


def extract_seed_formula(row) -> str | None:
    """Extract the raw (un-normalised) seed formula for a question row."""
    qtype = row['question_type']
    options = parse_options(row['question_options'])

    if qtype == 'english_to_ltl':
        # Seed = the option with no misconceptions attached
        for opt in options:
            if not opt['option_misconceptions']:
                return opt['option_value'].strip()
        return None

    elif qtype == 'trace_satisfaction_yn':
        # question_text = "<formula>\n<trace>" — the formula is the first non-empty line
        text = str(row['question_text'])
        first_line = text.strip().split('\n')[0].strip()
        if first_line and not looks_like_trace(first_line):
            return first_line
        return None

    elif qtype == 'trace_satisfaction_mc':
        # question_text is the LTL formula
        text = str(row['question_text']).strip()
        if text and not looks_like_trace(text):
            return text
        # Fallback: look for an option value that is an LTL formula (not a trace)
        for opt in options:
            v = opt['option_value'].strip()
            if v and not looks_like_trace(v):
                return v
        return None

    return None


# Compute seed formula on the *original* (un-expanded) rows first, then merge
df_raw['seed_formula_raw'] = df_raw.apply(extract_seed_formula, axis=1)
print("Seed formula extraction results (null count per question type):")
print(df_raw.groupby('question_type')['seed_formula_raw'].apply(lambda s: s.isna().sum()))

Seed formula extraction results (null count per question type):
question_type
english_to_ltl           0
trace_satisfaction_mc    0
trace_satisfaction_yn    0
Name: seed_formula_raw, dtype: int64


## 4. Normalize seed formulas via `parse_ltl_string`

In [10]:
def normalize_ltl(formula_str: str) -> str | None:
    """Parse an LTL formula string through parse_ltl_string and return the
    canonical string representation. Returns None if parsing fails.
    """
    if not formula_str or pd.isna(formula_str):
        return None
    try:
        node = parse_ltl_string(formula_str.strip())
        return str(node)
    except Exception:
        return None


df_raw['seed_formula'] = df_raw['seed_formula_raw'].apply(normalize_ltl)

# Report how many formulas failed to normalize
fail_mask = df_raw['seed_formula_raw'].notna() & df_raw['seed_formula'].isna()
print(f"Formulas that failed normalization: {fail_mask.sum()}")
if fail_mask.any():
    print(df_raw.loc[fail_mask, ['id', 'question_type', 'seed_formula_raw']].head(10))


Formulas that failed normalization: 0


## 5. Compute `chosen_option`

For **`english_to_ltl`** and **`trace_satisfaction_mc`**:
- `correct_answer == True` → student chose the option with **no** misconceptions.
- `correct_answer == False` → student chose the distractor whose `option_misconceptions` list contains the recorded `misconception`.

For **`trace_satisfaction_yn`** the options are just "Yes"/"No", and misconception tagging is asymmetric:
- The **correct** choice = the option with no misconceptions (fallback: "Yes" if both/neither are tagged).
- `correct_answer == True` → student chose the correct option.
- `correct_answer == False` → student chose the *other* option.


In [None]:
def extract_chosen_option(row) -> str | None:
    """Return the value of the option the student selected."""
    options = parse_options(row['question_options'])
    if not options:
        return None

    correct = row['correct_answer']
    qtype = row['question_type']

    if qtype == 'trace_satisfaction_yn':
        # For Y/N questions we can't rely on misconception matching because
        # there may be no misconceptions on either option.
        # The correct option = no misconceptions; fallback to "Yes" if ambiguous.
        no_misc = [o for o in options if not o['option_misconceptions']]
        if len(no_misc) == 1:
            correct_opt = no_misc[0]['option_value'].strip()
        else:
            # Ambiguous or both tagged — default correct answer is "Yes"
            correct_opt = next(
                (o['option_value'].strip() for o in options if o['option_value'].strip().lower() == 'yes'),
                options[0]['option_value'].strip()
            )
        wrong_opts = [o['option_value'].strip() for o in options if o['option_value'].strip() != correct_opt]
        if correct:
            return correct_opt
        else:
            return wrong_opts[0] if wrong_opts else None

    else:
        # english_to_ltl and trace_satisfaction_mc
        if correct:
            for opt in options:
                if not opt['option_misconceptions']:
                    return opt['option_value'].strip()
        else:
            # Match the recorded misconception to the chosen distractor
            recorded_misc = str(row['misconception']).strip() if pd.notna(row['misconception']) else ''
            for opt in options:
                if any(recorded_misc == m.strip() for m in opt['option_misconceptions']):
                    return opt['option_value'].strip()
            # Fallback: first option that has any misconception listed
            for opt in options:
                if opt['option_misconceptions']:
                    return opt['option_value'].strip()

    return None


df_raw['chosen_option'] = df_raw.apply(extract_chosen_option, axis=1)

print("Null chosen_option:", df_raw['chosen_option'].isna().sum())
df_raw[['id', 'question_type', 'correct_answer', 'misconception', 'chosen_option']].head(10)


## 6. Merge derived columns back into the expanded (per-option) DataFrame

In [None]:
# The expanded df still has the original column values per row;
# merge seed_formula and chosen_option from df_raw by question id
derived = df_raw[['id', 'seed_formula_raw', 'seed_formula', 'chosen_option']]
df = df.merge(derived, on='id', how='left')

# Clean up: drop the raw question_options column (already expanded)
# and reorder for clarity
COLS_ORDER = [
    'id', 'user_id', 'timestamp', 'question_type', 'mp_class', 'exercise', 'course',
    'question_text',
    'option_value', 'option_misconceptions',
    'seed_formula_raw', 'seed_formula',
    'chosen_option',
    'correct_answer', 'misconception',
]
df = df[COLS_ORDER]

print(f"Final DataFrame: {len(df):,} rows × {len(df.columns)} columns")
df.head(8)

## 7. Sanity checks

In [None]:
# Per question-type: what fraction have a seed formula?
seed_coverage = df.drop_duplicates('id').groupby('question_type')['seed_formula'].apply(
    lambda s: s.notna().mean()
).rename('seed_formula_coverage')
print(seed_coverage)

print()

# Sample of seed formulas per type
for qtype, grp in df.drop_duplicates('id').groupby('question_type'):
    print(f"--- {qtype} ---")
    sample = grp[['id', 'seed_formula_raw', 'seed_formula']].dropna(subset=['seed_formula']).head(3)
    print(sample.to_string(index=False))
    print()

In [None]:
# Verify chosen_option makes sense
print("chosen_option null rate:", df['chosen_option'].isna().mean().round(3))

# For correct answers: chosen_option should not be a misconception option
correct_rows = df[df['correct_answer'] == True].drop_duplicates('id')
# option_misconceptions for chosen_option should be empty on the correct rows
# (check a few manually)
print("\nSample correct answers with chosen_option:")
print(correct_rows[['id', 'question_type', 'chosen_option']].head(5).to_string(index=False))

print("\nSample incorrect answers with chosen_option:")
wrong_rows = df[df['correct_answer'] == False].drop_duplicates('id')
print(wrong_rows[['id', 'question_type', 'misconception', 'chosen_option']].head(5).to_string(index=False))

## 8. Save processed data

In [None]:
OUT_CSV = 'ltl-tutor-responses-processed.csv'
df.to_csv(OUT_CSV, index=False)
print(f"Saved → {OUT_CSV}  ({len(df):,} rows)")