In [1]:
local = True
log = True
log_detail = False

# Imports

## Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

import os
import sys
from tqdm import tqdm
sys.path.append(os.path.abspath('..'))
import importlib
import pickle

In [3]:
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

## Data

In [5]:
if local:
    misconceptions = pd.read_csv('../kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv', index_col='MisconceptionId')
    train = pd.read_csv('../kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv')
    test = pd.read_csv('../kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv')
else:
    misconceptions = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv', index_col='MisconceptionId')
    train = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv')
    test = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv')
if log: print("(1) Imported data")

(1) Imported data


In [6]:
#########################
# Create train_melted 
#########################

# Define the identifier columns
id_cols = [
    'QuestionId', 'ConstructId', 'ConstructName', 
    'SubjectId', 'SubjectName', 'CorrectAnswer', 'QuestionText'
]

# Define the corresponding Answer options
answer_cols = ['AnswerAText', 'AnswerBText', 'AnswerCText', 'AnswerDText']
misconception_cols = ['MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId', 'MisconceptionDId']

# Melt Answer Text
text_melted = train.melt(
    id_vars=id_cols,
    value_vars=answer_cols,
    var_name='Attribute',
    value_name='AnswerText'
)

# Melt Misconception IDs
misconception_melted = train.melt(
    id_vars=id_cols,
    value_vars=misconception_cols,
    var_name='Attribute',
    value_name='MisconceptionId'
)

# Extract the option letter (A, B, C, D) and the attribute type
text_melted['AnswerOption'] = text_melted['Attribute'].str.extract(r'Answer([ABCD])Text')[0]
misconception_melted['AnswerOption'] = misconception_melted['Attribute'].str.extract(r'Misconception([ABCD])Id')[0]

# Drop the original 'Attribute' columns as they are no longer needed
text_melted.drop('Attribute', axis=1, inplace=True)
misconception_melted.drop('Attribute', axis=1, inplace=True)

# Merge the two melted DataFrames on id_vars and AnswerOption
train_melted = pd.merge(
    text_melted,
    misconception_melted,
    on=id_cols + ['AnswerOption'],
    how='left'
)

train_melted = train_melted.merge(misconceptions, left_on='MisconceptionId', right_index=True, how='left')
train_melted = train_melted[train_melted['CorrectAnswer'] != train_melted['AnswerOption']]

In [7]:
print(f"SubjectName nunique: {train_melted['SubjectName'].nunique()}")
print(f"ConstructName nunique: {train_melted['ConstructName'].nunique()}")
print(f"MisconceptionName nunique: {train_melted['MisconceptionName'].nunique()}")


SubjectName nunique: 163
ConstructName nunique: 757
MisconceptionName nunique: 1604


In [8]:
hierarchy_scm = {}
for subject in train_melted['SubjectName'].unique():
    subject_data = train_melted[train_melted['SubjectName'] == subject]
    constructs = subject_data['ConstructName'].dropna().unique()
    constructs_list = []
    for construct in constructs:
        misconceptions = subject_data[subject_data['ConstructName'] == construct]['MisconceptionName'].dropna().unique().tolist()
        constructs_list.extend(misconceptions)
    hierarchy_scm[subject] = constructs_list

# Classes

In [9]:
class Hierarchy:
    def __init__(self):
        self.parent_to_children = {}
        for subject, misconceptions in hierarchy_scm.items():
            self.parent_to_children[subject] = misconceptions
        self.child_to_parent = {child: parent for parent, children in self.parent_to_children.items() for child in children}
    
    def get_parent(self, child):
        return self.child_to_parent.get(child, None)
    
    def get_children(self, parent):
        return self.parent_to_children.get(parent, [])

In [10]:
class DataPreprocessor:
    def __init__(self, train_df, test_df, hierarchy):
        self.train_df = train_df
        self.test_df = test_df
        self.hierarchy = hierarchy
        self.label_encoders = {}
        self.child_label_encoders = {}
    
    def map_hierarchy(self):
        self.train_df['ParentCategory'] = self.train_df['MisconceptionName'].apply(self.hierarchy.get_parent)
        self.test_df['ParentCategory'] = None  # To be predicted later
    
    def encode_labels(self):
        # Encode parent categories
        le_parent = LabelEncoder()
        self.train_df['ParentCategoryEncoded'] = le_parent.fit_transform(self.train_df['ParentCategory'])
        self.label_encoders['parent'] = le_parent
        
        # Encode specific misconceptions per parent
        self.child_label_encoders = {}
        self.train_df['MisconceptionEncoded'] = -1  # Initialize with a placeholder
        
        for parent in self.hierarchy.parent_to_children.keys():
            misconception_subset = self.train_df[self.train_df['ParentCategory'] == parent]['MisconceptionName']
            le_child = LabelEncoder()
            encoded = le_child.fit_transform(misconception_subset)
            self.train_df.loc[self.train_df['ParentCategory'] == parent, 'MisconceptionEncoded'] = encoded
            self.child_label_encoders[parent] = le_child
        
    def preprocess(self):
        self.map_hierarchy()
        self.encode_labels()
        # Additional preprocessing steps like text cleaning can be added here
        return self.train_df, self.test_df

In [11]:
class HierarchicalClassifier:
    def __init__(self, parent_num_labels, child_num_labels, pretrained_model='bert-base-uncased'):
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_model)
        self.parent_model = BertForSequenceClassification.from_pretrained(pretrained_model, num_labels=parent_num_labels)
        self.child_models = nn.ModuleDict()
        self.pretrained_model = pretrained_model
    
    def add_child_model(self, parent_category, num_labels):
        # Convert parent category to valid dictionary key
        parent_key = str(parent_category).replace(" ", "_").replace("/", "_")
        if parent_key not in self.child_models:
            try:
                self.child_models[parent_key] = BertForSequenceClassification.from_pretrained(
                    self.pretrained_model, 
                    num_labels=num_labels
                )
                if log:
                    print(f"Created child model for {parent_category} with {num_labels} labels")
            except Exception as e:
                print(f"Error creating child model for {parent_category}: {str(e)}")
                raise
                
    def forward_parent(self, input_ids, attention_mask):
        return self.parent_model(input_ids=input_ids, attention_mask=attention_mask)
    
    def forward_child(self, parent_category, input_ids, attention_mask):
        child_model = self.child_models.get(parent_category, None)
        if child_model:
            return child_model(input_ids=input_ids, attention_mask=attention_mask)
        else:
            return None

In [12]:
class DistractorDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer.encode_plus(
            self.texts[idx],
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

def train_stage1(model, train_df, tokenizer, device, epochs=3):
    dataset = DistractorDataset(
        texts=train_df['AnswerText'].tolist(),
        labels=train_df['ParentCategoryEncoded'].tolist(),
        tokenizer=tokenizer
    )
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)
    optimizer = torch.optim.AdamW(model.parent_model.parameters(), lr=2e-5)
    model.parent_model.to(device)
    model.parent_model.train()
    
    for epoch in range(epochs):
        total_loss = 0.0
        total_correct = 0
        total_examples = 0
        
        for batch in tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}"):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model.parent_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            total_correct += (preds == labels).sum().item()
            total_examples += labels.size(0)
            
            loss.backward()
            optimizer.step()
        
        avg_loss = total_loss / len(dataloader)
        accuracy = total_correct / total_examples
        print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")

def train_stage2(model, train_df, tokenizer, hierarchy, device, epochs=3):
    for parent in tqdm(hierarchy.parent_to_children.keys()):
        # Convert parent name to valid dictionary key right away
        parent_key = str(parent).replace(" ", "_").replace("/", "_")
        
        child_subset = train_df[train_df['ParentCategory'] == parent]
        if child_subset.empty:
            if log: print(f"Skipping {parent} - no data")
            continue
            
        # Get number of unique misconceptions for this parent
        num_labels = len(child_subset['MisconceptionEncoded'].unique())
        
        # Initialize the child model if it doesn't exist
        try:
            model.add_child_model(parent, num_labels)
            if log: print(f"Successfully added child model for {parent}")
        except Exception as e:
            print(f"Failed to add child model for {parent}: {e}")
            continue
        
        try:
            dataset = DistractorDataset(
                texts=child_subset['AnswerText'].tolist(),
                labels=child_subset['MisconceptionEncoded'].tolist(),
                tokenizer=tokenizer
            )
            dataloader = DataLoader(dataset, batch_size=16, shuffle=True)
            
            # Now we can safely access the child model
            child_model = model.child_models[parent_key].to(device)
            optimizer = torch.optim.AdamW(child_model.parameters(), lr=2e-5)
            child_model.train()
            
            for epoch in range(epochs):
                total_loss = 0
                for batch in dataloader:
                    optimizer.zero_grad()
                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    labels = batch['labels'].to(device)
                    
                    # Ensure labels are Long type
                    labels = labels.long()
                    
                    outputs = child_model(
                        input_ids=input_ids, 
                        attention_mask=attention_mask, 
                        labels=labels
                    )
                    
                    loss = outputs.loss
                    loss.backward()
                    optimizer.step()
                    total_loss += loss.item()
                    
                avg_loss = total_loss / len(dataloader)
                if log:
                    print(f"Parent: {parent}, Epoch: {epoch+1}/{epochs}, Average Loss: {avg_loss:.4f}")
        except Exception as e:
            print(f"Error training model for {parent}: {e}")
            continue

In [13]:
def predict(model, texts, hierarchy, tokenizer, label_encoders, device):
    dataset = DistractorDataset(
        texts=texts,
        labels=[0]*len(texts),  # Dummy labels
        tokenizer=tokenizer
    )
    dataloader = DataLoader(dataset, batch_size=16, shuffle=False)
    model.parent_model.to(device)
    model.parent_model.eval()
    
    parent_preds = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model.parent_model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            parent_preds.extend(preds)
    
    parent_labels = label_encoders['parent'].inverse_transform(parent_preds)
    specific_preds = []
    
    for i, parent in enumerate(parent_labels):
        child_model = model.child_models.get(parent, None)
        if child_model is None:
            specific_preds.append(None)
            continue
        child_model.to(device)
        child_model.eval()
        with torch.no_grad():
            encoding = tokenizer.encode_plus(
                texts[i],
                add_special_tokens=True,
                max_length=128,
                padding='max_length',
                truncation=True,
                return_attention_mask=True,
                return_tensors='pt'
            )
            input_ids = encoding['input_ids'].to(device)
            attention_mask = encoding['attention_mask'].to(device)
            outputs = child_model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            pred = torch.argmax(logits, dim=1).cpu().numpy()[0]
            # Use the specific encoder for the parent
            specific_misconception = label_encoders['child_label_encoders'][parent].inverse_transform([pred])[0]
            specific_preds.append(specific_misconception)
    
    return specific_preds

# Training

In [14]:
hierarchy = Hierarchy()

In [15]:
# Define paths to your data files
train_path = '../kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv'
test_path = '../kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv'

# Initialize DataPreprocessor
preprocessor = DataPreprocessor(train_df=train_melted, test_df=test, hierarchy=hierarchy)
train_df, test_df = preprocessor.preprocess()

In [16]:
# Determine the number of parent labels
parent_num_labels = train_df['ParentCategoryEncoded'].nunique()

# Initialize HierarchicalClassifier
model = HierarchicalClassifier(parent_num_labels=parent_num_labels, child_num_labels=0)  # child_num_labels handled per parent



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'Using device: {device}')

Using device: cpu


In [18]:
# Train Stage 1: Parent Category Classification
train_stage1(model, train_df, model.tokenizer, device)

Epoch 1/3: 100%|██████████| 176/176 [11:57<00:00,  4.08s/it]


Epoch 1/3 - Loss: 4.4837, Accuracy: 0.2056


Epoch 2/3: 100%|██████████| 176/176 [11:21<00:00,  3.87s/it]


Epoch 2/3 - Loss: 4.1692, Accuracy: 0.2204


Epoch 3/3: 100%|██████████| 176/176 [12:10<00:00,  4.15s/it]

Epoch 3/3 - Loss: 3.9742, Accuracy: 0.2256





: 

In [19]:
# Train Stage 2: Specific Misconception Classification
train_stage2(model, train_df, model.tokenizer, hierarchy, device)

  0%|          | 0/163 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Simplifying Algebraic Fractions with 8 labels
Successfully added child model for Simplifying Algebraic Fractions
Parent: Simplifying Algebraic Fractions, Epoch: 1/3, Average Loss: 2.1719
Parent: Simplifying Algebraic Fractions, Epoch: 2/3, Average Loss: 2.1498


  1%|          | 1/163 [00:07<19:43,  7.31s/it]

Parent: Simplifying Algebraic Fractions, Epoch: 3/3, Average Loss: 2.0282


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Range and Interquartile Range from a List of Data with 9 labels
Successfully added child model for Range and Interquartile Range from a List of Data
Parent: Range and Interquartile Range from a List of Data, Epoch: 1/3, Average Loss: 2.1838
Parent: Range and Interquartile Range from a List of Data, Epoch: 2/3, Average Loss: 2.0885


  1%|          | 2/163 [00:17<23:27,  8.74s/it]

Parent: Range and Interquartile Range from a List of Data, Epoch: 3/3, Average Loss: 1.9776


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Properties of Quadrilaterals with 12 labels
Successfully added child model for Properties of Quadrilaterals
Parent: Properties of Quadrilaterals, Epoch: 1/3, Average Loss: 2.5574
Parent: Properties of Quadrilaterals, Epoch: 2/3, Average Loss: 2.4912


  2%|▏         | 3/163 [00:27<25:35,  9.60s/it]

Parent: Properties of Quadrilaterals, Epoch: 3/3, Average Loss: 2.4535


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Area of Simple Shapes with 30 labels
Successfully added child model for Area of Simple Shapes
Parent: Area of Simple Shapes, Epoch: 1/3, Average Loss: 3.4360
Parent: Area of Simple Shapes, Epoch: 2/3, Average Loss: 3.3548


  2%|▏         | 4/163 [00:58<47:53, 18.07s/it]

Parent: Area of Simple Shapes, Epoch: 3/3, Average Loss: 3.3694


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Converting between Fractions and Percentages with 9 labels
Successfully added child model for Converting between Fractions and Percentages
Parent: Converting between Fractions and Percentages, Epoch: 1/3, Average Loss: 2.2229
Parent: Converting between Fractions and Percentages, Epoch: 2/3, Average Loss: 2.2380


  3%|▎         | 5/163 [01:06<37:28, 14.23s/it]

Parent: Converting between Fractions and Percentages, Epoch: 3/3, Average Loss: 2.1076


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Transformations of functions in the form f(x) with 8 labels
Successfully added child model for Transformations of functions in the form f(x)
Parent: Transformations of functions in the form f(x), Epoch: 1/3, Average Loss: 2.1281
Parent: Transformations of functions in the form f(x), Epoch: 2/3, Average Loss: 2.1011


  4%|▎         | 6/163 [01:14<32:12, 12.31s/it]

Parent: Transformations of functions in the form f(x), Epoch: 3/3, Average Loss: 2.0064


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Expanding Triple Brackets and more with 1 labels
Successfully added child model for Expanding Triple Brackets and more


  4%|▍         | 7/163 [01:15<22:22,  8.61s/it]

Error training model for Expanding Triple Brackets and more: Found dtype Long but expected Float


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Nets with 3 labels
Successfully added child model for Nets
Parent: Nets, Epoch: 1/3, Average Loss: 0.9563
Parent: Nets, Epoch: 2/3, Average Loss: 1.0263


  5%|▍         | 8/163 [01:26<23:58,  9.28s/it]

Parent: Nets, Epoch: 3/3, Average Loss: 0.8525


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Time with 22 labels
Successfully added child model for Time
Parent: Time, Epoch: 1/3, Average Loss: 3.1132
Parent: Time, Epoch: 2/3, Average Loss: 2.9969


  6%|▌         | 9/163 [01:54<38:42, 15.08s/it]

Parent: Time, Epoch: 3/3, Average Loss: 2.8255


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Trial and Improvement and Iterative Methods with 4 labels
Successfully added child model for Trial and Improvement and Iterative Methods
Parent: Trial and Improvement and Iterative Methods, Epoch: 1/3, Average Loss: 1.3917
Parent: Trial and Improvement and Iterative Methods, Epoch: 2/3, Average Loss: 1.3095


  6%|▌         | 10/163 [02:01<32:15, 12.65s/it]

Parent: Trial and Improvement and Iterative Methods, Epoch: 3/3, Average Loss: 1.2855


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Sharing in a Ratio with 14 labels
Successfully added child model for Sharing in a Ratio
Parent: Sharing in a Ratio, Epoch: 1/3, Average Loss: 2.6100
Parent: Sharing in a Ratio, Epoch: 2/3, Average Loss: 2.6116


  7%|▋         | 11/163 [02:18<35:05, 13.85s/it]

Parent: Sharing in a Ratio, Epoch: 3/3, Average Loss: 2.5673


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Ordering Negative Numbers with 6 labels
Successfully added child model for Ordering Negative Numbers
Parent: Ordering Negative Numbers, Epoch: 1/3, Average Loss: 1.8938
Parent: Ordering Negative Numbers, Epoch: 2/3, Average Loss: 1.8721


  7%|▋         | 12/163 [02:22<27:25, 10.90s/it]

Parent: Ordering Negative Numbers, Epoch: 3/3, Average Loss: 1.7641


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Adding and Subtracting Negative Numbers with 14 labels
Successfully added child model for Adding and Subtracting Negative Numbers
Parent: Adding and Subtracting Negative Numbers, Epoch: 1/3, Average Loss: 2.7335
Parent: Adding and Subtracting Negative Numbers, Epoch: 2/3, Average Loss: 2.6800


  8%|▊         | 13/163 [02:38<31:22, 12.55s/it]

Parent: Adding and Subtracting Negative Numbers, Epoch: 3/3, Average Loss: 2.6122


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Adding and Subtracting Algebraic Fractions with 6 labels
Successfully added child model for Adding and Subtracting Algebraic Fractions
Parent: Adding and Subtracting Algebraic Fractions, Epoch: 1/3, Average Loss: 1.8197
Parent: Adding and Subtracting Algebraic Fractions, Epoch: 2/3, Average Loss: 1.8453


  9%|▊         | 14/163 [02:43<25:39, 10.33s/it]

Parent: Adding and Subtracting Algebraic Fractions, Epoch: 3/3, Average Loss: 1.7952


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Volume of Prisms with 15 labels
Successfully added child model for Volume of Prisms
Parent: Volume of Prisms, Epoch: 1/3, Average Loss: 2.7741
Parent: Volume of Prisms, Epoch: 2/3, Average Loss: 2.6150


  9%|▉         | 15/163 [03:04<33:13, 13.47s/it]

Parent: Volume of Prisms, Epoch: 3/3, Average Loss: 2.3449


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Factorising into a Double Bracket with 11 labels
Successfully added child model for Factorising into a Double Bracket
Parent: Factorising into a Double Bracket, Epoch: 1/3, Average Loss: 2.4166
Parent: Factorising into a Double Bracket, Epoch: 2/3, Average Loss: 2.2167


 10%|▉         | 16/163 [03:24<37:35, 15.34s/it]

Parent: Factorising into a Double Bracket, Epoch: 3/3, Average Loss: 2.1690


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Mental Multiplication and Division with 15 labels
Successfully added child model for Mental Multiplication and Division
Parent: Mental Multiplication and Division, Epoch: 1/3, Average Loss: 2.7263
Parent: Mental Multiplication and Division, Epoch: 2/3, Average Loss: 2.7472


 10%|█         | 17/163 [03:38<36:16, 14.91s/it]

Parent: Mental Multiplication and Division, Epoch: 3/3, Average Loss: 2.6816


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Substitution into Formula with 11 labels
Successfully added child model for Substitution into Formula
Parent: Substitution into Formula, Epoch: 1/3, Average Loss: 2.4476
Parent: Substitution into Formula, Epoch: 2/3, Average Loss: 2.4342


 11%|█         | 18/163 [03:49<33:48, 13.99s/it]

Parent: Substitution into Formula, Epoch: 3/3, Average Loss: 2.3224


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Counting with 2 labels
Successfully added child model for Counting
Parent: Counting, Epoch: 1/3, Average Loss: 0.7792
Parent: Counting, Epoch: 2/3, Average Loss: 0.6412


 12%|█▏        | 19/163 [03:56<28:01, 11.68s/it]

Parent: Counting, Epoch: 3/3, Average Loss: 0.5694


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Solving Linear Inequalities with 7 labels
Successfully added child model for Solving Linear Inequalities
Parent: Solving Linear Inequalities, Epoch: 1/3, Average Loss: 1.8493
Parent: Solving Linear Inequalities, Epoch: 2/3, Average Loss: 1.6458


 12%|█▏        | 20/163 [04:04<25:41, 10.78s/it]

Parent: Solving Linear Inequalities, Epoch: 3/3, Average Loss: 1.8350


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Linear Equations with 24 labels
Successfully added child model for Linear Equations
Parent: Linear Equations, Epoch: 1/3, Average Loss: 3.2502
Parent: Linear Equations, Epoch: 2/3, Average Loss: 3.1858


 13%|█▎        | 21/163 [04:24<31:43, 13.40s/it]

Parent: Linear Equations, Epoch: 3/3, Average Loss: 3.1399


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Real Life Graphs with 21 labels
Successfully added child model for Real Life Graphs
Parent: Real Life Graphs, Epoch: 1/3, Average Loss: 3.0997
Parent: Real Life Graphs, Epoch: 2/3, Average Loss: 2.8934


 13%|█▎        | 22/163 [04:41<33:48, 14.39s/it]

Parent: Real Life Graphs, Epoch: 3/3, Average Loss: 2.8252


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Place Value with 34 labels
Successfully added child model for Place Value
Parent: Place Value, Epoch: 1/3, Average Loss: 3.6548
Parent: Place Value, Epoch: 2/3, Average Loss: 3.4810


 14%|█▍        | 23/163 [05:05<40:28, 17.34s/it]

Parent: Place Value, Epoch: 3/3, Average Loss: 3.4017


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Simplifying Expressions by Collecting Like Terms with 9 labels
Successfully added child model for Simplifying Expressions by Collecting Like Terms
Parent: Simplifying Expressions by Collecting Like Terms, Epoch: 1/3, Average Loss: 2.1564
Parent: Simplifying Expressions by Collecting Like Terms, Epoch: 2/3, Average Loss: 2.1104


 15%|█▍        | 24/163 [05:16<36:01, 15.55s/it]

Parent: Simplifying Expressions by Collecting Like Terms, Epoch: 3/3, Average Loss: 2.0168


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Simplifying Fractions with 5 labels
Successfully added child model for Simplifying Fractions
Parent: Simplifying Fractions, Epoch: 1/3, Average Loss: 1.7915
Parent: Simplifying Fractions, Epoch: 2/3, Average Loss: 1.6366


 15%|█▌        | 25/163 [05:22<28:58, 12.60s/it]

Parent: Simplifying Fractions, Epoch: 3/3, Average Loss: 1.5208


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Parallel Lines with 10 labels
Successfully added child model for Parallel Lines
Parent: Parallel Lines, Epoch: 1/3, Average Loss: 2.4631
Parent: Parallel Lines, Epoch: 2/3, Average Loss: 2.3015


 16%|█▌        | 26/163 [05:32<27:21, 11.98s/it]

Parent: Parallel Lines, Epoch: 3/3, Average Loss: 2.2118


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Dividing Fractions with 13 labels
Successfully added child model for Dividing Fractions
Parent: Dividing Fractions, Epoch: 1/3, Average Loss: 2.6681
Parent: Dividing Fractions, Epoch: 2/3, Average Loss: 2.6102


 17%|█▋        | 27/163 [05:47<28:45, 12.68s/it]

Parent: Dividing Fractions, Epoch: 3/3, Average Loss: 2.5682


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Properties of Triangles with 6 labels
Successfully added child model for Properties of Triangles
Parent: Properties of Triangles, Epoch: 1/3, Average Loss: 1.7846
Parent: Properties of Triangles, Epoch: 2/3, Average Loss: 1.6344


 17%|█▋        | 28/163 [05:53<24:21, 10.82s/it]

Parent: Properties of Triangles, Epoch: 3/3, Average Loss: 1.6033


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Function Machines with 14 labels
Successfully added child model for Function Machines
Parent: Function Machines, Epoch: 1/3, Average Loss: 2.6767
Parent: Function Machines, Epoch: 2/3, Average Loss: 2.6060


 18%|█▊        | 29/163 [06:11<28:56, 12.96s/it]

Parent: Function Machines, Epoch: 3/3, Average Loss: 2.5689


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Writing Expressions with 29 labels
Successfully added child model for Writing Expressions
Parent: Writing Expressions, Epoch: 1/3, Average Loss: 3.4039
Parent: Writing Expressions, Epoch: 2/3, Average Loss: 3.1983


 18%|█▊        | 30/163 [06:51<46:26, 20.95s/it]

Parent: Writing Expressions, Epoch: 3/3, Average Loss: 3.0541


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Fractions of an Amount with 10 labels
Successfully added child model for Fractions of an Amount
Parent: Fractions of an Amount, Epoch: 1/3, Average Loss: 2.3144
Parent: Fractions of an Amount, Epoch: 2/3, Average Loss: 2.2478


 19%|█▉        | 31/163 [07:00<38:34, 17.53s/it]

Parent: Fractions of an Amount, Epoch: 3/3, Average Loss: 2.1895


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Right-angled Triangles (SOHCAHTOA) with 8 labels
Successfully added child model for Right-angled Triangles (SOHCAHTOA)
Parent: Right-angled Triangles (SOHCAHTOA), Epoch: 1/3, Average Loss: 2.0320
Parent: Right-angled Triangles (SOHCAHTOA), Epoch: 2/3, Average Loss: 1.9780


 20%|█▉        | 32/163 [07:10<33:06, 15.16s/it]

Parent: Right-angled Triangles (SOHCAHTOA), Epoch: 3/3, Average Loss: 1.7732


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Inequalities on Number Lines with 2 labels
Successfully added child model for Inequalities on Number Lines
Parent: Inequalities on Number Lines, Epoch: 1/3, Average Loss: 0.7722
Parent: Inequalities on Number Lines, Epoch: 2/3, Average Loss: 0.6770


 20%|██        | 33/163 [07:16<27:07, 12.52s/it]

Parent: Inequalities on Number Lines, Epoch: 3/3, Average Loss: 0.6574


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Percentages of an Amount with 22 labels
Successfully added child model for Percentages of an Amount
Parent: Percentages of an Amount, Epoch: 1/3, Average Loss: 3.1214
Parent: Percentages of an Amount, Epoch: 2/3, Average Loss: 3.0224


 21%|██        | 34/163 [07:36<31:42, 14.75s/it]

Parent: Percentages of an Amount, Epoch: 3/3, Average Loss: 2.9237


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Converting Mixed Number and Improper Fractions with 18 labels
Successfully added child model for Converting Mixed Number and Improper Fractions
Parent: Converting Mixed Number and Improper Fractions, Epoch: 1/3, Average Loss: 2.9157
Parent: Converting Mixed Number and Improper Fractions, Epoch: 2/3, Average Loss: 2.7771


 21%|██▏       | 35/163 [07:49<30:05, 14.11s/it]

Parent: Converting Mixed Number and Improper Fractions, Epoch: 3/3, Average Loss: 2.8432


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Ordering Fractions with 10 labels
Successfully added child model for Ordering Fractions
Parent: Ordering Fractions, Epoch: 1/3, Average Loss: 2.2195
Parent: Ordering Fractions, Epoch: 2/3, Average Loss: 2.1442


 22%|██▏       | 36/163 [08:00<28:10, 13.31s/it]

Parent: Ordering Fractions, Epoch: 3/3, Average Loss: 2.0376


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Averages and Range from Frequency Table with 16 labels
Successfully added child model for Averages and Range from Frequency Table
Parent: Averages and Range from Frequency Table, Epoch: 1/3, Average Loss: 2.9153
Parent: Averages and Range from Frequency Table, Epoch: 2/3, Average Loss: 2.7518


 23%|██▎       | 37/163 [08:14<27:59, 13.33s/it]

Parent: Averages and Range from Frequency Table, Epoch: 3/3, Average Loss: 2.6982


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Quadratic Equations with 21 labels
Successfully added child model for Quadratic Equations
Parent: Quadratic Equations, Epoch: 1/3, Average Loss: 3.1028
Parent: Quadratic Equations, Epoch: 2/3, Average Loss: 2.9156


 23%|██▎       | 38/163 [08:43<37:40, 18.09s/it]

Parent: Quadratic Equations, Epoch: 3/3, Average Loss: 2.8919


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Adding and Subtracting with Decimals with 10 labels
Successfully added child model for Adding and Subtracting with Decimals
Parent: Adding and Subtracting with Decimals, Epoch: 1/3, Average Loss: 2.3375
Parent: Adding and Subtracting with Decimals, Epoch: 2/3, Average Loss: 2.2289


 24%|██▍       | 39/163 [08:57<34:51, 16.87s/it]

Parent: Adding and Subtracting with Decimals, Epoch: 3/3, Average Loss: 2.2230


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Experimental Probability and Relative Frequency with 1 labels
Successfully added child model for Experimental Probability and Relative Frequency


 25%|██▍       | 40/163 [08:58<24:48, 12.10s/it]

Error training model for Experimental Probability and Relative Frequency: Found dtype Long but expected Float


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Created child model for Horizontal and Vertical Lines with 8 labels
Successfully added child model for Horizontal and Vertical Lines


In [None]:
import os

# Create directory to save models
os.makedirs('models', exist_ok=True)

# Save parent model
model.parent_model.save_pretrained('models/parent_model')

# Save child models
for parent, child_model in model.child_models.items():
    model.child_models[parent].save_pretrained(f'models/child_model_{parent}')

In [None]:
from sklearn.metrics import classification_report

def evaluate_stage1(model, train_df, tokenizer, device):
    dataset = DistractorDataset(
        texts=train_df['DistractorText'].tolist(),
        labels=train_df['ParentCategoryEncoded'].tolist(),
        tokenizer=tokenizer
    )
    dataloader = DataLoader(dataset, batch_size=16, shuffle=False)
    model.parent_model.to(device)
    model.parent_model.eval()
    
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model.parent_model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
    
    print(classification_report(all_labels, all_preds, target_names=preprocessor.label_encoders['parent'].classes_))

# Evaluate Stage 1
evaluate_stage1(model, train_df, model.tokenizer, device)

In [None]:
def evaluate_stage2(model, train_df, tokenizer, hierarchy, device):
    for parent in hierarchy.parent_to_children.keys():
        child_subset = train_df[train_df['ParentCategory'] == parent]
        if child_subset.empty:
            continue
        
        child_model = model.child_models.get(parent, None)
        if child_model is None:
            continue
        
        child_model.to(device)
        child_model.eval()
        
        dataset = DistractorDataset(
            texts=child_subset['DistractorText'].tolist(),
            labels=child_subset['MisconceptionEncoded'].tolist(),
            tokenizer=tokenizer
        )
        dataloader = DataLoader(dataset, batch_size=16, shuffle=False)
        
        all_preds = []
        all_labels = []
        
        with torch.no_grad():
            for batch in dataloader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                outputs = child_model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits
                preds = torch.argmax(logits, dim=1).cpu().numpy()
                all_preds.extend(preds)
                all_labels.extend(labels.cpu().numpy())
        
        print(f"Classification Report for Parent Category: {parent}")
        print(classification_report(all_labels, all_preds, target_names=preprocessor.label_encoders['child'].classes_))
        print("\n")

# Evaluate Stage 2
evaluate_stage2(model, train_df, model.tokenizer, hierarchy, device)