In [7]:
local = True

log = True

log_detail = False

# Imports

## Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

import gc
import os
import sys
from tqdm import tqdm
sys.path.append(os.path.abspath('..'))
import importlib
import pickle

In [3]:
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import average_precision_score
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

## Data

In [8]:
if local:

    misconceptions = pd.read_csv('../kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv', index_col='MisconceptionId')
    train = pd.read_csv('../kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv')
    test = pd.read_csv('../kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv')

else:

    misconceptions = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv', index_col='MisconceptionId')

    train = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv')

    test = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv')

if log: print("(1) Imported data")

(1) Imported data


In [9]:
#########################

# Create train_melted 

#########################



# Define the identifier columns

id_cols = [

    'QuestionId', 'ConstructId', 'ConstructName', 

    'SubjectId', 'SubjectName', 'CorrectAnswer', 'QuestionText'

]



# Define the corresponding Answer options

answer_cols = ['AnswerAText', 'AnswerBText', 'AnswerCText', 'AnswerDText']

misconception_cols = ['MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId', 'MisconceptionDId']



# Melt Answer Text

text_melted = train.melt(

    id_vars=id_cols,

    value_vars=answer_cols,

    var_name='Attribute',

    value_name='AnswerText'

)



# Melt Misconception IDs

misconception_melted = train.melt(

    id_vars=id_cols,

    value_vars=misconception_cols,

    var_name='Attribute',

    value_name='MisconceptionId'

)



# Extract the option letter (A, B, C, D) and the attribute type

text_melted['AnswerOption'] = text_melted['Attribute'].str.extract(r'Answer([ABCD])Text')[0]

misconception_melted['AnswerOption'] = misconception_melted['Attribute'].str.extract(r'Misconception([ABCD])Id')[0]



# Drop the original 'Attribute' columns as they are no longer needed

text_melted.drop('Attribute', axis=1, inplace=True)

misconception_melted.drop('Attribute', axis=1, inplace=True)



# Merge the two melted DataFrames on id_vars and AnswerOption

train_melted = pd.merge(

    text_melted,

    misconception_melted,

    on=id_cols + ['AnswerOption'],

    how='left'

)



train_melted = train_melted.merge(misconceptions, left_on='MisconceptionId', right_index=True, how='left')

train_melted = train_melted[train_melted['CorrectAnswer'] != train_melted['AnswerOption']]

In [10]:
print(f"SubjectName nunique: {train_melted['SubjectName'].nunique()}")

print(f"ConstructName nunique: {train_melted['ConstructName'].nunique()}")

print(f"MisconceptionName nunique: {train_melted['MisconceptionName'].nunique()}")


SubjectName nunique: 163
ConstructName nunique: 757
MisconceptionName nunique: 1604


In [11]:
hierarchy_scm = {}

for subject in train_melted['SubjectName'].unique():

    subject_data = train_melted[train_melted['SubjectName'] == subject]

    constructs = subject_data['ConstructName'].dropna().unique()

    constructs_list = []

    for construct in constructs:

        misconceptions_list = subject_data[subject_data['ConstructName'] == construct]['MisconceptionName'].dropna().unique().tolist()

        constructs_list.extend(misconceptions_list)

    hierarchy_scm[subject] = constructs_list

In [14]:
print(len(hierarchy_scm))
avg_len = sum(len(misconceptions) for misconceptions in hierarchy_scm.values()) / len(hierarchy_scm)
print(f"Average number of misconceptions per subject: {avg_len:.2f}")


163
Average number of misconceptions per subject: 16.81


# Classes

In [10]:
class Hierarchy:

    def __init__(self):

        self.parent_to_children = {}

        for subject, misconceptions in hierarchy_scm.items():

            self.parent_to_children[subject] = misconceptions

        self.child_to_parent = {child: parent for parent, children in self.parent_to_children.items() for child in children}

    

    def get_parent(self, child):

        return self.child_to_parent.get(child, None)

    

    def get_children(self, parent):

        return self.parent_to_children.get(parent, [])

In [11]:
class DataPreprocessor:

    def __init__(self, train_df, test_df, hierarchy):

        self.train_df = train_df

        self.test_df = test_df

        self.hierarchy = hierarchy

        self.label_encoders = {}

        self.child_label_encoders = {}

    

    def map_hierarchy(self):

        self.train_df['ParentCategory'] = self.train_df['MisconceptionName'].apply(self.hierarchy.get_parent)

        self.test_df['ParentCategory'] = None  # To be predicted later

    

    def encode_labels(self):

        # Encode parent categories

        le_parent = LabelEncoder()

        self.train_df['ParentCategoryEncoded'] = le_parent.fit_transform(self.train_df['ParentCategory'])

        self.label_encoders['parent'] = le_parent

        

        # Encode specific misconceptions per parent

        self.child_label_encoders = {}

        self.train_df['MisconceptionEncoded'] = -1  # Initialize with a placeholder

        

        for parent in self.hierarchy.parent_to_children.keys():

            misconception_subset = self.train_df[self.train_df['ParentCategory'] == parent]['MisconceptionName']

            le_child = LabelEncoder()

            encoded = le_child.fit_transform(misconception_subset)

            self.train_df.loc[self.train_df['ParentCategory'] == parent, 'MisconceptionEncoded'] = encoded

            self.child_label_encoders[parent] = le_child

        

    def preprocess(self):

        self.map_hierarchy()

        self.encode_labels()

        # Additional preprocessing steps like text cleaning can be added here

        return self.train_df, self.test_df

In [12]:
class HierarchicalClassifier:
    def __init__(self, parent_num_labels, pretrained_model='/kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594'):
        self.tokenizer = BertTokenizer.from_pretrained(pretrained_model)
        self.parent_model = None
        self.child_models = {}  # Dictionary to store child models
        self.pretrained_model = pretrained_model
        self.parent_num_labels = parent_num_labels
    
    def load_parent_model(self, device):
        if self.parent_model is not None:
            self.parent_model.cpu()
            del self.parent_model
            torch.cuda.empty_cache()
        
        self.parent_model = BertForSequenceClassification.from_pretrained(
            self.pretrained_model, 
            num_labels=self.parent_num_labels
        ).to(device)
    
    def load_child_model(self, parent_category, num_labels, device):
        # Clear only this specific child model if it exists
        if parent_category in self.child_models:
            self.child_models[parent_category].cpu()
            del self.child_models[parent_category]
            torch.cuda.empty_cache()
        
        child_model = BertForSequenceClassification.from_pretrained(
            self.pretrained_model, 
            num_labels=num_labels
        ).to(device)
        
        self.child_models[parent_category] = child_model
        return child_model
    
    def save_model(self, path):
        os.makedirs(path, exist_ok=True)
        
        if self.parent_model is not None:
            self.parent_model.save_pretrained(os.path.join(path, 'parent_model'))
        
        for parent_key, child_model in self.child_models.items():
            safe_key = str(parent_key).replace(" ", "_").replace("/", "_")
            child_model.save_pretrained(os.path.join(path, f'child_model_{safe_key}'))
    
    def load_model(self, path):
        parent_path = os.path.join(path, 'parent_model')
        if os.path.exists(parent_path):
            self.parent_model = BertForSequenceClassification.from_pretrained(parent_path)
        
        for model_dir in os.listdir(path):
            if model_dir.startswith('child_model_'):
                parent_key = model_dir.replace('child_model_', '').replace("_", " ")
                model_path = os.path.join(path, model_dir)
                self.child_models[parent_key] = BertForSequenceClassification.from_pretrained(model_path)

    def clear_models(self):
        """Clear all models from memory"""
        if self.parent_model is not None:
            self.parent_model.cpu()
            del self.parent_model
            self.parent_model = None
        
        for key in list(self.child_models.keys()):
            self.child_models[key].cpu()
            del self.child_models[key]
        self.child_models.clear()
        
        torch.cuda.empty_cache()
        gc.collect()

In [13]:
class DistractorDataset(Dataset):

    def __init__(self, texts, labels, tokenizer, max_length=64):

        self.texts = texts

        self.labels = labels

        self.tokenizer = tokenizer

        self.max_length = max_length

    

    def __len__(self):

        return len(self.texts)

    

    def __getitem__(self, idx):

        encoding = self.tokenizer.encode_plus(

            self.texts[idx],

            add_special_tokens=True,

            max_length=self.max_length,

            padding='max_length',

            truncation=True,

            return_attention_mask=True,

            return_tensors='pt'

        )

        return {

            'input_ids': encoding['input_ids'].flatten(),

            'attention_mask': encoding['attention_mask'].flatten(),

            'labels': torch.tensor(self.labels[idx], dtype=torch.long)

        }


def train_stage1(model, train_df, tokenizer, device, epochs=3):
    # Load parent model
    model.load_parent_model(device)
    
    dataset = DistractorDataset(
        texts=train_df['AnswerText'].tolist(),
        labels=train_df['ParentCategoryEncoded'].tolist(),
        tokenizer=tokenizer,
        max_length=64  # Reduced max length
    )
    
    # Smaller batch size with gradient accumulation
    batch_size = 4
    gradient_accumulation_steps = 8
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    optimizer = torch.optim.AdamW(model.parent_model.parameters(), lr=2e-5)
    
    for epoch in range(epochs):
        total_loss = 0.0
        optimizer.zero_grad()
        
        for i, batch in enumerate(tqdm(dataloader)):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model.parent_model(
                input_ids=input_ids, 
                attention_mask=attention_mask, 
                labels=labels
            )
            
            loss = outputs.loss / gradient_accumulation_steps
            loss.backward()
            
            if (i + 1) % gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
                torch.cuda.empty_cache()
            
            # Free memory
            del outputs, input_ids, attention_mask, labels
            
            total_loss += loss.item() * gradient_accumulation_steps
        
        avg_loss = total_loss / len(dataloader)
        print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_loss:.4f}")

def train_stage2(model, train_df, tokenizer, hierarchy, device, epochs=3):
    # Explicitly clear parent model and GPU cache
    model.clear_models()
    torch.cuda.empty_cache()
    gc.collect()
    
    for parent in tqdm(hierarchy.parent_to_children.keys()):
        parent_key = str(parent).replace(" ", "_").replace("/", "_")
        child_subset = train_df[train_df['ParentCategory'] == parent]
        
        if child_subset.empty:
            continue
            
        num_labels = len(child_subset['MisconceptionEncoded'].unique())
        
        try:
            # Load child model for this parent
            child_model = model.load_child_model(parent, num_labels, device)
            
            dataset = DistractorDataset(
                texts=child_subset['AnswerText'].tolist(),
                labels=child_subset['MisconceptionEncoded'].tolist(),
                tokenizer=tokenizer,
                max_length=64
            )
            
            # Further reduced batch size and increased gradient accumulation
            batch_size = 1
            gradient_accumulation_steps = 32
            dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
            
            optimizer = torch.optim.AdamW(child_model.parameters(), lr=2e-5)
            
            for epoch in range(epochs):
                total_loss = 0
                optimizer.zero_grad()
                
                for i, batch in enumerate(dataloader):
                    # Move batch to CPU first to clear GPU memory
                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    labels = batch['labels'].long().to(device)
                    
                    # Clear cache before forward pass
                    torch.cuda.empty_cache()
                    
                    outputs = child_model(
                        input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=labels
                    )
                    
                    loss = outputs.loss / gradient_accumulation_steps
                    loss.backward()
                    
                    if (i + 1) % gradient_accumulation_steps == 0:
                        optimizer.step()
                        optimizer.zero_grad()
                    
                    # Explicitly clear memory after each batch
                    del outputs, input_ids, attention_mask, labels
                    torch.cuda.empty_cache()
                    
                    total_loss += loss.item() * gradient_accumulation_steps
                
                avg_loss = total_loss / len(dataloader)
                if log:
                    print(f"Parent: {parent}, Epoch: {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")
                
                # Clear cache after each epoch
                torch.cuda.empty_cache()
                gc.collect()
            
            # Clear model from GPU after training for this parent
            child_model.cpu()
            torch.cuda.empty_cache()
            gc.collect()
                
        except Exception as e:
            print(f"Error training model for {parent}: {e}")
            # Ensure cleanup even if error occurs
            torch.cuda.empty_cache()
            gc.collect()
            continue
        
        # Additional cleanup between parents
        torch.cuda.empty_cache()
        gc.collect()

In [14]:
def predict(model, texts, hierarchy, tokenizer, label_encoders, device):

    dataset = DistractorDataset(

        texts=texts,

        labels=[0]*len(texts),  # Dummy labels

        tokenizer=tokenizer

    )

    dataloader = DataLoader(dataset, batch_size=16, shuffle=False)

    model.parent_model.to(device)

    model.parent_model.eval()

    

    parent_preds = []

    with torch.no_grad():

        for batch in dataloader:

            input_ids = batch['input_ids'].to(device)

            attention_mask = batch['attention_mask'].to(device)

            outputs = model.parent_model(input_ids=input_ids, attention_mask=attention_mask)

            logits = outputs.logits

            preds = torch.argmax(logits, dim=1).cpu().numpy()

            parent_preds.extend(preds)

    

    parent_labels = label_encoders['parent'].inverse_transform(parent_preds)

    specific_preds = []

    

    for i, parent in enumerate(parent_labels):

        child_model = model.child_models.get(parent, None)

        if child_model is None:

            specific_preds.append(None)

            continue

        child_model.to(device)

        child_model.eval()

        with torch.no_grad():

            encoding = tokenizer.encode_plus(

                texts[i],

                add_special_tokens=True,

                max_length=128,

                padding='max_length',

                truncation=True,

                return_attention_mask=True,

                return_tensors='pt'

            )

            input_ids = encoding['input_ids'].to(device)

            attention_mask = encoding['attention_mask'].to(device)

            outputs = child_model(input_ids=input_ids, attention_mask=attention_mask)

            logits = outputs.logits

            pred = torch.argmax(logits, dim=1).cpu().numpy()[0]

            # Use the specific encoder for the parent

            specific_misconception = label_encoders['child_label_encoders'][parent].inverse_transform([pred])[0]

            specific_preds.append(specific_misconception)

    

    return specific_preds

# Training

In [15]:
hierarchy = Hierarchy()

In [16]:
def calculate_map25(true_labels, pred_probs, k=25):
    """Calculate MAP@25 for predictions"""
    # Convert predictions to probabilities if they aren't already
    if isinstance(pred_probs, torch.Tensor):
        pred_probs = pred_probs.cpu().numpy()
    
    # Handle case where we have fewer than k classes
    k = min(k, pred_probs.shape[1])
    
    # Calculate AP for each sample and average
    aps = []
    for i in range(len(true_labels)):
        y_true = np.zeros(pred_probs.shape[1])
        y_true[true_labels[i]] = 1
        ap = average_precision_score(y_true, pred_probs[i])
        aps.append(ap)
    
    return np.mean(aps)

In [17]:
# Define paths to your data files

train_path = '../kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv'

test_path = '../kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv'



# Initialize DataPreprocessor

preprocessor = DataPreprocessor(train_df=train_melted, test_df=test, hierarchy=hierarchy)

train_df, test_df = preprocessor.preprocess()

In [18]:
# Determine the number of parent labels

parent_num_labels = train_df['ParentCategoryEncoded'].nunique()



# Initialize HierarchicalClassifier

model = HierarchicalClassifier(parent_num_labels=parent_num_labels) 

In [19]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

print(f'Using device: {device}')

Using device: cuda


In [21]:
# Train Stage 1: Parent Category Classification

train_stage1(model, train_df, model.tokenizer, device, epochs=1)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1402/1402 [01:00<00:00, 23.27it/s]

Epoch 1/1 - Loss: 4.4212





In [None]:
# Train Stage 2: Specific Misconception Classification

train_stage2(model, train_df, model.tokenizer, hierarchy, device)

  0%|          | 0/163 [00:00<?, ?it/s]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Simplifying Algebraic Fractions, Epoch: 1/3, Loss: 2.2835
Parent: Simplifying Algebraic Fractions, Epoch: 2/3, Loss: 2.2835
Parent: Simplifying Algebraic Fractions, Epoch: 3/3, Loss: 2.2835


  1%|          | 1/163 [00:03<10:01,  3.71s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Range and Interquartile Range from a List of Data, Epoch: 1/3, Loss: 2.3678
Parent: Range and Interquartile Range from a List of Data, Epoch: 2/3, Loss: 2.3678
Parent: Range and Interquartile Range from a List of Data, Epoch: 3/3, Loss: 2.3678


  1%|          | 2/163 [00:08<11:45,  4.38s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Properties of Quadrilaterals, Epoch: 1/3, Loss: 2.5455
Parent: Properties of Quadrilaterals, Epoch: 2/3, Loss: 2.5455
Parent: Properties of Quadrilaterals, Epoch: 3/3, Loss: 2.5455


  2%|▏         | 3/163 [00:13<12:19,  4.62s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Area of Simple Shapes, Epoch: 1/3, Loss: 3.4294
Parent: Area of Simple Shapes, Epoch: 2/3, Loss: 3.3754
Parent: Area of Simple Shapes, Epoch: 3/3, Loss: 3.2906


  2%|▏         | 4/163 [00:21<16:10,  6.11s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Converting between Fractions and Percentages, Epoch: 1/3, Loss: 2.1624
Parent: Converting between Fractions and Percentages, Epoch: 2/3, Loss: 2.1624
Parent: Converting between Fractions and Percentages, Epoch: 3/3, Loss: 2.1624


  3%|▎         | 5/163 [00:25<14:04,  5.35s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Transformations of functions in the form f(x), Epoch: 1/3, Loss: 2.1626
Parent: Transformations of functions in the form f(x), Epoch: 2/3, Loss: 2.1626
Parent: Transformations of functions in the form f(x), Epoch: 3/3, Loss: 2.1626


  4%|▎         | 6/163 [00:29<12:40,  4.85s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  4%|▍         | 7/163 [00:30<09:02,  3.48s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Error training model for Expanding Triple Brackets and more: Found dtype Long but expected Float
Parent: Nets, Epoch: 1/3, Loss: 0.9994
Parent: Nets, Epoch: 2/3, Loss: 0.9994
Parent: Nets, Epoch: 3/3, Loss: 0.9994


  5%|▍         | 8/163 [00:34<09:36,  3.72s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Time, Epoch: 1/3, Loss: 3.1830
Parent: Time, Epoch: 2/3, Loss: 3.1252
Parent: Time, Epoch: 3/3, Loss: 3.0649


  6%|▌         | 9/163 [00:42<13:11,  5.14s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Trial and Improvement and Iterative Methods, Epoch: 1/3, Loss: 1.3655
Parent: Trial and Improvement and Iterative Methods, Epoch: 2/3, Loss: 1.3655
Parent: Trial and Improvement and Iterative Methods, Epoch: 3/3, Loss: 1.3655


  6%|▌         | 10/163 [00:46<11:43,  4.60s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Sharing in a Ratio, Epoch: 1/3, Loss: 2.6757
Parent: Sharing in a Ratio, Epoch: 2/3, Loss: 2.6757
Parent: Sharing in a Ratio, Epoch: 3/3, Loss: 2.6757


  7%|▋         | 11/163 [00:51<12:09,  4.80s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Ordering Negative Numbers, Epoch: 1/3, Loss: 1.8653
Parent: Ordering Negative Numbers, Epoch: 2/3, Loss: 1.8653
Parent: Ordering Negative Numbers, Epoch: 3/3, Loss: 1.8653


  7%|▋         | 12/163 [00:55<11:04,  4.40s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Adding and Subtracting Negative Numbers, Epoch: 1/3, Loss: 2.8260
Parent: Adding and Subtracting Negative Numbers, Epoch: 2/3, Loss: 2.8260
Parent: Adding and Subtracting Negative Numbers, Epoch: 3/3, Loss: 2.8260


  8%|▊         | 13/163 [01:00<11:39,  4.66s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Adding and Subtracting Algebraic Fractions, Epoch: 1/3, Loss: 1.8472
Parent: Adding and Subtracting Algebraic Fractions, Epoch: 2/3, Loss: 1.8472
Parent: Adding and Subtracting Algebraic Fractions, Epoch: 3/3, Loss: 1.8472


  9%|▊         | 14/163 [01:03<10:32,  4.24s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Volume of Prisms, Epoch: 1/3, Loss: 2.7163
Parent: Volume of Prisms, Epoch: 2/3, Loss: 2.6434
Parent: Volume of Prisms, Epoch: 3/3, Loss: 2.5522


  9%|▉         | 15/163 [01:09<11:43,  4.75s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Factorising into a Double Bracket, Epoch: 1/3, Loss: 2.5363
Parent: Factorising into a Double Bracket, Epoch: 2/3, Loss: 2.3599
Parent: Factorising into a Double Bracket, Epoch: 3/3, Loss: 2.2272


 10%|▉         | 16/163 [01:16<13:06,  5.35s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Mental Multiplication and Division, Epoch: 1/3, Loss: 2.7974
Parent: Mental Multiplication and Division, Epoch: 2/3, Loss: 2.7974
Parent: Mental Multiplication and Division, Epoch: 3/3, Loss: 2.7974


 10%|█         | 17/163 [01:21<12:49,  5.27s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Substitution into Formula, Epoch: 1/3, Loss: 2.3666
Parent: Substitution into Formula, Epoch: 2/3, Loss: 2.3666
Parent: Substitution into Formula, Epoch: 3/3, Loss: 2.3666


 11%|█         | 18/163 [01:26<12:38,  5.23s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Counting, Epoch: 1/3, Loss: 0.7669
Parent: Counting, Epoch: 2/3, Loss: 0.7669
Parent: Counting, Epoch: 3/3, Loss: 0.7669


 12%|█▏        | 19/163 [01:29<11:19,  4.72s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Solving Linear Inequalities, Epoch: 1/3, Loss: 2.1333
Parent: Solving Linear Inequalities, Epoch: 2/3, Loss: 2.1333
Parent: Solving Linear Inequalities, Epoch: 3/3, Loss: 2.1333


 12%|█▏        | 20/163 [01:34<10:57,  4.60s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Linear Equations, Epoch: 1/3, Loss: 3.0851
Parent: Linear Equations, Epoch: 2/3, Loss: 3.0137
Parent: Linear Equations, Epoch: 3/3, Loss: 2.9321


 13%|█▎        | 21/163 [01:41<12:32,  5.30s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Real Life Graphs, Epoch: 1/3, Loss: 3.1564
Parent: Real Life Graphs, Epoch: 2/3, Loss: 3.0086
Parent: Real Life Graphs, Epoch: 3/3, Loss: 2.9394


 13%|█▎        | 22/163 [01:47<13:07,  5.58s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Place Value, Epoch: 1/3, Loss: 3.5776
Parent: Place Value, Epoch: 2/3, Loss: 3.5339
Parent: Place Value, Epoch: 3/3, Loss: 3.4946


 14%|█▍        | 23/163 [01:55<14:49,  6.35s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Simplifying Expressions by Collecting Like Terms, Epoch: 1/3, Loss: 2.3425
Parent: Simplifying Expressions by Collecting Like Terms, Epoch: 2/3, Loss: 2.3425
Parent: Simplifying Expressions by Collecting Like Terms, Epoch: 3/3, Loss: 2.3425


 15%|█▍        | 24/163 [02:00<13:37,  5.88s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Simplifying Fractions, Epoch: 1/3, Loss: 1.7385
Parent: Simplifying Fractions, Epoch: 2/3, Loss: 1.7385
Parent: Simplifying Fractions, Epoch: 3/3, Loss: 1.7385


 15%|█▌        | 25/163 [02:04<12:10,  5.29s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Parallel Lines, Epoch: 1/3, Loss: 2.4184
Parent: Parallel Lines, Epoch: 2/3, Loss: 2.4184
Parent: Parallel Lines, Epoch: 3/3, Loss: 2.4184


 16%|█▌        | 26/163 [02:08<11:37,  5.09s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Dividing Fractions, Epoch: 1/3, Loss: 2.5895
Parent: Dividing Fractions, Epoch: 2/3, Loss: 2.5895
Parent: Dividing Fractions, Epoch: 3/3, Loss: 2.5895


 17%|█▋        | 27/163 [02:14<11:36,  5.12s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Properties of Triangles, Epoch: 1/3, Loss: 1.8091
Parent: Properties of Triangles, Epoch: 2/3, Loss: 1.8091
Parent: Properties of Triangles, Epoch: 3/3, Loss: 1.8091


 17%|█▋        | 28/163 [02:18<10:46,  4.79s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Function Machines, Epoch: 1/3, Loss: 2.6816
Parent: Function Machines, Epoch: 2/3, Loss: 2.5753
Parent: Function Machines, Epoch: 3/3, Loss: 2.5338


 18%|█▊        | 29/163 [02:24<11:46,  5.27s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Writing Expressions, Epoch: 1/3, Loss: 3.4146
Parent: Writing Expressions, Epoch: 2/3, Loss: 3.2857
Parent: Writing Expressions, Epoch: 3/3, Loss: 3.1784


 18%|█▊        | 30/163 [02:35<15:43,  7.09s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Fractions of an Amount, Epoch: 1/3, Loss: 2.2710
Parent: Fractions of an Amount, Epoch: 2/3, Loss: 2.2710
Parent: Fractions of an Amount, Epoch: 3/3, Loss: 2.2710


 19%|█▉        | 31/163 [02:40<14:08,  6.43s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Right-angled Triangles (SOHCAHTOA), Epoch: 1/3, Loss: 2.2937
Parent: Right-angled Triangles (SOHCAHTOA), Epoch: 2/3, Loss: 2.2937
Parent: Right-angled Triangles (SOHCAHTOA), Epoch: 3/3, Loss: 2.2937


 20%|█▉        | 32/163 [02:45<12:47,  5.86s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Inequalities on Number Lines, Epoch: 1/3, Loss: 0.6793
Parent: Inequalities on Number Lines, Epoch: 2/3, Loss: 0.6793
Parent: Inequalities on Number Lines, Epoch: 3/3, Loss: 0.6793


 20%|██        | 33/163 [02:49<11:25,  5.27s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Percentages of an Amount, Epoch: 1/3, Loss: 3.0869
Parent: Percentages of an Amount, Epoch: 2/3, Loss: 3.0634
Parent: Percentages of an Amount, Epoch: 3/3, Loss: 2.9930


 21%|██        | 34/163 [02:57<13:00,  6.05s/it]Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Parent: Converting Mixed Number and Improper Fractions, Epoch: 1/3, Loss: 3.0766
Parent: Converting Mixed Number and Improper Fractions, Epoch: 2/3, Loss: 3.0766
Parent: Converting Mixed Number and Improper Fractions, Epoch: 3/3, Loss: 3.0766


In [None]:
# After training, save 

model.save_model('model_checkpoint')

In [None]:
def evaluate_stage1(model, train_df, tokenizer, device):
    # Load parent model first
    model.load_parent_model(device)
    
    dataset = DistractorDataset(
        texts=train_df['AnswerText'].tolist(),
        labels=train_df['ParentCategoryEncoded'].tolist(),
        tokenizer=tokenizer,
        max_length=64  # Match training sequence length
    )
    dataloader = DataLoader(dataset, batch_size=8, shuffle=False)  # Smaller batch size
    
    model.parent_model.eval()
    
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model.parent_model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
            
            # Free memory
            del outputs, logits, input_ids, attention_mask, labels
            torch.cuda.empty_cache()
    
    # Get unique parent categories and filter out None values
    target_names = [str(x) for x in train_df['ParentCategory'].unique() if pd.notna(x)]
    
    # Get unique label indices
    unique_labels = sorted(set(all_labels))
    
    print(classification_report(
        all_labels, 
        all_preds, 
        labels=unique_labels,
        target_names=target_names,
        zero_division=0
    ))

def evaluate_stage2(model, train_df, tokenizer, hierarchy, device):
    for parent in hierarchy.parent_to_children.keys():
        parent_key = str(parent).replace(" ", "_").replace("/", "_")
        child_subset = train_df[train_df['ParentCategory'] == parent]
        
        if child_subset.empty:
            continue
        
        num_labels = len(child_subset['MisconceptionEncoded'].unique())
        
        try:
            # Load child model for this parent
            child_model = model.load_child_model(parent, num_labels, device)
            
            dataset = DistractorDataset(
                texts=child_subset['AnswerText'].tolist(),
                labels=child_subset['MisconceptionEncoded'].tolist(),
                tokenizer=tokenizer,
                max_length=64
            )
            dataloader = DataLoader(dataset, batch_size=8, shuffle=False)
            
            child_model.eval()
            
            all_preds = []
            all_labels = []
            all_probs = []
            
            with torch.no_grad():
                for batch in dataloader:
                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    labels = batch['labels'].to(device)
                    
                    outputs = child_model(input_ids=input_ids, attention_mask=attention_mask)
                    logits = outputs.logits
                    probs = torch.softmax(logits, dim=1)
                    preds = torch.argmax(logits, dim=1).cpu().numpy()
                    
                    all_preds.extend(preds)
                    all_labels.extend(labels.cpu().numpy())
                    all_probs.extend(probs.cpu().numpy())
                    
                    # Free memory
                    del outputs, logits, probs, input_ids, attention_mask, labels
                    torch.cuda.empty_cache()

            # Calculate MAP@25
            map25 = calculate_map25(all_labels, np.array(all_probs))
            print(f"\nMAP@25 Score for {parent}: {map25:.4f}")
            
            # Get unique misconception names and filter out None values
            target_names = [str(x) for x in child_subset['MisconceptionName'].unique() if pd.notna(x)]
            
            # Get unique label indices
            unique_labels = sorted(set(all_labels))
            
            print(f"\nClassification Report for Parent Category: {parent}")
            print(classification_report(
                all_labels, 
                all_preds, 
                labels=unique_labels,
                target_names=target_names,
                zero_division=0
            ))
            
        except Exception as e:
            print(f"Error evaluating model for {parent}: {e}")
            continue
        
        finally:
            # Clean up memory
            torch.cuda.empty_cache()
            gc.collect()

In [None]:
# Before prediction, load the model
model.load_model('model_checkpoint')

In [None]:
evaluate_stage1(model, train_df, model.tokenizer, device)

In [None]:
evaluate_stage2(model, train_df, model.tokenizer, hierarchy, device)

In [None]:
if log: print("(7) Melt test set")
    
# Create a dictionary to map MisconceptionName to MisconceptionId
name_to_id = misconceptions.reset_index().set_index('MisconceptionName')['MisconceptionId'].to_dict()

# Reshape the test DataFrame to have one row per QuestionId_Answer (A, B, C, D)
test_melted = test.melt(
    id_vars=['QuestionId', 'QuestionText', 'CorrectAnswer'],
    value_vars=['AnswerAText', 'AnswerBText', 'AnswerCText', 'AnswerDText'],
    var_name='AnswerOption',
    value_name='AnswerText'
)

# Clean the 'AnswerOption' column to obtain A, B, C, D
test_melted['AnswerOption'] = test_melted['AnswerOption'].str.replace('Answer', '').str.replace('Text', '')
test_melted['QA_Id'] = test_melted['QuestionId'].astype(str) + '_' + test_melted['AnswerOption']

# Drop correct answers
test_melted = test_melted[test_melted['CorrectAnswer'] != test_melted['AnswerOption']]

In [None]:
# Create submission predictions
def create_submission(model, test_df, tokenizer, hierarchy, label_encoders, device):
    # Ensure model is loaded
    if not hasattr(model, 'parent_model') or model.parent_model is None:
        model.load_model('model_checkpoint')
    
    # Get predictions for test data
    predictions = predict(model, test_melted['AnswerText'].tolist(), hierarchy, tokenizer, label_encoders, device)
    
    # Create submission DataFrame
    submission = pd.DataFrame({
        'Id': test_df.index,
        'MisconceptionName': predictions
    })
    
    # Save submission file
    submission.to_csv('submission.csv', index=False)
    if log:
        print("Submission file created successfully")
    return submission

In [None]:
create_submission(model, test_melted, model.tokenizer, hierarchy, preprocessor.label_encoders, device)