# BERT as a baseline



Uses BERT to classify misconceptions.



Key limitations:

- Doing classification, but the dataset has over 2500+ classes. Several classes have only one example, and some data points have no target class. 

In [85]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

import os
import sys
sys.path.append(os.path.abspath('..'))
import importlib
import pickle

In [86]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [87]:
local = True
log = True
log_detail = False

In [107]:
if local:
    misconceptions = pd.read_csv('../kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv', index_col='MisconceptionId')
    train = pd.read_csv('../kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv')
    test = pd.read_csv('../kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv')
else:
    misconceptions = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv', index_col='MisconceptionId')
    train = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv')
    test = pd.read_csv('/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv')
if log: print("(1) Imported data")

(1) Imported data


In [89]:
# Define the identifier columns
id_cols = [
    'QuestionId', 'ConstructId', 'ConstructName', 
    'SubjectId', 'SubjectName', 'CorrectAnswer', 'QuestionText'
]

# Define the corresponding Answer options
answer_cols = ['AnswerAText', 'AnswerBText', 'AnswerCText', 'AnswerDText']
misconception_cols = ['MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId', 'MisconceptionDId']

# Melt Answer Text
text_melted = train.melt(
    id_vars=id_cols,
    value_vars=answer_cols,
    var_name='Attribute',
    value_name='AnswerText'
)

# Melt Misconception IDs
misconception_melted = train.melt(
    id_vars=id_cols,
    value_vars=misconception_cols,
    var_name='Attribute',
    value_name='MisconceptionId'
)

# Extract the option letter (A, B, C, D) and the attribute type
text_melted['AnswerOption'] = text_melted['Attribute'].str.extract(r'Answer([ABCD])Text')[0]
misconception_melted['AnswerOption'] = misconception_melted['Attribute'].str.extract(r'Misconception([ABCD])Id')[0]

# Drop the original 'Attribute' columns as they are no longer needed
text_melted.drop('Attribute', axis=1, inplace=True)
misconception_melted.drop('Attribute', axis=1, inplace=True)

# Merge the two melted DataFrames on id_vars and AnswerOption
train_melted = pd.merge(
    text_melted,
    misconception_melted,
    on=id_cols + ['AnswerOption'],
    how='left'
)

train_melted = train_melted.merge(misconceptions, left_on='MisconceptionId', right_index=True, how='left')
if log: print("(2) Created train_melted")

(2) Created train_melted


In [90]:
misconception_list = list(misconceptions['MisconceptionName'])

In [91]:
if log_detail: print("Shape before:", train_melted.shape)

# Remove rows with missing MisconceptionName, remove misconceptinos that only appear once
train_melted = train_melted.dropna(subset=['MisconceptionName'])
train_melted = train_melted.groupby('MisconceptionId').filter(lambda x: len(x) > 1)

if log_detail: print("Shape after:", train_melted.shape)
if log: print("(3) Shaped train_melted to", train_melted.shape)

(3) Shaped train_melted to (3623, 11)


In [92]:
# Split into training and validation sets

train, valid = train_test_split(train_melted, test_size=0.25, random_state=123, stratify=train_melted['MisconceptionName'])

In [93]:
from transformers import BertTokenizer

if local: 
    bert_path = 'bert-base-uncased'
else:
    bert_path = '/kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594'

tokenizer = BertTokenizer.from_pretrained(bert_path)

def tokenize_data(text, max_length=128):
    return tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

In [94]:
# Example tokenization
if local:
    sample_text = train['QuestionText'].iloc[0]
    tokenized_sample = tokenize_data(sample_text)
    tokenized_sample

In [95]:
import torch
from torch import nn
from transformers import BertModel

class BERTClassifier(nn.Module):
    def __init__(self, n_classes):
        super(BERTClassifier, self).__init__()
        if local:
            self.bert = BertModel.from_pretrained('bert-base-uncased')
        else:
            self.bert = BertModel.from_pretrained('/kaggle/input/google-bertbert-base-uncased/transformers/default/1/cache/models--google-bert--bert-base-uncased/snapshots/86b5e0934494bd15c9632b12f734a8a67f723594')
        self.dropout = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        pooled_output = outputs[1]  # [CLS] token
        output = self.dropout(pooled_output)
        return self.out(output)

In [96]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW
from sklearn.preprocessing import LabelEncoder

# Encode labels
le = LabelEncoder()
train_labels = le.fit_transform(train['MisconceptionName'])
val_labels = le.transform(valid['MisconceptionName'])
n_classes = len(le.classes_)

# Create a custom dataset
class MisconceptionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer.encode_plus(
            self.texts[idx],
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }


# Create datasets and dataloaders

train_dataset = MisconceptionDataset(
    texts=train['QuestionText'].to_numpy(),
    labels=train_labels,
    tokenizer=tokenizer
)

val_dataset = MisconceptionDataset(
    texts=valid['QuestionText'].to_numpy(),
    labels=val_labels,
    tokenizer=tokenizer
)

if local:
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
else:
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4, pin_memory=True, prefetch_factor=2)
    val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False, num_workers=4, pin_memory=True, prefetch_factor=2)

if log: print("(4) Created DataLoaders")

(4) Created DataLoaders


In [97]:
train.head()

Unnamed: 0,QuestionId,ConstructId,ConstructName,SubjectId,SubjectName,CorrectAnswer,QuestionText,AnswerText,AnswerOption,MisconceptionId,MisconceptionName
2948,1079,479,Calculate the square root of a number,246,"Square Roots, Cube Roots, etc",D,What is the square root of sixteen?,\( 32 \),B,2017.0,Mixes up square rooting and multiplying by 2 or doubling
3900,162,2760,Calculate the median as an average from an even numbered list of data,102,"Averages (mean, median, mode) from a List of Data",A,"What is the median of the following numbers?\n\[\n3,5,6,18,18,-4\n\]",\( 6 \),C,2426.0,When finding the median from a even dataset does not understand we must find the midpoint of the two values in the middle
2379,510,1961,Identify questions involving a 2D right-angled triangle that require the use of the Tangent (tan) ratio,279,Right-angled Triangles (SOHCAHTOA),C,"Which ratio would you use to find the value of \( p \) ? ![A right-angled triangle with the angle labelled 32 degrees, the side adjacent to this is 6cm and the side opposite is p.]()",Cos,B,809.0,Uses cos when tan is required
6256,649,311,Multiply a decimal by an integer,224,Multiplying and Dividing with Decimals,B,Tom and Katie are discussing multiplying decimals.\n\nTom says \( 5 \times 3.9=3 \times 5.9 \)\n\nKatie says \( 5 \times 3.9=3.9 \times 5 \)\n\nWho is correct?,Neither is correct,D,638.0,Believes multiplication is not commutative
912,912,100,Recognise and use efficient strategies for mental subtraction,203,Mental Addition and Subtraction,B,Nick wants to subtract \( 199 \) from a number.\nWhich one of the following methods would give him the correct answer?,\( +200 \) then \( +1 \),A,1338.0,Does not understand the effect of consecutive operations


In [98]:
if local: 
    model_path = '../models/bert-20241120.pkl'
    if log: print("(5) Loading model")
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    if log: print("(6) Finished loading model")
    model = model.to('cuda' if torch.cuda.is_available() else 'cpu')


else:
    # Initialize the model
    model = BERTClassifier(n_classes=n_classes)
    model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

    # Define optimizer and loss function
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    criterion = nn.CrossEntropyLoss()

    # Training loop
    from tqdm import tqdm
    def train_epoch(model, data_loader, optimizer, criterion, device):
        model = model.train()
        losses = []
        correct_predictions = 0

        for batch in tqdm(data_loader, desc="Training"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            loss = criterion(outputs, labels)

            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

    # Validation loop

    def eval_model(model, data_loader, criterion, device):
        model = model.eval()
        losses = []
        correct_predictions = 0

        with torch.no_grad():
            for batch in tqdm(data_loader, desc="Validation"):
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                _, preds = torch.max(outputs, dim=1)
                loss = criterion(outputs, labels)

                correct_predictions += torch.sum(preds == labels)
                losses.append(loss.item())

        return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

    if log: print("(5) Begin training")
        
    # Training the model
    EPOCHS = 3
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    for epoch in range(EPOCHS):
        print(f'Epoch {epoch + 1}/{EPOCHS}')
        train_acc, train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
        print(f'Train loss {train_loss} accuracy {train_acc}')

        val_acc, val_loss = eval_model(model, val_loader, criterion, device)
        print(f'Val   loss {val_loss} accuracy {val_acc}')
        print('-' * 10)
    
    if log: print("(6) Finish training")

(5) Loading model
(6) Finished loading model


In [99]:
test.columns

Index(['QuestionId', 'ConstructId', 'ConstructName', 'SubjectId',
       'SubjectName', 'CorrectAnswer', 'QuestionText', 'AnswerAText',
       'AnswerBText', 'AnswerCText', 'AnswerDText'],
      dtype='object')

In [108]:
test.head()

Unnamed: 0,QuestionId,ConstructId,ConstructName,SubjectId,SubjectName,CorrectAnswer,QuestionText,AnswerAText,AnswerBText,AnswerCText,AnswerDText
0,1869,856,Use the order of operations to carry out calculations involving powers,33,BIDMAS,A,\[\n3 \times 2+4-5\n\]\nWhere do the brackets need to go to make the answer equal \( 13 \) ?,\( 3 \times(2+4)-5 \),\( 3 \times 2+(4-5) \),\( 3 \times(2+4-5) \),Does not need brackets
1,1870,1612,Simplify an algebraic fraction by factorising the numerator,1077,Simplifying Algebraic Fractions,D,"Simplify the following, if possible: \( \frac{m^{2}+2 m-3}{m-3} \)",\( m+1 \),\( m+2 \),\( m-1 \),Does not simplify
2,1871,2774,Calculate the range from a list of data,339,Range and Interquartile Range from a List of Data,B,"Tom and Katie are discussing the \( 5 \) plants with these heights:\n\( 24 \mathrm{~cm}, 17 \mathrm{~cm}, 42 \mathrm{~cm}, 26 \mathrm{~cm}, 13 \mathrm{~cm} \)\nTom says if all the plants were cut in half, the range wouldn't change.\nKatie says if all the plants grew by \( 3 \mathrm{~cm} \) each, the range wouldn't change.\nWho do you agree with?",Only\nTom,Only\nKatie,Both Tom and Katie,Neither is correct


In [109]:
if log: print("(7) Melt test set")
    
# Create a dictionary to map MisconceptionName to MisconceptionId
name_to_id = misconceptions.reset_index().set_index('MisconceptionName')['MisconceptionId'].to_dict()

# Reshape the test DataFrame to have one row per QuestionId_Answer (A, B, C, D)

# Include 'QuestionText' in id_vars to preserve it in the melted DataFrame
test_melted = test.melt(
    id_vars=['QuestionId', 'QuestionText', 'CorrectAnswer'],
    value_vars=['AnswerAText', 'AnswerBText', 'AnswerCText', 'AnswerDText'],
    var_name='AnswerOption',
    value_name='AnswerText'
)

# Clean the 'AnswerOption' column to obtain A, B, C, D
test_melted['AnswerOption'] = test_melted['AnswerOption'].str.replace('Answer', '').str.replace('Text', '')

test_melted['QA_Id'] = test_melted['QuestionId'].astype(str) + '_' + test_melted['AnswerOption']

# Drop correct answers
test_melted = test_melted[test_melted['CorrectAnswer'] != test_melted['AnswerOption']]

test_melted.head()


(7) Melt test set


Unnamed: 0,QuestionId,QuestionText,CorrectAnswer,AnswerOption,AnswerText,QA_Id
1,1870,"Simplify the following, if possible: \( \frac{m^{2}+2 m-3}{m-3} \)",D,A,\( m+1 \),1870_A
2,1871,"Tom and Katie are discussing the \( 5 \) plants with these heights:\n\( 24 \mathrm{~cm}, 17 \mathrm{~cm}, 42 \mathrm{~cm}, 26 \mathrm{~cm}, 13 \mathrm{~cm} \)\nTom says if all the plants were cut in half, the range wouldn't change.\nKatie says if all the plants grew by \( 3 \mathrm{~cm} \) each, the range wouldn't change.\nWho do you agree with?",B,A,Only\nTom,1871_A
3,1869,\[\n3 \times 2+4-5\n\]\nWhere do the brackets need to go to make the answer equal \( 13 \) ?,A,B,\( 3 \times 2+(4-5) \),1869_B
4,1870,"Simplify the following, if possible: \( \frac{m^{2}+2 m-3}{m-3} \)",D,B,\( m+2 \),1870_B
6,1869,\[\n3 \times 2+4-5\n\]\nWhere do the brackets need to go to make the answer equal \( 13 \) ?,A,C,\( 3 \times(2+4-5) \),1869_C


In [110]:
test_dataset = MisconceptionDataset(
    texts=test_melted['QuestionText'].to_numpy(),
    labels=[0]*len(test_melted), 
    tokenizer=tokenizer
)
test_loader = DataLoader(test_dataset, batch_size=16)

if log: print("(8) Making predictions")

# Make predictions
model = model.eval()
predictions = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = torch.softmax(outputs, dim=1)
        # Get top 25 predictions
        topk = probs.topk(25, dim=1)
        preds = topk.indices.cpu().numpy()
        predictions.extend(preds)

# Decode predictions

# Inverse transform to get MisconceptionName from class indices
predicted_misconceptions = [le.inverse_transform(pred) for pred in predictions]

# Map MisconceptionName to MisconceptionId using the name_to_id dictionary
predicted_misconception_ids = [
    ' '.join(str(name_to_id.get(name, 0)) for name in preds) for preds in predicted_misconceptions
]


if log: print("(9) Creating submission file")

# Create submission DataFrame
submission = pd.DataFrame({
    'QuestionId_Answer': test_melted['QA_Id'],
    'MisconceptionId': predicted_misconception_ids
})

# Ensure that each 'MisconceptionId' has up to 25 MisconceptionIds
submission['MisconceptionId'] = submission['MisconceptionId'].apply(lambda x: ' '.join(x.split()[:25]))

# Save to CSV
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully.")

(8) Making predictions


Testing: 100%|██████████| 1/1 [00:00<00:00,  1.58it/s]

(9) Creating submission file
Submission file created successfully.



