# BERT as a baseline

Uses BERT to classify misconceptions.

Key limitations:
- Doing classification, but the dataset has over 2500+ classes. Several classes have only one example, and some data points have no target class. 

In [17]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

import os
import sys
sys.path.append(os.path.abspath('..'))
import importlib
import pickle

In [2]:
import utils.text
import utils.api

importlib.reload(utils.text)
importlib.reload(utils.api)

from utils.text import match_misconception, clean_string
from utils.api import get_completion

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [4]:
misconceptions = pd.read_csv('../eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv', index_col='MisconceptionId')
train = pd.read_csv('../eedi-mining-misconceptions-in-mathematics/train.csv')
train_melted = pd.read_csv('../train_melted.csv')
test = pd.read_csv('../eedi-mining-misconceptions-in-mathematics/test.csv')

In [5]:
misconception_list = list(misconceptions['MisconceptionName'])
misconception_list

['Does not know that angles in a triangle sum to 180 degrees',
 'Uses dividing fractions method for multiplying fractions',
 'Believes there are 100 degrees in a full turn',
 'Thinks a quadratic without a non variable term, can not be factorised',
 'Believes addition of terms and powers of terms are equivalent e.g. a + c = a^c',
 'When measuring a reflex angle, gives the acute or obtuse angle that sums to 360 instead',
 'Can identify the multiplier used to form an equivalent fraction but does not apply to the numerator',
 'Believes gradient = change in y',
 'Student thinks that any two angles along a straight line are equal',
 'Thinks there are 180 degrees in a full turn',
 'Believes duration can be read from a timetable, rather than calculating from start and end times',
 'When reading value from graph, reads from the wrong axes.',
 'Thinks that the side view does not include the furthest rows back',
 'Does not subtract from the hours, when having to borrow for a time calculation',
 '

In [6]:
print("Shape before:", train_melted.shape)

# Remove rows with missing MisconceptionName, remove misconceptinos that only appear once
train_melted = train_melted.dropna(subset=['MisconceptionName'])
train_melted = train_melted.groupby('MisconceptionId').filter(lambda x: len(x) > 1)

print("Shape after:", train_melted.shape)

Shape before: (7476, 11)
Shape after: (3623, 11)


In [7]:
# Split into training and validation sets
train, valid = train_test_split(train_melted, test_size=0.25, random_state=123, stratify=train_melted['MisconceptionName'])

In [8]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_data(text, max_length=128):
    return tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
# Example tokenization
sample_text = train['QuestionText'].iloc[0]
tokenized_sample = tokenize_data(sample_text)
tokenized_sample

{'input_ids': tensor([[ 101, 2054, 2003, 1996, 2675, 7117, 1997, 7032, 1029,  102,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0,

In [10]:
import torch
from torch import nn
from transformers import BertModel

class BERTClassifier(nn.Module):
    def __init__(self, n_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(p=0.3)
        self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled_output = outputs[1]  # [CLS] token
        output = self.dropout(pooled_output)
        return self.out(output)

In [11]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW
from sklearn.preprocessing import LabelEncoder

# Encode labels
le = LabelEncoder()
train_labels = le.fit_transform(train['MisconceptionName'])
val_labels = le.transform(valid['MisconceptionName'])
n_classes = len(le.classes_)

# Create a custom dataset
class MisconceptionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        encoding = self.tokenizer.encode_plus(
            self.texts[idx],
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Create datasets and dataloaders
train_dataset = MisconceptionDataset(
    texts=train['QuestionText'].to_numpy(),
    labels=train_labels,
    tokenizer=tokenizer
)

val_dataset = MisconceptionDataset(
    texts=valid['QuestionText'].to_numpy(),
    labels=val_labels,
    tokenizer=tokenizer
)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)

In [13]:
# Initialize the model
model = BERTClassifier(n_classes=n_classes)
model = model.to('cuda' if torch.cuda.is_available() else 'cpu')

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()

# Training loop
from tqdm import tqdm

def train_epoch(model, data_loader, optimizer, criterion, device):
    model = model.train()
    losses = []
    correct_predictions = 0
    
    for batch in tqdm(data_loader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
        loss = criterion(outputs, labels)
        
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    
    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

# Validation loop
def eval_model(model, data_loader, criterion, device):
    model = model.eval()
    losses = []
    correct_predictions = 0
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Validation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            loss = criterion(outputs, labels)
            
            correct_predictions += torch.sum(preds == labels)
            losses.append(loss.item())
    
    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

# Training the model
EPOCHS = 3
device = 'cuda' if torch.cuda.is_available() else 'cpu'

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    train_acc, train_loss = train_epoch(model, train_loader, optimizer, criterion, device)
    print(f'Train loss {train_loss} accuracy {train_acc}')
    
    val_acc, val_loss = eval_model(model, val_loader, criterion, device)
    print(f'Val   loss {val_loss} accuracy {val_acc}')
    print('-' * 10)

Epoch 1/3


Training: 100%|██████████| 170/170 [07:07<00:00,  2.52s/it]


Train loss 6.762886454077328 accuracy 0.003312476996687523


Validation: 100%|██████████| 57/57 [00:26<00:00,  2.18it/s]


Val   loss 6.625169034589801 accuracy 0.026490066225165563
----------
Epoch 2/3


Training: 100%|██████████| 170/170 [06:30<00:00,  2.30s/it]


Train loss 6.556825099271887 accuracy 0.022451232977548766


Validation: 100%|██████████| 57/57 [00:25<00:00,  2.24it/s]


Val   loss 6.401228804337351 accuracy 0.04856512141280353
----------
Epoch 3/3


Training: 100%|██████████| 170/170 [05:20<00:00,  1.89s/it]


Train loss 6.312380092284259 accuracy 0.04637467795362532


Validation: 100%|██████████| 57/57 [00:27<00:00,  2.11it/s]

Val   loss 6.180494425589578 accuracy 0.06181015452538632
----------





In [19]:
save_path = '../models/bert-2024_11_20.pkl'
os.makedirs('models', exist_ok=True)  
with open(save_path, 'wb') as f:
    pickle.dump(model, f)

print(f"Model saved to {save_path}")

Model saved to ../models/bert-2024_11_20.pkl


In [23]:
test.columns

Index(['QuestionId', 'ConstructId', 'ConstructName', 'SubjectId',
       'SubjectName', 'CorrectAnswer', 'QuestionText', 'AnswerAText',
       'AnswerBText', 'AnswerCText', 'AnswerDText'],
      dtype='object')

In [26]:
# Update the submission creation process to handle multiple answer choices (A, B, C, D)

import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

# Load the misconception mapping to create a name-to-ID mapping
misconception_mapping_path = '../eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv'
misconceptions = pd.read_csv(misconception_mapping_path, index_col='MisconceptionId')

# Create a dictionary to map MisconceptionName to MisconceptionId
name_to_id = misconceptions.reset_index().set_index('MisconceptionName')['MisconceptionId'].to_dict()

# Verify the columns in the test DataFrame
print("Test DataFrame Columns:", test.columns.tolist())

# Ensure that your test DataFrame has a unique identifier for each question
# For this example, we'll assume there's a 'QuestionId_Answer' column in the test DataFrame

# If 'QuestionId_Answer' does not exist, replace it with the appropriate identifier from your test set
if 'QuestionId_Answer' not in test.columns:
    # Example: Create a unique identifier by combining 'QuestionId' and 'AnswerOption' columns
    # Replace 'Answer' with the actual column names in your test DataFrame
    test['QuestionId_Answer'] = test.apply(
        lambda row: f"{row['QuestionId']}_{row['CorrectAnswer']}", axis=1
    )

# Reshape the test DataFrame to have one row per QuestionId_Answer (A, B, C, D)
# Include 'QuestionText' in id_vars to preserve it in the melted DataFrame
test_long = test.melt(
    id_vars=['QuestionId', 'QuestionText'],
    value_vars=['AnswerAText', 'AnswerBText', 'AnswerCText', 'AnswerDText'],
    var_name='AnswerOption',
    value_name='AnswerText'
)

# Clean the 'AnswerOption' column to obtain A, B, C, D
test_long['AnswerOption'] = test_long['AnswerOption'].str.replace('Answer', '').str.replace('Text', '')

# Create 'QuestionId_Answer' column by combining 'QuestionId' and 'AnswerOption'
test_long['QuestionId_Answer'] = test_long['QuestionId'].astype(str) + '_' + test_long['AnswerOption']

# Verify the reshaped DataFrame
print("Reshaped Test DataFrame Head:")
print(test_long.head())

# Tokenize the combined 'QuestionText' and 'AnswerText'
def tokenize_question_answer(question, answer, tokenizer, max_length=128):
    combined_text = question + " " + answer
    return tokenizer.encode_plus(
        combined_text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

# Update the MisconceptionDataset to handle question and answer
class MisconceptionDataset(Dataset):
    def __init__(self, questions, answers, labels, tokenizer, max_len=128):
        self.questions = questions
        self.answers = answers
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.questions)
    
    def __getitem__(self, idx):
        encoding = tokenize_question_answer(
            self.questions[idx],
            self.answers[idx],
            self.tokenizer,
            self.max_len
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            # For multi-label classification, labels should be multi-hot vectors
            'labels': self.labels[idx]
        }

# Since the submission requires up to 25 MisconceptionIds, consider modifying the model for multi-label
# Here, we'll proceed with single-label for simplicity and adjust the submission format

# Create a custom dataset for test
test_dataset = MisconceptionDataset(
    questions=test_long['QuestionText'].to_numpy(),
    answers=test_long['AnswerText'].to_numpy(),
    labels=[0]*len(test_long),  # Placeholder labels
    tokenizer=tokenizer
)

test_loader = DataLoader(test_dataset, batch_size=16)

# Make predictions
model = model.eval()
predictions = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        probs = torch.softmax(outputs, dim=1)
        # Get top 25 predictions
        topk = probs.topk(25, dim=1)
        preds = topk.indices.cpu().numpy()
        predictions.extend(preds)

# Decode predictions
# Inverse transform to get MisconceptionName from class indices
predicted_misconceptions = [le.inverse_transform(pred) for pred in predictions]

# Map MisconceptionName to MisconceptionId using the name_to_id dictionary
predicted_misconception_ids = [
    ' '.join(str(name_to_id.get(name, 0)) for name in preds) for preds in predicted_misconceptions
]

# Create submission DataFrame
submission = pd.DataFrame({
    'QuestionId_Answer': test_long['QuestionId_Answer'],
    'MisconceptionId': predicted_misconception_ids
})

# Ensure that each 'MisconceptionId' has up to 25 MisconceptionIds
submission['MisconceptionId'] = submission['MisconceptionId'].apply(lambda x: ' '.join(x.split()[:25]))

# Save to CSV
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully.")

Test DataFrame Columns: ['QuestionId', 'ConstructId', 'ConstructName', 'SubjectId', 'SubjectName', 'CorrectAnswer', 'QuestionText', 'AnswerAText', 'AnswerBText', 'AnswerCText', 'AnswerDText', 'QuestionId_Answer']
Reshaped Test DataFrame Head:
   QuestionId  \
0        1869   
1        1870   
2        1871   
3        1869   
4        1870   

                                                                                                                                                                                                                                                                                                                                                  QuestionText  \
0                                                                                                                                                                                                                                                                 \[\n3 \times 2+4-5\n\]\nWhere do the brack

Testing: 100%|██████████| 1/1 [00:00<00:00,  1.79it/s]

Submission file created successfully.



