In [1]:
import torch

# Check if GPU is available
if torch.cuda.is_available():
    print(f"GPU is available. Device name: {torch.cuda.get_device_name(0)}")
else:
    print("GPU is not available, using CPU.")


GPU is available. Device name: Tesla T4


In [2]:
!pip install kaggle



In [3]:
# from google.colab import drive
# drive.mount('/content/drive')


In [8]:
!mkdir -p ~/.kaggle
!mv /content/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [9]:
!kaggle competitions download -c eedi-mining-misconceptions-in-mathematics

Downloading eedi-mining-misconceptions-in-mathematics.zip to /content
100% 260k/260k [00:00<00:00, 459kB/s]
100% 260k/260k [00:00<00:00, 458kB/s]


In [10]:
!unzip /content/eedi-mining-misconceptions-in-mathematics.zip -d /content/data

Archive:  /content/eedi-mining-misconceptions-in-mathematics.zip
  inflating: /content/data/misconception_mapping.csv  
  inflating: /content/data/sample_submission.csv  
  inflating: /content/data/test.csv  
  inflating: /content/data/train.csv  


In [11]:
# Step 1: Checking for missing values in the dataset
import pandas as pd
train_df = pd.read_csv('/content/data/train.csv')
print(len(train_df))
print(train_df.head())
missing_values = train_df.isnull().sum()

# Step 2: Distribution of correct answers
correct_answer_distribution = train_df['CorrectAnswer'].value_counts()

# Step 3: Distribution of misconceptions
misconception_columns = ['MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId', 'MisconceptionDId']
misconception_distribution = train_df[misconception_columns].melt(value_name='MisconceptionId').dropna()['MisconceptionId'].value_counts()

# Step 4: Distribution of constructs and subjects
construct_distribution = train_df['ConstructName'].value_counts().head(10)  # Top 10 most frequent constructs
subject_distribution = train_df['SubjectName'].value_counts().head(10)  # Top 10 most frequent subjects

# Step 5: Analyzing the question text length
train_df['QuestionLength'] = train_df['QuestionText'].apply(lambda x: len(x.split()))
question_length_distribution = train_df['QuestionLength'].describe()


# Display the insights
{
    "Missing Values": missing_values,
    "Correct Answer Distribution": correct_answer_distribution,
    "Misconception Distribution": misconception_distribution.head(10),  # Display top 10
    "Top 10 Constructs": construct_distribution,
    "Top 10 Subjects": subject_distribution,
    "Question Text Length Distribution": question_length_distribution
}


1869
   QuestionId  ConstructId                                      ConstructName  \
0           0          856  Use the order of operations to carry out calcu...   
1           1         1612  Simplify an algebraic fraction by factorising ...   
2           2         2774            Calculate the range from a list of data   
3           3         2377  Recall and use the intersecting diagonals prop...   
4           4         3387  Substitute positive integer values into formul...   

   SubjectId                                        SubjectName CorrectAnswer  \
0         33                                             BIDMAS             A   
1       1077                    Simplifying Algebraic Fractions             D   
2        339  Range and Interquartile Range from a List of Data             B   
3         88                       Properties of Quadrilaterals             C   
4         67                          Substitution into Formula             A   

                     

{'Missing Values': QuestionId            0
 ConstructId           0
 ConstructName         0
 SubjectId             0
 SubjectName           0
 CorrectAnswer         0
 QuestionText          0
 AnswerAText           0
 AnswerBText           0
 AnswerCText           0
 AnswerDText           0
 MisconceptionAId    734
 MisconceptionBId    751
 MisconceptionCId    789
 MisconceptionDId    832
 dtype: int64,
 'Correct Answer Distribution': CorrectAnswer
 C    488
 A    482
 B    461
 D    438
 Name: count, dtype: int64,
 'Misconception Distribution': MisconceptionId
 1214.0    54
 1379.0    43
 2316.0    38
 1507.0    36
 1990.0    33
 1880.0    32
 1597.0    27
 2392.0    27
 1248.0    22
 77.0      22
 Name: count, dtype: int64,
 'Top 10 Constructs': ConstructName
 Calculate the square of a number                                                                                               14
 Solve two-step linear equations, with the variable on one side, with all positive integers     

<h3> EDA </h3>

In [12]:
# import pandas as pd

# questions_df = pd.read_csv('/content/data/train.csv')

# misconceptions_df = pd.read_csv('/content/data/misconception_mapping.csv')

import pandas as pd
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import torch

questions_df = pd.read_csv('/content/data/train.csv')  # First dataset: Misconception IDs and Names
misconceptions_df = pd.read_csv('/content/data/misconception_mapping.csv')  # Second dataset: Question details with misconception IDs



In [13]:
answer_to_misconception_map = {
    'AnswerAText': 'MisconceptionAId',
    'AnswerBText': 'MisconceptionBId',
    'AnswerCText': 'MisconceptionCId',
    'AnswerDText': 'MisconceptionDId'
}

# Step 2: Melt the answer text columns into a long format
answer_melted_df = pd.melt(
    questions_df,
    id_vars=['QuestionId', 'ConstructId', 'ConstructName', 'SubjectId', 'SubjectName', 'CorrectAnswer', 'QuestionText'],  # Columns to keep as-is
    value_vars=list(answer_to_misconception_map.keys()),  # Answer text columns to melt
    var_name='Option',  # This new column will indicate AnswerAText, AnswerBText, etc.
    value_name='OptionText'  # This new column will contain the answer text
)

# Step 3: Map each Answer column to its corresponding Misconception column
answer_melted_df['MisconceptionColumn'] = answer_melted_df['Option'].map(answer_to_misconception_map)

# Step 4: Fetch the corresponding Misconception IDs using the mapped column names
for idx, row in answer_melted_df.iterrows():
    misconception_col = row['MisconceptionColumn']
    answer_melted_df.at[idx, 'MisconceptionId'] = questions_df.loc[row['QuestionId'], misconception_col]

answer_melted_df

Unnamed: 0,QuestionId,ConstructId,ConstructName,SubjectId,SubjectName,CorrectAnswer,QuestionText,Option,OptionText,MisconceptionColumn,MisconceptionId
0,0,856,Use the order of operations to carry out calcu...,33,BIDMAS,A,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,AnswerAText,\( 3 \times(2+4)-5 \),MisconceptionAId,
1,1,1612,Simplify an algebraic fraction by factorising ...,1077,Simplifying Algebraic Fractions,D,"Simplify the following, if possible: \( \frac{...",AnswerAText,\( m+1 \),MisconceptionAId,2142.0
2,2,2774,Calculate the range from a list of data,339,Range and Interquartile Range from a List of Data,B,Tom and Katie are discussing the \( 5 \) plant...,AnswerAText,Only\nTom,MisconceptionAId,1287.0
3,3,2377,Recall and use the intersecting diagonals prop...,88,Properties of Quadrilaterals,C,The angles highlighted on this rectangle with ...,AnswerAText,acute,MisconceptionAId,1180.0
4,4,3387,Substitute positive integer values into formul...,67,Substitution into Formula,A,The equation \( f=3 r^{2}+3 \) is used to find...,AnswerAText,\( 30 \),MisconceptionAId,
...,...,...,...,...,...,...,...,...,...,...,...
7471,1864,2774,Calculate the range from a list of data,339,Range and Interquartile Range from a List of Data,C,What is the range of the following numbers?\n\...,AnswerDText,\( 16 \),MisconceptionDId,1349.0
7472,1865,2695,"Describe an enlargement, with no centre of enl...",90,Length Scale Factors in Similar Shapes,B,Shape \( Q \) is an enlargement of shape \( P ...,AnswerDText,\( 11-3 \),MisconceptionDId,1258.0
7473,1866,854,Use the order of operations to carry out calcu...,33,BIDMAS,B,What does the following equal?\n\[\n8-7+10 \ti...,AnswerDText,\( 33 \),MisconceptionDId,1507.0
7474,1867,2634,Distinguish between congruency and similarity,274,Congruency in Other Shapes,B,Tom and Katie are discussing congruence and si...,AnswerDText,Neither is correct,MisconceptionDId,2312.0


In [14]:
final_merged_df = pd.merge(
    answer_melted_df,  # The formatted data containing questions and answers
    misconceptions_df,  # The misconception descriptions
    how='left',        # Use a left join to keep all question/answer rows
    on='MisconceptionId'  # Merge based on the 'MisconceptionId' column
)

# Treat -1 as a valid label, assign it to a new class (e.g., num_labels)
max_label = final_merged_df['MisconceptionId'].max()  # Get the current max label
final_merged_df['MisconceptionId'] = final_merged_df['MisconceptionId'].fillna(max_label + 1).astype(int)
final_merged_df['MisconceptionName'] = final_merged_df['MisconceptionName'].fillna("No Misconception Available")
final_merged_df['MisconceptionId'].nunique()

1605

In [15]:
train_df, val_df = train_test_split(final_merged_df, test_size=0.2, random_state=42)

# Step 4: Initialize the tokenizer for BERT
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')  # Use 'math-bertha' if desired


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [16]:
def tokenize_data(data):
    return tokenizer(data['OptionText'].tolist(), padding=True, truncation=True, max_length=128, return_tensors='pt')

train_encodings = tokenize_data(train_df)
val_encodings = tokenize_data(val_df)

# Convert labels to tensors
train_labels = torch.tensor(train_df['MisconceptionId'].values)
val_labels = torch.tensor(val_df['MisconceptionId'].values)

In [17]:
# Create a Dataset class
class MisconceptionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

# Create Dataset objects for training and validation sets
train_dataset = MisconceptionDataset(train_encodings, train_labels)
val_dataset = MisconceptionDataset(val_encodings, val_labels)

In [21]:
final_merged_df['MisconceptionId'].nunique()

1605

In [22]:
# Load a pretrained BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1605)

# Define training arguments, disabling any external logging (like wandb)
training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=3,              # Number of training epochs
    per_device_train_batch_size=8,   # Batch size for training
    per_device_eval_batch_size=16,   # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    logging_strategy='steps',        # Log every few steps
    logging_steps=100,               # Log progress every 100 steps
    report_to=None,                  # Disable logging to external platforms like wandb
    evaluation_strategy='epoch',     # Evaluate every epoch
    save_strategy='epoch',           # Save the model every epoch
    load_best_model_at_end=True      # Load the best model when finished
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

for batch in train_dataset:
    print(batch)
    break


# Train the model (this will display a progress bar using tqdm)
trainer.train()

# Evaluate the model
trainer.evaluate()

# Save the trained model and tokenizer
model.save_pretrained('./misconception_model')
tokenizer.save_pretrained('./misconception_model')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [19]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'