In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import torch
import numpy as np
import json
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix

# Paths to JSON files
train_path = "/kaggle/input/nlp-math-question-classification-dataset/train.json"
test_path = "/kaggle/input/nlp-math-question-classification-dataset/test.json"

# Define label mapping
label_mapping = {
    "Computations with Matrices": 0,
    "Determinants": 1,
    "Eigenvalues and Eigenvectors": 2,
    "Linear Programming and Game Theory": 3,
    "Matrices and Gaussian Elimination": 4,
    "Orthogonality": 5,
    "Positive Definite Matrices": 6,
    "Vector Spaces": 7
}

# Load train and test JSON
def load_data(json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    if isinstance(data, dict):
        possible_keys = ["data", "questions", "examples"]
        for key in possible_keys:
            if key in data and isinstance(data[key], list):
                data = data[key]
                break
        else:
            raise ValueError(f"Expected list in {json_path}, but got dictionary without a valid key.")

    if not isinstance(data, list):
        raise ValueError(f"Expected a list of dictionaries in {json_path}, but got {type(data)}.")

    texts, labels = [], []
    
    for item in data:
        if not isinstance(item, dict):
            raise ValueError(f"Invalid item in JSON: {item}. Expected a dictionary.")
        if "question_latex" not in item or "chapter" not in item:
            raise KeyError(f"Missing required keys in JSON item: {item}")

        if item["chapter"] not in label_mapping:
            raise ValueError(f"Unknown chapter '{item['chapter']}' in JSON file.")

        texts.append(item["question_latex"])
        labels.append(label_mapping[item["chapter"]])  # Map chapter to label
        
    return texts, labels

train_texts, train_labels = load_data(train_path)
test_texts, test_labels = load_data(test_path)

# Load SciBERT tokenizer and model
model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=len(label_mapping), ignore_mismatched_sizes=True
)

# Tokenize datasets
def tokenize_function(texts):
    return tokenizer(texts, truncation=True, padding=True, max_length=512)

train_encodings = tokenize_function(train_texts)
test_encodings = tokenize_function(test_texts)

train_dataset = Dataset.from_dict({
    "input_ids": train_encodings["input_ids"],
    "attention_mask": train_encodings["attention_mask"],
    "label": train_labels
})

test_dataset = Dataset.from_dict({
    "input_ids": test_encodings["input_ids"],
    "attention_mask": test_encodings["attention_mask"],
    "label": test_labels
})

# Define compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)

    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    conf_matrix = confusion_matrix(labels, predictions)

    print("\nConfusion Matrix:")
    print(conf_matrix)

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./scibert_math_classification",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",  # Disables WandB logging if not used
)

# Trainer API
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()

# Save model & tokenizer
model.save_pretrained("./scibert_math_model")
tokenizer.save_pretrained("./scibert_math_model")

# Evaluate the model
results = trainer.evaluate()
print("\nEvaluation Metrics:")
print(results)


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.4297,1.316437,0.577882,0.586615,0.577882,0.572466
2,0.5952,0.623128,0.799065,0.80366,0.799065,0.799363
3,0.4412,0.52829,0.839564,0.857257,0.839564,0.841057
4,0.2409,0.466437,0.870717,0.876453,0.870717,0.871273
5,0.1275,0.482701,0.88162,0.883025,0.88162,0.881477



Confusion Matrix:
[[45  2  5  1  1  2  9  0]
 [ 2 32 11  0 12  3  0 14]
 [ 2  4 51  1  8 10  3 12]
 [ 0  0  2 59  0  1  1  2]
 [ 0  4 18  1 31  9  0 33]
 [ 5  0  4  1  3 39  6 13]
 [21  1  2  3  0  7 37  0]
 [ 0  0 10  0 17  2  3 77]]





Confusion Matrix:
[[45  0  7  1  1  5  6  0]
 [ 0 69  0  0  5  0  0  0]
 [ 1  5 65  0 13  2  1  4]
 [ 0  0  1 62  1  0  0  1]
 [ 0  1 12  0 66  4  1 12]
 [ 2  1  2  1  4 53  2  6]
 [ 1  0  5  0  0  2 63  0]
 [ 0  1  6  0  9  1  2 90]]





Confusion Matrix:
[[52  1  4  0  1  4  2  1]
 [ 0 69  0  0  5  0  0  0]
 [ 1  5 60  0 18  3  1  3]
 [ 0  0  0 64  1  0  0  0]
 [ 0  1  3  0 88  2  0  2]
 [ 1  1  1  3  5 56  0  4]
 [ 1  0  1  2  2  1 64  0]
 [ 0  1  1  0 20  1  0 86]]





Confusion Matrix:
[[59  0  3  0  0  1  2  0]
 [ 0 69  0  0  5  0  0  0]
 [ 3  0 83  0  3  2  0  0]
 [ 0  0  0 64  1  0  0  0]
 [ 1  0 12  0 76  3  0  4]
 [ 6  1  2  1  5 52  1  3]
 [ 2  0  2  0  1  0 66  0]
 [ 0  1  6  0 10  2  0 90]]





Confusion Matrix:
[[64  0  0  0  0  1  0  0]
 [ 0 69  0  0  5  0  0  0]
 [ 4  0 79  0  3  4  1  0]
 [ 0  0  1 64  0  0  0  0]
 [ 1  2  9  0 74  4  1  5]
 [ 2  1  1  1  4 58  2  2]
 [ 1  0  3  0  0  0 67  0]
 [ 0  1  5  0  9  3  0 91]]





Confusion Matrix:
[[64  0  0  0  0  1  0  0]
 [ 0 69  0  0  5  0  0  0]
 [ 4  0 79  0  3  4  1  0]
 [ 0  0  1 64  0  0  0  0]
 [ 1  2  9  0 74  4  1  5]
 [ 2  1  1  1  4 58  2  2]
 [ 1  0  3  0  0  0 67  0]
 [ 0  1  5  0  9  3  0 91]]

Evaluation Metrics:
{'eval_loss': 0.4827011823654175, 'eval_accuracy': 0.881619937694704, 'eval_precision': 0.8830248140586462, 'eval_recall': 0.881619937694704, 'eval_f1': 0.881476510438396, 'eval_runtime': 7.6514, 'eval_samples_per_second': 83.906, 'eval_steps_per_second': 5.359, 'epoch': 5.0}
