# Overview

- [Infer Notebook](https://www.kaggle.com/code/sinchir0/fine-tuning-bge-infer/notebook)

- make 25 retrieval data by `bge-large-en-v1.5`
- Fine-tuning `bge-large-en-v1.5` by retrieval data
  - `anchor`: `ConstructName` + `SubjectName` + `QuestionText` + `Answer[A-D]Text`
  - `positive`: Correct MisconceptionName
  - `negative`: Wrong MisconceptionName

ref: https://sbert.net/docs/sentence_transformer/training_overview.html#trainer

In [None]:
%pip install -qq datasets==3.0.0
%pip install -qq sentence_transformers==3.1.0

In [None]:
import os
import numpy as np

from datasets import load_dataset, Dataset

import wandb
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity

from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers import (
    SentenceTransformer,
    SentenceTransformerTrainer,
    SentenceTransformerTrainingArguments,
)
from sentence_transformers.training_args import BatchSamplers
from sentence_transformers.evaluation import TripletEvaluator

import datasets
import sentence_transformers

# Setting

In [None]:
EXP_NAME = "fine-tuning-bge"
DATA_PATH = "/kaggle/input/eedi-mining-misconceptions-in-mathematics"
MODEL_NAME = "BAAI/bge-small-en-v1.5"
COMPETITION_NAME = "eedi-mining-misconceptions-in-mathematics"
OUTPUT_PATH = "."
MODEL_OUTPUT_PATH = f"{OUTPUT_PATH}/trained_model"
NUM_PROC = os.cpu_count()

config = {
    'retrieve_num': 25,
    'epochs': 10,
    'lr': 2e-5,
    'bs': 16,
    'grad_acc_step': 128 // 16, # CHANGE ACC TO BATCH SIZE DONT FORGET
    'train': True,
    'debug': False,
    'wandb': False
}

# RETRIEVE_NUM = 25

# EPOCH = 2
# LR = 2e-05
# BS = 8
# GRAD_ACC_STEP = 128 // BS

# TRAINING = True
# DEBUG = False
# WANDB = True

# WANDB

In [None]:
if config['wandb']:
    # Settings -> add wandb api
    from kaggle_secrets import UserSecretsClient
    user_secrets = UserSecretsClient()
    wandb.login(key=user_secrets.get_secret("wandb_api"))
    wandb.init(project=COMPETITION_NAME, name=EXP_NAME)
    REPORT_TO = "wandb"
else:
    REPORT_TO = "none"

REPORT_TO

# Data Load

In [None]:
train = pd.read_csv(f"{DATA_PATH}/train.csv")
misconception_mapping = pd.read_csv(f"{DATA_PATH}/misconception_mapping.csv")

In [None]:
import pandas as pd

# Define the list of common columns
common_col = [
    "QuestionId",
    "ConstructName",
    "SubjectName",
    "QuestionText",
    "CorrectAnswer",
]

# Select the required columns from the DataFrame
train_selected = train[common_col + [f"Answer{alpha}Text" for alpha in ["A", "B", "C", "D"]]]

# Unpivot the DataFrame using melt
train_melted = train_selected.melt(
    id_vars=common_col, 
    var_name="AnswerType", 
    value_name="AnswerText"
)

# Create the 'AllText' column by concatenating the specified columns
train_melted["AllText"] = (
    train_melted["ConstructName"] + " " +
    train_melted["SubjectName"] + " " +
    train_melted["QuestionText"] + " " +
    train_melted["AnswerText"]
)

# Extract the alphabet (A, B, C, D) from the 'AnswerType' column and create 'AnswerAlphabet' column
train_melted["AnswerAlphabet"] = train_melted["AnswerType"].str.extract(r"Answer([A-D])Text$")[0]

# Create the 'QuestionId_Answer' column by concatenating 'QuestionId' and 'AnswerAlphabet'
train_melted["QuestionId_Answer"] = train_melted["QuestionId"].astype(str) + "_" + train_melted["AnswerAlphabet"]

# Sort the DataFrame by 'QuestionId_Answer'
train_long = train_melted.sort_values("QuestionId_Answer")

# Display the first few rows
train_long.head()

In [None]:
# Select the common columns and Misconception columns in the desired format
misconception_cols = [f"Misconception{alpha}Id" for alpha in ["A", "B", "C", "D"]]

train_misconception_long = (
    train[common_col + misconception_cols]
    .melt(id_vars=common_col, var_name="MisconceptionType", value_name="MisconceptionId")
    .assign(
        AnswerAlphabet=lambda df: df["MisconceptionType"].str.extract(r"Misconception([A-D])Id$")[0],
        QuestionId_Answer=lambda df: df["QuestionId"].astype(str) + "_" + df["AnswerAlphabet"]
    )
    .sort_values("QuestionId_Answer")
    .loc[:, ["QuestionId_Answer", "MisconceptionId"]]
    .astype({"MisconceptionId": "Int64"})  # Use Int64 for nullable integers in pandas
)

train_misconception_long.head()

In [None]:
train_long = train_long.merge(train_misconception_long, on="QuestionId_Answer", how="left")
train_long.head()

# Make retrieval data

In [None]:
model = SentenceTransformer(MODEL_NAME)

train_long_vec = model.encode(
    train_long["AllText"].to_list(), normalize_embeddings=True
)
misconception_mapping_vec = model.encode(
    misconception_mapping["MisconceptionName"].to_list(), normalize_embeddings=True
)
print(train_long_vec.shape)
print(misconception_mapping_vec.shape)

In [None]:
train_cos_sim_arr = cosine_similarity(train_long_vec, misconception_mapping_vec) # similarity between each misconception for each input question & option
train_sorted_indices = np.argsort(-train_cos_sim_arr, axis=1) # argsort does not have sort in descending order, hence -train_cos_sim_arr

In [None]:
# Assuming `train_sorted_indices` is a NumPy array and `RETRIEVE_NUM` is defined
train_long = train_long.assign(
    PredictMisconceptionId=pd.Series(train_sorted_indices[:, :config['retrieve_num']].tolist())
)

train_long.head()

In [None]:
# Filter rows where 'MisconceptionId' is not null
train_retrieved = train_long[train_long["MisconceptionId"].notna()].copy()

# Explode the 'PredictMisconceptionId' column to separate each value in lists into its own row
train_retrieved = train_retrieved.explode("PredictMisconceptionId")

# Perform the first join with 'misconception_mapping' on 'MisconceptionId'
train_retrieved = train_retrieved.merge(
    misconception_mapping, on="MisconceptionId", how="left"
)

# Rename columns in 'misconception_mapping' with 'Predict' prefix and join on 'PredictMisconceptionId'
predict_mapping = misconception_mapping.rename(columns=lambda x: "Predict" + x)
train_retrieved = train_retrieved.merge(
    predict_mapping, on="PredictMisconceptionId", how="left"
)

# Check the shape
train_retrieved.shape

In [None]:
train_retrieved.head()

# Fine-Tune bge

In [None]:
train = (
    Dataset.from_pandas(train_retrieved)
    .filter(  # To create an anchor, positive, and negative structure, delete rows where the positive and negative are identical.
        lambda example: example["MisconceptionId"] != example["PredictMisconceptionId"],
        num_proc=NUM_PROC,
    )
)

In [None]:
train

In [None]:
if config['debug']:
    train = train.select(range(1000))
    config['epochs'] = 1

In [None]:
model = SentenceTransformer(MODEL_NAME)

loss = MultipleNegativesRankingLoss(model)

In [None]:
model # check word embedding dimension to check which model

In [None]:
loss

In [None]:
config['scheduler'] = 'cosine_with_restarts'

In [None]:
config

In [None]:
args = SentenceTransformerTrainingArguments(
    # Required parameter:
    output_dir=OUTPUT_PATH,
    # Optional training parameters:
    num_train_epochs=config['epochs'],
    per_device_train_batch_size=config['bs'],
    gradient_accumulation_steps=config['grad_acc_step'],
    per_device_eval_batch_size=config['bs'],
    eval_accumulation_steps=config['grad_acc_step'],
    learning_rate=config['lr'],
    weight_decay=0.01,
    warmup_ratio=0.1,
    fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
    bf16=False,  # Set to True if you have a GPU that supports BF16
    batch_sampler=BatchSamplers.NO_DUPLICATES,  # MultipleNegativesRankingLoss benefits from no duplicate samples in a batch
    # Optional tracking/debugging parameters:
    lr_scheduler_type=config['scheduler'],
    save_strategy="steps",
    save_steps=0.1,
    save_total_limit=2,
    logging_steps=100,
    report_to=REPORT_TO,  # Will be used in W&B if `wandb` is installed
    run_name=EXP_NAME,
    do_eval=False
)

In [None]:
f"{sum(p.numel() for p in model.parameters()):,}"

In [None]:
trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train.select_columns(
        ["AllText", "MisconceptionName", "PredictMisconceptionName"]
    ),
    loss=loss
)

trainer.train()
model.save_pretrained(MODEL_OUTPUT_PATH)