In [1]:
%%time
!pip uninstall -y torch
!pip install -q --no-index --find-links=/kaggle/input/making-wheels-of-necessary-packages-for-vllm vllm
!pip install -q -U /kaggle/input/vllm-t4-fix/grpcio-1.62.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install -q -U /kaggle/input/vllm-t4-fix/ray-2.11.0-cp310-cp310-manylinux2014_x86_64.whl
!pip install -q --no-deps --no-index /kaggle/input/hf-libraries/sentence-transformers/sentence_transformers-3.1.0-py3-none-any.whl
!pip install --no-deps --no-index /kaggle/input/logits-processor-zoo/logits_processor_zoo-0.1.0-py3-none-any.whl

Found existing installation: torch 2.4.0
Uninstalling torch-2.4.0:
  Successfully uninstalled torch-2.4.0
Processing /kaggle/input/logits-processor-zoo/logits_processor_zoo-0.1.0-py3-none-any.whl
Installing collected packages: logits-processor-zoo
Successfully installed logits-processor-zoo-0.1.0
CPU times: user 1.77 s, sys: 407 ms, total: 2.18 s
Wall time: 2min 24s


In [2]:
!pip install transformers peft accelerate \
    -q -U --no-index --find-links /kaggle/input/lmsys-wheel-files

In [3]:
%%capture
!pip install --no-index /kaggle/input/bitsandbytes0-42-0/bitsandbytes-0.42.0-py3-none-any.whl --find-links=/kaggle/input/bitsandbytes0-42-0
!pip install --no-index  /kaggle/input/bitsandbytes0-42-0/optimum-1.21.2-py3-none-any.whl --find-links=/kaggle/input/bitsandbytes0-42-0
!pip install --no-index  /kaggle/input/bitsandbytes0-42-0/auto_gptq-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl --find-links=/kaggle/input/bitsandbytes0-42-0

### Embedding using 7B LLM

In [4]:
import pandas as pd
import numpy as np
full_df = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv")


rows = []
for idx, row in full_df.iterrows():
    for option in ["A", "B", "C", "D"]:
        if option == row.CorrectAnswer:
            continue
        # simply filter out all data samples without MisconceptionId
        if np.isnan(row[f"Misconception{option}Id"]):
            continue
        correct_answer = row[f"Answer{row.CorrectAnswer}Text"]
        
        query_text =f"###question###:{row['SubjectName']}-{row['ConstructName']}-{row['QuestionText']}\n###Correct Answer###:{correct_answer}\n###Misconcepte Incorrect answer###:{option}.{row[f'Answer{option}Text']}"
        rows.append({"query_text": query_text, 
                     "QuestionId_Answer": f"{row.QuestionId}_{option}",
                     "ConstructName": row.ConstructName,
                     "SubjectName": row.SubjectName,
                     "QuestionText": row.QuestionText,
                     "correct_answer": correct_answer,
                     "incorrect_answer": row[f"Answer{option}Text"],
                     "MisconceptionID":row[f"Misconception{option}Id"]
                     })

df = pd.DataFrame(rows)
df.head()
df.to_csv("/kaggle/working/train_query.csv")

In [5]:
# Code for Model Loading
import torch
from numpy.linalg import norm
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel,BitsAndBytesConfig
from peft import (
    LoraConfig,
    get_peft_model,
)

def batch_to_device(batch, target_device):
    """
    send a pytorch batch to a device (CPU/GPU)
    """
    for key in batch:
        if isinstance(batch[key], Tensor):
            batch[key] = batch[key].to(target_device)
    return batch

def last_token_pool(last_hidden_states: Tensor,
                    attention_mask: Tensor) -> Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery: {query}'

def inference(df, model, tokenizer, device):
    batch_size = 16
    max_length = 512
    sentences = list(df['query_text'].values)

    all_embeddings = []
    length_sorted_idx = np.argsort([-len(sen) for sen in sentences])
    sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
    for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=False):
        print(f"Processing batch {start_index}/{len(sentences)}")
        sentences_batch = sentences_sorted[start_index: start_index + batch_size]
        features = tokenizer(sentences_batch, max_length=max_length, padding=True, truncation=True,
                             return_tensors="pt")
        features = batch_to_device(features, device)
        with torch.no_grad():
            outputs = model.model(**features)
            embeddings = last_token_pool(outputs.last_hidden_state, features['attention_mask'])
            embeddings = torch.nn.functional.normalize(embeddings, dim=-1)
            embeddings = embeddings.detach().cpu().numpy().tolist()
        all_embeddings.extend(embeddings)

    all_embeddings = [np.array(all_embeddings[idx]).reshape(1, -1) for idx in np.argsort(length_sorted_idx)]

    return np.concatenate(all_embeddings, axis=0)

In [6]:
path_prefix = "/kaggle/input/eedi-mining-misconceptions-in-mathematics"
model_path = "/kaggle/input/sfr-embedding-mistral/SFR-Embedding-2_R"
lora_path="/kaggle/input/v7-recall/epoch_19_model/adapter.bin"
device='cuda:0'

In [7]:
tokenizer = AutoTokenizer.from_pretrained(lora_path.replace("/adapter.bin",""))
bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )
backbone = AutoModel.from_pretrained(model_path, quantization_config=bnb_config,device_map=device)
config = LoraConfig(
        r=64,
        lora_alpha=128,
        target_modules=[
            "q_proj",
            "k_proj",
            "v_proj",
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],
        bias="none",
        lora_dropout=0.05,  # Conventional
        task_type="CAUSAL_LM",
    )
model = get_peft_model(backbone, config)
d = torch.load(lora_path, map_location=model.device)
model.load_state_dict(d, strict=False)
model = model.eval()
model = model.to(device)
print("Finish Loading Model")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Finish Loading Model


In [8]:
import numpy as np
from tqdm.autonotebook import trange


task_description = 'Given a math question and a misconcepte incorrect answer, please retrieve the most accurate reason for the misconception.'

print("Start calculate Query embedding")
V_answer = inference(df, model, tokenizer, device)
print(f"Query embedding {V_answer.shape}")
np.save("/kaggle/working/train_query_embedding.npy", V_answer)

misconception_df = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv")
misconception_df["query_text"] = misconception_df["MisconceptionName"]

print("Start calculate Misconceptions embedding")
V_misconception = inference(misconception_df, model, tokenizer, device)
print(f"Misconception embedding {V_misconception.shape}")
np.save("/kaggle/working/misconception_embedding.npy", V_misconception)


Start calculate Query embedding


Batches:   0%|          | 0/274 [00:00<?, ?it/s]

Processing batch 0/4370
Processing batch 16/4370
Processing batch 32/4370
Processing batch 48/4370
Processing batch 64/4370
Processing batch 80/4370
Processing batch 96/4370
Processing batch 112/4370
Processing batch 128/4370
Processing batch 144/4370
Processing batch 160/4370
Processing batch 176/4370
Processing batch 192/4370
Processing batch 208/4370
Processing batch 224/4370
Processing batch 240/4370
Processing batch 256/4370
Processing batch 272/4370
Processing batch 288/4370
Processing batch 304/4370
Processing batch 320/4370
Processing batch 336/4370
Processing batch 352/4370
Processing batch 368/4370
Processing batch 384/4370
Processing batch 400/4370
Processing batch 416/4370
Processing batch 432/4370
Processing batch 448/4370
Processing batch 464/4370
Processing batch 480/4370
Processing batch 496/4370
Processing batch 512/4370
Processing batch 528/4370
Processing batch 544/4370
Processing batch 560/4370
Processing batch 576/4370
Processing batch 592/4370
Processing batch 608

Batches:   0%|          | 0/162 [00:00<?, ?it/s]

Processing batch 0/2587
Processing batch 16/2587
Processing batch 32/2587
Processing batch 48/2587
Processing batch 64/2587
Processing batch 80/2587
Processing batch 96/2587
Processing batch 112/2587
Processing batch 128/2587
Processing batch 144/2587
Processing batch 160/2587
Processing batch 176/2587
Processing batch 192/2587
Processing batch 208/2587
Processing batch 224/2587
Processing batch 240/2587
Processing batch 256/2587
Processing batch 272/2587
Processing batch 288/2587
Processing batch 304/2587
Processing batch 320/2587
Processing batch 336/2587
Processing batch 352/2587
Processing batch 368/2587
Processing batch 384/2587
Processing batch 400/2587
Processing batch 416/2587
Processing batch 432/2587
Processing batch 448/2587
Processing batch 464/2587
Processing batch 480/2587
Processing batch 496/2587
Processing batch 512/2587
Processing batch 528/2587
Processing batch 544/2587
Processing batch 560/2587
Processing batch 576/2587
Processing batch 592/2587
Processing batch 608