In [1]:
!pip install transformers peft accelerate \
    -U --no-index --find-links /kaggle/input/lmsys-wheel-files

!pip install --no-index /kaggle/input/bitsandbytes0-42-0/bitsandbytes-0.42.0-py3-none-any.whl --find-links=/kaggle/input/bitsandbytes0-42-0

!pip install --no-index --find-links=/kaggle/input/vllm-0-6-3-post1-wheels torchvision==0.19.1
!pip install --no-index --find-links=/kaggle/input/vllm-0-6-3-post1-wheels vllm

!pip install -q --no-deps --no-index /kaggle/input/hf-libraries/sentence-transformers/sentence_transformers-3.1.0-py3-none-any.whl

Looking in links: /kaggle/input/lmsys-wheel-files
Processing /kaggle/input/lmsys-wheel-files/peft-0.11.1-py3-none-any.whl
Installing collected packages: peft
Successfully installed peft-0.11.1
Looking in links: /kaggle/input/bitsandbytes0-42-0
Processing /kaggle/input/bitsandbytes0-42-0/bitsandbytes-0.42.0-py3-none-any.whl
Installing collected packages: bitsandbytes
Successfully installed bitsandbytes-0.42.0
Looking in links: /kaggle/input/vllm-0-6-3-post1-wheels
Processing /kaggle/input/vllm-0-6-3-post1-wheels/torchvision-0.19.1-cp310-cp310-manylinux1_x86_64.whl
Processing /kaggle/input/vllm-0-6-3-post1-wheels/torch-2.4.1-cp310-cp310-manylinux1_x86_64.whl (from torchvision==0.19.1)
Processing /kaggle/input/vllm-0-6-3-post1-wheels/nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (from torch==2.4.1->torchvision==0.19.1)
Processing /kaggle/input/vllm-0-6-3-post1-wheels/nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (from torch==2.4.1->torchvisi

In [2]:
import math, numpy as np
import os, sys
import pandas as pd
from tqdm.auto import tqdm
from tqdm.autonotebook import trange
import re, gc
import pickle
import torch
from torch import Tensor
import torch.distributed as dist
from torch.nn import DataParallel
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.nn.functional as F
from bs4 import BeautifulSoup
from sklearn.model_selection import GroupKFold
import json
from numpy.linalg import norm

import vllm
import argparse

from transformers import AutoTokenizer, AutoModel
from transformers import BitsAndBytesConfig
from sentence_transformers import SentenceTransformer, util
from peft import (
    LoraConfig,
    get_peft_model,
)
import json
import copy
import warnings
warnings.filterwarnings('ignore')

os.environ["CUDA_VISIBLE_DEVICES"]="0,1"
pd.set_option('display.max_rows', 300)

device='cuda:0'

# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
IS_SUBMISSION = bool(os.getenv("KAGGLE_IS_COMPETITION_RERUN"))

# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# RECALL

RECALL_SIZE = 40

recall_model_path = "/kaggle/input/qwen2-5-14b-instruct/Qwen2.5-14B-Instruct"
recall_lora_paths = [
    "/kaggle/input/eedi-recall-qwen-14b/fold_0/CV-563-929/adapter.bin", #LB: 481
    "/kaggle/input/eedi-recall-qwen-14b/fold_1/CV-557-922_LB-494/adapter.bin", #LB: 494
    "/kaggle/input/eedi-recall-qwen-14b/fold_3_CV-578-926-657-960/CV-578-926-657-960/adapter.bin", #LB: 458
    "/kaggle/input/eedi-recall-qwen-14b/fold_4/CV-572-934/adapter.bin", #LB: 457
]

recaller_weights = [0.6,1,0.2,0.2]

# misc_embeddings pre-computed
misc_embeddings = [
    "/kaggle/input/eedi-misc-embeddings-qwen-14b/misc_embeddings_fold0_LB481.npy",
    "/kaggle/input/eedi-misc-embeddings-qwen-14b/misc_embeddings_fold1_LB494.npy",
    "/kaggle/input/eedi-misc-embeddings-qwen-14b/misc_embeddings_fold3_LB458.npy",
    "/kaggle/input/eedi-misc-embeddings-qwen-14b/misc_embeddings_fold4_LB457.npy"
]

# # @@@@@@@@@@@@@@@@@@@@
# # recall params
# # @@@@@@@@@@@@@@@@@@@@
infer_batch=16
infer_max_len=768
merge_unload=False
bnb_config = BitsAndBytesConfig( 
            load_in_4bit=True, #将模型权重加载为4位格式，减少内存占用。
            bnb_4bit_use_double_quant=True, #使用双重量化方法来提高模型的性能。
            bnb_4bit_quant_type="nf4", #指定量化类型为 nf4（一个特定的量化格式）。
            bnb_4bit_compute_dtype=torch.bfloat16 #指定计算的数据类型为 bfloat16。
        )
loraConfig = LoraConfig(
        r=32,
        lora_alpha=64,
        target_modules=[
            "q_proj",# 查询（q_proj）
            "k_proj",# 键（k_proj）
            "v_proj", # 值（v_proj）
            "o_proj",
            "gate_proj",
            "up_proj",
            "down_proj",
        ],
        bias="none",
        lora_dropout=0.05,  # Conventional # LoRA适应层的dropout概率，防止过拟合。
        task_type="FEATURE_EXTRACTION",
    )

# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# LLM

LLM_PATH = "/kaggle/input/eedi-rerankers" #LB: 601, 584

LLM_PARAMS = {
    "vllm_params":{
        "model": LLM_PATH,
        "quantization":  "awq",
        "tensor_parallel_size":  2,
        "gpu_memory_utilization":  0.95,
        "trust_remote_code":  True,
        "dtype":  "half",
        "enforce_eager":  True,
        "max_model_len":  1024,
        "disable_log_stats":  True,
        "enable_prefix_caching": True,
    },
    "vllm_samplingParams":{
        "n": 1,  # Number of output sequences to return for each prompt.
        "top_p": 0.8,  # Float that controls the cumulative probability of the top tokens to consider. # ***改进思路: set top_p=0.95 to prevent particularly strange tokens.
        "temperature": 0,  # randomness of the sampling  # ***改进思路: set temperature values between 0.9 and 1.05, for the sake of diversity of answers
        "seed": 777, # Seed for reproducibility.
        "skip_special_tokens": False, # Whether to skip special tokens in the output.
        "max_tokens": 1,  # Maximum number of tokens to generate per output sequence.
        "logprobs": 5, # top K most probable tokens.
    }
}

# Save as a JSON file
with open("llm_params.json", "w") as f:
    json.dump(LLM_PARAMS, f, indent=4)



2024-12-22 00:31:16,986	INFO util.py:124 -- Outdated packages:
  ipywidgets==7.7.1 found, needs ipywidgets>=8
Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


## Metric

In [3]:
%%writefile eedi_metrics.py

# Credit: https://www.kaggle.com/code/abdullahmeda/eedi-map-k-metric

import numpy as np
def apk(actual, predicted, k=25):
    """
    Computes the average precision at k.
    
    This function computes the average prescision at k between two lists of
    items.
    
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
        
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    
    if not actual:
        return 0.0

    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        # first condition checks whether it is valid prediction
        # second condition checks if prediction is not repeated
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score / min(len(actual), k)

def mapk(actual, predicted, k=25):
    """
    Computes the mean average precision at k.
    
    This function computes the mean average prescision at k between two lists
    of lists of items.
    
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
        
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

Writing eedi_metrics.py


## Prepare dataframe

In [4]:
df_train = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/train.csv").fillna(-1)
df_train = df_train.sample(5, random_state=42).reset_index(drop=True)
df_test = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv")

if not IS_SUBMISSION:
    df_ret = df_train.copy()
else:
    df_ret = df_test.copy()

df_ret['CorrectAnswerText'] = df_ret.apply(lambda row: row[f"Answer{row.CorrectAnswer}Text"], axis=1)
df_ret.head()


# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# 重构 train 数据集，将其转换为更适合分析的长表结构，
#     并生成包含问题和答案文本组合的 AllText 列 ("ConstructName"+"SubjectName"+"QuestionText"+"AnswerText")，
#     同时为每个问题的答案创建一个唯一标识符 QuestionId_Answer 列。


common_col = [
    "QuestionId",
    "ConstructName",
    "SubjectName",
    "QuestionText",
    "CorrectAnswer",
    "CorrectAnswerText"
]

# 选择需要的列
df_ret_long = df_ret[common_col + [f"Answer{alpha}Text" for alpha in ["A", "B", "C", "D"]]]

# 使用 melt 操作来进行 unpivot
df_ret_long = df_ret_long.melt(
    id_vars=common_col,
    var_name="AnswerType",
    value_name="AnswerText"
)

# 提取 AnswerAlphabet 列
df_ret_long["AnswerAlphabet"] = df_ret_long["AnswerType"].str.extract(r"Answer([A-D])Text$")

# 去掉正确答案行
df_ret_long = df_ret_long[df_ret_long["CorrectAnswer"] != df_ret_long["AnswerAlphabet"]]

# 添加 QuestionId_Answer 列
df_ret_long["QuestionId_Answer"] = (df_ret_long["QuestionId"].astype(str) + "_" + df_ret_long["AnswerAlphabet"].astype(str)).astype(str)



if not IS_SUBMISSION:

    
    # 选择需要的列
    df_ret_misconception_long = df_ret[common_col + [f"Misconception{alpha}Id" for alpha in ["A", "B", "C", "D"]]]

    # 使用 melt 操作来进行 unpivot
    df_ret_misconception_long = df_ret_misconception_long.melt(
        id_vars=common_col,
        var_name="MisconceptionType",
        value_name="MisconceptionId"
    )

    # 提取 AnswerAlphabet 列
    df_ret_misconception_long["AnswerAlphabet"] = df_ret_misconception_long["MisconceptionType"].str.extract(r"Misconception([A-D])Id$")

    # 添加 QuestionId_Answer 列
    df_ret_misconception_long["QuestionId_Answer"] = (df_ret_misconception_long["QuestionId"].astype(str) + "_" + df_ret_misconception_long["AnswerAlphabet"].astype(str)).astype(str)
    
    df_ret_misconception_long["MisconceptionId"] = df_ret_misconception_long["MisconceptionId"].astype('int64')
    
    # 选择需要的列并转换 MisconceptionId 列的类型
    df_ret_misconception_long = df_ret_misconception_long[["QuestionId_Answer", "MisconceptionId"]]
    
    # 使用 merge 而不是 join 进行合并
    df_ret_long = df_ret_long.merge(df_ret_misconception_long, on="QuestionId_Answer", how="left")
    df_ret_long = df_ret_long[df_ret_long["MisconceptionId"] != -1]

    
df_ret_long = df_ret_long.sort_values(by="QuestionId").reset_index(drop=True)


# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
### label存起来，日后做validation用
if not IS_SUBMISSION:    
    df_label = {}
    for idx, row in tqdm(df_ret_long.iterrows(), total=len(df_ret_long)):
        df_label[f"{row.QuestionId_Answer}"] = [row["MisconceptionId"]]
                
    df_label = pd.DataFrame([df_label]).T.reset_index()
    df_label.columns = ["QuestionId_Answer", "MisconceptionId"]
    df_label.to_parquet("label.parquet", index=False)
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>


# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# LoRA
def create_recall_query(row):

    task_description = 'Given a math question and an incorrect answer, please identify the most accurate misconception that led to this incorrect answer.'
    query_text = f"###question###:{row['SubjectName']}-{row['ConstructName']}-{row['QuestionText']}\n###Correct Answer###:{row['CorrectAnswerText']}\n###Incorrect answer###:{row['AnswerText']}"
    return f'Instruct: {task_description}\nQuery: {query_text}'

df_ret_long['recall_query'] = df_ret_long.apply(create_recall_query, axis=1)
# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@

# df_ret_long.head()

  0%|          | 0/11 [00:00<?, ?it/s]

In [5]:
df_misconception_mapping = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv")
misconceptions = df_misconception_mapping.MisconceptionName.values

# infer helper

In [6]:
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# recall helpers

def batch_to_device(batch, target_device):
    """
    send a pytorch batch to a device (CPU/GPU)
    """
    for key in batch:
        if isinstance(batch[key], Tensor):
            batch[key] = batch[key].to(target_device)
    return batch

# 功能: 从最后的隐藏状态中提取最后一个有效token的表示。
# 检查输入序列是否只包含填充。如果是，则返回最后一个隐藏状态。
# 否则，使用注意力掩码计算每个序列的有效长度，并提取对应的最后隐藏状态。

# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
def last_token_pool(last_hidden_states: Tensor,
                    attention_mask: Tensor) -> Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1] #每个样本中最后一个token (batch,seq,dim)->(batch,dim)
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
from torch.utils.data import DataLoader
from datasets import Dataset

def inference(sentences, model, tokenizer, batch_size = infer_batch, max_length = infer_max_len):
    all_embeddings = []
    length_sorted_idx = np.argsort([-len(sen) for sen in sentences])
    sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
    
    for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=False):
        sentences_batch = sentences_sorted[start_index: start_index + batch_size]
        features = tokenizer(sentences_batch, max_length=max_length, padding=True, truncation=True,
                             return_tensors="pt")
        # features = batch_to_device(features, device) #hf框架会自动将它传递到适当的设备。
        with torch.no_grad():
            outputs = model(**features) if merge_unload else model.model(**features)
            embeddings = last_token_pool(outputs.last_hidden_state, features['attention_mask'])
            embeddings = torch.nn.functional.normalize(embeddings, dim=-1)
            embeddings = embeddings.detach().cpu().numpy().tolist()
        all_embeddings.extend(embeddings)
    all_embeddings = [np.array(all_embeddings[idx]).reshape(1, -1) for idx in np.argsort(length_sorted_idx)]
    sentence_embeddings = np.concatenate(all_embeddings, axis=0)
    return sentence_embeddings


# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# 并行推理 helper funcs
from threading import Thread
from queue import Queue


def run_inference(sentences, model, result_queue, index):
    result = inference(sentences, model[0], model[1])
    result_queue.put((index, result))  # 将线程索引与结果一起存入队列

def embed_parallel(sentences, models):
    pids = [i for i in range(len(sentences))]
    
    result_queue = Queue()  # 单一队列存储结果，带索引确保顺序
    threads = []

    for model_index, model in enumerate(models):
        t = Thread(target=run_inference,args=(sentences, model, result_queue, model_index))
        threads.append(t)

    # 启动线程
    for thread in threads:
        thread.start()

    # 等待所有线程完成
    for thread in threads:
        thread.join()

    # 收集每个线程的结果并按索引排序
    results = sorted([result_queue.get() for _ in threads], key=lambda x: x[0])
    results = [result[1] for result in results]  # 提取排序后的结果部分

    return results


def twin_recall(lora_paths, misc_embeddings, top_k=RECALL_SIZE):

    use_device = [f'cuda:{i}' for i in range(torch.cuda.device_count())]
    print("线程数: ", len(use_device))

    try:
        
        # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
        # 每个device上都放一个模型
        models = []
        for i in range(len(use_device)):
            recall_model = AutoModel.from_pretrained(recall_model_path, quantization_config=bnb_config, trust_remote_code=True, device_map=use_device[i])
            recall_tokenizer = AutoTokenizer.from_pretrained(recall_model_path)
            # 加载 LoRA 权重
            if lora_paths[i] != "none":
                recall_model = get_peft_model(recall_model, loraConfig) # 函数将LoRA设置应用于加载的基础模型，使其能够在微调过程中使用LoRA技术。
                d = torch.load(lora_paths[i], map_location="cpu")
                recall_model.load_state_dict(d, strict=False)
            if merge_unload:
                recall_model = recall_model.merge_and_unload()
            recall_model = recall_model.eval()
            models.append((recall_model, recall_tokenizer))
        
        # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
        # 执行嵌入计算
        query_embeddings = embed_parallel(list(df_ret_long.recall_query.values), models)
        if "none" in misc_embeddings:
            misc_embeddings = embed_parallel(list(misconceptions), models)
        else:
            misc_embeddings = [np.load(x) for x in misc_embeddings]
        
        # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
        # 执行recall
        topRecalls=[]
        for i in range(2):
            Ret_topNids = util.semantic_search(query_embeddings[i], misc_embeddings[i], top_k=top_k)
            topRecalls.append([[(idx['corpus_id'],idx['score']) for idx in ret] for ret in Ret_topNids])
        return topRecalls

    except Exception as e:
        print(f"An error occurred: {e}")
    
    finally:
        if 'models' in locals():
            del models
        if 'recall_model' in locals():
            del recall_model
        if 'recall_tokenizer' in locals():
            del recall_tokenizer
        if 'query_embeddings' in locals():
            del query_embeddings
        if 'misc_embeddings' in locals():
            del misc_embeddings
        torch.cuda.empty_cache()
        gc.collect()


# @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
# ensemble results

def ensemble(all_topRecalls, recaller_weights, top_k=RECALL_SIZE):
    """
    :param all_topRecalls:
        结构示意：all_topRecalls[recaller_idx][query_idx] = [(cid, score), (cid, score), ...]
        - recaller_idx: 0 ~ (N-1)
        - query_idx: 0 ~ (Q-1)
        每个元组中 cid 是候选项id，score是该 recaller 对该候选的打分。

    :param recaller_weights:
        一个长度为 N 的列表，每个 recaller 对应一个浮点数权重。例如 [1.0, 0.8, 1.2]

    :top_k:
        提取数量

    :return:
        final_results: 长度为 Q 的列表。
        final_results[q] 是该 query 的融合排序结果列表：[cid1, cid2, ...]，按 final_score 降序排序。
    """

    num_recallers = len(all_topRecalls)
    num_queries = len(all_topRecalls[0])
    final_results = []

    for q in range(num_queries):
        # 用一个字典来记录当前 query 下出现过的 candidate 及对应的最大加权得分
        candidate_scores = {}

        # 遍历每个 recaller 的结果
        for r in range(num_recallers):
            weight = recaller_weights[r]
            for cid, score in all_topRecalls[r][q]:
                weighted_score = weight * score

                if cid not in candidate_scores:
                    candidate_scores[cid] = weighted_score
                else:
                    candidate_scores[cid] += weighted_score

        # 将所有 candidate 转化为 (cid, final_score) 的列表，并按分数排序
        candidates_list = list(candidate_scores.items())
        candidates_list.sort(key=lambda x: x[1], reverse=True)

        final_results.append([x[0] for x in candidates_list[:top_k]])

    return final_results

# RECALL

In [7]:
# recall (多recaller叠加)

all_topRecalls = []
all_topRecalls.extend(twin_recall(recall_lora_paths[:2], misc_embeddings[:2]))
if len(recall_lora_paths)>2:
    all_topRecalls.extend(twin_recall(recall_lora_paths[2:], misc_embeddings[2:]))
topids = ensemble(all_topRecalls, recaller_weights, top_k=RECALL_SIZE)

import gc
gc.collect()
torch.cuda.empty_cache()

df_ret_long['topids'] = topids
print(topids[0])

# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
df_ret_long['top25ids'] = df_ret_long['topids'].apply(lambda x: x[:25])
df_ret_long["MisconceptionId"] = df_ret_long['top25ids'].apply(lambda lst: ' '.join(map(str, lst)))
df_ret_long.to_csv("submission.csv", columns=["QuestionId_Answer", "MisconceptionId"], index=False)

线程数:  2


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

线程数:  2


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[699, 496, 2264, 1795, 2222, 2346, 1383, 980, 2203, 1138, 467, 1514, 539, 1858, 1751, 2119, 567, 1387, 285, 1730, 587, 1773, 234, 2126, 1605, 1675, 611, 2431, 13, 711, 1342, 1359, 1441, 1533, 1510, 1054, 2305, 1742, 2135, 1453]


# RERANK

In [8]:
PROMPT  = """Given a math question about {ConstructName}({SubjectName}), determine whether the misconception causes the incorrect answer:
Question: {Question}
Correct Answer: {CorrectAnswer}
Incorrect Answer: {IncorrectAnswer}
Misconception: {Retrival}

Please respond with only 'Yes' or 'No'.
"""

def preprocess_text(x):
    x = re.sub("@\w+", '',x)      # Delete strings starting with @
    x = re.sub("http\w+", '',x)   # Delete URL
    x = re.sub(r"\.+", ".", x)    # Replace consecutive commas and periods with one comma and period character
    x = re.sub(r"\,+", ",", x)
    x = x.strip()                 # Remove empty characters at the beginning and end
    return x

tokenizer = AutoTokenizer.from_pretrained(LLM_PATH)

def apply_template(row, retrival, tokenizer = tokenizer):
    messages = [
        {"role": "system", "content": "You are a Mathematics teacher. "},
        {"role": "user", 
        "content": preprocess_text(
            PROMPT.format(
                ConstructName=row["ConstructName"],
                SubjectName=row["SubjectName"],
                Question=row["QuestionText"],
                IncorrectAnswer=row["AnswerText"],
                CorrectAnswer=row["CorrectAnswerText"],
                Retrival=retrival)
            )
        }
    ]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return text


df = {}
for idx, row in tqdm(df_ret_long.iterrows(), total=len(df_ret_long)):
    df[f"{row.QuestionId_Answer}"] = {str(idx): apply_template(row, misconceptions[idx]) for idx in row['topids']}

df = pd.DataFrame([df]).T.reset_index()
df.columns = ["QuestionId_Answer", "llm_queries"]

flattened_queries = [item for sublist in df["llm_queries"].apply(lambda x: list(x.values())) for item in sublist]
with open("flattened_queries.json", "w", encoding="utf-8") as f:
    json.dump(flattened_queries, f, ensure_ascii=False)
    
print(len(flattened_queries))

  0%|          | 0/11 [00:00<?, ?it/s]

440


In [9]:
%%writefile llm_rerank.py

import re
import vllm
import pandas as pd
import math, numpy as np
import json
from tqdm.auto import tqdm
from tqdm.autonotebook import trange
from typing import Any, Dict, List
from transformers import LogitsProcessor
import torch
from time import time


with open("flattened_queries.json", "r", encoding="utf-8") as f:
    flattened_queries = json.load(f)

# Load sampling parameters from JSON file
with open("llm_params.json", "r") as f:
    llm_params = json.load(f)
    vllm_params = llm_params["vllm_params"]
    vllm_samplingParams = llm_params["vllm_samplingParams"]
    
llm = vllm.LLM(**vllm_params)
tokenizer = llm.get_tokenizer()

# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# force output
choices = ["Yes","No"]
KEEP = [tokenizer.encode(x,add_special_tokens=False)[0] for x in choices]
print(f"Force predictions to be tokens {KEEP} which are {choices}.")

class DigitLogitsProcessor(LogitsProcessor):
    def __init__(self, tokenizer):
        self.allowed_ids = KEEP
    def __call__(self, input_ids: List[int], scores: torch.Tensor) -> torch.Tensor:
        scores[self.allowed_ids] += 100 #对 self.allowed_ids 中的 token IDs 的分数增加一个非常大的值（100），显著提高这些 token 的生成概率。
        return scores

logits_processors = [DigitLogitsProcessor(tokenizer)] #有中括号是可以是连续使用多个processors

# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# inference
start = time()
responses = llm.generate(
    flattened_queries,
    vllm.SamplingParams(**vllm_samplingParams,logits_processors=logits_processors,),
    use_tqdm=True
)

end = time()
elapsed = (end-start)/60. #minutes
print(f"Inference of {len(flattened_queries)} samples took {elapsed} minutes!")

# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
# post-processing
results = []
errors = 0
for i,response in enumerate(responses):
    try:
        x = response.outputs[0].logprobs[0]
        logprobs = []
        for k in KEEP:
            if k in x:
                logprobs.append( math.exp(x[k].logprob) )
            else:
                logprobs.append(0)
                print(f"bad logits {i}")
        logprobs = np.array(logprobs)
        logprobs /= logprobs.sum() #表中的概率进行归一化处理，使得概率总和为 1
        results.append( logprobs[0] )
    except:
        results.append(1/2.)
        errors += 1
print(f"There were {errors} inference errors out of {i+1} inferences")

with open("results.json", "w", encoding="utf-8") as f:
    json.dump(results, f)

Writing llm_rerank.py


In [10]:
!python llm_rerank.py

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


INFO 12-22 00:40:15 config.py:905] Defaulting to use mp for distributed inference
INFO 12-22 00:40:15 llm_engine.py:237] Initializing an LLM engine (v0.6.3.post1) with config: model='/kaggle/input/eedi-rerankers', speculative_config=None, tokenizer='/kaggle/input/eedi-rerankers', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=True, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=/kaggle/input/eedi-rerankers, n

In [11]:
with open("results.json", "r", encoding="utf-8") as f:
    results = json.load(f)

results = [results[i:i + RECALL_SIZE] for i in range(0, len(results), RECALL_SIZE)]

df["yes_logits"] = results

top25ids=[]
for _, row in df.iterrows():
    pids = list(row["llm_queries"].keys())
    yes_logits = row["yes_logits"]
    top_25_indices = sorted(range(len(yes_logits)), key=lambda i: yes_logits[i], reverse=True)[:25]
    top25ids.append([pids[i] for i in top_25_indices])

df["MisconceptionId"] = [" ".join(top25id) for top25id in top25ids]
df[["QuestionId_Answer", "MisconceptionId"]].to_csv("submission.csv", index=False)

# Submission

In [12]:
if not IS_SUBMISSION:
    from eedi_metrics import mapk
    predicted = pd.read_csv("submission.csv")["MisconceptionId"].apply(lambda x: [int(y) for y in x.split()])
    label = pd.read_parquet("label.parquet")["MisconceptionId"]
    print("Validation: ", mapk(label, predicted))
else:
    print(pd.read_csv("submission.csv"))

Validation:  0.9090909090909091
