In [1]:
# Step 1: Find the wheel inside your dataset folder
!ls /kaggle/input/bitsandbytes-wheel


bnb_wheel


In [2]:
!ls /kaggle/input/bitsandbytes-wheel/bnb_wheel/bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl


/kaggle/input/bitsandbytes-wheel/bnb_wheel/bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl


In [3]:
!pip install  /kaggle/input/bitsandbytes-wheel/bnb_wheel/bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl


Processing /kaggle/input/bitsandbytes-wheel/bnb_wheel/bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes==0.47.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes==0.47.0)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch<3,>=2.2->bitsandbytes==0.47.0)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch<3,>=2.2->bitsandbytes==0.47.0)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch<3,>=2.2->bitsandbytes==0.47.0)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014

In [4]:
import bitsandbytes as bnb
print("BitsAndBytes version:", bnb.__version__)


BitsAndBytes version: 0.47.0


In [5]:
# Optimized inference notebook (inference-only) for MAP competition
# - Runs models sequentially (not parallel) to avoid multi-GPU OOM
# - Uses bitsandbytes 8-bit when available, otherwise device_map+offload fallback
# - Small MAX_LEN, small batch size, streaming tokenization to reduce VRAM and speed
# - Loads saved LabelEncoder if present, otherwise reconstructs from train
# - Produces per-model probability CSVs and ensembled submission.csv

# Usage:
# 1) Attach Kaggle dataset that contains the trained model folders (each model_dir should contain HF model & tokenizer files and label_encoder.joblib if saved)
# 2) Run this notebook. It will try 8-bit load first, then device_map auto + offload.
# 3) If bitsandbytes is not installed and you want to install it, uncomment the pip install cell (may require internet which is disallowed on some competitions).

# NOTE: This is inference-only. Do NOT train here.

import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"
# disable torch dynamo compile if present
os.environ["TORCH_COMPILE_DISABLE"] = "1"

import shutil
import time
import math
from pathlib import Path
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoConfig, AutoModelForSequenceClassification, DataCollatorWithPadding
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
import joblib
from tqdm import tqdm
from scipy.special import softmax

# ----------------- CONFIG -----------------
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", DEVICE)

# Paths for model folders (these should be attached as Kaggle dataset(s))
# Update these strings to the paths in your Kaggle notebook under /kaggle/input
MODEL_PATHS = {
    'deepseek7b': '/kaggle/input/deekseepmath-7b-map-competition/MAP_EXP_09_FULL',
    'qwen3_8b':   '/kaggle/input/qwen3-8b-map-competition/MAP_EXP_16_FULL',
    'gemma2_lora':'/kaggle/input/gemma2-9b-it-cv945'  # LoRA dir or combined model dir
}

# If you have a separate base Gemma2 checkpoint (for PEFT), point here (optional)
GEMMA2_BASE = '/kaggle/input/gemma2-9b-it-bf16'  # or None

OUT_DIR = Path('out_inference')
OUT_DIR.mkdir(exist_ok=True)

MAX_LEN = 128          # smaller length to reduce compute and memory
FORWARD_BS = 1         # per-forward batch size (GPU memory saver)
TOKENIZE_CHUNK = 8     # how many texts to tokenize at once

# Set these lower if you still OOM
INFER_BATCH = 4        # tokenization batch (not forward) when building encodings

# ----------------- Utility helpers -----------------

def build_prompt(row):
    # format must match training prompt
    correctness = "This answer is correct." if int(row.get('is_correct', 0)) else "This answer is incorrect."
    return (
        f"Question: {row['QuestionText']}\n"
        f"Answer: {row['MC_Answer']}\n"
        f"{correctness}\n"
        f"Student Explanation: {row.get('StudentExplanation', '')}"
    )


def safe_tokenizer_add_pad(tokenizer, model=None):
    if tokenizer.pad_token is None:
        pad_tok = tokenizer.eos_token or tokenizer.unk_token or '[PAD]'
        tokenizer.add_special_tokens({'pad_token': pad_tok})
        if model is not None:
            model.resize_token_embeddings(len(tokenizer))


def try_load_model(model_dir, num_labels, prefer_8bit=True, offload_folder='./offload'):
    """
    Try load_in_8bit -> device_map auto with offload -> cpu fallback
    Returns (model, tokenizer, device_of_model)
    """
    tokenizer = AutoTokenizer.from_pretrained(model_dir, local_files_only=True)
    # try bitsandbytes 8-bit
    model = None
    device_of_model = 'cpu'
    try:
        if prefer_8bit:
            import bitsandbytes as bnb  # noqa: F401
            print(f"Attempting 8-bit load for {model_dir}...")
            model = AutoModelForSequenceClassification.from_pretrained(
                model_dir,
                local_files_only=True,
                load_in_8bit=True,
                device_map='auto'
            )
            device_of_model = 'cuda'
            print("Loaded in 8-bit mode.")
    except Exception as e:
        print("8-bit load failed or bitsandbytes not present:", e)

    if model is None:
        # try device_map auto with offload
        try:
            print("Trying device_map='auto' with offload_folder...")
            if os.path.exists(offload_folder):
                shutil.rmtree(offload_folder)
            os.makedirs(offload_folder, exist_ok=True)
            model = AutoModelForSequenceClassification.from_pretrained(
                model_dir,
                local_files_only=True,
                device_map='auto',
                offload_folder=offload_folder,
                low_cpu_mem_usage=True
            )
            device_of_model = 'cuda'
            print("Loaded with device_map='auto' + offload.")
        except Exception as e:
            print("device_map auto + offload failed:", e)

    if model is None:
        # final fallback to CPU
        print("Falling back to CPU model load (very slow but safe).")
        model = AutoModelForSequenceClassification.from_pretrained(model_dir, local_files_only=True)
        device_of_model = 'cpu'

    safe_tokenizer_add_pad(tokenizer, model)
    model.eval()
    return model, tokenizer, device_of_model


# ----------------- Prepare data & label encoder -----------------
print('Loading datasets...')
train = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/train.csv')
test  = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/test.csv')

train['Misconception'] = train['Misconception'].fillna('NA')
train['target'] = train['Category'].astype(str) + ':' + train['Misconception'].astype(str)

# Build correctness map (same as training notebooks)
idx_true = train['Category'].astype(str).str.startswith('True')
correct_counts = (
    train.loc[idx_true]
         .groupby(['QuestionId','MC_Answer'])['MC_Answer']
         .agg('count')
         .reset_index(name='c')
         .sort_values('c', ascending=False)
         .drop_duplicates(['QuestionId'])
)
correct_counts['is_correct'] = 1
# merge into test
test = test.merge(correct_counts[['QuestionId','MC_Answer','is_correct']], on=['QuestionId','MC_Answer'], how='left')
test['is_correct'] = test['is_correct'].fillna(0).astype(int)

# Build prompt text
print('Building prompts...')
test['text'] = test.apply(build_prompt, axis=1)

# Label encoder: try load saved encoder from first model dir, else build from train
le = None
saved_le_path = None
for mp in MODEL_PATHS.values():
    p = Path(mp) / 'label_encoder.joblib'
    if p.exists():
        saved_le_path = str(p)
        break

if saved_le_path:
    print('Loading saved LabelEncoder from', saved_le_path)
    le = joblib.load(saved_le_path)
else:
    print('No saved LabelEncoder found. Building from training data (must match training).')
    le = LabelEncoder()
    le.fit(train['target'].values)

NUM_CLASSES = len(le.classes_)
print('NUM_CLASSES =', NUM_CLASSES)

# ----------------- Inference per model (sequential) -----------------
# Produces per-model probs CSV with top-25 classes and per-row top-classes string

PROB_FILES = []
SUB_FILES = []

for model_name, model_dir in MODEL_PATHS.items():
    print('\n' + '='*60)
    print('Running inference for', model_name, 'from', model_dir)
    try:
        model, tokenizer, device_of_model = try_load_model(model_dir, NUM_CLASSES, prefer_8bit=True, offload_folder=f'./offload_{model_name}')
    except Exception as e:
        print('Failed to load model', model_name, 'Error:', e)
        continue

    device = next(model.parameters()).device
    print('Model device:', device)

    # Tokenize in streaming/chunks to avoid building huge tensors
    texts = test['text'].tolist()
    all_probs = []

    # Data collator for padding
    collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors='pt')

    # process small groups of texts to avoid OOM
    for i in tqdm(range(0, len(texts), TOKENIZE_CHUNK), desc=f'tokenizing_{model_name}'):
        batch_texts = texts[i:i+TOKENIZE_CHUNK]
        enc = tokenizer(batch_texts, truncation=True, padding=True, max_length=MAX_LEN, return_tensors='pt')

        # run forward in even smaller steps
        for j in range(0, enc['input_ids'].size(0), FORWARD_BS):
            b_input_ids = enc['input_ids'][j:j+FORWARD_BS].to(device)
            b_attn = enc['attention_mask'][j:j+FORWARD_BS].to(device)
            with torch.no_grad():
                logits = model(input_ids=b_input_ids, attention_mask=b_attn).logits
                probs = torch.softmax(logits, dim=-1).cpu().numpy()
            all_probs.append(probs)
        # free enc tensors
        del enc
        torch.cuda.empty_cache()

    probs_arr = np.vstack(all_probs)
    print(model_name, 'probs shape:', probs_arr.shape)

    # compute top indices and decode
    top_indices = np.argsort(-probs_arr, axis=1)
    flat = top_indices.flatten()
    decoded = le.inverse_transform(flat).reshape(top_indices.shape)

    # per-model submission (top3)
    joined_top3 = [' '.join(r[:3]) for r in decoded]
    sub_df = pd.DataFrame({'row_id': test.row_id.values, 'Category:Misconception': joined_top3})
    sub_file = OUT_DIR / f'submission_{model_name}.csv'
    sub_df.to_csv(sub_file, index=False)
    SUB_FILES.append(str(sub_file))

    # save probability CSV (top-25)
    prob_list = []
    TOPK = min(25, probs_arr.shape[1])
    for i in range(probs_arr.shape[0]):
        row = {}
        for k in range(TOPK):
            row[f'prob_{k}'] = float(probs_arr[i, top_indices[i, k]])
        row['row_id'] = int(test.row_id.values[i])
        row['top_classes'] = ' '.join(decoded[i, :TOPK])
        prob_list.append(row)
    prob_df = pd.DataFrame(prob_list)
    prob_file = OUT_DIR / f'probabilities_{model_name}.csv'
    prob_df.to_csv(prob_file, index=False)
    PROB_FILES.append(str(prob_file))

    print('Saved', sub_file, 'and', prob_file)

    # cleanup
    del model
    torch.cuda.empty_cache()
    time.sleep(2)

# ----------------- Ensemble -----------------
print('\nEnsembling model probability files...')

# simple weighted ensemble + agreement bonus (fast and effective)
weights = {
    'deepseek7b': 1.2,
    'qwen3_8b': 1.0,
    'gemma2_lora': 0.9
}

# load all prob files into a dict by model name
prob_dfs = {Path(p).stem.split('_',1)[1]: pd.read_csv(p) for p in PROB_FILES}

# merge them on row_id
merged = None
for name, df in prob_dfs.items():
    df = df.rename(columns={c: f"{c}_{name}" for c in df.columns if c!='row_id'})
    if merged is None:
        merged = df
    else:
        merged = merged.merge(df, on='row_id')

# build combined scores per row
final_preds = []
for idx, row in merged.iterrows():
    class_scores = {}
    # extract model-specific top classes and probs
    for name in prob_dfs.keys():
        top_classes = str(row[f'top_classes_{name}']).split(' ')
        for k, cls in enumerate(top_classes):
            prob_col = f'prob_{k}_{name}'
            if prob_col not in row:
                continue
            p = float(row[prob_col])
            w = weights.get(name, 1.0)
            class_scores[cls] = class_scores.get(cls, 0.0) + p * w
    # agreement bonus: count how many models predicted the class among their top K
    # we'll use top-3 agreement bonus
    for cls in list(class_scores.keys()):
        votes = 0
        for name in prob_dfs.keys():
            top_classes = str(row[f'top_classes_{name}']).split(' ')
            if cls in top_classes[:3]:
                votes += 1
        # add small bonus for agreement
        class_scores[cls] += 0.05 * votes
    # sort and take top-3
    sorted_cls = sorted(class_scores.items(), key=lambda x: -x[1])
    top3 = [c for c, s in sorted_cls[:3]]
    final_preds.append(' '.join(top3))

submission = pd.DataFrame({'row_id': merged.row_id.values, 'Category:Misconception': final_preds})
submission.to_csv('submission_ensemble.csv', index=False)
print('Saved submission_ensemble.csv')

print('\nDone — files in', OUT_DIR)
print(list(OUT_DIR.iterdir()))


2025-09-23 18:09:36.096704: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758650976.281881      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758650976.349401      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Device: cuda
Loading datasets...
Building prompts...
No saved LabelEncoder found. Building from training data (must match training).
NUM_CLASSES = 65

Running inference for deepseek7b from /kaggle/input/deekseepmath-7b-map-competition/MAP_EXP_09_FULL
Attempting 8-bit load for /kaggle/input/deekseepmath-7b-map-competition/MAP_EXP_09_FULL...


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loaded in 8-bit mode.
Model device: cuda:0


tokenizing_deepseek7b: 100%|██████████| 1/1 [00:01<00:00,  1.45s/it]


deepseek7b probs shape: (3, 65)
Saved out_inference/submission_deepseek7b.csv and out_inference/probabilities_deepseek7b.csv

Running inference for qwen3_8b from /kaggle/input/qwen3-8b-map-competition/MAP_EXP_16_FULL


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Attempting 8-bit load for /kaggle/input/qwen3-8b-map-competition/MAP_EXP_16_FULL...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loaded in 8-bit mode.
Model device: cuda:0


tokenizing_qwen3_8b: 100%|██████████| 1/1 [00:00<00:00,  1.05it/s]


qwen3_8b probs shape: (3, 65)
Saved out_inference/submission_qwen3_8b.csv and out_inference/probabilities_qwen3_8b.csv

Running inference for gemma2_lora from /kaggle/input/gemma2-9b-it-cv945
Attempting 8-bit load for /kaggle/input/gemma2-9b-it-cv945...
8-bit load failed or bitsandbytes not present: We couldn't connect to 'https://huggingface.co' to load the files, and couldn't find them in the cached files.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.
Trying device_map='auto' with offload_folder...
device_map auto + offload failed: We couldn't connect to 'https://huggingface.co' to load the files, and couldn't find them in the cached files.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.
Falling back to CPU model load (very slow but safe).
Failed to load model gemma2_lora Err