In [1]:
# Full inference + ensemble pipeline (memory-safe, no bitsandbytes requirement)
# Paste into a Kaggle notebook cell and run.
import os, time, math, gc
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import defaultdict

import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import (
    AutoTokenizer, AutoConfig, AutoModelForSequenceClassification,
    DataCollatorWithPadding
)
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
from scipy.special import softmax

# --------------------- CONFIG ---------------------
# Put your model folders here (Hugging Face format saved directories in /kaggle/input/...)
MODEL_PATHS = [
    "/kaggle/input/deekseepmath-7b-map-competition/MAP_EXP_09_FULL",  # deepseek (example)
    "/kaggle/input/gemma2-9b-it-cv945",                              # gemma2 lora path (example)
    "/kaggle/input/qwen3-8b-map-competition/MAP_EXP_16_FULL"        # qwen3 (example)
]
OUT_DIR = "/kaggle/working"
os.makedirs(OUT_DIR, exist_ok=True)

MAX_LEN = 256           # tokens (reduce to 128 if OOM)
BATCH_SIZE = 1          # safe default; increase if you have memory
TOP_PRED_PER_MODEL = 25 # how many top classes and probs to save per model
ENSEMBLE_TOPK = 3       # final top-k predictions per row
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
PRINT_EVERY = 1
# ---------------------------------------------------

print("DEVICE:", DEVICE, "DTYPE:", DTYPE)
print("MAX_LEN:", MAX_LEN, "BATCH_SIZE:", BATCH_SIZE)

# --------------------- DATA & LABEL ENCODER ---------------------
train = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/train.csv')
test  = pd.read_csv('/kaggle/input/map-charting-student-math-misunderstandings/test.csv')

# fill missing and create target labels
train['Misconception'] = train['Misconception'].fillna("NA")
train['target'] = train['Category'].astype(str) + ":" + train['Misconception'].astype(str)

le = LabelEncoder()
train['label'] = le.fit_transform(train['target'])
NUM_CLASSES = len(le.classes_)
print("Num classes from train:", NUM_CLASSES)

# correctness feature (powerful)
idx_true = train['Category'].str.startswith("True")
correct_counts = (
    train[idx_true]
    .groupby(['QuestionId','MC_Answer'])
    .MC_Answer.agg('count')
    .reset_index(name='c')
    .sort_values('c', ascending=False)
    .drop_duplicates(['QuestionId'])
)
correct_counts['is_correct'] = 1
correct_map = dict(((int(r.QuestionId), str(r.MC_Answer)), 1) for _, r in correct_counts.iterrows())

# attach is_correct to test
test = test.copy()
test['is_correct'] = test.apply(lambda r: int(correct_map.get((int(r.QuestionId), str(r.MC_Answer)), 0)), axis=1)

# prompt builder
def build_prompt(row):
    correctness = "Yes" if int(row['is_correct']) else "No"
    return (
        f"Question: {row['QuestionText']}\n"
        f"Answer: {row['MC_Answer']}\n"
        f"Correct? {correctness}\n"
        f"Student Explanation: {row['StudentExplanation']}"
    )

test['text'] = test.apply(build_prompt, axis=1)

# --------------------- HELPERS ---------------------
def safe_tokenizer_from(model_dir):
    try:
        tok = AutoTokenizer.from_pretrained(model_dir, local_files_only=True)
    except Exception as e:
        print("Tokenizer load failed for", model_dir, ":", e)
        raise
    # ensure pad_token exists
    if tok.pad_token is None:
        if tok.eos_token is not None:
            tok.add_special_tokens({'pad_token': tok.eos_token})
        else:
            tok.add_special_tokens({'pad_token': '<pad>'})
    return tok

def stream_tokenize_texts(tokenizer, texts, max_len=MAX_LEN, chunk=128):
    """
    Tokenize large list of texts in smaller chunks to avoid big memory spikes.
    Returns input_ids and attention_masks (torch tensors).
    """
    input_ids_batches = []
    attn_batches = []
    for i in range(0, len(texts), chunk):
        batch_texts = texts[i:i+chunk]
        enc = tokenizer(batch_texts, truncation=True, padding='max_length', max_length=max_len, return_tensors='pt')
        input_ids_batches.append(enc['input_ids'])
        attn_batches.append(enc['attention_mask'])
        # free enc
    input_ids = torch.cat(input_ids_batches, dim=0)
    attn = torch.cat(attn_batches, dim=0)
    return input_ids, attn

def model_name_for_path(p):
    # create a friendly short name for outputs
    base = os.path.basename(p.rstrip('/'))
    if base == "":
        base = p.replace('/', '_')
    return base

# --------------------- PER-MODEL INFERENCE ---------------------
saved_prob_files = []  # will collect produced probability CSVs

for model_path in MODEL_PATHS:
    model_short = model_name_for_path(model_path)
    print(f"\n=== Processing model: {model_path} (name {model_short}) ===")
    prob_csv = os.path.join(OUT_DIR, f"submission_{model_short}_probabilities.csv")
    sub_csv  = os.path.join(OUT_DIR, f"submission_{model_short}.csv")

    # skip if already produced
    if os.path.exists(prob_csv) and os.path.exists(sub_csv):
        print("Found existing outputs for", model_short, "-> skipping inference.")
        saved_prob_files.append(prob_csv)
        continue

    # Load tokenizer
    try:
        tokenizer = safe_tokenizer_from(model_path)
    except Exception as e:
        print("Skipping model due to tokenizer error:", e)
        continue

    # Try loading model conservatively
    try:
        print("Loading model (may be memory heavy)...")
        cfg = AutoConfig.from_pretrained(model_path, local_files_only=True, num_labels=NUM_CLASSES)
        model = AutoModelForSequenceClassification.from_pretrained(
            model_path,
            config=cfg,
            local_files_only=True,
            torch_dtype=DTYPE,
            low_cpu_mem_usage=True,
            device_map="auto" if torch.cuda.is_available() else None
        )
        # ensure tie tokenizer size
        model.resize_token_embeddings(len(tokenizer))
        model.eval()
    except Exception as e:
        print(f"[ERROR] Failed to load model {model_path}: {e}")
        print("Skipping this model to avoid crashing the notebook.")
        # cleanup and continue
        try:
            del model
        except:
            pass
        torch.cuda.empty_cache()
        continue

    # Tokenize (streamed)
    texts = test['text'].astype(str).tolist()
    try:
        input_ids, attention_mask = stream_tokenize_texts(tokenizer, texts, max_len=MAX_LEN, chunk=128)
    except Exception as e:
        print("Tokenization failed:", e)
        del model
        torch.cuda.empty_cache()
        continue

    # Create DataLoader (TensorDataset)
    ds = TensorDataset(input_ids, attention_mask)
    loader = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=False)

    device_for_model = next(model.parameters()).device
    print("Running inference on device", device_for_model, "| batches:", len(loader))

    all_logits = []
    try:
        with torch.no_grad():
            for batch in tqdm(loader, desc=f"Infer {model_short}", disable=False):
                ids, att = (t.to(device_for_model) for t in batch)
                outputs = model(input_ids=ids, attention_mask=att)
                logits = outputs.logits
                # convert to float32 on CPU to save memory
                all_logits.append(logits.detach().float().cpu().numpy())
    except RuntimeError as e:
        print(f"[ERROR] Runtime error during inference for {model_short}: {e}")
        print("Cleaning GPU and skipping this model.")
        del model
        torch.cuda.empty_cache()
        continue

    # stack predictions and post-process
    predictions = np.concatenate(all_logits, axis=0)  # shape (n_samples, num_classes)
    probs = softmax(predictions, axis=1)

    # top indices
    top_indices = np.argsort(-probs, axis=1)  # each row sorted descending
    # prepare decode: flatten and inverse transform
    flat_idx = top_indices.flatten()
    try:
        decoded_flat = le.inverse_transform(flat_idx)
    except Exception as e:
        print("Label inverse transform failed:", e)
        # fallback: produce dummy class names (shouldn't happen)
        decoded_flat = np.array([f"UNK_{i}" for i in flat_idx])

    decoded_labels = decoded_flat.reshape(top_indices.shape)

    # Save submission file (top-3 class strings)
    top3 = [" ".join(row[:3]) for row in decoded_labels]
    sub_df = pd.DataFrame({
        "row_id": test.row_id.values,
        "Category:Misconception": top3
    })
    sub_df.to_csv(sub_csv, index=False)
    print("Wrote submission csv:", sub_csv)

    # Save probabilities + top classes (top TOP_PRED_PER_MODEL)
    rows = []
    n = len(test)
    for i in range(n):
        row = {}
        # top K names
        topk_names = list(decoded_labels[i, :TOP_PRED_PER_MODEL])
        row['row_id'] = int(test.row_id.values[i])
        row['top_classes'] = " ".join(topk_names)
        # top probs (aligned)
        for j in range(min(TOP_PRED_PER_MODEL, top_indices.shape[1])):
            col = f"prob_{j}"
            val = float(probs[i, top_indices[i, j]])
            row[col] = val
        rows.append(row)
    prob_df = pd.DataFrame(rows)
    prob_df.to_csv(prob_csv, index=False)
    print("Wrote probabilities csv:", prob_csv)

    saved_prob_files.append(prob_csv)

    # cleanup model memory
    del model
    torch.cuda.empty_cache()
    gc.collect()
    print(f"Finished model: {model_short} (saved files).")

# --------------------- ENSEMBLE ---------------------
print("\n=== ENSEMBLING available probability files ===")
print("Found probability files:", saved_prob_files)
if len(saved_prob_files) == 0:
    raise FileNotFoundError("No model probability files found. Run inference on at least one model.")

# optionally define weights aligned to MODEL_PATHS order; if fewer available, use ones for those found
default_weights = {model_name_for_path(p): 1.0 for p in MODEL_PATHS}
# Example custom weights (change if desired)
# default_weights.update({
#     model_name_for_path(MODEL_PATHS[0]): 1.2,
#     model_name_for_path(MODEL_PATHS[1]): 1.0,
#     model_name_for_path(MODEL_PATHS[2]): 0.8,
# })
weights_list = []
for p in saved_prob_files:
    weights_list.append(default_weights.get(model_name_for_path(p), 1.0))

# helper to load model_map row_id -> {class:prob}
def load_prob_file(path):
    df = pd.read_csv(path)
    # try to infer top_classes column and prob columns
    # top_classes may be named 'top_classes' or 'top_classes_model' etc; try to find it:
    top_col = None
    for c in df.columns:
        if c.startswith("top_classes"):
            top_col = c; break
    if top_col is None:
        # fallback names
        for c in ['top_classes', 'top_classes_deepseek', 'top_classes_gemma']:
            if c in df.columns:
                top_col = c; break
    # collect prob columns in order
    prob_cols = [c for c in df.columns if c.startswith("prob_")]
    # sort by numeric index
    prob_cols = sorted(prob_cols, key=lambda x: int(''.join(filter(str.isdigit, x)) or 0))
    model_map = {}
    for _, row in df.iterrows():
        rid = int(row['row_id'])
        if top_col is None:
            # If we can't find top names, try to reconstruct from prob columns names (not ideal)
            # Here we just use column names as pseudo class names
            names = [f"cls_{i}" for i in range(len(prob_cols))]
        else:
            names = str(row[top_col]).split()
        per = {}
        for i, pc in enumerate(prob_cols):
            if i >= len(names): break
            try:
                per[names[i]] = float(row[pc])
            except Exception:
                per[names[i]] = 0.0
        model_map[int(rid)] = per
    return model_map

model_maps = []
for path in saved_prob_files:
    try:
        mm = load_prob_file(path)
        model_maps.append(mm)
    except Exception as e:
        print(f"Error loading prob file {path}: {e}")

# ensemble per-row
all_row_ids = sorted(set.union(*[set(mm.keys()) for mm in model_maps]))
final_preds = {}
for rid in all_row_ids:
    votes = defaultdict(int)
    total_prob = defaultdict(float)
    max_prob = defaultdict(float)
    for m_idx, mm in enumerate(model_maps):
        w = weights_list[m_idx] if m_idx < len(weights_list) else 1.0
        per = mm.get(rid, {})
        for cls, p in per.items():
            votes[cls] += 1
            total_prob[cls] += p * w
            max_prob[cls] = max(max_prob[cls], p * w)
    # score combination
    scores = {}
    n_models = len(model_maps)
    for cls in total_prob.keys():
        scores[cls] = total_prob[cls] * 0.6 + (votes[cls] / max(1, n_models)) * 0.3 + max_prob[cls] * 0.1
    # pick top ENSEMBLE_TOPK
    sorted_classes = sorted(scores.items(), key=lambda x: -x[1])
    topk = [c for c, _ in sorted_classes[:ENSEMBLE_TOPK]]
    # pad if fewer
    if len(topk) < ENSEMBLE_TOPK:
        # fill with most common classes from train (safe fallback)
        more = [c for c in le.classes_ if c not in topk]
        topk += more[:ENSEMBLE_TOPK - len(topk)]
    final_preds[rid] = " ".join(topk)

# build final submission aligned to test order
out_rows = []
for rid in test.row_id.values:
    out_rows.append((int(rid), final_preds.get(int(rid), " ".join(le.classes_[:ENSEMBLE_TOPK]))))

final_sub = pd.DataFrame(out_rows, columns=["row_id", "Category:Misconception"])
final_path = os.path.join(OUT_DIR, "submission.csv")
final_sub.to_csv(final_path, index=False)
print("\nSaved ensemble submission:", final_path)
print(final_sub.head())


2025-09-24 10:08:42.154760: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1758708522.343688      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1758708522.402224      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


DEVICE: cuda DTYPE: torch.float16
MAX_LEN: 256 BATCH_SIZE: 1
Num classes from train: 65

=== Processing model: /kaggle/input/deekseepmath-7b-map-competition/MAP_EXP_09_FULL (name MAP_EXP_09_FULL) ===
Loading model (may be memory heavy)...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Running inference on device cuda:0 | batches: 3


Infer MAP_EXP_09_FULL: 100%|██████████| 3/3 [00:01<00:00,  2.13it/s]


Wrote submission csv: /kaggle/working/submission_MAP_EXP_09_FULL.csv
Wrote probabilities csv: /kaggle/working/submission_MAP_EXP_09_FULL_probabilities.csv
Finished model: MAP_EXP_09_FULL (saved files).

=== Processing model: /kaggle/input/gemma2-9b-it-cv945 (name gemma2-9b-it-cv945) ===
Loading model (may be memory heavy)...
[ERROR] Failed to load model /kaggle/input/gemma2-9b-it-cv945: 'NoneType' object has no attribute 'endswith'
Skipping this model to avoid crashing the notebook.

=== Processing model: /kaggle/input/qwen3-8b-map-competition/MAP_EXP_16_FULL (name MAP_EXP_16_FULL) ===
Loading model (may be memory heavy)...


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Running inference on device cuda:0 | batches: 3


Infer MAP_EXP_16_FULL: 100%|██████████| 3/3 [00:00<00:00,  4.13it/s]


Wrote submission csv: /kaggle/working/submission_MAP_EXP_16_FULL.csv
Wrote probabilities csv: /kaggle/working/submission_MAP_EXP_16_FULL_probabilities.csv
Finished model: MAP_EXP_16_FULL (saved files).

=== ENSEMBLING available probability files ===
Found probability files: ['/kaggle/working/submission_MAP_EXP_09_FULL_probabilities.csv', '/kaggle/working/submission_MAP_EXP_16_FULL_probabilities.csv']

Saved ensemble submission: /kaggle/working/submission.csv
   row_id                             Category:Misconception
0   36696  True_Correct:NA False_Neither:NA False_Misconc...
1   36697  False_Misconception:WNB False_Neither:NA False...
2   36698   True_Neither:NA False_Neither:NA True_Correct:NA
