In [None]:
# !nvcc --version && echo $CUDA_HOME
# !nvidia-smi  # Should show a T4 or V100 GPU

In [12]:
# %%bash
# export FORCE_CMAKE=1
# export CUDACXX=/usr/local/cuda/bin/nvcc
# export CMAKE_ARGS="-DGGML_CUDA=ON -DCMAKE_CUDA_COMPILER=$CUDACXX"
!pip --quiet install --upgrade llama-cpp-python 

In [2]:
# 
import os
import json
import time
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor, as_completed
from llama_cpp import Llama


In [3]:
def load_model():
    # """Load model with Kaggle CPU constraints"""
    # os.environ["GGML_CUDA_FORCE_MMQ"] = "1"
    # os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Ensure Kaggle's GPU is visible
    
    llm = Llama.from_pretrained(
        repo_id="mradermacher/TxAgent-T1-Llama-3.1-8B-GGUF",
        filename="TxAgent-T1-Llama-3.1-8B.Q8_0.gguf",
        # n_ctx=2048, # default 512
        # n_gpu_layers=35,  # Kaggle GPUs (T4/V100) work best with ~35 layers (avoids OOM)
        n_threads=2,  # Kaggle has 4 CPU cores - limit per-worker threads
        # n_batch=128,  # Smaller batch to fit Kaggle GPU memory
        verbose=False,  # Reduce log spam in Kaggle console
        flash_attn=True,
        # device='cuda'
    )
    # Verify GPU is detected
    # if not llm.ctx.is_using_gpu:
    #     print("GPU not detected! Check CUDA installation.")
    # print("Successfully using GPU acceleration!")

    
    return llm

In [4]:
def process_chunk(chunk, llm):
    """Process a single chunk of samples on Kaggle GPU"""
    chunk_results = []
    for sample in chunk:
        try:
            prompt = f"""Please answer the question: {sample['question']}. These are the options {sample['options']}."""
            
            messages = [
                {"role": "system", "content": "You are a reasoning clinical assistant."},
                {"role": "user", "content": prompt}
            ]
            
            # Small delay to prevent GPU overload on Kaggle
            time.sleep(0.2)
            
            output = llm.create_chat_completion(
                messages=messages,
                # max_tokens=64,
                # temperature=0.0,1
                stream=False
            )
            
            chunk_results.append({
                'id': sample['id'],
                'llm_answer': output['choices'][0]['message']['content']
            })
            print(f"Processed sample {sample['id']}")
            
        except Exception as e:
            print(f"Error in sample {sample['id']}: {str(e)}")
            chunk_results.append({'id': sample['id'], 'error': str(e)})
    
    return chunk_results

def split_into_chunks(data, num_chunks):
    """Split data into equal chunks for parallel processing"""
    chunk_size = len(data) // num_chunks
    chunks = [data[i:i+chunk_size] for i in range(0, len(data), chunk_size)]
    # Handle remaining samples
    if len(chunks) > num_chunks:
        chunks[num_chunks-1].extend(chunks[num_chunks:])
        chunks = chunks[:num_chunks]
    return chunks

In [5]:
def process_sample(sample):
    """Process single sample - model loaded per worker (Kaggle-safe)"""
    llm = load_model()  # Load model once per worker (Kaggle memory-friendly)
    
    prompt = f"""Please answer the question: {sample['question']}. These are the options {sample['options']}."""
    
    messages = [
        {"role": "system", "content": "You are a reasoning clinical assistant."},
        {"role": "user", "content": prompt}
    ]
    
    output = llm.create_chat_completion(
        messages=messages,
        # max_tokens=128,  # Further reduced for Kaggle speed (clinical Q&A is concise)
        # temperature=0.0,
        stream=False,  # Disable streaming (faster in Kaggle)
    )
    
    return {
        'id': sample['id'],
        'question_type': sample['question_type'],
        'question': sample['question'],
        'options': sample['options'],
        'llm_answer': output['choices'][0]['message']['content'],
        'output': str(output)
    }

In [7]:
def main(data, output_json_path, num_chunks=2):  # Kaggle-safe parallelism
    # Load existing results (Kaggle working directory)
    if os.path.exists(output_json_path):
        with open(output_json_path, 'r', encoding='utf-8') as f:
            results = json.load(f)
    else:
        results = []
    
    # Filter already processed samples
    processed_ids = {r['id'] for r in results}
    to_process = [s for s in data if s['id'] not in processed_ids]
    print(f"Need to process {len(to_process)} samples split into {num_chunks} chunks")

    # Split data into chunks
    chunks = split_into_chunks(to_process, num_chunks)

    # Load model once (shared across threads)
    print("Loading model on Kaggle CPU...")
    llm = load_model()
    # time.sleep(3)  # Let CUDA initialize fully on Kaggle

    # Process chunks in parallel with Kaggle GPU limits
    with ThreadPoolExecutor(max_workers=num_chunks) as executor:
        futures = [executor.submit(process_chunk, chunk, llm) for chunk in chunks]
        
        # Collect results
        for future in futures:
            chunk_results = future.result()
            results.extend(chunk_results)
            
            # Save progress (Kaggle persists /kaggle/working/)
            with open(output_json_path, 'w', encoding='utf-8') as f:
                json.dump(results, f, indent=4, ensure_ascii=False)
            print(f"Completed chunk. Total processed: {len(results)}")

    # Final save
    with open(output_json_path, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=4, ensure_ascii=False)
    print(f"All done! Total results: {len(results)}")

In [11]:
def main(data, output_json_path, max_workers=1):  # 1 worker for Kaggle GPU safety
    # Load existing results (Kaggle saves to /kaggle/working/)
    if os.path.exists(output_json_path):
        with open(output_json_path, 'r', encoding='utf-8') as f:
            results = json.load(f)
    else:
        results = []
    
    processed_ids = {r['id'] for r in results}
    to_process = [s for s in data if s['id'] not in processed_ids]
    print(f"Need to process {len(to_process)} samples")

    # Process in parallel with Kaggle constraints
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_sample, sample): sample for sample in to_process}

        for future in as_completed(futures):
            sample = futures[future]
            try:
                result = future.result()
                results.append(result)
                print(f"Processed sample {result['id']}")

                # Save to Kaggle working directory (persists between sessions)
                with open(output_json_path, 'w', encoding='utf-8') as f:
                    json.dump(results, f, indent=4, ensure_ascii=False)
            except Exception as e:
                print(f"Error processing sample {sample['id']}: {e}")
                # Save error to results for debugging
                results.append({
                    'id': sample['id'],
                    'error': str(e)
                })
                with open(output_json_path, 'w', encoding='utf-8') as f:
                    json.dump(results, f, indent=4, ensure_ascii=False)

In [15]:
data_path = "/kaggle/input/curebench-data/curebench_testset_phase1.jsonl"
data = []
with open(data_path, 'r') as f:
    for line in f:
        obj = json.loads(line)  # parse each JSON object from the line
        data.append(obj)

# Kaggle working directory (where outputs are saved)
output_path = "/kaggle/working/curebench_results.json"


In [17]:
# temp = process_sample(data[42])
# data

In [None]:
main(data, output_path, max_workers=2)

Need to process 230 samples


llama_context: n_ctx_per_seq (512) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
llama_context: n_ctx_per_seq (512) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility
llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility


Processed sample QBzcFcVzpisx


llama_context: n_ctx_per_seq (512) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility


Processed sample lMCGiMsPyJct


llama_context: n_ctx_per_seq (512) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility


Processed sample OUTPs0m2REUz


llama_context: n_ctx_per_seq (512) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility


Processed sample OhzVvNLVpXly


llama_context: n_ctx_per_seq (512) < n_ctx_train (131072) -- the full capacity of the model will not be utilized
llama_kv_cache_unified: LLAMA_SET_ROWS=0, using old ggml_cpy() method for backwards compatibility


In [None]:

# llm = Llama.from_pretrained(
# 	repo_id="mradermacher/TxAgent-T1-Llama-3.1-8B-GGUF",
# 	filename="TxAgent-T1-Llama-3.1-8B.Q8_0.gguf",
#     # n_ctx=512
# )


In [None]:
# import json

# file_path = '/kaggle/input/curebench-data/curebench_testset_phase1.jsonl'
# output_json_path = '/kaggle/working/llm_answers_test.json'

# data = []
# with open(file_path, 'r') as f:
#     for line in f:
#         obj = json.loads(line)  # parse each JSON object from the line
#         data.append(obj)

In [None]:
# data[0].keys()

In [None]:
# data[0]['question']
# data[0]['options']

In [None]:
# IDX = 0

# prompt = f"""Please answer the question: {data[IDX]['question']}. These are the options {data[IDX]['options']}"""

# messages = [
#             {"role": "system", "content": "You are a reasoning clinical assistant."},
#             {"role": "user", "content": prompt}
#         ]

In [None]:
# # %%timeit
# output = llm.create_chat_completion(
#     messages = messages
# )

In [None]:
# %%timeit
# output

In [None]:
# output['choices'][0]['message']['content'].split('[FinalAnswer]')

In [None]:
# # Load existing results if file exists
# if os.path.exists(output_json_path):
#     with open(output_json_path, 'r', encoding='utf-8') as f:
#         results = json.load(f)
# else:
#     results = []

# processed_ids = {r['id'] for r in results}  # track already done samples

# for idx, sample in enumerate(data):
#     if sample['id'] in processed_ids:
#         continue  # skip already processed samples
    
#     prompt = f"""Please answer the question: {sample['question']}. These are the options {sample['options']}."""

#     messages = [
#         {"role": "system", "content": "You are a reasoning clinical assistant."},
#         {"role": "user", "content": prompt}
#     ]

#     output = llm.create_chat_completion(messages=messages, max_tokens=512)
#     llm_answer = output['choices'][0]['message']['content']

#     result = {
#         'id': sample['id'],
#         'question_type': sample['question_type'],
#         'question': sample['question'],
#         'options': sample['options'],
#         'llm_answer': llm_answer
#     }
#     results.append(result)

#     # Write after each processed sample (or every N samples for speed)
#     with open(output_json_path, 'w', encoding='utf-8') as f:
#         json.dump(results, f, indent=4, ensure_ascii=False)

In [None]:
results[0]

In [None]:
# with open(output_json_path, 'w', encoding='utf-8') as f_json:
#     json.dump(results, f_json, indent=4, ensure_ascii=False)

# print(f"Results written to JSON file: {output_json_path}")

In [None]:
# !python -m pip install --quiet --no-index -v --find-links=/kaggle/input/aimo-packages/offline_packages trl --pre

In [None]:
# !python -m pip install --quiet --no-index -v --find-links=/kaggle/input/aimo-packages/offline_packages -U bitsandbytes
# !python -m pip install --no-index -v --find-links=/kaggle/input/aimo-packages/offline_packages levenshtein

In [None]:
# !python -m pip install --quiet --no-index -v --find-links=/kaggle/input/aimo-packages/offline_packages transformers

In [None]:
# !python /kaggle/input/curebench-code/run.py --config /kaggle/input/curebench-code/metadata_config_test.json

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1,2,3"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
from datasets import load_dataset,Dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel
from trl import GRPOConfig, GRPOTrainer

import datetime

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    GenerationConfig,
    PrinterCallback,
)
from tqdm import tqdm
import torch
import time
import transformers
import pandas as pd
import numpy as np

transformers.set_seed(42)

In [None]:
class CFG:
    MAX_TRAIN = 100
    MAX_TOKENS = 2048
    NUM_GENERATIONS = 4
    USE_PEFT = True
    BATCH_SIZE=1
    MAX_STEPS = 80
    
    BETA = 0.04
    LR = 1.e-5
    
    MODEL_NAME = '/kaggle/input/deepseek-r1/transformers/deepseek-r1-distill-qwen-1.5b/2'
    splitter = '<｜Assistant｜>'
    
    step_count=10
    DEBUG = False

In [None]:
import re

def extract_boxed_text(text):
    pattern = r'oxed{(.*?)}'
    matches = re.findall(pattern, text)
    if not matches:
        return ""
    for match in matches[::-1]:
        if match != "":
            return match
    return ""

In [None]:
def create_prompt(sample):
    question = sample['problem']
    chat = [{"role": "system", "content": "A conversation between User and Assistant. The user asks a question, and the Assistant solves it.  The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>"},
            {"role": "user", "content": question + ' Return final answer within \\boxed{}, after taking modulo 1000.'},]
    sample['prompt'] = tokenizer.apply_chat_template(
            conversation=chat,
            tokenize=False,
            add_generation_prompt=True
        )
    return sample

In [None]:
## We would also want a reward function based on accuracy
# split after </think>, then get the answer within bbox

## We can also do a reward based on Similarity of 

import re

def format_reward_func(completions, **kwargs):
    """Reward function that checks if the completion has a specific format."""
    pattern = r"^<think>.*?</think>.*?oxed{(.*?)}.*?$"
    matches = [re.match(pattern, content, re.DOTALL) for content in completions]
    return [1.0 if match else 0.0 for match in matches]


def extract_boxed_text(text):
    pattern = r'oxed{(.*?)}'
    matches = re.findall(pattern, text)
    if not matches:
        return ""
    for match in matches[::-1]:
        if match != "":
            return match
    return ""

def accuracy_reward_func(completions, answer, **kwargs):
    # Regular expression to capture content inside \boxed{}
    contents = [extract_boxed_text(completion) for completion in completions]
    # Reward 1 if the content is the same as the ground truth, 0 otherwise
    return [1.0 if c == str(gt) else 0.0 for c, gt in zip(contents, answer)]

In [None]:

device_map = 'auto'
if CFG.USE_PEFT:
    compute_dtype = getattr(torch, "float16")
    bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type='nf4',
            bnb_4bit_compute_dtype=compute_dtype,
            bnb_4bit_use_double_quant=False,
        )
    original_model = AutoModelForCausalLM.from_pretrained(CFG.MODEL_NAME, 
                                                          device_map=device_map,
                                                          quantization_config=bnb_config,
                                                          trust_remote_code=True
                                                         )
    tokenizer = AutoTokenizer.from_pretrained(CFG.MODEL_NAME,
                                              trust_remote_code=True,
                                              # padding_side="left"
                                             )
else:
    original_model = AutoModelForCausalLM.from_pretrained(CFG.MODEL_NAME, 
                                                          device_map=device_map,
                                                          trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained(CFG.MODEL_NAME,
                                              trust_remote_code=True,
                                              # padding_side="left"
                                             )

In [None]:
pip install -U bitsandbytes

In [None]:
def gen(model, text, max_tokens):
    model_input = tokenizer(text, return_tensors='pt').to(model.device)
    model.eval()
    with torch.no_grad():
        tok = model.generate(**model_input, max_new_tokens=max_tokens, pad_token_id=tokenizer.pad_token_type_id)
        outputs = []
        for i in range(len(tok)):
            res = tokenizer.decode(tok[i], skip_special_tokens=True)
            output = res.split(CFG.splitter)[-1]
            outputs.append(output)
        return outputs[0] if len(outputs) == 1 else outputs

In [None]:
def evaluate_rewards(model, dataset, reward_functions: dict[str, callable], max_tokens: int, num_generations: int):
    completions = []
    other_info = []
    for example in tqdm(dataset):
        txt = example['prompt']
        kw = {k: v for k, v in example.items() if k not in {'prompt', 'completion'}}
        for _ in range(num_generations):
            other_info.append(kw)
            
        completion = gen(model, [txt]*num_generations, max_tokens)
        if isinstance(completion, str):
            completions.append(completion)
        else:
            completions += completion
        
    kwargs = {k: [d[k] for d in other_info] for k in other_info[0].keys()}
    res = {}
    for nm, reward_func in reward_functions.items():
        v = reward_func(completions=completions, **kwargs)
        print(nm, np.mean(v))
        res[nm] = np.mean(v)
    return res

In [None]:
reward_functions = {'formatting': format_reward_func, 'accuracy': accuracy_reward_func, 'solution_quality': levenshtein_reward_func}

In [None]:
# if not CFG.DEBUG:
#     original_rewards = evaluate_rewards(model=original_model, dataset=dataset['test'], reward_functions=reward_functions, max_tokens=CFG.MAX_TOKENS, num_generations=CFG.NUM_GENERATIONS)


In [None]:

# dtstr = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
# output_directory=f"./DEEPSEEK-GRPO-{dtstr}"


# training_args = GRPOConfig(
#     output_dir=output_directory,
    
#     learning_rate=CFG.LR,
    
#     per_device_train_batch_size=CFG.BATCH_SIZE,
    
#     gradient_accumulation_steps=1,
#     max_steps=CFG.MAX_STEPS,
    
#     max_completion_length=CFG.MAX_TOKENS,  #8192
#     num_generations=CFG.NUM_GENERATIONS,
#     beta=CFG.BETA,
    
#     logging_steps=CFG.step_count,
#     logging_dir="./logs",
#     save_strategy="steps",
#     save_steps=CFG.step_count,
# #     eval_strategy="steps",
# #     eval_steps=CFG.step_count,
# #     do_eval=True,
#     # gradient_checkpointing=True,  # Will crash the whole thing
#     report_to="none",
#     overwrite_output_dir = 'True',    
# )

# # Will typically use the AdamW optimizer


In [None]:
# if CFG.USE_PEFT:
#     peft_config = LoraConfig(
#         r=32, #Rank
#         lora_alpha=32,
#         target_modules=[
#             'q_proj',
#             'k_proj',
#             'v_proj',
#             'dense'
#         ],
#         bias="none",
#         lora_dropout=0.05,  # Conventional
#         task_type="CAUSAL_LM",
#     )
#     trainer = GRPOTrainer(
#         model=original_model,
#         reward_funcs=list(reward_functions.values()),
#         args=training_args,
#         train_dataset=dataset['train'],
#         peft_config=peft_config,
#         callbacks=[PrinterCallback()]
#     )
# else:
#     trainer = GRPOTrainer(
#         model=original_model,
#         reward_funcs=list(reward_functions.values()),
#         args=training_args,
#         train_dataset=dataset['train'],
#         callbacks=[PrinterCallback()]
#     )


In [None]:
# trainer.train()

In [None]:
# if CFG.USE_PEFT:
#     print('Loading trained model')
#     CHKPT = CFG.MAX_STEPS
#     adapter_model_name = f'{output_directory}/checkpoint-{CHKPT}/'
#     new_model = PeftModel.from_pretrained(original_model, adapter_model_name)
# else:
#     new_model = original_model

In [None]:
# rewards = evaluate_rewards(model=new_model, dataset=dataset['test'], reward_functions=reward_functions, max_tokens=CFG.MAX_TOKENS, num_generations=CFG.NUM_GENERATIONS)
# rewards