In [1]:
!pip install -q -U peft --no-index --find-links ../input/common-pip/
!pip install -q -U accelerate --no-index --find-links ../input/common-pip/
!pip install -q -U bitsandbytes --no-index --find-links ../input/common-pip/
!pip install -q -U transformers --no-index --find-links ../input/common-pip/
!pip install -q -U sentencepiece --no-index --find-links ../input/common-pip/

In [2]:
%%writefile run_LLM_inference.py
import pandas as pd
import numpy as np
import torch
import argparse
import os
import yaml
from ast import literal_eval
from dataclasses import dataclass
from transformers import (
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorWithPadding,
    EvalPrediction,
    Gemma2ForSequenceClassification,
    GemmaTokenizerFast,
    LlamaForSequenceClassification,
    Trainer, 
    TrainingArguments
)
from scipy.special import expit, softmax
from peft import prepare_model_for_kbit_training, LoraConfig, TaskType, PeftModelForSequenceClassification
from datasets import load_dataset, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score

import gc

from accelerate import Accelerator

accelerator = Accelerator()
os.environ["WANDB_DISABLED"] = "true"

def process_prompt(input_str):
    stripped_str = input_str.strip('[]')
    sentences = [s.strip('"') for s in stripped_str.split('","')]
    return " ".join(sentences)

def prepare_tokenized(data, tokenizer, max_length=2048, spread_max_length=False):
    if isinstance(tokenizer, GemmaTokenizerFast):
        prompt = ["<prompt>: " + p for p in data['prompt']]
        response_a = ["\n\n<response_a>: " + r_a for r_a in data['response_a']]
        response_b = ["\n\n<response_b>: " + r_b for r_b in data['response_b']]
    else:
        prompt = ["User prompt: " + p for p in data['prompt']]
        response_a = ["\n\nModel A: \n" + r_a for r_a in data['response_a']]
        response_b = ["\n\nModel B: \n" + r_b for r_b in data['response_b']]

    if spread_max_length:
        prompt = tokenizer(prompt, max_length=prompt_len, truncation=True, padding=False).input_ids
        response_a = tokenizer(response_a, max_length=response_len, truncation=True, padding=False).input_ids
        response_b = tokenizer(response_b, max_length=response_len, truncation=True, padding=False).input_ids
        input_ids = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        attention_mask = [[1]* len(i) for i in input_ids]
    else:
        prompt_len = max_length//4
        prompt = [p[:prompt_len] + "<cont>" if len(p) > prompt_len else p for p in prompt]

        response_len = [(max_length - len(p))//2 for p in prompt]
        response_a = [r[:rl] + "<cont>" if len(r)>rl else r for r, rl in zip(response_a, response_len)]
        response_b = [r[:rl] + "<cont>" if len(r)>rl else r for r, rl in zip(response_b, response_len)]

        text = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        tokenized = tokenizer(text, max_length=max_length, truncation=True, padding=False)
        input_ids = tokenized.input_ids
        attention_mask = tokenized.attention_mask
    data.update({'input_ids': input_ids, 'attention_mask': attention_mask})
    return data

def main(args):
    test_df = pd.read_csv("/kaggle/input/lmsys-chatbot-arena/test.csv")
    
    accelerator.print(f'Test csv shape: {test_df.shape}')
    
    test_df['prompt'] = test_df['prompt'].fillna("").apply(process_prompt)
    test_df['response_a'] = test_df['response_a'].fillna("").apply(process_prompt)
    test_df['response_b'] = test_df['response_b'].fillna("").apply(process_prompt)
    test_df['text'] = test_df['prompt'].fillna("") + test_df['response_a'].fillna("") + test_df['response_b'].fillna("")
    test_df['prompt_len'] = test_df['text'].apply(lambda x: len(x))
    test_df.sort_values(by='prompt_len', inplace=True, ascending=False)
    
    ## Load tokenizer
    if "gemma" in args.base_model_path:
        tokenizer = GemmaTokenizerFast.from_pretrained(args.base_model_path)
    else:
        tokenizer = AutoTokenizer.from_pretrained(args.base_model_path)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    tokenizer.truncation_side = "left"
    
    test_df.reset_index(inplace=True, drop=True)
    test_ds = Dataset.from_list(test_df[['prompt', 'response_a', 'response_b']].to_dict('records'), split="test")
    
    test_tokenized_ds = test_ds.map(prepare_tokenized, batched=True, 
                                    fn_kwargs={"max_length": args.max_length, "tokenizer": tokenizer, "spread_max_length": False},
                                    remove_columns=test_ds.column_names)

    
    
    ## Load Model
    bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,
        bnb_8bit_use_double_quant=False,
        bnb_8bit_compute_dtype=torch.float16,
        llm_int8_skip_modules=["score", "correctness", "classifier", "model_pred1", "model_pred2"]
    )
    
    if "gemma" in args.base_model_path:
        base_model = Gemma2ForSequenceClassification.from_pretrained(
            args.base_model_path,
            quantization_config=bnb_config,
            torch_dtype=torch.float16, 
            trust_remote_code=True,
            problem_type="single_label_classification",
            num_labels=3
        )
    else:
        base_model = LlamaForSequenceClassification.from_pretrained(
            args.base_model_path,
            quantization_config=bnb_config,
            torch_dtype=torch.float16, 
            trust_remote_code=True,
            problem_type="single_label_classification",
            num_labels=3
        )
    base_model.config.pad_token_id = tokenizer.pad_token_id
    base_model.config.use_cache = False
    # base_model.enable_input_require_grads()
    
    base_model = prepare_model_for_kbit_training(base_model)

    model = PeftModelForSequenceClassification.from_pretrained(base_model, args.lora_path, is_trainable=False)
    # model = accelerator.prepare(model)
    
    ## Trainer Setup
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding="longest", return_tensors="pt")

    training_args = TrainingArguments(output_dir="tmp", 
                                  per_device_eval_batch_size=args.batch_size,
                                  remove_unused_columns=False,
                                  batch_eval_metrics=True,
                                  report_to=None
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        data_collator=data_collator
    )
    
    ## predictor
    pred_output = trainer.predict(test_tokenized_ds, ignore_keys=["past_key_values"])
    logits = pred_output.predictions.astype(float).reshape(-1, 3)

    # probs = softmax(logits, axis=1)
    
    results = pd.DataFrame({
        "id": test_df['id'].values,
        "logits_0": logits[:, 0],
        "logits_1": logits[:, 1],
        "logits_2": logits[:, 2],
    })
    
    results.to_csv(args.save_dir, index=False)
    
    
if __name__ == "__main__":
    ap = argparse.ArgumentParser()
    ap.add_argument('--base_model_path', type=str, required=True)
    ap.add_argument('--lora_path', type=str, required=True)
    ap.add_argument('--max_length', type=int, required=True)
    ap.add_argument('--batch_size', type=int, required=True)
    ap.add_argument('--save_dir', type=str, default="submission.csv")
    args = ap.parse_args()
    
    main(args)

Writing run_LLM_inference.py


In [3]:
!accelerate launch --multi_gpu --mixed_precision=fp16 --num_processes=2 run_LLM_inference.py \
--lora_path "/kaggle/input/gemma2-8b-it-lora-2k-fold2-ft/" \
--base_model_path "/kaggle/input/gemma2-9b-it-base/" \
--max_length 1536 \
--batch_size 2 \
--save_dir "out1.csv"

2024-08-04 18:56:10.749459: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-04 18:56:10.749459: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-04 18:56:10.749530: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-04 18:56:10.749575: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-04 18:56:10.860303: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register fac

In [4]:
!accelerate launch --multi_gpu --mixed_precision=fp16 --num_processes=2 run_LLM_inference.py \
--lora_path "/kaggle/input/llama3-8b-it-lora-2k-fold2-ft/" \
--base_model_path "/kaggle/input/llama3-8b-it-base/" \
--max_length 2048 \
--batch_size 8 \
--save_dir "out2.csv"

2024-08-04 19:00:03.989110: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-04 19:00:03.989172: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-04 19:00:03.990619: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-04 19:00:04.005657: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-04 19:00:04.005705: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register fa

In [5]:
import pandas as pd
import numpy as np
from scipy.special import expit, softmax

df1 = pd.read_csv("out1.csv")
df2 = pd.read_csv("out2.csv")

best_w = 0.55
best_temp = 0.9
logits1 = df1[['logits_0', 'logits_1', 'logits_2']].values
logits2 = df2[['logits_0', 'logits_1', 'logits_2']].values
logits = best_w*logits1.copy() + (1-best_w)*logits2.copy()
probs = softmax(logits/best_temp, 1).astype(np.float64)

results_df = pd.DataFrame({
    "id": df1['id'].values,
    "winner_model_a": probs[:, 0].astype(np.float64),
    "winner_model_b": probs[:, 1].astype(np.float64),
    "winner_tie": probs[:, 2].astype(np.float64),
})

results_df.to_csv("submission.csv", index=False)

In [6]:
# results_df = pd.DataFrame({
#     "id": df1['id'].values,
#     "winner_model_a": probs[:, 0].astype(np.float64),
#     "winner_model_b": probs[:, 1].astype(np.float64),
#     "winner_tie": probs[:, 2].astype(np.float64),
# })

In [7]:
# print("\nData type of column 'B':", results_df['winner_tie'].dtype)

In [8]:
# import pandas as pd
# pd.read_csv("submission.csv").head()

In [9]:
# import pandas as pd
# sample = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/sample_submission.csv')

In [10]:
# type(results_df['winner_model_a'][0])