In [1]:
import time
from dataclasses import dataclass, field
from concurrent.futures import ThreadPoolExecutor

import torch
import sklearn
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer, BitsAndBytesConfig
from transformers.data.data_collator import pad_without_fast_tokenizer_warning
from peft import PeftModel
from sklearn.metrics import accuracy_score
from datasets import load_from_disk

bnb_config =  BitsAndBytesConfig(
        load_in_4bit = True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
    )

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
@torch.no_grad()
@torch.cuda.amp.autocast()
def inference(df, model, tokenizer, device, batch_size):
    winners = []
    
    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        tmp = df.iloc[start_idx:end_idx]
        input_ids = tmp["input_ids"].to_list()
        attention_mask = tmp["attention_mask"].to_list()
        inputs = pad_without_fast_tokenizer_warning(
            tokenizer,
            {"input_ids": input_ids, "attention_mask": attention_mask},
            padding="longest",
            pad_to_multiple_of=None,
            return_tensors="pt",
        )
        outputs = model(**inputs.to(device))
        proba = outputs.logits.softmax(-1).cpu()
        
        winners.extend(proba[:, 1].tolist())
    
    df['winner'] = winners
    return df

def format_label(winner, reverse=False, bidirect=False):
    if bidirect:
        return [int(0) if winner == "model_a" else int(1),
                int(1) if winner == "model_a" else int(0)]
    if not reverse:
        return int(0) if winner == "model_a" else int(1)
    else:
        return int(1) if winner == "model_a" else int(0)

  @torch.cuda.amp.autocast()


In [3]:
for FOLD in tqdm(range(1)):
    print("START ", FOLD)
    
    @dataclass
    class Config:
        fold = FOLD
        # model_path = f"/group-volume/binfeng/wsdm/ckpt/qwen14b_soft_ft/fold{fold}/checkpoint-2422"
        model_path = "/group-volume/binfeng/wsdm/ckpt/qwen14b_tie"
        eval_data_path = f"/group-volume/binfeng/wsdm/stage_qft/dataset/tokenized_qwen14b/ft_val_fold{fold}"
        tokenizer_path = model_path
        max_length = 3000
        max_prompt_length = 400
        batch_size_list = [32, 32]
    cfg = Config()
    tokenizer = AutoTokenizer.from_pretrained(cfg.tokenizer_path)
    
    ## Prepare model and Tokenizer
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = 'right'

    device_0 = torch.device('cuda:0')
    model_0 = AutoModelForSequenceClassification.from_pretrained(
        cfg.model_path,
        device_map=device_0,
        use_cache=False,
        # torch_dtype=torch.bfloat16
        quantization_config=bnb_config,
    )
    # model_0 = PeftModel.from_pretrained(model_0, cfg.phi_lora_dir)
    # model_0.score = torch.nn.Linear(in_features=3584, out_features=2, bias=False).to(device_0)

    # Load base model on GPU 1
    device_1 = torch.device('cuda:1')
    model_1 = AutoModelForSequenceClassification.from_pretrained(
        cfg.model_path,
        device_map=device_1,
        use_cache=False,
        # torch_dtype=torch.bfloat16
        quantization_config=bnb_config,
    )
    # model_1 = PeftModel.from_pretrained(model_1, cfg.phi_lora_dir)
    
    data = load_from_disk(cfg.eval_data_path).to_pandas()
    data = data[["input_ids", "attention_mask", "labels"]]
    data["length"] = data["input_ids"].apply(len)
    data['index'] = np.arange(len(data), dtype=np.int32)
    data = data.sort_values("length", ascending=False)
    
    
    data_dict = {}
    data_dict[0] = data[data["length"] > 1024].reset_index(drop=True)
    data_dict[1] = data[data["length"] <= 1024].reset_index(drop=True)
    result_df = []
    for i, batch_size in enumerate(Config.batch_size_list):
        if len(data_dict[i]) == 0:
            continue
        sub_1 = data_dict[i].iloc[0::2].copy()
        sub_2 = data_dict[i].iloc[1::2].copy()
        
        with ThreadPoolExecutor(max_workers=2) as executor:
            results = executor.map(inference, (sub_1, sub_2), (model_0, model_1), (tokenizer, tokenizer), (device_0, device_1), (batch_size, batch_size))
            
        result_df.append(pd.concat(list(results), axis=0))

    result_df = pd.concat(result_df).sort_values('index').reset_index(drop=True)

    submission_df = result_df[['labels', 'winner']].copy()
    submission_df['winner'] = np.where(submission_df['winner'] > 0.5, 'model_b', 'model_a')
    submission_df["pred"] = submission_df['winner'].apply(format_label)
    
    print(f"========Eval FOLD {cfg.fold}=========")
    print(f"Eval result for fold {cfg.fold}: ", accuracy_score(submission_df["labels"], submission_df["pred"]))

  0%|          | 0/1 [00:00<?, ?it/s]

START  0


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]
  0%|          | 0/1 [00:01<?, ?it/s]


RuntimeError: CUDA error: out of memory
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
