In [None]:
# import os

# # CUDA 디바이스 0, 2만 사용하도록 설정
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [1]:
from dataclasses import dataclass

import torch
from datasets import Dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EvalPrediction,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    BitsAndBytesConfig,
    PreTrainedTokenizerBase
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from sklearn.metrics import log_loss, accuracy_score, roc_auc_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
tokenizer = AutoTokenizer.from_pretrained("RowKick/kanana-1.5-2.1b-instruct-stage1")
tokenizer.padding_side = 'right'
tokenizer.truncation_side='right'
tokenizer.add_eos_token = True

In [4]:

model = AutoModelForSequenceClassification.from_pretrained(
    "RowKick/kanana-1.5-2.1b-instruct-stage1",
    num_labels=2,
    #quantization_config=bnb_config,
    device_map="auto",
    ignore_mismatched_sizes=True,
)

model.config.use_cache = False

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at RowKick/kanana-1.5-2.1b-instruct-stage1 and are newly initialized because the shapes did not match:
- score.weight: found shape torch.Size([1, 1792]) in the checkpoint and torch.Size([2, 1792]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from peft import PeftModel, PeftConfig


# 2. Adapter 가중치만 포함된 디렉터리에서 PeftConfig 로드
peft_model_id = "RowKick/kanana-1.5-2.1b-instruct-stage1-stage2"
peft_config = PeftConfig.from_pretrained(peft_model_id)

# 3. Adapter 가중치를 base model에 로드하여 합치기
model = PeftModel.from_pretrained(model, peft_model_id)


adapter_config.json:   0%|          | 0.00/830 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/126M [00:00<?, ?B/s]

In [6]:
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id


In [7]:
ds = Dataset.from_csv("test.csv")

In [8]:
class CustomTokenizer:
    def __init__(
        self,
        tokenizer: PreTrainedTokenizerBase,
        max_length: int
    ) -> None:
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, batch: dict) -> dict:
        title = ["<Title>: " + t for t in batch["title"]]
        para = ["\n\n<Full text>: " + t for t in batch["paragraph_text"]]
        texts = [t + p for t, p in zip(title, para)]
        tokenized = self.tokenizer(texts, max_length=self.max_length, truncation=True)

        return {**tokenized}

In [9]:
encode = CustomTokenizer(tokenizer, max_length=1024)
ds = ds.map(encode, batched=True)

Map:   0%|          | 0/1962 [00:00<?, ? examples/s]

In [10]:
from tqdm.notebook import tqdm
from transformers.data.data_collator import pad_without_fast_tokenizer_warning


@torch.no_grad()
@torch.cuda.amp.autocast()
def inference(ds, model, batch_size=1):
    preds = []
    pseudo = []
    model.eval()
    
    for start_idx in tqdm(range(0, len(ds), batch_size)):
        end_idx = min(start_idx + batch_size, len(ds))
        tmp = ds[start_idx:end_idx]
        input_ids = tmp["input_ids"]
        attention_mask = tmp["attention_mask"]
        inputs = pad_without_fast_tokenizer_warning(
            tokenizer,
            {"input_ids": input_ids, "attention_mask": attention_mask},
            padding="longest",
            pad_to_multiple_of=None,
            return_tensors="pt",
        )
        outputs = model(**inputs.to("cuda:0"))
        proba = outputs.logits.cpu()
        
        preds.extend(proba[:, 0].tolist())
        pseudo.extend(proba[:,1].tolist())
    
    return preds, pseudo

In [11]:
a, b = inference(ds, model)

  0%|          | 0/1962 [00:00<?, ?it/s]

In [12]:
import pandas as pd


sub = pd.read_csv('sample_submission.csv')
sub.head()

sub['generated'] = a

sub.to_csv('stage2_a_kanana21b.csv', index=False)