In [1]:
import os
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"

# CUDA 디바이스 0, 2만 사용하도록 설정
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

from dataclasses import dataclass

import torch
from datasets import Dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EvalPrediction,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    BitsAndBytesConfig,
    PreTrainedTokenizerBase
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from sklearn.metrics import log_loss, accuracy_score, roc_auc_score

import warnings
warnings.filterwarnings('ignore')



from transformers import AutoTokenizer, AutoConfig

# 예시: "your-username/your-model-name"
model_name = "RowKick/deberta-v3-base-korean-attention-pooling-stage2"

# tokenizer 불러오기 (필요한 경우)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# config 불러오기
config = AutoConfig.from_pretrained(model_name)


tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'
tokenizer.truncation_side='right'
tokenizer.add_eos_token = True


###
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoConfig, AutoModel, PreTrainedModel


class AttentionPooling(nn.Module):
    def __init__(self, hidden_size):
        super(AttentionPooling, self).__init__()
        self.attention = nn.Linear(hidden_size, 1)

    def forward(self, last_hidden_state, attention_mask):
        scores = self.attention(last_hidden_state).squeeze(-1)
        scores = scores.masked_fill(attention_mask == 0, float('-inf')) #B x L
        attn_weights = F.softmax(scores, dim=1).unsqueeze(-1) #B x L x 1
        
        weighted_sum = torch.sum(last_hidden_state * attn_weights, dim=1)
        return weighted_sum


class AiModel(PreTrainedModel):
    """
    Hugging Face Trainer-compatible AI Model with Attention Pooling
    """
    def __init__(self, config):
        super().__init__(config)
        print("Initializing the AI Model...")

        self.backbone = AutoModel.from_pretrained(config._name_or_path, config=config)
        hidden_size = self.backbone.config.hidden_size

        self.pool = AttentionPooling(hidden_size)

        # self.projection_head = nn.Linear(hidden_size, 1)
        # self.projection_head = nn.Linear(hidden_size, 2) #1 -> 2
        # self.projection_head = nn.Linear(hidden_size*2, 2) #1 -> 2 (use cls concat)

        self.projection_head = nn.Sequential(nn.Linear(hidden_size*2, hidden_size, bias = True),
                                        nn.Linear(hidden_size, 2, bias = True),
                                        nn.Dropout(p=0.1, inplace=False)) #1 (use cls concat)
        self.loss_fn = nn.BCEWithLogitsLoss()

    def forward(self, input_ids=None, attention_mask=None, labels=None, **kwargs):
        outputs = self.backbone(
            input_ids=input_ids,
            attention_mask=attention_mask,
            output_hidden_states=False,
        )

        ###BASE
        # last_hidden_state = outputs.last_hidden_state #B x 508 x H(768)
        # pooled_output = self.pool(last_hidden_state, attention_mask) #B x H(768)
        # logits = self.projection_head(pooled_output).squeeze(-1)  # shape: (batch_size)
        # logits = self.projection_head(pooled_output).squeeze(-1)  # shape: (batch_size)


        # ## USE CLS concat
        cls_hidden = outputs.last_hidden_state[:, 0, :] #B x 1 x H
        rest_hidden_state = outputs.last_hidden_state[:, 1:, :]  #B x (508-1) x H(768)
        pooled_output = self.pool(rest_hidden_state, attention_mask[:, 1:]) #B x H(768)

        my_hidden = torch.concat([cls_hidden, pooled_output], dim = 1) #B x H*2
        logits = self.projection_head(my_hidden)  # shape: B x 2
        # ##-----------

        ##-----------

        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels.float())

        return {
            "loss": loss,
            "logits": logits,
        }

###-------------------------------------
from transformers import AutoConfig, AutoModel, PreTrainedModel

model = AiModel.from_pretrained(model_name, config=config)
model.to('cuda:0')

model.config.use_cache = False
 

ds = Dataset.from_csv("./test.csv")

class CustomTokenizer:
    def __init__(
        self,
        tokenizer: PreTrainedTokenizerBase,
        max_length: int
    ) -> None:
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, batch: dict) -> dict:
        title = ["<Title>: " + t for t in batch["title"]]
        para = ["\n\n<Full text>: " + t for t in batch["paragraph_text"]]
        texts = [t + p for t, p in zip(title, para)]
        tokenized = self.tokenizer(texts, max_length=self.max_length, truncation=True)

        return {**tokenized}
    
encode = CustomTokenizer(tokenizer, max_length=512)
ds = ds.map(encode, batched=True)

from tqdm import tqdm
from transformers.data.data_collator import pad_without_fast_tokenizer_warning


@torch.no_grad()
@torch.cuda.amp.autocast()
def inference(ds, model, batch_size=1):
    preds = []
    pseudo = []
    model.eval()
    
    for start_idx in tqdm(range(0, len(ds), batch_size)):
        end_idx = min(start_idx + batch_size, len(ds))
        tmp = ds[start_idx:end_idx]
        input_ids = tmp["input_ids"]
        attention_mask = tmp["attention_mask"]
        inputs = pad_without_fast_tokenizer_warning(
            tokenizer,
            {"input_ids": input_ids, "attention_mask": attention_mask},
            padding="longest",
            pad_to_multiple_of=None,
            return_tensors="pt",
        )
        outputs = model(**inputs.to("cuda:0"))
        proba = outputs.get("logits").cpu()
        
        preds.extend(proba[:, 0].tolist())
        pseudo.extend(proba[:,1].tolist())
    
    return preds, pseudo

a, b = inference(ds, model)

import pandas as pd


sub = pd.read_csv('./sample_submission.csv')
sub.head()

sub['generated'] = b

sub.to_csv('stage2_attpool_b.csv', index=False)

Initializing the AI Model...


Some weights of DebertaV2Model were not initialized from the model checkpoint at RowKick/deberta-v3-base-korean-attention-pooling-stage2 and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.word_embeddings.weight', 'encoder.LayerNorm.bias', 'encoder.LayerNorm.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key_proj.bias', 'encoder.layer.0.attention.self.key_proj.weight', 'encoder.layer.0.attention.self.query_proj.bias', 'encoder.layer.0.attention.self.query_proj.weight', 'encoder.layer.0.attention.self.value_proj.bias', 'encoder.layer.0.attention.self.value_proj.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermediate.dense.weight', 'encoder.layer.0.output.LayerNorm.bias', 'encoder.layer.0.output.LayerNorm.weight', 'encoder.l