In [None]:
import os

# CUDA 디바이스 0, 2만 사용하도록 설정
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
from dataclasses import dataclass

import torch
from datasets import Dataset
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    EvalPrediction,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    BitsAndBytesConfig,
    PreTrainedTokenizerBase
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from sklearn.metrics import log_loss, accuracy_score, roc_auc_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
@dataclass
class Config:
    output_dir: str = 'stage2_deberta_1:2'
    checkpoint: str = "team-lucid/deberta-v3-base-korean" # 4-bit quantized gemma-2-9b-instruct
    max_length: int = 512
    n_splits: int = 20
    fold_idx: int = 0
    optim_type: str = "adamw_torch"
    per_device_train_batch_size: int = 16
    gradient_accumulation_steps: int = 1  # global batch size is 16
    per_device_eval_batch_size: int = 8
    n_epochs: int = 5
    freeze_layers: int = 0  # there're 42 layers in total, we don't add adapters to the first 16 layers
    lr: float = 1e-5
    warmup_steps: int = 20
    lora_r: int = 64
    lora_alpha: float = 16
    lora_dropout: float = 0.1
    lora_bias: str = "none"
    real_label_weight: float = 0.33

config = Config()

In [None]:
import os
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"

In [None]:
training_args = TrainingArguments(
    output_dir=config.output_dir,
    do_eval=True,
    overwrite_output_dir=True,
    report_to="none",
    num_train_epochs=config.n_epochs,
    per_device_train_batch_size=config.per_device_train_batch_size,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    per_device_eval_batch_size=config.per_device_eval_batch_size,
    logging_steps=100,
    eval_strategy="epoch",
    # eval_steps=1,
    save_strategy="steps",
    save_total_limit=3,
    save_steps=200,
    optim=config.optim_type,
    weight_decay=1e-2,
    fp16=True,
    learning_rate=config.lr,
    warmup_steps=config.warmup_steps,
    dataloader_num_workers=4
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(config.checkpoint)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'
tokenizer.truncation_side='right'
tokenizer.add_eos_token = True

In [None]:
# model = AutoModelForSequenceClassification.from_pretrained(
#     config.checkpoint ,
#     num_labels=2,
#     device_map="cuda:0",
# )
model = AutoModelForSequenceClassification.from_pretrained(
    config.checkpoint,
    num_labels=1,  
    device_map="cuda:0",
)


model.config.use_cache = False
model

In [None]:
from datasets import load_from_disk

ds = load_from_disk("./stage1_llm")

In [None]:
ds = ds.map(lambda x: {'labels': [x['labels'], x['generated']]})

In [None]:
# Step 1: 'generated' 컬럼 제거
ds = ds.remove_columns(['generated'])

# Step 2: 컬럼 이름 변경
ds = ds.rename_columns({
    "original_input_ids": "input_ids",
    "original_attention_mask": "attention_mask",
    "original_token_type_ids": "token_type_ids",
})

In [None]:
ds

In [None]:
from sklearn.metrics import log_loss, roc_auc_score, accuracy_score
import torch
from transformers import EvalPrediction

# def compute_metrics(eval_preds: EvalPrediction) -> dict:
#     preds = eval_preds.predictions  # shape: (batch_size, 1)
#     labels = eval_preds.label_ids   # shape: (batch_size,)

#     # Apply sigmoid to get probabilities
#     probs = torch.from_numpy(preds).float().sigmoid().numpy().squeeze()

#     # Convert probabilities to binary predictions (threshold = 0.5)
#     binary_preds = (probs >= 0.5).astype(int)
#     binary_labels = (labels >= 0.5).astype(int)
#     # Compute metrics
#     loss = log_loss(y_true=binary_labels, y_pred=probs)
#     auc = roc_auc_score(y_true=binary_labels, y_score=probs)
#     acc = accuracy_score(y_true=binary_labels, y_pred=binary_preds)

#     return {"auc": auc, "log_loss": loss, "accuracy": acc}


def compute_metrics(eval_preds: EvalPrediction) -> dict:
    preds = eval_preds.predictions  # shape: (batch_size,) or (batch_size, 1)
    labels = eval_preds.label_ids   # shape: (batch_size, 2)

    probs = torch.from_numpy(preds).float().sigmoid().numpy().squeeze()
    real_labels = labels[:, 0]  # real label만 사용

    # log_loss expects y_pred to be probs between 0 and 1 (1D)
    loss = log_loss(y_true=real_labels, y_pred=probs)
    auc = roc_auc_score(y_true=real_labels, y_score=probs)
    binary_preds = (probs >= 0.5).astype(int)
    acc = accuracy_score(y_true=real_labels, y_pred=binary_preds)

    return {"auc": auc, "log_loss": loss, "accuracy": acc}



In [None]:
from math import ceil

def get_train_val_split_indices(dataset, n_splits):
    total = len(dataset)
    split_size = ceil(total / n_splits)
    indices = list(range(total))

    # Split into n_splits parts
    splits = [indices[i*split_size:(i+1)*split_size] for i in range(n_splits)]

    # Train: all but last split
    train_indices = [i for split in splits[:-1] for i in split]
    val_indices = splits[-1]

    return train_indices, val_indices

# 예시 사용:
train_idx, eval_idx = get_train_val_split_indices(ds, config.n_splits)

In [None]:
from dataclasses import dataclass
from typing import List, Dict, Any
import torch
from transformers import PreTrainedTokenizerBase

@dataclass
class CustomDataCollator:
    tokenizer: PreTrainedTokenizerBase
    padding: bool = True

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
        labels = torch.tensor([f["labels"] for f in features], dtype=torch.float)

        for f in features:
            f.pop("labels")

        batch = self.tokenizer.pad(
            features,
            padding=self.padding,
            return_tensors="pt"
        )
        batch["labels"] = labels  # (batch_size, 2)

        return batch


In [None]:
from transformers import Trainer
import torch.nn as nn

# class CustomTrainer(Trainer):
#     def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
#         labels = inputs.get("labels")  # shape: (batch_size, 2)
#         outputs = model(**inputs)
#         logits = outputs.get("logits")  # shape: (batch_size, 2)

#         real_logit = logits[:, 0]
#         pseudo_logit = logits[:, 1]
#         real_label = labels[:, 0]
#         pseudo_label = labels[:, 1]

#         loss_fct = nn.BCEWithLogitsLoss()
#         loss_real = loss_fct(real_logit, real_label.float())
#         loss_pseudo = loss_fct(pseudo_logit, pseudo_label.float())

#         REAL_WEIGHT = config.real_label_weight
#         PSEUDO_WEIGHT = 1 - REAL_WEIGHT
#         loss = REAL_WEIGHT * loss_real + PSEUDO_WEIGHT * loss_pseudo

#         return (loss, outputs) if return_outputs else loss

import torch.nn as nn
from transformers import Trainer

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")  # (batch_size, 2)
        labels = labels.to(model.device).float()

        real_label = labels[:, 0]
        pseudo_label = labels[:, 1]

        outputs = model(**inputs)
        logits = outputs["logits"].squeeze(-1)  # (batch_size,)


        loss_fct = nn.BCEWithLogitsLoss()
        loss_real = loss_fct(logits, real_label)
        loss_pseudo = loss_fct(logits, pseudo_label)

        REAL_WEIGHT = config.real_label_weight
        PSEUDO_WEIGHT = 1 - REAL_WEIGHT
        loss = REAL_WEIGHT * loss_real + PSEUDO_WEIGHT * loss_pseudo

        return (loss, outputs) if return_outputs else loss




In [None]:
print(f"총 train samples: {len(train_idx)}")
print(f"총 val samples: {len(eval_idx)}")

steps_per_epoch = (len(train_idx) + config.per_device_train_batch_size - 1) // config.per_device_train_batch_size
total_steps = steps_per_epoch * config.n_epochs
print(f"total steps: {total_steps}")


In [None]:

trainer = CustomTrainer(
    args=training_args, 
    model=model,
    tokenizer=tokenizer,
    train_dataset=ds.select(train_idx),
    eval_dataset=ds.select(eval_idx),
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)
trainer.train()

In [None]:
ds = Dataset.from_csv("test.csv")

In [None]:
class CustomTokenizer:
    def __init__(
        self,
        tokenizer: PreTrainedTokenizerBase,
        max_length: int
    ) -> None:
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __call__(self, batch: dict) -> dict:
        title = ["<Title>: " + t for t in batch["title"]]
        para = ["\n\n<Full text>: " + t for t in batch["paragraph_text"]]
        texts = [t + p for t, p in zip(title, para)]
        tokenized = self.tokenizer(texts, max_length=self.max_length, truncation=True)

        return {**tokenized}

In [None]:
encode = CustomTokenizer(tokenizer, max_length=512)
ds = ds.map(encode, batched=True)

In [None]:
from tqdm.notebook import tqdm
from transformers.data.data_collator import pad_without_fast_tokenizer_warning


@torch.no_grad()
@torch.cuda.amp.autocast()
def inference(ds, model, batch_size=1):
    preds = []
    pseudo = []
    model.eval()
    
    for start_idx in tqdm(range(0, len(ds), batch_size)):
        end_idx = min(start_idx + batch_size, len(ds))
        tmp = ds[start_idx:end_idx]
        input_ids = tmp["input_ids"]
        attention_mask = tmp["attention_mask"]
        inputs = pad_without_fast_tokenizer_warning(
            tokenizer,
            {"input_ids": input_ids, "attention_mask": attention_mask},
            padding="longest",
            pad_to_multiple_of=None,
            return_tensors="pt",
        )
        outputs = model(**inputs.to("cuda:0"))
        # proba = outputs.logits.cpu()
        logits = outputs.logits.squeeze(-1).cpu()
        proba = torch.sigmoid(logits)

        
        # preds.extend(proba[:, 0].tolist())
        # pseudo.extend(proba[:,1].tolist())
        preds.extend(proba.tolist())  # logit 하나

    
    return preds, pseudo

In [None]:
probs, _ = inference(ds, model)  

import pandas as pd
sub = pd.read_csv('sample_submission.csv')

sub['generated'] = probs  

sub.to_csv('stage2_0.33:0.67_total_loss.csv', index=False)