In [1]:
!unzip 3595-4FOLD.zip

unzip:  cannot find or open 3595-4FOLD.zip, 3595-4FOLD.zip.zip or 3595-4FOLD.zip.ZIP.


In [2]:
!pip install -r requirements.txt \
  --extra-index-url https://download.pytorch.org/whl/cu124

[31mERROR: Could not open requirements file: [Errno 2] No such file or directory: 'requirements.txt'[0m[31m
[0m

KeyboardInterrupt: 

In [None]:
import qwen3_seqcls_infonce

In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, BitsAndBytesConfig
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer, TrainerCallback
from transformers import pipeline
import torch
from peft import LoraConfig, TaskType, get_peft_model
from sklearn.metrics import roc_auc_score
import datetime as dt
import random
import re
import os

In [None]:
from huggingface_hub import login

login(token="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")

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

SEED = 42
seed_everything(SEED)

In [None]:
val_fold_idx = 3
fold_paths = [f"./fold{i}.csv" for i in range(4)]

FOLD_VAL   = fold_paths[val_fold_idx]
FOLD_TRAIN = [path for idx, path in enumerate(fold_paths) if idx != val_fold_idx]

print("▶ Train folds:", FOLD_TRAIN)
print("▶ Validation fold:", FOLD_VAL)

TEST_CSV        = "./test_preprocessed.csv"
SUBMISSION_CSV  = "./sample_submission.csv"

In [None]:
train_df = pd.concat(
    [pd.read_csv(p, encoding="utf-8-sig") for p in FOLD_TRAIN],
    ignore_index=True
)

val_df   = pd.read_csv(FOLD_VAL, encoding="utf-8-sig")

train_df = train_df[['full_text', 'generated']].rename(
    columns={'full_text':'text', 'generated':'label'}
)
val_df   = val_df  [['full_text', 'generated']].rename(
    columns={'full_text':'text', 'generated':'label'}
)

train_df = train_df.sample(frac=1, random_state=SEED).reset_index(drop=True)

print("최종 학습 샘플 수:", len(train_df))
print("최종 학습 클래스 분포:", train_df['label'].value_counts().to_dict())
print("검증 샘플 수:", len(val_df))
print("검증 클래스 분포:", val_df['label'].value_counts().to_dict())

In [None]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset   = Dataset.from_pandas(val_df)

In [None]:
MODEL_NAME = "Qwen/Qwen3-14B"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.remove_columns(["text"])
val_dataset = val_dataset.remove_columns(["text"])

train_dataset = train_dataset.rename_column("label", "labels")
val_dataset   = val_dataset.rename_column("label", "labels")

In [None]:
data_collator = DataCollatorWithPadding(tokenizer, padding=True)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2, quantization_config=bnb_config, torch_dtype=torch.bfloat16)
model.to(device)

In [None]:
R = 32
LORA_ALPHA = 16
LORA_DROPOUT = 0.1
lora_config = LoraConfig(
    r=R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    task_type=TaskType.SEQ_CLS,
    target_modules= ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
)

model = get_peft_model(model, lora_config)


In [None]:
model.print_trainable_parameters()

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = logits[:, 1]
    roc_auc = roc_auc_score(labels, probs)
    return {"roc_auc": roc_auc}

In [None]:
class ScheduledCLTrainer(Trainer):
    def __init__(self, *args, delay_ratio: float = 0.3, max_lambda: float = 0.05, **kwargs):
        super().__init__(*args, **kwargs)
        self.delay_ratio = delay_ratio
        self.max_lambda  = max_lambda

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs["labels"]
        step   = self.state.global_step
        total  = self.state.max_steps

        delay_steps = int(total * self.delay_ratio)

        if step < delay_steps:
            lambda_cl = 0.0
        else:
            rem_steps = total - delay_steps
            rel_step  = step - delay_steps
            progress  = min(rel_step / rem_steps, 1.0)
            lambda_cl = progress * self.max_lambda

        outputs = model(
            **inputs,
            contrastive_labels=labels,
            lambda_cl=lambda_cl,
        )
        loss = outputs.loss
        return (loss, outputs) if return_outputs else loss

In [None]:
# 훈련 파라미터 설정
training_args = TrainingArguments(
    output_dir="./output_qwen3_checkpoint",
    overwrite_output_dir=True,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    metric_for_best_model="roc_auc",
    greater_is_better=True,
    logging_strategy="steps",
    logging_steps=500,
    logging_first_step=True,
    save_total_limit=2,
    seed=SEED,
    dataloader_drop_last=False,
    report_to="none",
    label_names=["labels"]
)

In [None]:
trainer = ScheduledCLTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    delay_ratio=0.3,
    max_lambda=0.05,
)

In [None]:
trainer.train()

In [None]:
output_dir = "output_fold3"
trainer.model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print("모델이 저장되었습니다:", output_dir)

In [None]:
test_df = pd.read_csv(TEST_CSV, encoding='utf-8-sig')
submission_df = pd.read_csv(SUBMISSION_CSV, encoding='utf-8-sig')

print("테스트 샘플 수:", len(test_df))
pred_probs = []

In [None]:
trainer.model.eval()

In [None]:
clf = pipeline(
    "text-classification",
    model=trainer.model,
    tokenizer=tokenizer,
    return_all_scores=True,
)

In [None]:
print("샘플 결과 예시:", clf(test_df['paragraph_text'][0]))

In [None]:
for text in test_df['paragraph_text']:
    scores = clf(text)[0]
    prob_ai = None
    for s in scores:
        if s['label'] in ['LABEL_1', '1', 'generated']:
            prob_ai = s['score']
            break
    if prob_ai is None:
        prob_ai = scores[1]['score']
    pred_probs.append(prob_ai)

In [None]:
submission_df['generated'] = pred_probs
submission_df

In [None]:
submission_df.to_csv("test_qwen_fold3.csv", index=False, encoding="utf-8-sig")

In [None]:
val_ds = Dataset.from_pandas(val_df)

In [None]:
def collate_infer(features: list[dict]):
    texts = [f["text"] for f in features]
    encodings = tokenizer(
        texts,
        padding=False,
        truncation=True,
        max_length=512,
        return_tensors=None,
    )
    batch = data_collator(encodings)
    return batch

In [None]:
from torch.utils.data import DataLoader
from tqdm import tqdm

loader = DataLoader(
    val_ds,
    batch_size=16,
    shuffle=False,
    collate_fn=collate_infer,
    pin_memory=True,
)

probs_list = []

with torch.no_grad():
    for batch in tqdm(loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        logits = trainer.model(**batch).logits
        probs  = torch.softmax(logits, dim=-1)[:, 1]
        probs_list.append(probs.cpu().float())

probs = torch.cat(probs_list).numpy()
print(f"[✓] Inference done – {len(probs)} samples")


In [None]:
val_df['generated'] = probs

In [None]:
val_df['ID'] = pd.read_csv(FOLD_VAL, encoding="utf-8-sig")['id']
val_df = val_df[['ID', 'generated', 'label']]

In [None]:
val_df.to_csv('val_qwen_fold3.csv', index=False, encoding="utf-8-sig")