In [1]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, AutoModel, AutoTokenizer, BigBirdForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from tqdm import tqdm
import pandas as pd
from types import SimpleNamespace

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
config = {
    "learning_rate": 1e-4, # changed 3e-5 to 3e-4 to 1e-4
    "epoch": 30,
    "batch_size": 64,
    "weight_decay": 0.01,
    "tokenizer_max_len": 256,
    # https://huggingface.co/docs/transformers/v4.20.1/en/model_doc/big_bird
    # https://huggingface.co/blog/big-bird
    "attention_type": "original_full", # original_full, block_sparse
}

CFG = SimpleNamespace(**config)

In [None]:
import os
root_path = "/root/dacon-2024-gbt-hackerton"

In [6]:
train_df = pd.read_csv(os.path.join(root_path, "./train.csv"))
test_df = pd.read_csv(os.path.join(root_path, "./test.csv")) 

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained("monologg/kobigbird-bert-base")  # BertTokenizer
model = BigBirdForSequenceClassification.from_pretrained(
    'monologg/kobigbird-bert-base', 
    num_labels=len(train_df['분류'].unique()), 
    attention_type=CFG.attention_type
).to(device)

Some weights of BigBirdForSequenceClassification were not initialized from the model checkpoint at monologg/kobigbird-bert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128, preprocess_datarows=False):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

        self.encodings = []
        self._preprocess_datarows = preprocess_datarows
        if preprocess_datarows:
            print(f"Using {preprocess_datarows=}")
            self.encodings = self.tokenizer(
                self.texts,
                add_special_tokens=True,
                max_length=self.max_len,
                truncation=True,
                padding='max_length',
                return_attention_mask=True,
                return_tensors='pt'
            )

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        if self._preprocess_datarows:
            return {
                'input_ids': self.encodings['input_ids'][item],
                'attention_mask': self.encodings['attention_mask'][item],
                'labels': torch.tensor(self.labels[item], dtype=torch.long) if self.labels is not None else -1
            }
        
        text = str(self.texts[item])
        label = self.labels[item] if self.labels is not None else -1
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [9]:
# 데이터 준비
# train_df['제목_키워드'] = train_df['제목'] + ' ' + train_df['키워드']
# test_df['제목_키워드'] = test_df['제목'] + ' ' + test_df['키워드']
def remove_duplicates(text):
    return ' '.join(list(dict.fromkeys(text.split(","))))

train_df['제목_키워드'] = train_df['키워드'].apply(remove_duplicates)
test_df['제목_키워드'] = test_df['키워드'].apply(remove_duplicates)

# 레이블 인코딩
label_encoder = {label: i for i, label in enumerate(train_df['분류'].unique())}
train_df['label'] = train_df['분류'].map(label_encoder)

# 데이터 분할 (train -> train + validation)
train_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['분류'], random_state=42)

# 데이터셋 생성
train_dataset = TextDataset(train_df.제목_키워드.tolist(), train_df.label.tolist(), tokenizer, max_len=CFG.tokenizer_max_len, preprocess_datarows=True)
val_dataset = TextDataset(val_df.제목_키워드.tolist(), val_df.label.tolist(), tokenizer,  max_len=CFG.tokenizer_max_len, preprocess_datarows=True)
test_dataset = TextDataset(test_df.제목_키워드.tolist(), None, tokenizer)  # 라벨 없음

# 데이터 로더 생성
num_workers = 0  # 적절한 값으로 조정하세요
pin_memory = True
train_loader = DataLoader(train_dataset, batch_size=CFG.batch_size, shuffle=True, num_workers=num_workers, pin_memory=pin_memory)
val_loader = DataLoader(val_dataset, batch_size=CFG.batch_size, shuffle=False, num_workers=num_workers, pin_memory=pin_memory)
test_loader = DataLoader(test_dataset, batch_size=CFG.batch_size, shuffle=False, num_workers=num_workers, pin_memory=pin_memory)

Using preprocess_datarows=True
Using preprocess_datarows=True


In [26]:
train_df[train_df['분류'] != '지역']

Unnamed: 0,ID,분류,제목,키워드,제목_키워드,label
35146,TRAIN_35146,경제:부동산,매물 쌓이고 거래량도 '뚝' 집값 안정화 '조짐',"매물,거래량,조짐,집값,안정,아시아투데이,박지숙,서울,포함,매물,수도,아파트,거래량...",매물 거래량 조짐 집값 안정 아시아투데이 박지숙 서울 포함 수도 아파트 매수세 추세...,10
7317,TRAIN_07317,스포츠:야구,"CJ슈퍼레이스 GT1 2위 개그맨 한민관, 10년차 레이서의 관록 보여줘","한민관,CJ,슈퍼,레이스,GT1,개그맨,10년,레이서,관록,개그맨,한민관,레이서,입...",한민관 CJ 슈퍼 레이스 GT1 개그맨 10년 레이서 관록 입지 소속 레이싱 비트알...,50
51465,TRAIN_51465,정치:청와대,"용인에서 2번 민생토론회 연 尹 ""특례시 지원 특별법 제정, 반도체 고속도로 신속 추진""","용인,민생,토론회,특례시,제정,지원,특별법,반도체,고속도,신속,추진,윤석열,대통령,...",용인 민생 토론회 특례시 제정 지원 특별법 반도체 고속도 신속 추진 윤석열 대통령 ...,43
44057,TRAIN_44057,경제:산업_기업,"故 조양호 회장 2주기 한진그룹, 난기류 뚫고 '비상'","조양호,회장,한진그룹,난기류,비상,아시아투데이,정석만,회장,조양호,한진그룹,세상,2...",조양호 회장 한진그룹 난기류 비상 아시아투데이 정석만 세상 2년 별세 경영 분쟁 감...,25
20498,TRAIN_20498,스포츠:골프,4승 고지 누가 먼저 오르나 ‘트로이카’ 박현경 박지영 이예원 격돌,"고지,트로이카,박현경,박지영,이예원,격돌,출격,개막,KG,레이디스,오픈,동반,대회,...",고지 트로이카 박현경 박지영 이예원 격돌 출격 개막 KG 레이디스 오픈 동반 대회 ...,26
...,...,...,...,...,...,...
47885,TRAIN_47885,경제:경제일반,강서구 보험회사 5명 등 서울 어제 37명 확진,"5명,강서구,보험,회사,서울,확진,서울,신종,코로나바이러스,감염증,코로나19,확진자...",5명 강서구 보험 회사 서울 확진 신종 코로나바이러스 감염증 코로나19 확진자 발생...,16
14003,TRAIN_14003,경제:반도체,"이상일 용인시장, 반도체 국가산단 적극협조...부지 내 이주대책 등 협조 필요","이상일,용인,시장,적극협조,반도체,국가산단,적극,협조,부지,이주대책,협조,회의서,산...",이상일 용인 시장 적극협조 반도체 국가산단 적극 협조 부지 이주대책 회의서 산업 주...,33
20857,TRAIN_20857,경제:부동산,"더퍼스트한양, '더챔버 라티파니' 내달 공급","더퍼스트한양,라티파니,내달,공급,아시아투데이,이민영,더퍼스트한양,동탄2,경기도,화성...",더퍼스트한양 라티파니 내달 공급 아시아투데이 이민영 동탄2 경기도 화성시 동탄 신도...,10
48642,TRAIN_48642,사회:사건_사고,남경필 전 지사 장남 또 필로폰 투약 구속영장 기각 닷새만,"남경필,지사,장남,필로폰,투약,구속영장,기각,닷새,필로폰,투약,혐의,체포,법원,구속...",남경필 지사 장남 필로폰 투약 구속영장 기각 닷새 혐의 체포 법원 구속 영장 경기도...,21


In [27]:
walker = pyg.walk(train_df[train_df['분류'] != '지역'])

Box(children=(HTML(value='\n<div id="ifr-pyg-00062303e9dc7117Smzp4KeUa50RDP1b" style="height: auto">\n    <hea…

In [18]:
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

# 옵티마이저 및 학습 파라미터 설정
optimizer = AdamW(model.parameters(), lr=CFG.learning_rate, weight_decay=CFG.weight_decay)

# Calculate total number of training steps
total_steps = len(train_loader) * CFG.epoch

# Create the learning rate scheduler
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),  # 10% of total steps for warmup
    num_training_steps=total_steps
)

In [None]:
import wandb
import random

# start a new wandb run to track this script
w_run = wandb.init(
    project="dacon-gbt-2024-hackerton",
    config=CFG
)

# 학습
model.train()

# Early stopping parameters
patience = 3
best_val_f1 = 0
counter = 0

for epoch in range(CFG.epoch):
    total_loss = 0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{CFG.epoch}'):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()  # Update learning rate
    
    avg_train_loss = total_loss / len(train_loader)

    # Validation
    model.eval()
    val_predictions = []
    val_true_labels = []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc='Validating'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)
            val_predictions.extend(preds.cpu().tolist())
            val_true_labels.extend(labels.cpu().tolist())
    
    # 검증 결과 출력
    val_f1 = f1_score(val_true_labels, val_predictions, average='macro')

    # Log metrics to wandb
    wandb.log({
        "epoch": epoch + 1,
        "train_loss": avg_train_loss,
        "val_f1_score": val_f1,
        "learning_rate": scheduler.get_last_lr()[0],  # Log the current learning rate
        "best_val_f1": best_val_f1
    })

    print(f"Epoch {epoch + 1}/{CFG.epoch}")
    print(f"Average Training Loss: {avg_train_loss:.4f}")
    print(f"Validation F1 Score: {val_f1:.4f}")
    print(f"Best Validation F1 Score: {best_val_f1:.4f}")
    print(f"Current learning rate: {scheduler.get_last_lr()[0]}")
    print("-" * 50)

     # Early stopping check
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        counter = 0
        # Save the best model
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        counter += 1
    
    # Early stopping
    if counter >= patience:
        print(f"Early stopping triggered after {epoch + 1} epochs")
        break


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mryanbae[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch 1/30:   0%|          | 0/683 [00:00<?, ?it/s]Attention type 'block_sparse' is not possible if sequence_length: 256 <= num global tokens: 2 * config.block_size + min. num sliding tokens: 3 * config.block_size + config.num_random_blocks * config.block_size + additional buffer: config.num_random_blocks * config.block_size = 704 with config.block_size = 64, config.num_random_blocks = 3. Changing attention type to 'original_full'...
Epoch 1/30: 100%|██████████| 683/683 [04:51<00:00,  2.34it/s]
Validating: 100%|██████████| 171/171 [00:24<00:00,  6.93it/s]


Epoch 1/30
Average Training Loss: 2.2011
Validation F1 Score: 0.2198
Best Validation F1 Score: 0.0000
Current learning rate: 3.3333333333333335e-05
--------------------------------------------------


Epoch 2/30: 100%|██████████| 683/683 [04:35<00:00,  2.47it/s]
Validating: 100%|██████████| 171/171 [00:24<00:00,  6.94it/s]


Epoch 2/30
Average Training Loss: 0.9357
Validation F1 Score: 0.4505
Best Validation F1 Score: 0.2198
Current learning rate: 6.666666666666667e-05
--------------------------------------------------


Epoch 3/30:  76%|███████▋  | 522/683 [03:30<01:05,  2.48it/s]

In [None]:
wandb.finish()

In [12]:
# 테스트 세트 추론
model.eval()
test_predictions = []
with torch.no_grad():
    for batch in tqdm(test_loader, desc='Testing'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)
        test_predictions.extend(preds.cpu().tolist())

# 라벨 디코딩
label_decoder = {i: label for label, i in label_encoder.items()}
decoded_predictions = [label_decoder[pred] for pred in test_predictions]

Testing: 100%|██████████| 366/366 [00:36<00:00,  9.99it/s]


In [15]:
w_run.name

'graceful-tree-15'

In [13]:
sample_submission = pd.read_csv(os.path.join(root_path, "./sample_submission.csv"))
sample_submission["분류"] = decoded_predictions
submission_filepath = f"./{w_run.name}.csv"
sample_submission.to_csv(submission_filepath, encoding='UTF-8-sig', index=False)

In [14]:
from dacon_submit_api import dacon_submit_api 

dacon_token = 'd5ea5ec7e519de6412291ab093463dc54315baa080104aeee57ae0ab51543149'
result = dacon_submit_api.post_submission_file(
submission_filepath, 
dacon_token, 
'236372', 
'김밥조아', 
'submission 메모 내용' )

{'isSubmitted': True, 'detail': 'Success'}
