In [1]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
import torch
from datasets import Dataset
import numpy as np
import pandas as pd
import os

RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
DATA_DIR_PATH = 'data'
TRAIN_PATH = os.path.join(DATA_DIR_PATH, 'train.csv')
TEST_PATH = os.path.join(DATA_DIR_PATH, 'test.csv')
DEV_PATH = os.path.join(DATA_DIR_PATH, 'dev.csv')
print(TRAIN_PATH, TEST_PATH, DEV_PATH)

  from .autonotebook import tqdm as notebook_tqdm


data/train.csv data/test.csv data/dev.csv


In [2]:
device = torch.device('cpu')
print(device)

cpu


In [3]:
train_df = pd.read_csv(TRAIN_PATH)
dev_df = pd.read_csv(DEV_PATH)
test_df = pd.read_csv(TEST_PATH)

In [4]:
ts1 = train_df[['sentence_1', 'source']].rename(columns={'sentence_1': 'sentence'})
ts2 = train_df[['sentence_2', 'source']].rename(columns={'sentence_2': 'sentence'})
ds1 = dev_df[['sentence_1', 'source']].rename(columns={'sentence_1': 'sentence'})
ds2 = dev_df[['sentence_2', 'source']].rename(columns={'sentence_2': 'sentence'})
train = pd.concat([ts1, ts2, ds1, ds2], ignore_index=True)

fs1 = test_df[['sentence_1', 'source']].rename(columns={'sentence_1': 'sentence'})
fs2 = test_df[['sentence_2', 'source']].rename(columns={'sentence_2': 'sentence'})
test = pd.concat([fs1, fs2], ignore_index=True)
print(train.info(), test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19748 entries, 0 to 19747
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  19748 non-null  object
 1   source    19748 non-null  object
dtypes: object(2)
memory usage: 308.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  2200 non-null   object
 1   source    2200 non-null   object
dtypes: object(2)
memory usage: 34.5+ KB
None None


In [5]:
label_encoder = LabelEncoder()
train['source_encoded'] = label_encoder.fit_transform(train['source'])
test['source_encoded'] = label_encoder.fit_transform(test['source'])

In [6]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label_encoder.classes_))
model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [7]:
def tokenize_function(examples):
    return tokenizer(examples['sentence'], padding='max_length', truncation=True)

In [8]:
# kFold 설정
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# kFold 교차 검증
fold_accuracies = []
for fold, (train_idx, val_idx) in enumerate(kf.split(train)):
    # Train/Val 데이터를 분리
    train_data = train.iloc[train_idx]
    val_data = train.iloc[val_idx]

    # Hugging Face Datasets 형식으로 변환
    train_dataset = Dataset.from_pandas(train_data)
    val_dataset = Dataset.from_pandas(val_data)

    # 토큰화 적용
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)

    # 필요한 열만 선택 및 라벨을 'labels'로 설정
    train_dataset = train_dataset.map(lambda examples: {'labels': examples['source_encoded']})
    val_dataset = val_dataset.map(lambda examples: {'labels': examples['source_encoded']})

    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    # 학습 파라미터 설정
    training_args = TrainingArguments(
        output_dir=f'./results_{fold}',  # 각 폴드별로 결과 저장
        evaluation_strategy="epoch",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,  # 에폭 수
        logging_dir=f'./logs_{fold}',
        report_to="none",  # 콘솔에만 출력하도록 설정
    )

    # 트레이너 설정
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer
    )

    # 모델 학습
    print(f"Starting training for fold {fold+1}...")
    trainer.train()

    # 폴드별 평가 및 정확도 기록
    metrics = trainer.evaluate()
    fold_accuracy = metrics['eval_loss']  # eval_loss를 대신 사용하거나 eval_accuracy가 있으면 그 값을 사용
    fold_accuracies.append(fold_accuracy)
    print(f"Fold {fold+1} evaluation: {metrics}")

# 각 폴드의 평균 정확도 계산
average_accuracy = np.mean(fold_accuracies)
print(f'Average kFold Accuracy: {average_accuracy:.4f}')


Map: 100%|██████████| 15798/15798 [00:02<00:00, 6799.19 examples/s]
Map: 100%|██████████| 3950/3950 [00:00<00:00, 6838.28 examples/s]
Map: 100%|██████████| 15798/15798 [00:00<00:00, 26944.25 examples/s]
Map: 100%|██████████| 3950/3950 [00:00<00:00, 32937.57 examples/s]


Starting training for fold 1...


  9%|▉         | 266/2964 [02:29<27:35,  1.63it/s]