In [10]:
from transformers import ElectraTokenizer, ElectraForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
import torch
from datasets import Dataset
import numpy as np
import pandas as pd
import os
from sklearn.metrics import accuracy_score, f1_score


RANDOM_SEED = 42
torch.manual_seed(RANDOM_SEED)
DATA_DIR_PATH = 'data'
TRAIN_PATH = os.path.join(DATA_DIR_PATH, 'train.csv')
TEST_PATH = os.path.join(DATA_DIR_PATH, 'test.csv')
DEV_PATH = os.path.join(DATA_DIR_PATH, 'dev.csv')
print(TRAIN_PATH, TEST_PATH, DEV_PATH)

data/train.csv data/test.csv data/dev.csv


In [11]:
device = torch.device('cuda')
print(device)

cuda


In [12]:
train_df = pd.read_csv(TRAIN_PATH)
dev_df = pd.read_csv(DEV_PATH)
test_df = pd.read_csv(TEST_PATH)

In [13]:
ts1 = train_df[['sentence_1', 'source']].rename(columns={'sentence_1': 'sentence'})
ts2 = train_df[['sentence_2', 'source']].rename(columns={'sentence_2': 'sentence'})
ds1 = dev_df[['sentence_1', 'source']].rename(columns={'sentence_1': 'sentence'})
ds2 = dev_df[['sentence_2', 'source']].rename(columns={'sentence_2': 'sentence'})
train = pd.concat([ts1, ts2, ds1, ds2], ignore_index=True)

fs1 = test_df[['sentence_1', 'source']].rename(columns={'sentence_1': 'sentence'})
fs2 = test_df[['sentence_2', 'source']].rename(columns={'sentence_2': 'sentence'})
test = pd.concat([fs1, fs2], ignore_index=True)

In [14]:
train['source'] = train['source'].str.replace('-rtt', '')
train['source'] = train['source'].str.replace('-sampled', '')

test['source'] = test['source'].str.replace('-rtt', '')
test['source'] = test['source'].str.replace('-sampled', '')

In [15]:
train

Unnamed: 0,sentence,source
0,스릴도있고 반전도 있고 여느 한국영화 쓰레기들하고는 차원이 다르네요~,nsmc
1,앗 제가 접근권한이 없다고 뜹니다;;,slack
2,주택청약조건 변경해주세요.,petition
3,입사후 처음 대면으로 만나 반가웠습니다.,slack
4,뿌듯뿌듯 하네요!!,slack
...,...,...
19743,정말 가슴을 따뜻하게 한 좋은 드라마...,nsmc
19744,(비타민을 먹는 장면),slack
19745,무슨의미로 만들었는지 모르겠음..,nsmc
19746,"(예: 주말에는 개인캘린더만 보고, 업무시간에는 업무캘린더만 보기)",slack


In [16]:
test

Unnamed: 0,sentence,source
0,가상화폐거래소 폐쇄하지 말고,petition
1,뇌물적폐1호 640만달라 70억 뇌물받은 권양숙 구속하고 재산을 몰수하라,petition
2,기무사 영관급의 하극상 정말 이대로 방관하는게 민주주의 인지요,petition
3,화까지가 한계였다.,nsmc
4,왜 혼자 있지.. ㅠㅠ,slack
...,...,...
2195,오늘 슬랙의 채널 및 사용 권한에 대한 변경이 있을 예정입니다!,slack
2196,청소년보호법 폐지 청원서,petition
2197,"친애하는 문 대통령님, 디젤차량 단속이 매연을 내는 트럭으로 제한되기를 간절히 기도...",petition
2198,요즘 재미가 사라졌다...,nsmc


In [17]:
label_encoder = LabelEncoder()
train['source_encoded'] = label_encoder.fit_transform(train['source'])
test['source_encoded'] = label_encoder.transform(test['source'])

In [18]:
tokenizer = ElectraTokenizer.from_pretrained('monologg/koelectra-base-v3-discriminator')
model = ElectraForSequenceClassification.from_pretrained('monologg/koelectra-base-v3-discriminator', num_labels=len(label_encoder.classes_))
model.to(device)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(35000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): L

In [19]:
def tokenize_function(examples):
    return tokenizer(examples['sentence'], padding='max_length', truncation=True)

In [20]:
for param in model.parameters():
    if not param.is_contiguous():
        param.data = param.contiguous()

# kFold 설정
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# kFold 교차 검증
fold_accuracies = []
for fold, (train_idx, val_idx) in enumerate(kf.split(train)):
    # Train/Val 데이터를 분리
    train_data = train.iloc[train_idx]
    val_data = train.iloc[val_idx]

    # Hugging Face Datasets 형식으로 변환
    train_dataset = Dataset.from_pandas(train_data)
    val_dataset = Dataset.from_pandas(val_data)

    # 토큰화 적용
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    val_dataset = val_dataset.map(tokenize_function, batched=True)

    # 필요한 열만 선택 및 라벨을 'labels'로 설정
    train_dataset = train_dataset.map(lambda examples: {'labels': examples['source_encoded']})
    val_dataset = val_dataset.map(lambda examples: {'labels': examples['source_encoded']})

    train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

    # 학습 파라미터 설정
    training_args = TrainingArguments(
        output_dir=f'./results_{fold}',  # 각 폴드별로 결과 저장
        evaluation_strategy="epoch",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,  # 에폭 수
        logging_dir=f'./logs_{fold}',
        report_to="none",  # 콘솔에만 출력하도록 설정
        fp16=True,  # 혼합 정밀도(16-bit floating point) 사용 (CUDA에서 성능 향상)
        save_steps=1000,  # 체크포인트 저장 빈도
        save_total_limit=2  # 저장할 체크포인트의 최대 수
    )

    # 트레이너 설정
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer
    )

    # 모델 학습
    print(f"Starting training for fold {fold+1}...")
    trainer.train()

    # 폴드별 평가 및 정확도 기록
    metrics = trainer.evaluate()
    fold_accuracy = metrics['eval_loss']  # eval_loss를 대신 사용하거나 eval_accuracy가 있으면 그 값을 사용
    fold_accuracies.append(fold_accuracy)
    print(f"Fold {fold+1} evaluation: {metrics}")

# 각 폴드의 평균 정확도 계산
average_accuracy = np.mean(fold_accuracies)
print(f'Average kFold Accuracy: {average_accuracy:.4f}')


Map: 100%|██████████| 15798/15798 [00:05<00:00, 2716.62 examples/s]
Map: 100%|██████████| 3950/3950 [00:01<00:00, 2817.94 examples/s]
Map: 100%|██████████| 15798/15798 [00:01<00:00, 10181.05 examples/s]
Map: 100%|██████████| 3950/3950 [00:00<00:00, 11134.25 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Starting training for fold 1...


Epoch,Training Loss,Validation Loss
1,0.3698,0.216726
2,0.1219,0.173212
3,0.0478,0.186228


Fold 1 evaluation: {'eval_loss': 0.18622753024101257, 'eval_runtime': 15.585, 'eval_samples_per_second': 253.449, 'eval_steps_per_second': 15.849, 'epoch': 3.0}


Map: 100%|██████████| 15798/15798 [00:05<00:00, 2699.81 examples/s]
Map: 100%|██████████| 3950/3950 [00:01<00:00, 2770.86 examples/s]
Map: 100%|██████████| 15798/15798 [00:01<00:00, 10236.76 examples/s]
Map: 100%|██████████| 3950/3950 [00:00<00:00, 11222.40 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Starting training for fold 2...


Epoch,Training Loss,Validation Loss
1,0.1272,0.065006
2,0.0549,0.052049
3,0.0178,0.054223


Fold 2 evaluation: {'eval_loss': 0.05422252416610718, 'eval_runtime': 15.5869, 'eval_samples_per_second': 253.418, 'eval_steps_per_second': 15.847, 'epoch': 3.0}


Map: 100%|██████████| 15798/15798 [00:05<00:00, 2787.65 examples/s]
Map: 100%|██████████| 3950/3950 [00:01<00:00, 2767.64 examples/s]
Map: 100%|██████████| 15798/15798 [00:01<00:00, 8760.87 examples/s] 
Map: 100%|██████████| 3950/3950 [00:00<00:00, 11005.35 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Starting training for fold 3...


Epoch,Training Loss,Validation Loss
1,0.0681,0.034038
2,0.0298,0.042841
3,0.0075,0.035648


Fold 3 evaluation: {'eval_loss': 0.03564784303307533, 'eval_runtime': 15.5986, 'eval_samples_per_second': 253.227, 'eval_steps_per_second': 15.835, 'epoch': 3.0}


Map: 100%|██████████| 15799/15799 [00:05<00:00, 2786.83 examples/s]
Map: 100%|██████████| 3949/3949 [00:01<00:00, 2782.45 examples/s]
Map: 100%|██████████| 15799/15799 [00:01<00:00, 8936.74 examples/s] 
Map: 100%|██████████| 3949/3949 [00:00<00:00, 11266.61 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Starting training for fold 4...


Epoch,Training Loss,Validation Loss
1,0.0533,0.013665
2,0.0208,0.005606
3,0.0097,0.006182


Fold 4 evaluation: {'eval_loss': 0.006181574426591396, 'eval_runtime': 15.5963, 'eval_samples_per_second': 253.202, 'eval_steps_per_second': 15.837, 'epoch': 3.0}


Map: 100%|██████████| 15799/15799 [00:05<00:00, 2759.86 examples/s]
Map: 100%|██████████| 3949/3949 [00:01<00:00, 2767.11 examples/s]
Map: 100%|██████████| 15799/15799 [00:01<00:00, 9015.72 examples/s] 
Map: 100%|██████████| 3949/3949 [00:00<00:00, 11078.23 examples/s]
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Starting training for fold 5...


Epoch,Training Loss,Validation Loss
1,0.0494,0.01415
2,0.0103,0.007316
3,0.0072,0.005027


Fold 5 evaluation: {'eval_loss': 0.005026908591389656, 'eval_runtime': 15.6005, 'eval_samples_per_second': 253.132, 'eval_steps_per_second': 15.833, 'epoch': 3.0}
Average kFold Accuracy: 0.0575


In [21]:
# 최종 테스트 데이터 평가
test_sentence_tokenized = tokenizer(list(test['sentence']), padding=True, truncation=True, return_tensors='pt').to(device)

with torch.no_grad():
    test_outputs = model(input_ids=test_sentence_tokenized['input_ids'], attention_mask=test_sentence_tokenized['attention_mask'])
test_predictions = torch.argmax(test_outputs.logits, dim=-1)

# 실제 라벨과 예측 라벨로 Accuracy 및 F1 Score 계산
true_labels = test['source_encoded'].values
predictions = test_predictions.cpu().numpy()

# 최종 테스트 데이터에 대한 성능 평가
test_accuracy = accuracy_score(true_labels, predictions)
test_f1 = f1_score(true_labels, predictions, average='weighted')
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test F1 Score: {test_f1:.4f}")

Test Accuracy: 0.9559
Test F1 Score: 0.9559


In [22]:
model_save_path = "saved/text_classifier_experiment_2.pt"
torch.save(model.state_dict(), model_save_path)

In [23]:
example = "제발 좀 돌아가라"
token = tokenizer(example, padding=True, truncation=True, return_tensors='pt').to(device)

with torch.no_grad():
    example_predict_vector = model(input_ids=token['input_ids'], attention_mask=token['attention_mask'])
example_predict = torch.argmax(example_predict_vector.logits, dim=-1)
result = label_encoder.inverse_transform(example_predict.cpu())
print(result)

['petition']
