In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# ### 1. 데이터 증강 (Data Augmentation)을 사용한 KcELECTRA 모델 코드

# import pandas as pd
# import random
# from nltk.corpus import wordnet
# from sklearn.model_selection import train_test_split
# from transformers import ElectraForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
# import torch

# # 텍스트 증강 함수 정의
# def synonym_replacement(sentence):
#     words = sentence.split()
#     new_sentence = []
#     for word in words:
#         synonyms = wordnet.synsets(word)
#         if synonyms:
#             new_word = synonyms[0].lemmas()[0].name()
#             new_sentence.append(new_word if new_word != word else word)
#         else:
#             new_sentence.append(word)
#     return ' '.join(new_sentence)

# # 학습 데이터 증강
# train_df = pd.read_csv('train.hate.csv')
# dev_df = pd.read_csv('dev.hate.csv')

# # 학습 데이터 증강 후 인덱스 재설정
# train_df['comments_augmented'] = train_df['comments'].apply(synonym_replacement)
# train_df_augmented = train_df.rename(columns={'comments_augmented': 'comments'})[['comments', 'label']].reset_index(drop=True)

# # 원본 데이터프레임의 인덱스 재설정 및 중복 열 제거
# train_df = train_df[['comments', 'label']].reset_index(drop=True)

# # 데이터프레임 병합 시 중복된 열이 없도록 확인
# train_df = pd.concat([train_df, train_df_augmented], axis=0, ignore_index=True)




InvalidIndexError: Reindexing only valid with uniquely valued Index objects

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np

# 학습, 검증, 테스트 데이터 로드
train_df = pd.read_csv('train.hate.csv')  # 학습 데이터 로드
dev_df = pd.read_csv('dev.hate.csv')      # 검증 데이터 로드
test_df = pd.read_csv('test.hate.no_label.csv')    # 테스트 데이터 로드

# 라벨링 변환 (예: 'no' -> 0, 'offensive' -> 1, 'hate' -> 2)
label_mapping = {'no': 0, 'offensive': 1, 'hate': 2}
train_df['label'] = train_df['label'].map(label_mapping)
dev_df['label'] = dev_df['label'].map(label_mapping)

# 결측치 확인 및 제거
train_df = train_df.dropna(subset=['label', 'comments'])
dev_df = dev_df.dropna(subset=['label', 'comments'])

# 데이터셋 확인
if train_df.empty:
    raise ValueError("학습 데이터셋이 비어 있습니다. 데이터 파일의 내용을 확인하세요.")
if dev_df.empty:
    raise ValueError("검증 데이터셋이 비어 있습니다. 데이터 파일의 내용을 확인하세요.")

In [None]:
from transformers import ElectraForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, get_linear_schedule_with_warmup
import torch

# 학습 설정에 학습률 스케줄러 추가
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    learning_rate=5e-5,
    warmup_steps=500,
    weight_decay=0.01,
    eval_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=1,
    load_best_model_at_end=True
)

# 학습률 스케줄러 생성
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=500, num_training_steps=1000)

ValueError: --load_best_model_at_end requires the save and eval strategy to match, but found
- Evaluation strategy: epoch
- Save strategy: steps

In [None]:
from transformers import TrainerCallback

# 조기 종료 콜백 정의
class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, patience=3):
        self.patience = patience
        self.best_score = None
        self.early_stop = False
        self.counter = 0

    def on_evaluate(self, args, state, control, **kwargs):
        eval_metric = state.log_history[-1]['eval_loss']
        if self.best_score is None or eval_metric < self.best_score:
            self.best_score = eval_metric
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
                control.should_training_stop = True

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(patience=3)]
)

In [None]:
from transformers import ElectraForSequenceClassification
from tqdm import tqdm
import torch

# 여러 모델 훈련 후 예측 결합
models = [ElectraForSequenceClassification.from_pretrained("beomi/KcELECTRA-base", num_labels=3) for _ in range(3)]
trainers = []

for i, model in tqdm(enumerate(models)):
    training_args = TrainingArguments(
        output_dir=f'./results_{i}',
        num_train_epochs=3,
        per_device_train_batch_size=16,
        learning_rate=5e-5,
        eval_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True
    )
    trainers.append(Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    ))
    trainers[-1].train()

# 예측 결합 코드
final_predictions = sum(trainer.predict(test_dataset).predictions for trainer in trainers) / len(trainers)
final_preds = torch.argmax(torch.tensor(final_predictions), axis=1)

In [None]:
from transformers import ElectraConfig, ElectraForSequenceClassification

# 모델 설정 변경
config = ElectraConfig.from_pretrained("beomi/KcELECTRA-base")
config.hidden_dropout_prob = 0.3  # Dropout 비율 조정
model = ElectraForSequenceClassification.from_pretrained("beomi/KcELECTRA-base", config=config, num_labels=3)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    learning_rate=5e-5,
    warmup_steps=500,
    weight_decay=0.1,  # 높은 weight decay 설정
    eval_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=1,
    load_best_model_at_end=True
)

# 모델 훈련
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

In [None]:
# 모델 훈련 후 평가 지표 계산 및 혼동 행렬 생성
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# 평가 및 예측
predictions = trainer.predict(val_dataset)
y_preds = torch.argmax(torch.tensor(predictions.predictions), axis=1)
y_true = val_labels.numpy()

# 평가 지표 출력
print(classification_report(y_true, y_preds, target_names=['no', 'offensive', 'hate']))

In [None]:
# 혼동 행렬 그리기
conf_matrix = confusion_matrix(y_true, y_preds)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['no', 'offensive', 'hate'], yticklabels=['no', 'offensive', 'hate'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


In [None]:
# 모델 및 토크나이저 저장
model.save_pretrained('./saved_model_v4')
tokenizer.save_pretrained('./saved_model_v4')

In [None]:
# 테스트 데이터 예측 및 CSV 저장
test_encodings = encode_data(test_df)
test_dataset = CustomDataset(test_encodings)
test_predictions = trainer.predict(test_dataset)
test_preds = torch.argmax(torch.tensor(test_predictions.predictions), axis=1)

output_df = test_df.copy()
output_df['predicted_label'] = test_preds.numpy()
output_df.to_csv('test.hate.predicted_KcELECTRA_v4.csv', index=False)
print("테스트 데이터의 예측 결과가 test.hate.predicted_KcELECTRA_v4.csv 파일에 저장되었습니다.")


In [None]:
# Colab에서 모델 압축 및 다운로드
import shutil
from google.colab import files

shutil.make_archive('saved_model_v4', 'zip', './saved_model_v4')
files.download('saved_model_v4.zip')