In [1]:
from transformers import AutoTokenizer, \
    AutoModelForSequenceClassification, \
    Trainer, \
    TrainingArguments
from torchmetrics.functional import pearson_corrcoef
import torch
import pandas as pd
from torch.utils.data import Dataset
from transformers import AdamW, get_scheduler, EarlyStoppingCallback
from torch.optim import AdamW


TRAIN_PATH = 'data/train_after_hanspell (1).csv'
DEV_PATH = 'data/dev_after_hanspell.csv'
TEST_PATH = 'data/test_after_hanspell.csv'

RANDOM_SEED = 42
OUTPUT_DIR = 'saved/'
LEARNING_RATE = 2e-5
TRAIN_BATCH_SIZE = 16
EVAL_BATCH_SIZE = 16
EPOCH = 50
WEIGHT_DECAY = .01
DROPOUT = .4
EARLY_STOP_PATIENCE = 5

MODEL_NAME = "monologg/koelectra-base-v3-discriminator"
EXPERIMENT = "7"

NSMC = "model_nsmc_'"
PETITION = "model_petition_"
SLACK = "model_slack_"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_df = pd.read_csv(TRAIN_PATH)
dev = pd.read_csv(DEV_PATH)

train_nsmc = train_df[train_df['source'].str.contains('nsmc')][['sentence_1', 'sentence_2', 'label']].reset_index(drop=True)
dev_nsmc = dev[dev['source'].str.contains('nsmc')][['sentence_1', 'sentence_2', 'label']].reset_index(drop=True)

train_petition = train_df[train_df['source'].str.contains('petition')][['sentence_1', 'sentence_2', 'label']].reset_index(drop=True)
dev_petition = dev[dev['source'].str.contains('petition')][['sentence_1', 'sentence_2', 'label']].reset_index(drop=True)

train_slack = train_df[train_df['source'].str.contains('slack')][['sentence_1', 'sentence_2', 'label']].reset_index(drop=True)
dev_slack = dev[dev['source'].str.contains('slack')][['sentence_1', 'sentence_2', 'label']].reset_index(drop=True)

In [3]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

model_nsmc = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=1).to('cuda')
model_petition = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=1).to('cuda')
model_slack = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=1).to('cuda')

  return self.fget.__get__(instance, owner)()
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classif

In [4]:
def preprocess_function(examples):
    return tokenizer(list(examples['sentence_1']), list(examples['sentence_2']), padding=True, truncation=True, return_tensors='pt')

train_nsmc_tokenized = preprocess_function(train_nsmc)
dev_nsmc_tokenized = preprocess_function(dev_nsmc)

train_petition_tokenized = preprocess_function(train_petition)
dev_petition_tokenized = preprocess_function(dev_petition)

train_slack_tokenized = preprocess_function(train_slack)
dev_slack_tokenized = preprocess_function(dev_slack)


In [5]:
class STSDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float32)
        return item
    
    def __len__(self):
        if self.labels is not None:
            return len(self.labels)
        return len(self.encodings['input_ids'])

train_nsmc_dataset = STSDataset(train_nsmc_tokenized, train_nsmc['label'])
dev_nsmc_dataset = STSDataset(dev_nsmc_tokenized, dev_nsmc['label'])

train_petition_dataset = STSDataset(train_petition_tokenized, train_petition['label'])
dev_petition_dataset = STSDataset(dev_petition_tokenized, dev_petition['label'])

train_slack_dataset = STSDataset(train_slack_tokenized, train_slack['label'])
dev_slack_dataset = STSDataset(dev_slack_tokenized, dev_slack['label'])


In [6]:
# Pearson 상관계수 메트릭
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.flatten()
    return {
        'pearson': pearson_corrcoef(torch.tensor(predictions), torch.tensor(labels))
    }

In [7]:
def make_model_contiguous(model):
    for param in model.parameters():
        if not param.is_contiguous():
            param.data = param.data.contiguous()

make_model_contiguous(model_nsmc)
make_model_contiguous(model_petition)
make_model_contiguous(model_slack)

In [8]:
# 드롭아웃 설정
model_nsmc.config.hidden_dropout_prob = DROPOUT
model_petition.config.hidden_dropout_prob = DROPOUT
model_slack.config.hidden_dropout_prob = DROPOUT

# 학습률 스케줄러 설정
num_epochs = EPOCH  # 기본 50 에폭
num_training_steps = num_epochs * len(train_nsmc_dataset)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    learning_rate=2e-5,  # 작은 학습률
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="pearson",
    neftune_noise_alpha=.1
)

# Early Stopping 추가
callbacks = [EarlyStoppingCallback(early_stopping_patience=EARLY_STOP_PATIENCE)]

# 옵티마이저 설정
optimizer_grouped_parameters = [
    {'params': model_nsmc.electra.encoder.layer[:6].parameters(), 'lr': 5e-6},  # 하위 레이어
    {'params': model_nsmc.electra.encoder.layer[6:].parameters(), 'lr': 5e-5},  # 상위 레이어
    {'params': model_nsmc.classifier.parameters(), 'lr': 1e-4},  # 분류기
]

optimizer = AdamW(optimizer_grouped_parameters)

# 학습률 스케줄러
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

# Trainer 설정
trainer_nsmc = Trainer(
    model=model_nsmc,
    args=training_args,
    train_dataset=train_nsmc_dataset,
    eval_dataset=dev_nsmc_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=callbacks
)

trainer_petition = Trainer(
    model=model_petition,
    args=training_args,
    train_dataset=train_petition_dataset,
    eval_dataset=dev_petition_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=callbacks
)

trainer_slack = Trainer(
    model=model_slack,
    args=training_args,
    train_dataset=train_slack_dataset,
    eval_dataset=dev_slack_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=callbacks
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [9]:
trainer_nsmc.train()

Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.459736,0.855473
2,No log,0.418716,0.885465
3,0.675600,0.437689,0.888974
4,0.675600,0.411852,0.891083
5,0.675600,0.512307,0.888553
6,0.139300,0.339333,0.903565
7,0.139300,0.358059,0.905617
8,0.083000,0.424063,0.907269
9,0.083000,0.374825,0.899885
10,0.083000,0.353909,0.908155


TrainOutput(global_step=3667, training_loss=0.14768052341699406, metrics={'train_runtime': 431.9092, 'train_samples_per_second': 357.019, 'train_steps_per_second': 22.343, 'total_flos': 3221936555230152.0, 'train_loss': 0.14768052341699406, 'epoch': 19.0})

In [10]:
trainer_petition.train()

Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.620749,0.892691
2,No log,0.595032,0.906887
3,0.751700,0.643494,0.902522
4,0.751700,0.484534,0.921084
5,0.751700,0.522161,0.914825
6,0.153300,0.481656,0.921665
7,0.153300,0.48943,0.919595
8,0.085600,0.467508,0.923625
9,0.085600,0.459163,0.926801
10,0.085600,0.517784,0.919899


TrainOutput(global_step=3383, training_loss=0.17340993641608923, metrics={'train_runtime': 364.0606, 'train_samples_per_second': 435.23, 'train_steps_per_second': 27.331, 'total_flos': 2685394722467046.0, 'train_loss': 0.17340993641608923, 'epoch': 17.0})

In [11]:
trainer_slack.train()

Epoch,Training Loss,Validation Loss,Pearson
1,No log,0.551725,0.858364
2,No log,0.510988,0.883483
3,0.662900,0.583866,0.876374
4,0.662900,0.43481,0.900843
5,0.662900,0.525682,0.896519
6,0.124500,0.588791,0.88692
7,0.124500,0.457265,0.895589
8,0.077600,0.503239,0.898003
9,0.077600,0.485601,0.901824
10,0.077600,0.451705,0.896654


TrainOutput(global_step=4608, training_loss=0.11611169804301527, metrics={'train_runtime': 489.5998, 'train_samples_per_second': 313.521, 'train_steps_per_second': 19.608, 'total_flos': 3521257637631840.0, 'train_loss': 0.11611169804301527, 'epoch': 24.0})

In [12]:
eval_result = trainer_nsmc.evaluate()
print(f"NSMC 데이터셋 평가 결과: {eval_result['eval_pearson']}")
trainer_nsmc.save_model(f"{OUTPUT_DIR}/nsmc_model_{EXPERIMENT}")

NSMC 데이터셋 평가 결과: 0.9107161164283752


In [13]:
eval_result = trainer_petition.evaluate()
print(f"NSMC 데이터셋 평가 결과: {eval_result['eval_pearson']}")
trainer_petition.save_model(f"{OUTPUT_DIR}/petition_model_{EXPERIMENT}")

NSMC 데이터셋 평가 결과: 0.9280089139938354


In [14]:
eval_result = trainer_slack.evaluate()
print(f"NSMC 데이터셋 평가 결과: {eval_result['eval_pearson']}")
trainer_slack.save_model(f"{OUTPUT_DIR}/slack_model_{EXPERIMENT}")

NSMC 데이터셋 평가 결과: 0.9079321026802063


In [15]:
test = pd.read_csv(TEST_PATH)
test_nsmc = test[test['source'].str.contains('nsmc')]
test_petition = test[test['source'].str.contains('petition')]
test_slack = test[test['source'].str.contains('slack')]
test.head()

Unnamed: 0,id,source,sentence_1,sentence_2
0,boostcamp-sts-v1-test-000,petition-sampled,가상화폐 거래소 폐쇄하지 말고,가상화폐 거래소 폐쇄 반대합니다
1,boostcamp-sts-v1-test-001,petition-sampled,뇌물적 폐 1호 640만 달라 70억 뇌물 받은 권양숙 구속하고 재산을 몰수하라,뇌물적 폐 원조 640만 달라 70억 뇌물 받은 권양숙 구속하고 재산을 몰수하세요
2,boostcamp-sts-v1-test-002,petition-rtt,기무사 영관급의 하극상 정말 이대로 방관하는 게 민주주의인지요,그냥 가만히 있는 게 진짜 민주주의인가요
3,boostcamp-sts-v1-test-003,nsmc-sampled,화까지가 한계였다,기대가 너무 컸다
4,boostcamp-sts-v1-test-004,slack-rtt,왜 혼자 있지,왜 혼자야


In [16]:
test_nsmc_encoded = preprocess_function(test_nsmc)
test_petition_encoded = preprocess_function(test_petition)
test_slack_encoded = preprocess_function(test_slack)

test_nsmc_encoded

{'input_ids': tensor([[    2,  3811,  4149,  ...,     0,     0,     0],
        [    2,  8308,  4070,  ...,     0,     0,     0],
        [    2, 20752, 22452,  ...,     0,     0,     0],
        ...,
        [    2, 15603,  4007,  ...,     0,     0,     0],
        [    2,  7226,  7504,  ...,     0,     0,     0],
        [    2, 10976,  4325,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [17]:
test_nsmc_dataset = STSDataset(test_nsmc_encoded)
test_petition_dataset = STSDataset(test_petition_encoded)
test_slack_dataset = STSDataset(test_slack_encoded)

nsmc_predictions = trainer_nsmc.predict(test_nsmc_dataset).predictions.flatten()
petition_predictions = trainer_petition.predict(test_petition_dataset).predictions.flatten()
slack_predictions = trainer_slack.predict(test_slack_dataset).predictions.flatten()

test_nsmc['target'] = nsmc_predictions
test_petition['target'] = petition_predictions
test_slack['target'] = slack_predictions

test_nsmc.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_nsmc['target'] = nsmc_predictions
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_petition['target'] = petition_predictions
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_slack['target'] = slack_predictions


Unnamed: 0,id,source,sentence_1,sentence_2,target
3,boostcamp-sts-v1-test-003,nsmc-sampled,화까지가 한계였다,기대가 너무 컸다,0.084887
5,boostcamp-sts-v1-test-005,nsmc-sampled,전개가 너무 부실하고 몰입력은 아예 기대할 수가 없다,전개가 황당해서 몰입할 수가 없었다,3.415882
7,boostcamp-sts-v1-test-007,nsmc-sampled,스튜어트 고든과 데니스 호퍼에게 정말 실망한 작품,무언가 힐링이든 감동이든 재미든 글 기대하며 봤지만 완벽히 실망한 영화,1.582404
8,boostcamp-sts-v1-test-008,nsmc-sampled,김래원 캐릭터는 전지전능,남주 여주 둘 다 캐릭터가 그다지 매력적이지가 않음,0.262238
10,boostcamp-sts-v1-test-010,nsmc-sampled,2008년에 봤지만 이제 개봉한다니 놀랍고 반갑네요,이게 더 씽이고 이제 연도에 개봉한 더 씨를 봐야겠다,1.707893


In [18]:
test['target'] = None
test.loc[test_nsmc.index, 'target'] = test_nsmc['target']
test.loc[test_petition.index, 'target'] = test_petition['target']
test.loc[test_slack.index, 'target'] = test_slack['target']
test['target'] = pd.to_numeric(test['target'], errors='coerce')
test['target'] = test['target'].round(1)

test

Unnamed: 0,id,source,sentence_1,sentence_2,target
0,boostcamp-sts-v1-test-000,petition-sampled,가상화폐 거래소 폐쇄하지 말고,가상화폐 거래소 폐쇄 반대합니다,3.4
1,boostcamp-sts-v1-test-001,petition-sampled,뇌물적 폐 1호 640만 달라 70억 뇌물 받은 권양숙 구속하고 재산을 몰수하라,뇌물적 폐 원조 640만 달라 70억 뇌물 받은 권양숙 구속하고 재산을 몰수하세요,4.5
2,boostcamp-sts-v1-test-002,petition-rtt,기무사 영관급의 하극상 정말 이대로 방관하는 게 민주주의인지요,그냥 가만히 있는 게 진짜 민주주의인가요,1.4
3,boostcamp-sts-v1-test-003,nsmc-sampled,화까지가 한계였다,기대가 너무 컸다,0.1
4,boostcamp-sts-v1-test-004,slack-rtt,왜 혼자 있지,왜 혼자야,3.5
...,...,...,...,...,...
1095,boostcamp-sts-v1-test-1095,slack-sampled,슬랙 채널 개편 현재 75개의 채널들이 있는데 사용되지 않는 채널들은 정리하고 넘버...,오늘 슬랙의 채널 및 사용 권한에 대한 변경이 있을 예정입니다,1.2
1096,boostcamp-sts-v1-test-1096,petition-rtt,청소년보호법 폐지 청원합니다,청소년보호법 폐지 청원서,4.3
1097,boostcamp-sts-v1-test-1097,petition-rtt,존경하는 문 대통령님 상서 경유 자동차 단속 대상은 매연이 많이 나오는 화물자동차에...,친애하는 문 대통령님 디젤 차량 단속이 매연을 내는 트럭으로 제한되기를 간절히 기도합니다,4.1
1098,boostcamp-sts-v1-test-1098,nsmc-rtt,요즘 재미가 훅 떨어짐,요즘 재미가 사라졌다,4.0


In [19]:
submission = pd.read_csv('data/sample_submission.csv')
submission['target'] = test['target']
submission.to_csv('output_model_by_corpus_3.csv', index=False)