In [1]:
import argparse
import random

import pandas as pd

from tqdm.auto import tqdm

import transformers
import torch
import torchmetrics
import pytorch_lightning as pl
from pytorch_lightning.callbacks import LearningRateMonitor
from pytorch_lightning.callbacks import Callback
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning.callbacks import BackboneFinetuning

# seed 고정
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
random.seed(0)

In [2]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets=[]):
        self.inputs = inputs
        self.targets = targets

    # 학습 및 추론 과정에서 데이터를 1개씩 꺼내오는 곳
    def __getitem__(self, idx):
        # 정답이 있다면 else문을, 없다면 if문을 수행합니다
        if len(self.targets) == 0:
            return torch.tensor(self.inputs[idx])
        else:
            return torch.tensor(self.inputs[idx]), torch.tensor(self.targets[idx])

    # 입력하는 개수만큼 데이터를 사용합니다
    def __len__(self):
        return len(self.inputs)


class Dataloader(pl.LightningDataModule):
    def __init__(self, model_name, batch_size, shuffle, train_path, dev_path, test_path, predict_path):
        super().__init__()
        self.model_name = model_name
        self.batch_size = batch_size
        self.shuffle = shuffle

        self.train_path = train_path
        self.dev_path = dev_path
        self.test_path = test_path
        self.predict_path = predict_path

        self.train_dataset = None
        self.val_dataset = None
        self.test_dataset = None
        self.predict_dataset = None

        self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, max_length=130)
        self.target_columns = ['label']
        self.delete_columns = ['id']
        self.text_columns = ['sentence_1', 'sentence_2']

    def tokenizing(self, dataframe):
        data = []
        for idx, item in tqdm(dataframe.iterrows(), desc='tokenizing', total=len(dataframe)):
            # 두 입력 문장을 [SEP] 토큰으로 이어붙여서 전처리합니다.
            text = '[SEP]'.join([item[text_column] for text_column in self.text_columns])
            outputs = self.tokenizer(text, add_special_tokens=True, padding='max_length', truncation=True)
            data.append(outputs['input_ids'])
        return data

    def preprocessing(self, data):
        # 안쓰는 컬럼을 삭제합니다.
        data = data.drop(columns=self.delete_columns)

        # 타겟 데이터가 없으면 빈 배열을 리턴합니다.
        try:
            targets = data[self.target_columns].values.tolist()
        except:
            targets = []

        # 텍스트 데이터를 전처리합니다.
        inputs = self.tokenizing(data)

        return inputs, targets

    def setup(self, stage='fit'):
        if stage == 'fit':
            # 학습 데이터와 검증 데이터셋을 호출합니다
            train_data = pd.read_csv(self.train_path)
            val_data = pd.read_csv(self.dev_path)
            
            train_data['label'] = train_data['label']
            augmented_data = train_data.copy()
            non_zero_labels = augmented_data[augmented_data['label'] != 0]

            non_zero_labels[['sentence_1', 'sentence_2']] = non_zero_labels[['sentence_2', 'sentence_1']]
            train_data = pd.concat([train_data, non_zero_labels], ignore_index=True)
            # 학습데이터 준비
            train_inputs, train_targets = self.preprocessing(train_data)

            # 검증데이터 준비
            val_inputs, val_targets = self.preprocessing(val_data)

            # train 데이터만 shuffle을 적용해줍니다, 필요하다면 val, test 데이터에도 shuffle을 적용할 수 있습니다
            self.train_dataset = Dataset(train_inputs, train_targets)
            self.val_dataset = Dataset(val_inputs, val_targets)
        else:
            # 평가데이터 준비
            test_data = pd.read_csv(self.test_path)
            test_inputs, test_targets = self.preprocessing(test_data)
            self.test_dataset = Dataset(test_inputs, test_targets)

            predict_data = pd.read_csv(self.predict_path)
            predict_inputs, predict_targets = self.preprocessing(predict_data)
            self.predict_dataset = Dataset(predict_inputs, [])

    def train_dataloader(self):
        return torch.utils.data.DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=args.shuffle)#, num_workers=5, persistent_workers=True)

    def val_dataloader(self):
        return torch.utils.data.DataLoader(self.val_dataset, batch_size=self.batch_size)

    def test_dataloader(self):
        return torch.utils.data.DataLoader(self.test_dataset, batch_size=self.batch_size)

    def predict_dataloader(self):
        return torch.utils.data.DataLoader(self.predict_dataset, batch_size=self.batch_size)


class Model(pl.LightningModule):
    def __init__(self, model_name, lr):
        super().__init__()
        self.save_hyperparameters()

        self.model_name = model_name
        self.lr = lr

        # 사용할 모델을 호출합니다.
        self.plm = transformers.AutoModelForSequenceClassification.from_pretrained(
            pretrained_model_name_or_path=model_name, num_labels=1)
        # Loss 계산을 위해 사용될 L1Loss를 호출합니다.
        self.loss_func = torch.nn.MSELoss()

    def forward(self, x):
        x = self.plm(x)['logits']

        return x

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.loss_func(logits, y.float())
        self.log("train_loss", loss)

        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.loss_func(logits, y.float())
        val_pearson = torchmetrics.functional.pearson_corrcoef(logits.squeeze(), y.squeeze())
        self.log("val_loss", loss)

        self.log("val_pearson", val_pearson)
        #print(loss, val_pearson)
        return loss

    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)

        self.log("test_pearson", torchmetrics.functional.pearson_corrcoef(logits.squeeze(), y.squeeze()))

    def predict_step(self, batch, batch_idx):
        x = batch
        logits = self(x)

        return logits.squeeze()

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=self.lr)
        return optimizer


class EpochPrintCallback(Callback):
    def on_train_epoch_end(self, trainer, pl_module):
        print(f"Epoch {trainer.current_epoch} ended")
        metrics = trainer.callback_metrics
        # 검증 손실 출력 (만약 검증을 수행했다면)
        if trainer.callback_metrics.get("val_loss"):
            print(f"Validation Loss: {trainer.callback_metrics['val_loss']:.4f}")
        # 학습 손실 출력
        if trainer.callback_metrics.get("train_loss"):
            print(f"Training Loss: {trainer.callback_metrics['train_loss']:.4f}")
        if "val_pearson" in metrics:
            print(f"Validation Pearson Correlation: {metrics['val_pearson']:.4f}")
        print("-" * 40)

In [3]:
checkpoint_callback = ModelCheckpoint(
    monitor='val_pearson',  # 모니터링할 메트릭
    mode='max',                   # 클수록 ㅁ좋은 것
    save_top_k=1,               
    #save_last=True,               # 마지막 모델도 저장
    filename='best-{model_name}-{epoch:02d}-}',
    verbose=True,
    dirpath='./checkpoints',      # 체크포인트를 저장할 디렉토리
)
early_stopping = EarlyStopping('val_loss',patience=10, verbose=True)

print('Train Starting...')
max_epoch = 50
batch_size = 16
shuffle = True


learning_rate = 1e-5  # 스캐쥴려로 자동
model_name = "monologg/koelectra-base-v3-discriminator"
parser = argparse.ArgumentParser()
parser.add_argument('--model_name', default=model_name, type=str)
parser.add_argument('--learning_rate', default=learning_rate, type=float)


# 고정값 (상대위치x -> 현재폴더 기준으로(pathlib), )
parser.add_argument('--batch_size', default=batch_size, type=int)
parser.add_argument('--max_epoch', default=max_epoch, type=int)
parser.add_argument('--shuffle', default=shuffle)
parser.add_argument('--train_path', default='./data/train_after_hanspell (1).csv')
parser.add_argument('--dev_path', default='./data/dev_after_hanspell.csv')
parser.add_argument('--test_path', default='./data/dev_after_hanspell.csv')
parser.add_argument('--predict_path', default='./data/test_after_hanspell.csv')
args = parser.parse_args(args=[])

# dataloader와 model을 생성합니다.
dataloader = Dataloader(args.model_name, args.batch_size, args.shuffle, args.train_path, args.dev_path,
                        args.test_path, args.predict_path)
model = Model(args.model_name, args.learning_rate)



lr_monitor = LearningRateMonitor(logging_interval='step')
epoch_print_callback = EpochPrintCallback()
# gpu가 없으면 accelerator="cpu"로 변경해주세요, gpu가 여러개면 'devices=4'처럼 사용하실 gpu의 개수를 입력해주세요
trainer = pl.Trainer(accelerator="gpu", devices=1, max_epochs=args.max_epoch, log_every_n_steps=1, callbacks=[lr_monitor, epoch_print_callback, checkpoint_callback, early_stopping])

# Train part
trainer.fit(model=model, datamodule=dataloader)



tokenizing:   0%|          | 0/550 [00:00<?, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                             | Params
---------------------------------------------------------------
0 | plm       | ElectraForSequenceClassification | 112 M 
1 | loss_func | MSELoss                          | 0     
---------------------------------------------------------------
112 M     Trainable params
0         Non-trainable params
112 M     Total params
451.688   Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\승범 pc\Desktop\공부\딥러닝\naver_boot\플젝\첫번째\venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
c:\Users\승범 pc\Desktop\공부\딥러닝\naver_boot\플젝\첫번째\venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved. New best score: 0.448
Epoch 0, global step 1033: 'val_pearson' reached 0.89515 (best 0.89515), saving model to './checkpoints\\best-model-epoch=00-val_pearson_epoch=0.0000.ckpt' as top 1


Epoch 0 ended
Validation Loss: 0.4477
Training Loss: 0.3797
Validation Pearson Correlation: 0.8951
----------------------------------------


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 1, global step 2066: 'val_pearson' reached 0.89780 (best 0.89780), saving model to './checkpoints\\best-model-epoch=01-val_pearson_epoch=0.0000.ckpt' as top 1


Epoch 1 ended
Validation Loss: 0.4684
Training Loss: 0.2113
Validation Pearson Correlation: 0.8978
----------------------------------------


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.022 >= min_delta = 0.0. New best score: 0.425
Epoch 2, global step 3099: 'val_pearson' reached 0.90395 (best 0.90395), saving model to './checkpoints\\best-model-epoch=02-val_pearson_epoch=0.0000.ckpt' as top 1


Epoch 2 ended
Validation Loss: 0.4254
Training Loss: 0.1531
Validation Pearson Correlation: 0.9039
----------------------------------------


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 3, global step 4132: 'val_pearson' was not in top 1


Epoch 3 ended
Validation Loss: 0.4425
Training Loss: 0.1740
Validation Pearson Correlation: 0.9022
----------------------------------------


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.016 >= min_delta = 0.0. New best score: 0.409
Epoch 4, global step 5165: 'val_pearson' reached 0.91082 (best 0.91082), saving model to './checkpoints\\best-model-epoch=04-val_pearson_epoch=0.0000.ckpt' as top 1


Epoch 4 ended
Validation Loss: 0.4095
Training Loss: 0.1786
Validation Pearson Correlation: 0.9108
----------------------------------------


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.056 >= min_delta = 0.0. New best score: 0.354
Epoch 5, global step 6198: 'val_pearson' reached 0.91491 (best 0.91491), saving model to './checkpoints\\best-model-epoch=05-val_pearson_epoch=0.0000.ckpt' as top 1


Epoch 5 ended
Validation Loss: 0.3539
Training Loss: 0.0837
Validation Pearson Correlation: 0.9149
----------------------------------------


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 6, global step 7231: 'val_pearson' reached 0.91512 (best 0.91512), saving model to './checkpoints\\best-model-epoch=06-val_pearson_epoch=0.0000.ckpt' as top 1


Epoch 6 ended
Validation Loss: 0.3782
Training Loss: 0.0882
Validation Pearson Correlation: 0.9151
----------------------------------------


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 7, global step 8264: 'val_pearson' was not in top 1


Epoch 7 ended
Validation Loss: 0.4115
Training Loss: 0.0349
Validation Pearson Correlation: 0.9128
----------------------------------------


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.009 >= min_delta = 0.0. New best score: 0.345
Epoch 8, global step 9297: 'val_pearson' reached 0.91957 (best 0.91957), saving model to './checkpoints\\best-model-epoch=08-val_pearson_epoch=0.0000.ckpt' as top 1


Epoch 8 ended
Validation Loss: 0.3447
Training Loss: 0.1779
Validation Pearson Correlation: 0.9196
----------------------------------------


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 9, global step 10330: 'val_pearson' was not in top 1


Epoch 9 ended
Validation Loss: 0.3822
Training Loss: 0.0810
Validation Pearson Correlation: 0.9164
----------------------------------------


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 10, global step 11363: 'val_pearson' reached 0.92227 (best 0.92227), saving model to './checkpoints\\best-model-epoch=10-val_pearson_epoch=0.0000.ckpt' as top 1


Epoch 10 ended
Validation Loss: 0.3521
Training Loss: 0.0275
Validation Pearson Correlation: 0.9223
----------------------------------------


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 11, global step 12396: 'val_pearson' was not in top 1


Epoch 11 ended
Validation Loss: 0.3475
Training Loss: 0.0272
Validation Pearson Correlation: 0.9195
----------------------------------------


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 12, global step 13429: 'val_pearson' was not in top 1


Epoch 12 ended
Validation Loss: 0.3605
Training Loss: 0.0629
Validation Pearson Correlation: 0.9178
----------------------------------------


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 13, global step 14462: 'val_pearson' was not in top 1


Epoch 13 ended
Validation Loss: 0.3665
Training Loss: 0.0329
Validation Pearson Correlation: 0.9171
----------------------------------------


Validation: |          | 0/? [00:00<?, ?it/s]

Epoch 14, global step 15495: 'val_pearson' was not in top 1


Epoch 14 ended
Validation Loss: 0.3573
Training Loss: 0.0521
Validation Pearson Correlation: 0.9185
----------------------------------------


c:\Users\승범 pc\Desktop\공부\딥러닝\naver_boot\플젝\첫번째\venv\Lib\site-packages\pytorch_lightning\trainer\call.py:54: Detected KeyboardInterrupt, attempting graceful shutdown...


In [6]:
best_model_path = checkpoint_callback.best_model_path
model = Model.load_from_checkpoint(best_model_path)
trainer.test(model=model, datamodule=dataloader)
# 학습이 완료된 모델을 저장합니다.
torch.save(model, f'model_{max_epoch}.pt')

c:\Users\승범 pc\Desktop\공부\딥러닝\naver_boot\플젝\첫번째\venv\Lib\site-packages\lightning_fabric\utilities\cloud_io.py:55: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
Some weights of ElectraForSequ

tokenizing:   0%|          | 0/550 [00:00<?, ?it/s]

tokenizing:   0%|          | 0/1100 [00:00<?, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
c:\Users\승범 pc\Desktop\공부\딥러닝\naver_boot\플젝\첫번째\venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_pearson          0.9222691655158997
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


In [5]:
best_model_path = checkpoint_callback.best_model_path
model = Model.load_from_checkpoint(best_model_path)
predictions = trainer.predict(model=model, datamodule=dataloader)
# 예측된 결과를 형식에 맞게 반올림하여 준비합니다.
predictions = list(round(float(i), 1) for i in torch.cat(predictions))
# output 형식을 불러와서 예측된 결과로 바꿔주고, output.csv로 출력합니다.
output = pd.read_csv('./data/sample_submission.csv')
output['target'] = predictions

output.to_csv(f'output_base_{max_epoch}_electra_mse.csv', index=False)

c:\Users\승범 pc\Desktop\공부\딥러닝\naver_boot\플젝\첫번째\venv\Lib\site-packages\lightning_fabric\utilities\cloud_io.py:55: You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
Some weights of ElectraForSequ

tokenizing:   0%|          | 0/550 [00:00<?, ?it/s]

tokenizing:   0%|          | 0/1100 [00:00<?, ?it/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
c:\Users\승범 pc\Desktop\공부\딥러닝\naver_boot\플젝\첫번째\venv\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:441: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Predicting: |          | 0/? [00:00<?, ?it/s]

In [None]:
from pathlib import Path

def load_config(config_path):
    with open(config_path, 'r') as f:
        return yaml.safe_load(f)


# envs

# dev(개발)
# stg(QA)
# prd(운영) -> prd_config.yml

# argparser -> dev? stg? prd?

# 1가지만의 컨피그 파일을 관리 한다.
#  BASE = Path(__file__).resolve().parent # baseline_sb.ipynb /root/jhlee/level1-project/
# 
# git clone level1-project
# level1-project 
# - baseline_sb
# - conf
#   - config.yml
# COFNIG_FILE_PATH = Path(BASE, "conf", "config.yaml")  # posix(window, mac, linux) 위치 지정 가능 -> C:\ -> usr/ 

# datas
    # data.csv  # gitingore
    # train.csv # gitignore

# src
    # conf
        # config.yaml

    # settings  # 프로젝트에서 사용 할 수 있는 공통 상수? 공통적으로 써야 하는 것들
        # config.py

    # - tokenizer
        # - tokenizer.py

    # - preprocess
    #   - preprocessing.py

    # - callbacks.py
        # callback.py

    # - data
    #   - data_loader.py 

# baseline_sb.ipynb  # -> 사람들한테 공유 하는 목적

# main.py  # -> train, test  # 소스코드 동작 목적



def main():

    # Load config
    config = load_config(COFNIG_FILE_PATH)
    data_loder_config = config.get("data_loader")  # 상수 설정 값들
    data_loder_config["tonizer"] = Tokenizer(model_name)  # 가변 설정 값들

    # Create dataloader and model
    dataloader = Dataloader(**config.get("data_loader"))
    model = Model(config['model_name'], config['learning_rate'])