In [1]:
import argparse
import random
import pandas as pd
import os
import re
import torch
import transformers
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
import torch.nn as nn
import wandb
import torchmetrics
from scipy.stats import pearsonr
import evaluate
import gc

from sklearn.model_selection import KFold
import numpy as np

torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
random.seed(0)



  from .autonotebook import tqdm as notebook_tqdm
2024-09-24 01:57:28.078681: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-24 01:57:28.084570: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-24 01:57:28.098561: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-24 01:57:28.121823: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-24 01:57:28.129071: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:

In [2]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, inputs, tokenizer, max_length, mode = 'train'):
        self.inputs = inputs
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.text_columns = ['sentence_1', 'sentence_2']
        self.mode = mode
    def cleaning_text(self, text):
        cleaned_text = re.sub(r'[ㅋ-ㅎ]+', '', text)
        cleaned_text = re.sub(r"[^가-힣a-zA-Z0-9\s]", "", cleaned_text)
        return cleaned_text

    def __len__(self):
        return len(self.inputs) 
    
    def __getitem__(self, idx):
        t = self.inputs.iloc[idx] 

        text = '[SEP]'.join([t[col] for col in self.text_columns])
        text = self.cleaning_text(text)
        output = self.tokenizer(text,
                                padding='max_length',
                                max_length=self.max_length,
                                truncation=True)

        datas = torch.tensor(output['input_ids'], dtype = torch.long)
        attn = torch.tensor(output['attention_mask'], dtype = torch.long)
        type_ids = torch.tensor(output['token_type_ids'], dtype = torch.long)
        if self.mode == 'train':
            labels = t['label']
            output = {'input_ids' : datas,
                      'attention_mask' : attn,
                      'token_type_ids' : type_ids,
                      'labels' : labels}
            return output
        else:
            output = {'input_ids' : datas,
                      'attention_mask' : attn,
                      'token_type_ids' : type_ids}
            return output
    

    


In [3]:
def compute_metrics(model_preds):
    preds, labels = model_preds
    preds = torch.tensor(preds, dtype = torch.float32).squeeze(-1)
    labels = torch.tensor(labels, dtype = torch.float32).squeeze(-1)
    pear = torchmetrics.PearsonCorrCoef()
    pearson = pear(preds, labels)
    return {'pearson' : pearson.item()}

In [4]:

class MyModel(nn.Module):
    def __init__(self, model_name):
        super(MyModel, self).__init__()
        self.model = transformers.AutoModel.from_pretrained(
            model_name,
            trust_remote_code=True
        )
        
        # 첫 번째 Conv1D 레이어
        self.Conv1 = nn.Conv1d(
            in_channels=768,  # BERT의 출력 차원
            out_channels=256,
            kernel_size=3,
            padding=1
        )
        
        # 두 번째 Conv1D 레이어 (필요 시 추가)
        self.Conv2 = nn.Conv1d(
            in_channels=256,  # Conv1의 출력 차원
            out_channels=128,  # Conv2의 출력 차원
            kernel_size=3,
            padding=1
        )
        
        self.output_layer = nn.Linear(128, 1)  
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.avg_pool = nn.AdaptiveAvgPool1d(1)  
        self.batchnorm1 = nn.BatchNorm1d(256)  
        self.batchnorm2 = nn.BatchNorm1d(128)  
        self.maxpool = nn.MaxPool1d(kernel_size = 2)  
        # self.sigmoid = nn.Sigmoid()
        self.loss_fn = nn.MSELoss()

    def forward(self, input_ids, attention_mask, token_type_ids, labels = None):
        output = self.model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        
        output = output.last_hidden_state.permute(0, 2, 1)  

        cnn_output = self.Conv1(output)  # Shape: (B, 256, L)
        cnn_output = self.relu(cnn_output)
        cnn_output = self.batchnorm1(cnn_output)  
        cnn_output = self.maxpool(cnn_output) # (B, 128, L/2)

        cnn_output = self.Conv2(cnn_output)  # Shape: (B, 128, L/2)
        cnn_output = self.relu(cnn_output)
        cnn_output = self.batchnorm2(cnn_output) 
        cnn_output = self.avg_pool(cnn_output)  #(B, 128, 1)

        cnn_output = cnn_output.view(cnn_output.size(0), -1)  # Shape: (B, 128)
        output = self.output_layer(cnn_output).squeeze(-1)
        
        if labels is not None:
            loss = self.loss_fn(output, labels.float())
            return {'output' : output, 'loss' : loss}

        else:  
            return {'output' : output}



In [5]:

def maketrain(args,training_args):

    model_list = args.model_list
    max_length = args.max_length
    k = args.kf
    kf = KFold(n_splits = k, shuffle = True, random_state = 0)
    data_routes = args.data_routes
    preds = {}
    test = pd.read_csv('/data/ephemeral/home/data/test.csv')
    df = pd.DataFrame()
    for route in data_routes:
        df = pd.concat([df, pd.read_csv(route)])
    df.reset_index(drop = True)
    df = df[['sentence_1', 'sentence_2' ,'label']].dropna()



    for model_name in model_list:
        name = model_name.split('/')[-1]
        model = MyModel(model_name)
        training_args.output_dir = f"./results/{name}"
        training_args.run_name = f'{name}'
        wandb_run = wandb.init(project = "yongruka", name = f"{name}", reinit = True)
        tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, trust_remote_code = True)
        test_dataset = Dataset(test, tokenizer, max_length, mode = 'test')
        data_collator = DataCollatorWithPadding(
                tokenizer = tokenizer,
                padding = True,
                return_tensors = 'pt'
            )
        

        for fold, (train_index, val_index) in enumerate(kf.split(df)):

            print(f'-Fold : {fold+1}-  /  Now_model : [{name}]')

            train_fold = df.iloc[train_index]
            val_fold = df.iloc[val_index]
            train_fold = Dataset(train_fold, tokenizer, max_length)
            val_fold = Dataset(val_fold, tokenizer, max_length)

            trainer = Trainer( 
                model = model,
                tokenizer = tokenizer,
                args = training_args,
                train_dataset = train_fold,
                eval_dataset = val_fold,
                compute_metrics = compute_metrics,
                data_collator = data_collator,
                
            )

        
            trainer.train()
        trainer.save_model(f'results/best_model_{name}')

        pred = trainer.predict(test_dataset)
        preds[name] = pred
        gc.collect()
        
    return preds


In [6]:
!rm -rf /root/.cache/wandb
!rm -rf /root/.config/wandb
!rm -rf /root/.netrc
os.environ["WANDB_API_KEY"] = "ea26fff0d932bc74bbfad9fd507b292c67444c02"
wandb.init(project="yonruka")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mkimybjg2[0m ([33mkimybjg2-boostcampaitech[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [7]:
model_list = [#'klue/roberta-small',
            'klue/roberta-base',
            #'snunlp/KR-SBERT-Medium-extended-klueNLItriplet_PARpair_QApair-klueSTS',
            'snunlp/KR-SBERT-Medium-klueNLItriplet_PARpair-klueSTS',
            'klue/bert-base',
            #'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',
            #  'Alibaba-NLP/gte-multilingual-base',
            ]
data_routes = ['/data/ephemeral/home/data/aug_data40000.csv',]
            #    '/data/ephemeral/home/data/dev.csv']
            #    '/data/ephemeral/home/data/aug50000.csv']

parser = argparse.ArgumentParser()
parser.add_argument('--model_list', default = model_list, type = list)
parser.add_argument('--batch_size', default = 64, type = int)
parser.add_argument('--max_epoch', default = 5, type = int)
parser.add_argument('--max_length', default = 160, type = int)
parser.add_argument('--kf', default = 5, type = int)
parser.add_argument('--data_routes', default = data_routes, type = list)
args = parser.parse_args(args=[])

training_args = TrainingArguments(
    output_dir = f"./results/default",
    eval_strategy = "epoch",
    save_strategy = 'epoch',
    per_device_train_batch_size = args.batch_size,
    per_device_eval_batch_size = args.batch_size,
    num_train_epochs = args.max_epoch,
    weight_decay = 0.01,
    logging_dir = './logs',
    logging_steps = 30,
    report_to = "wandb",  
    run_name = "default",
    load_best_model_at_end = True,
    metric_for_best_model = 'pearson'

)



In [8]:
preds = maketrain(args, training_args)

Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


-Fold : 1-  /  Now_model : [roberta-base]


Epoch,Training Loss,Validation Loss,Pearson
1,5.0797,4.550874,0.954294
2,3.2589,3.198948,0.975929
3,2.1646,1.976188,0.982551
4,1.6527,1.577736,0.986036
5,1.4812,1.297059,0.987742


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


-Fold : 2-  /  Now_model : [roberta-base]


Epoch,Training Loss,Validation Loss,Pearson
1,0.4804,0.432266,0.986991
2,0.1349,0.08822,0.989625
3,0.0931,0.038649,0.993942
4,0.0409,0.028191,0.996004
5,0.0381,0.039477,0.996616


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


-Fold : 3-  /  Now_model : [roberta-base]


Epoch,Training Loss,Validation Loss,Pearson
1,0.0639,0.034834,0.993401
2,0.073,0.031949,0.996327
3,0.0481,0.026679,0.997212
4,0.0421,0.024971,0.998152
5,0.0357,0.029893,0.99842


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


-Fold : 4-  /  Now_model : [roberta-base]


Epoch,Training Loss,Validation Loss,Pearson
1,0.0565,0.022902,0.996584
2,0.0546,0.033796,0.996909
3,0.0467,0.018009,0.998206
4,0.0458,0.023024,0.998565
5,0.0468,0.013652,0.998961


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


-Fold : 5-  /  Now_model : [roberta-base]


Epoch,Training Loss,Validation Loss,Pearson
1,0.043,0.037926,0.996942
2,0.0617,0.018105,0.998457
3,0.0427,0.010242,0.998799
4,0.0384,0.022104,0.998999
5,0.0354,0.012575,0.999225


0,1
eval/loss,█▆▄▃▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/pearson,▁▄▅▆▆▆▇▇▇█▇██████████████
eval/runtime,▃▃▂▄▁▄▃▄▃▄▆▅▄▄▃▄▅▅█▄▇▇▃▃▃
eval/samples_per_second,▆▆▇▅█▅▆▅▆▅▃▄▅▅▅▅▄▄▁▅▂▂▆▆▆
eval/steps_per_second,▆▆▇▅█▅▆▅▆▅▃▄▅▅▆▅▄▄▁▅▂▂▆▆▆
test/runtime,▁
test/samples_per_second,▁
test/steps_per_second,▁
train/epoch,▁▂▃▄▅▆▆▇▁▂▃▄▅▆▇█▁▂▃▄▅▆▇█▁▂▃▄▅▆▇█▁▂▃▄▅▆▇█
train/global_step,▁▂▃▄▅▆▆▇▁▂▃▄▅▆▇█▁▂▃▄▅▆▇█▁▂▃▄▅▆▇█▁▂▃▄▅▆▇█

0,1
eval/loss,0.01257
eval/pearson,0.99922
eval/runtime,22.5335
eval/samples_per_second,354.938
eval/steps_per_second,5.547
test/runtime,3.0399
test/samples_per_second,361.859
test/steps_per_second,5.921
total_flos,0.0
train/epoch,5.0


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


-Fold : 1-  /  Now_model : [KR-SBERT-Medium-klueNLItriplet_PARpair-klueSTS]


Epoch,Training Loss,Validation Loss,Pearson
1,5.2451,4.093363,0.938557
2,3.4601,2.877481,0.965618
3,2.3538,1.909438,0.977623
4,1.8224,1.652497,0.98133
5,1.6439,1.371942,0.983069


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


-Fold : 2-  /  Now_model : [KR-SBERT-Medium-klueNLItriplet_PARpair-klueSTS]


Epoch,Training Loss,Validation Loss,Pearson
1,0.5761,0.45556,0.985783
2,0.1647,0.083068,0.989065
3,0.1012,0.030748,0.993247
4,0.0425,0.023502,0.995184
5,0.0377,0.027379,0.995906


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


-Fold : 3-  /  Now_model : [KR-SBERT-Medium-klueNLItriplet_PARpair-klueSTS]


Epoch,Training Loss,Validation Loss,Pearson
1,0.0653,0.065908,0.99363
2,0.0728,0.0175,0.995961
3,0.0513,0.018747,0.996164
4,0.0422,0.011201,0.998322
5,0.035,0.016406,0.998571


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


-Fold : 4-  /  Now_model : [KR-SBERT-Medium-klueNLItriplet_PARpair-klueSTS]


Epoch,Training Loss,Validation Loss,Pearson
1,0.0576,0.022001,0.996389
2,0.053,0.01899,0.997765


In [14]:
def make_csv(preds):
    test_id = pd.read_csv('/data/ephemeral/home/data/test.csv')['id']
    for name in preds:
        d = pd.DataFrame({'id' : test_id, 'target' : preds[name].round(2)})
        d.to_csv(f'{name}'.csv)
        print(f'{name}'.csv)
    print('done.')

['/data/ephemeral/home/data/aug50000.csv']

In [9]:
preds

NameError: name 'preds' is not defined

In [None]:
make_csv(preds)