In [1]:
import argparse
import random
import pandas as pd
import os
import re
import torch
import transformers
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
import torch.nn as nn
import wandb
import torchmetrics
from scipy.stats import pearsonr
import evaluate
import gc
from transformers import ElectraModel, ElectraTokenizer
from pykospacing import Spacing
from sklearn.model_selection import KFold
import numpy as np

torch.manual_seed(42)
torch.cuda.manual_seed(42)
torch.cuda.manual_seed_all(42)
random.seed(42)



  from .autonotebook import tqdm as notebook_tqdm
2024-09-26 00:58:50.748740: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-26 00:58:50.755107: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-26 00:58:50.768933: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-26 00:58:50.792167: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-26 00:58:50.799098: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:

In [2]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, inputs, tokenizer, max_length, mode = 'train'):
        self.inputs = inputs
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.text_columns = ['sentence_1', 'sentence_2']
        self.mode = mode

    def __len__(self):
        return len(self.inputs) 
    
    def __getitem__(self, idx):
        t = self.inputs.iloc[idx] 

        text = '[SEP]'.join([t[col] for col in self.text_columns])
        output = self.tokenizer(text,
                                padding='max_length',
                                max_length=self.max_length,
                                truncation=True)

        datas = torch.tensor(output['input_ids'], dtype = torch.long)
        attn = torch.tensor(output['attention_mask'], dtype = torch.long)
        # type_ids = torch.tensor(output['token_type_ids'], dtype = torch.long)
        if self.mode == 'train':
            labels = t['label']
            output = {'input_ids' : datas,
                      'attention_mask' : attn,
                    #   'token_type_ids' : type_ids,
                      'labels' : labels}
            return output
        else:
            output = {'input_ids' : datas,
                      'attention_mask' : attn,
                    #   'token_type_ids' : type_ids
                    }
            return output
    

    


In [3]:
def compute_metrics(model_preds):
    preds, labels = model_preds
    preds = torch.tensor(preds, dtype = torch.float32).squeeze(-1)
    labels = torch.tensor(labels, dtype = torch.float32).squeeze(-1)
    pear = torchmetrics.PearsonCorrCoef()
    pearson = pear(preds, labels)
    return {'pearson' : pearson.item()}

def cleaning_text(text):
    text = text.lower()
    cleaned_text = re.sub(r'[^가-힣a-z0-9\s]', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    cleaned_text = re.sub(r'(.)\1{2,}', r'\1', cleaned_text)
    spacing = Spacing()
    cleaned_text = spacing(cleaned_text)
    return cleaned_text.strip()


In [4]:

class MyModel(nn.Module):
    def __init__(self, model_name):
        super(MyModel, self).__init__()
        if model_name == "snunlp/KR-ELECTRA-discriminator":
            print('found KR-ELECTRA')
            self.model = ElectraModel.from_pretrained("snunlp/KR-ELECTRA-discriminator")
        else:
            self.model = transformers.AutoModel.from_pretrained(
                model_name,
                trust_remote_code=True
            )
            
        # 첫 번째 Conv1D 레이어
        self.Conv1 = nn.Conv1d(
            in_channels=768,  # BERT의 출력 차원
            out_channels=256,
            kernel_size=3,
            padding=1
        )
        
        # 두 번째 Conv1D 레이어 (필요 시 추가)
        self.Conv2 = nn.Conv1d(
            in_channels=256,  # Conv1의 출력 차원
            out_channels=128,  # Conv2의 출력 차원
            kernel_size=3,
            padding=1
        )
        
        self.output_layer = nn.Linear(128, 1)  
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.avg_pool = nn.AdaptiveAvgPool1d(1)  
        self.batchnorm1 = nn.BatchNorm1d(256)  
        self.batchnorm2 = nn.BatchNorm1d(128)  
        self.maxpool = nn.MaxPool1d(kernel_size = 2)  
        # self.sigmoid = nn.Sigmoid()
        self.loss_fn = nn.MSELoss()

    def forward(self, input_ids, attention_mask, 
                # token_type_ids,
                  labels = None):
        output = self.model(input_ids=input_ids, attention_mask=attention_mask, 
                            # token_type_ids=token_type_ids
                            )
        
        output = output.last_hidden_state.permute(0, 2, 1)  

        cnn_output = self.Conv1(output)  # Shape: (B, 256, L)
        cnn_output = self.relu(cnn_output)
        cnn_output = self.batchnorm1(cnn_output)  
        cnn_output = self.dropout(cnn_output)
        cnn_output = self.maxpool(cnn_output) # (B, 128, L/2)

        cnn_output = self.Conv2(cnn_output)  # Shape: (B, 128, L/2)
        cnn_output = self.relu(cnn_output)
        cnn_output = self.batchnorm2(cnn_output) 
        cnn_output = self.dropout(cnn_output)
        cnn_output = self.avg_pool(cnn_output)  #(B, 128, 1)

        cnn_output = cnn_output.view(cnn_output.size(0), -1)  # Shape: (B, 128)
        output = self.output_layer(cnn_output).squeeze(-1)
        
        if labels is not None:
            loss = self.loss_fn(output, labels.float())
            return {'output' : output, 'loss' : loss}

        else:  
            return {'output' : output}



In [5]:

def maketrain(args,training_args):

    model_list = args.model_list
    max_length = args.max_length
    k = args.kf
    kf = KFold(n_splits = k, shuffle = True, random_state = 0)
    data_routes = args.data_routes
    preds = {}
    test = pd.read_csv('/data/ephemeral/home/moruka/text_test.csv')

    df = pd.DataFrame()
    for route in data_routes:
        df = pd.concat([df, pd.read_csv(route)])
    df.reset_index(drop = True)
    df = df[['sentence_1', 'sentence_2' ,'label']].dropna().reset_index(drop = True)


    for model_name in model_list:
        name = model_name.split('/')[-1]
        model = MyModel(model_name)
        training_args.output_dir = f"./results/{name}"
        training_args.run_name = f'{name}'
        wandb_run = wandb.init(project = "yongruka", name = f"{name}", reinit = True)
        if model_name == "snunlp/KR-ELECTRA-discriminator":
            print('found KR-ELECTRA')
            tokenizer = ElectraTokenizer.from_pretrained("snunlp/KR-ELECTRA-discriminator")
        else:
            tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, trust_remote_code = True)
        test_dataset = Dataset(test, tokenizer, max_length, mode = 'test')
        data_collator = DataCollatorWithPadding(
                tokenizer = tokenizer,
                padding = True,
                return_tensors = 'pt'
            )
        

        for fold, (train_index, val_index) in enumerate(kf.split(df)):

            print(f'-Fold : {fold+1}-  /  Now_model : [{name}]')

            train_fold = df.iloc[train_index]
            val_fold = df.iloc[val_index]
            train_fold = Dataset(train_fold, tokenizer, max_length)
            val_fold = Dataset(val_fold, tokenizer, max_length)

            trainer = Trainer( 
                model = model,
                tokenizer = tokenizer,
                args = training_args,
                train_dataset = train_fold,
                eval_dataset = val_fold,
                compute_metrics = compute_metrics,
                data_collator = data_collator,
                
            )
            for param in model.parameters():
                if not param.is_contiguous():
                    param.data = param.data.contiguous()

        
            trainer.train()
        trainer.save_model(f'results/best_model_{name}')

        pred = trainer.predict(test_dataset)
        preds[name] = pred
        gc.collect()
        
    return preds


In [6]:
!rm -rf /root/.cache/wandb
!rm -rf /root/.config/wandb
!rm -rf /root/.netrc
os.environ["WANDB_API_KEY"] = "ea26fff0d932bc74bbfad9fd507b292c67444c02"
wandb.init(project="yonruka")

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


In [14]:
model_list = [
            # "klue/roberta-large",
            'monologg/koelectra-base-v3-discriminator',
            "snunlp/KR-ELECTRA-discriminator",
            'Alibaba-NLP/gte-multilingual-base',
            'klue/roberta-base',
            'snunlp/KR-SBERT-Medium-klueNLItriplet_PARpair-klueSTS',
            'klue/bert-base',

            ]
data_routes = ['/data/ephemeral/home/moruka/final_data.csv',]
            #    '/data/ephemeral/home/data/dev.csv']
            #    '/data/ephemeral/home/data/aug50000.csv']

parser = argparse.ArgumentParser()
parser.add_argument('--model_list', default = model_list, type = list)
parser.add_argument('--batch_size', default = 32, type = int)
parser.add_argument('--max_epoch', default = 3, type = int)
parser.add_argument('--max_length', default = 160, type = int)
parser.add_argument('--kf', default = 4, type = int)
parser.add_argument('--data_routes', default = data_routes, type = list)
args = parser.parse_args(args=[])

training_args = TrainingArguments(
    output_dir = f"./results/default",
    eval_strategy = "epoch",
    save_strategy = 'epoch',
    per_device_train_batch_size = args.batch_size,
    per_device_eval_batch_size = args.batch_size,
    num_train_epochs = args.max_epoch,
    weight_decay = 0.01,
    logging_dir = './logs',
    logging_steps = 30,
    report_to = "wandb",  
    run_name = "default",
    load_best_model_at_end = True,
    metric_for_best_model = 'pearson'

)



In [15]:
preds = maketrain(args, training_args)

0,1
eval/loss,█▆▅▄▃▂▁▁
eval/pearson,▁▃▆▆▇▇██
eval/runtime,▆▆█▁▆▁█▆
eval/samples_per_second,▃▃▁█▃█▁▃
eval/steps_per_second,▃▃▁█▃█▁▃
test/runtime,▁
test/samples_per_second,▁
test/steps_per_second,▁
train/epoch,▁▁▂▃▄▅▅▆▇█▁▂▃▄▄▅▅▆▇█▁▂▃▄▄▅▆▆▇█▁▂▃▄▄▅▆▇██
train/global_step,▁▁▂▃▄▅▅▆▇█▁▂▃▄▄▅▅▆▇█▁▂▃▄▄▅▆▆▇█▁▂▃▄▄▅▆▇██

0,1
eval/loss,0.22402
eval/pearson,0.98906
eval/runtime,17.1168
eval/samples_per_second,326.872
eval/steps_per_second,10.224
test/runtime,3.2841
test/samples_per_second,334.943
test/steps_per_second,10.657
total_flos,0.0
train/epoch,2.0


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


-Fold : 1-  /  Now_model : [koelectra-base-v3-discriminator]


Epoch,Training Loss,Validation Loss,Pearson
1,5.1871,4.458127,0.922068
2,3.7939,3.692455,0.941227
3,3.5857,3.349017,0.946605


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


-Fold : 2-  /  Now_model : [koelectra-base-v3-discriminator]


Epoch,Training Loss,Validation Loss,Pearson
1,2.3403,1.837132,0.973528
2,1.3258,1.162721,0.97736
3,1.2632,1.003826,0.981546


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


-Fold : 3-  /  Now_model : [koelectra-base-v3-discriminator]


Epoch,Training Loss,Validation Loss,Pearson
1,0.3611,0.263282,0.982966
2,0.1673,0.082568,0.987275
3,0.145,0.075055,0.989981


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


-Fold : 4-  /  Now_model : [koelectra-base-v3-discriminator]


Epoch,Training Loss,Validation Loss,Pearson
1,0.1616,0.071236,0.989083
2,0.1493,0.050582,0.991744
3,0.0934,0.049612,0.993277


found KR-ELECTRA


0,1
eval/loss,█▇▆▄▃▃▁▁▁▁▁▁
eval/pearson,▁▃▃▆▆▇▇▇████
eval/runtime,█▆▅▇▇▄▅█▁▆█▄
eval/samples_per_second,▁▃▄▂▂▅▄▁█▃▁▅
eval/steps_per_second,▁▃▄▂▂▅▄▁█▃▁▅
test/runtime,▁
test/samples_per_second,▁
test/steps_per_second,▁
train/epoch,▁▂▂▃▄▅▆▆▇█▁▂▃▃▄▅▆▆▇█▁▂▃▃▄▅▆▆▇█▁▂▃▃▄▅▆▇▇█
train/global_step,▁▂▂▃▄▅▆▆▇█▁▂▃▃▄▅▆▆▇█▁▂▃▃▄▅▆▆▇█▁▂▃▃▄▅▆▇▇█

0,1
eval/loss,0.04961
eval/pearson,0.99328
eval/runtime,17.0932
eval/samples_per_second,327.324
eval/steps_per_second,10.238
test/runtime,3.279
test/samples_per_second,335.466
test/steps_per_second,10.674
total_flos,0.0
train/epoch,3.0


found KR-ELECTRA


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


-Fold : 1-  /  Now_model : [KR-ELECTRA-discriminator]


Epoch,Training Loss,Validation Loss,Pearson
1,5.5984,4.996432,0.917399
2,4.1756,4.03358,0.940422
3,3.9286,3.68402,0.947394


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


-Fold : 2-  /  Now_model : [KR-ELECTRA-discriminator]


Epoch,Training Loss,Validation Loss,Pearson
1,2.7591,2.274101,0.966072
2,1.5673,1.716985,0.97509
3,1.4584,1.559862,0.980363


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


-Fold : 3-  /  Now_model : [KR-ELECTRA-discriminator]


Epoch,Training Loss,Validation Loss,Pearson
1,0.4305,0.45759,0.981937
2,0.1943,0.22365,0.985088
3,0.1533,0.148391,0.988884


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


-Fold : 4-  /  Now_model : [KR-ELECTRA-discriminator]


Epoch,Training Loss,Validation Loss,Pearson
1,0.1666,0.076858,0.986406
2,0.1524,0.052554,0.990051
3,0.0974,0.045889,0.993009


Some weights of the model checkpoint at Alibaba-NLP/gte-multilingual-base were not used when initializing NewModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0,1
eval/loss,█▇▆▄▃▃▂▁▁▁▁▁
eval/pearson,▁▃▄▆▆▇▇▇█▇██
eval/runtime,▁▁▁▁▁▁▄▁▁█▁▁
eval/samples_per_second,█▇████▅██▁██
eval/steps_per_second,█▇████▅██▁██
test/runtime,▁
test/samples_per_second,▁
test/steps_per_second,▁
train/epoch,▁▂▂▃▄▅▆▆▇█▁▂▃▃▄▅▆▆▇█▁▂▃▃▄▅▆▆▇█▁▂▃▃▄▅▆▇▇█
train/global_step,▁▂▂▃▄▅▆▆▇█▁▂▃▃▄▅▆▆▇█▁▂▃▃▄▅▆▆▇█▁▂▃▃▄▅▆▇▇█

0,1
eval/loss,0.04589
eval/pearson,0.99301
eval/runtime,17.8812
eval/samples_per_second,312.898
eval/steps_per_second,9.787
test/runtime,3.4388
test/samples_per_second,319.875
test/steps_per_second,10.178
total_flos,0.0
train/epoch,3.0


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


-Fold : 1-  /  Now_model : [gte-multilingual-base]


Epoch,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 734.00 MiB. GPU 0 has a total capacty of 31.74 GiB of which 218.38 MiB is free. Process 538227 has 1.07 GiB memory in use. Process 630379 has 1.07 GiB memory in use. Process 631223 has 1.07 GiB memory in use. Process 636519 has 1.07 GiB memory in use. Process 638847 has 384.00 MiB memory in use. Process 652174 has 1.07 GiB memory in use. Process 655151 has 1.07 GiB memory in use. Process 713755 has 1.13 GiB memory in use. Process 717260 has 1.48 GiB memory in use. Process 1194325 has 1.05 GiB memory in use. Process 2839881 has 1.64 GiB memory in use. Process 2860480 has 6.84 GiB memory in use. Process 2866205 has 9.58 GiB memory in use. Process 2933754 has 3.03 GiB memory in use. Of the allocated memory 4.64 GiB is allocated by PyTorch, and 4.55 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [11]:
def make_csv(preds):
    test_id = pd.read_csv('/data/ephemeral/home/data/test.csv')['id']
    for name in preds:
        d = pd.DataFrame({'id' : test_id, 'target' : preds[name].predictions.round(2)})
        d.to_csv(f'{name}.csv')
        print(f'{name}.csv')
    print('done.')

In [13]:
make_csv(preds)

koelectra-base-v3-discriminator.csv
KR-ELECTRA-discriminator.csv
gte-multilingual-base.csv
roberta-base.csv
KR-SBERT-Medium-klueNLItriplet_PARpair-klueSTS.csv
bert-base.csv
done.
