In [38]:
import argparse
import random
import pandas as pd
import os
import re
import torch
import transformers
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
import torch.nn as nn
import wandb
import torchmetrics
from scipy.stats import pearsonr
import evaluate
import gc
from transformers import ElectraModel, ElectraTokenizer

from sklearn.model_selection import KFold
import numpy as np

torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
random.seed(0)



In [39]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, inputs, tokenizer, max_length, mode = 'train'):
        self.inputs = inputs
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.text_columns = ['sentence_1', 'sentence_2']
        self.mode = mode
    def cleaning_text(self, text):
        cleaned_text = re.sub(r'[ㅋ-ㅎ]+', '', text)
        cleaned_text = re.sub(r"[^가-힣a-zA-Z0-9\s]", "", cleaned_text)
        return cleaned_text

    def __len__(self):
        return len(self.inputs) 
    
    def __getitem__(self, idx):
        t = self.inputs.iloc[idx] 

        text = '[SEP]'.join([t[col] for col in self.text_columns])
        text = self.cleaning_text(text)
        output = self.tokenizer(text,
                                padding='max_length',
                                max_length=self.max_length,
                                truncation=True)

        datas = torch.tensor(output['input_ids'], dtype = torch.long)
        attn = torch.tensor(output['attention_mask'], dtype = torch.long)
        # type_ids = torch.tensor(output['token_type_ids'], dtype = torch.long)
        if self.mode == 'train':
            labels = t['label']
            output = {'input_ids' : datas,
                      'attention_mask' : attn,
                    #   'token_type_ids' : type_ids,
                      'labels' : labels}
            return output
        else:
            output = {'input_ids' : datas,
                      'attention_mask' : attn,
                    #   'token_type_ids' : type_ids
                    }
            return output
    

    


In [40]:
def compute_metrics(model_preds):
    preds, labels = model_preds
    preds = torch.tensor(preds, dtype = torch.float32).squeeze(-1)
    labels = torch.tensor(labels, dtype = torch.float32).squeeze(-1)
    pear = torchmetrics.PearsonCorrCoef()
    pearson = pear(preds, labels)
    return {'pearson' : pearson.item()}

In [41]:

class MyModel(nn.Module):
    def __init__(self, model_name):
        super(MyModel, self).__init__()
        if model_name == "snunlp/KR-ELECTRA-discriminator":
            print('found KR-ELECTRA')
            self.model = ElectraModel.from_pretrained("snunlp/KR-ELECTRA-discriminator")
        else:
            self.model = transformers.AutoModel.from_pretrained(
                model_name,
                trust_remote_code=True
            )
            
        # 첫 번째 Conv1D 레이어
        self.Conv1 = nn.Conv1d(
            in_channels=768,  # BERT의 출력 차원
            out_channels=256,
            kernel_size=3,
            padding=1
        )
        
        # 두 번째 Conv1D 레이어 (필요 시 추가)
        self.Conv2 = nn.Conv1d(
            in_channels=256,  # Conv1의 출력 차원
            out_channels=128,  # Conv2의 출력 차원
            kernel_size=3,
            padding=1
        )
        
        self.output_layer = nn.Linear(128, 1)  
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.avg_pool = nn.AdaptiveAvgPool1d(1)  
        self.batchnorm1 = nn.BatchNorm1d(256)  
        self.batchnorm2 = nn.BatchNorm1d(128)  
        self.maxpool = nn.MaxPool1d(kernel_size = 2)  
        # self.sigmoid = nn.Sigmoid()
        self.loss_fn = nn.MSELoss()

    def forward(self, input_ids, attention_mask, 
                # token_type_ids,
                  labels = None):
        output = self.model(input_ids=input_ids, attention_mask=attention_mask, 
                            # token_type_ids=token_type_ids
                            )
        
        output = output.last_hidden_state.permute(0, 2, 1)  

        cnn_output = self.Conv1(output)  # Shape: (B, 256, L)
        cnn_output = self.relu(cnn_output)
        cnn_output = self.batchnorm1(cnn_output)  
        cnn_output = self.dropout(cnn_output)
        cnn_output = self.maxpool(cnn_output) # (B, 128, L/2)

        cnn_output = self.Conv2(cnn_output)  # Shape: (B, 128, L/2)
        cnn_output = self.relu(cnn_output)
        cnn_output = self.batchnorm2(cnn_output) 
        cnn_output = self.dropout(cnn_output)
        cnn_output = self.avg_pool(cnn_output)  #(B, 128, 1)

        cnn_output = cnn_output.view(cnn_output.size(0), -1)  # Shape: (B, 128)
        output = self.output_layer(cnn_output).squeeze(-1)
        
        if labels is not None:
            loss = self.loss_fn(output, labels.float())
            return {'output' : output, 'loss' : loss}

        else:  
            return {'output' : output}



In [42]:

def maketrain(args,training_args):

    model_list = args.model_list
    max_length = args.max_length
    k = args.kf
    kf = KFold(n_splits = k, shuffle = True, random_state = 0)
    data_routes = args.data_routes
    preds = {}
    test = pd.read_csv('/data/ephemeral/home/data/test.csv')
    df = pd.DataFrame()
    for route in data_routes:
        df = pd.concat([df, pd.read_csv(route)])
    df.reset_index(drop = True)
    df = df[['sentence_1', 'sentence_2' ,'label']].dropna().reset_index(drop = True)



    for model_name in model_list:
        name = model_name.split('/')[-1]
        model = MyModel(model_name)
        training_args.output_dir = f"./results/{name}"
        training_args.run_name = f'{name}'
        wandb_run = wandb.init(project = "yongruka", name = f"{name}", reinit = True)
        if model_name == "snunlp/KR-ELECTRA-discriminator":
            print('found KR-ELECTRA')
            tokenizer = ElectraTokenizer.from_pretrained("snunlp/KR-ELECTRA-discriminator")
        else:
            tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, trust_remote_code = True)
        test_dataset = Dataset(test, tokenizer, max_length, mode = 'test')
        data_collator = DataCollatorWithPadding(
                tokenizer = tokenizer,
                padding = True,
                return_tensors = 'pt'
            )
        

        for fold, (train_index, val_index) in enumerate(kf.split(df)):

            print(f'-Fold : {fold+1}-  /  Now_model : [{name}]')

            train_fold = df.iloc[train_index]
            val_fold = df.iloc[val_index]
            train_fold = Dataset(train_fold, tokenizer, max_length)
            val_fold = Dataset(val_fold, tokenizer, max_length)

            trainer = Trainer( 
                model = model,
                tokenizer = tokenizer,
                args = training_args,
                train_dataset = train_fold,
                eval_dataset = val_fold,
                compute_metrics = compute_metrics,
                data_collator = data_collator,
                
            )
            for param in model.parameters():
                if not param.is_contiguous():
                    param.data = param.data.contiguous()

        
            trainer.train()
        trainer.save_model(f'results/best_model_{name}')

        pred = trainer.predict(test_dataset)
        preds[name] = pred
        gc.collect()
        
    return preds


In [43]:
!rm -rf /root/.cache/wandb
!rm -rf /root/.config/wandb
!rm -rf /root/.netrc
os.environ["WANDB_API_KEY"] = "ea26fff0d932bc74bbfad9fd507b292c67444c02"
wandb.init(project="yonruka")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


0,1
eval/loss,█▆▄▂▁▁▁▁▁▁▁▁
eval/pearson,▁▃▄▆▇▇▇▇█▇██
eval/runtime,▃▄▄▄█▁▇▆▃▇▅▃
eval/samples_per_second,▆▅▅▅▁█▂▃▆▂▄▆
eval/steps_per_second,▆▅▅▅▁█▂▃▆▂▄▆
test/runtime,▁
test/samples_per_second,▁
test/steps_per_second,▁
train/epoch,▁▂▂▃▄▅▅▆▇▇▁▂▃▃▄▅▅▆▇█▁▂▃▃▄▅▆▆▇█▁▂▃▃▄▅▆▆▇█
train/global_step,▁▂▂▃▄▅▅▆▇▇▁▂▃▃▄▅▅▆▇█▁▂▃▃▄▅▆▆▇█▁▂▃▃▄▅▆▆▇█

0,1
eval/loss,0.0745
eval/pearson,0.99617
eval/runtime,25.9355
eval/samples_per_second,326.656
eval/steps_per_second,10.218
test/runtime,3.3077
test/samples_per_second,332.557
test/steps_per_second,10.581
total_flos,0.0
train/epoch,3.0


In [46]:
model_list = [#'klue/roberta-small',
            "snunlp/KR-ELECTRA-discriminator",
            'Alibaba-NLP/gte-multilingual-base',
            'klue/roberta-base',
            #'snunlp/KR-SBERT-Medium-extended-klueNLItriplet_PARpair_QApair-klueSTS',
            'snunlp/KR-SBERT-Medium-klueNLItriplet_PARpair-klueSTS',
            'klue/bert-base',
            #'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2',

            ]
data_routes = ['/data/ephemeral/home/data/toast_processed_train.csv',]
            #    '/data/ephemeral/home/data/dev.csv']
            #    '/data/ephemeral/home/data/aug50000.csv']

parser = argparse.ArgumentParser()
parser.add_argument('--model_list', default = model_list, type = list)
parser.add_argument('--batch_size', default = 32, type = int)
parser.add_argument('--max_epoch', default = 3, type = int)
parser.add_argument('--max_length', default = 160, type = int)
parser.add_argument('--kf', default = 4, type = int)
parser.add_argument('--data_routes', default = data_routes, type = list)
args = parser.parse_args(args=[])

training_args = TrainingArguments(
    output_dir = f"./results/default",
    eval_strategy = "epoch",
    save_strategy = 'epoch',
    per_device_train_batch_size = args.batch_size,
    per_device_eval_batch_size = args.batch_size,
    num_train_epochs = args.max_epoch,
    weight_decay = 0.01,
    logging_dir = './logs',
    logging_steps = 30,
    report_to = "wandb",  
    run_name = "default",
    load_best_model_at_end = True,
    metric_for_best_model = 'pearson'

)



In [47]:
preds = maketrain(args, training_args)

found KR-ELECTRA


found KR-ELECTRA


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


-Fold : 1-  /  Now_model : [KR-ELECTRA-discriminator]


Epoch,Training Loss,Validation Loss


In [36]:
def make_csv(preds):
    test_id = pd.read_csv('/data/ephemeral/home/data/test.csv')['id']
    for name in preds:
        d = pd.DataFrame({'id' : test_id, 'target' : preds[name].predictions.round(2)})
        d.to_csv(f'{name}.csv')
        print(f'{name}.csv')
    print('done.')

In [37]:
make_csv(preds)

KR-ELECTRA-discriminator.csv
gte-multilingual-base.csv
roberta-base.csv
KR-SBERT-Medium-klueNLItriplet_PARpair-klueSTS.csv
bert-base.csv
done.


In [28]:
for name in preds:
    print(name)

KR-ELECTRA-discriminator
gte-multilingual-base
roberta-base
KR-SBERT-Medium-klueNLItriplet_PARpair-klueSTS
bert-base
