In [2]:
import argparse
import random

import pandas as pd

from tqdm.auto import tqdm

import transformers
import torch
import torchmetrics
import pytorch_lightning as pl


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
df = pd.read_csv('../../data/train.csv')
df.head()

Unnamed: 0,id,source,sentence_1,sentence_2,label,binary-label
0,boostcamp-sts-v1-train-000,nsmc-sampled,스릴도있고 반전도 있고 여느 한국영화 쓰레기들하고는 차원이 다르네요~,"반전도 있고,사랑도 있고재미도있네요.",2.2,0.0
1,boostcamp-sts-v1-train-001,slack-rtt,앗 제가 접근권한이 없다고 뜹니다;;,"오, 액세스 권한이 없다고 합니다.",4.2,1.0
2,boostcamp-sts-v1-train-002,petition-sampled,주택청약조건 변경해주세요.,주택청약 무주택기준 변경해주세요.,2.4,0.0
3,boostcamp-sts-v1-train-003,slack-sampled,입사후 처음 대면으로 만나 반가웠습니다.,화상으로만 보다가 리얼로 만나니 정말 반가웠습니다.,3.0,1.0
4,boostcamp-sts-v1-train-004,slack-sampled,뿌듯뿌듯 하네요!!,꼬옥 실제로 한번 뵈어요 뿌뿌뿌~!~!,0.0,0.0


In [4]:
model_name = "snunlp/KR-ELECTRA-discriminator"
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, max_length=160)
model = transformers.AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name, num_labels=1)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at snunlp/KR-ELECTRA-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
tokenizer

ElectraTokenizerFast(name_or_path='snunlp/KR-ELECTRA-discriminator', vocab_size=30000, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [6]:
model

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): L

In [7]:
testText = df['sentence_1'][0]
testText

'스릴도있고 반전도 있고 여느 한국영화 쓰레기들하고는 차원이 다르네요~'

In [8]:
tokenizer.tokenize(testText)

['스릴',
 '##도',
 '##있',
 '##고',
 '반전',
 '##도',
 '있',
 '##고',
 '여느',
 '한국',
 '##영화',
 '쓰레기',
 '##들',
 '##하고',
 '##는',
 '차원',
 '##이',
 '다르',
 '##네요',
 '~']

In [9]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, inputs, targets=[]):
        self.inputs = inputs
        self.targets = targets

    # 학습 및 추론 과정에서 데이터를 1개씩 꺼내오는 곳
    def __getitem__(self, idx):
        # 정답이 있다면 else문을, 없다면 if문을 수행합니다
        if len(self.targets) == 0:
            return torch.tensor(self.inputs[idx])
        else:
            return torch.tensor(self.inputs[idx]), torch.tensor(self.targets[idx])

    # 입력하는 개수만큼 데이터를 사용합니다
    def __len__(self):
        return len(self.inputs)

In [10]:
class Dataloader(pl.LightningDataModule):
    def __init__(self, model_name, batch_size, shuffle, train_path, dev_path, test_path, predict_path):
        super().__init__()
        self.model_name = model_name
        self.batch_size = batch_size
        self.shuffle = shuffle

        self.train_path = train_path
        self.dev_path = dev_path
        self.test_path = test_path
        self.predict_path = predict_path

        self.train_dataset = None
        self.val_dataset = None
        self.test_dataset = None
        self.predict_dataset = None

        self.tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, max_length=160)
        self.target_columns = ['label']
        self.delete_columns = ['id']
        self.text_columns = ['sentence_1', 'sentence_2']

    def tokenizing(self, dataframe):
        data = []
        for idx, item in tqdm(dataframe.iterrows(), desc='tokenizing', total=len(dataframe)):
            # 두 입력 문장을 [SEP] 토큰으로 이어붙여서 전처리합니다.
            text = '[SEP]'.join([item[text_column] for text_column in self.text_columns])
            outputs = self.tokenizer(text, add_special_tokens=True, padding='max_length', truncation=True)
            data.append(outputs['input_ids'])
            print(text, outputs,sep='\n')
        return data

    def preprocessing(self, data):
        # 안쓰는 컬럼을 삭제합니다.
        data = data.drop(columns=self.delete_columns)

        # 타겟 데이터가 없으면 빈 배열을 리턴합니다.
        try:
            targets = data[self.target_columns].values.tolist()
        except:
            targets = []
        # 텍스트 데이터를 전처리합니다.
        inputs = self.tokenizing(data)

        return inputs, targets

    def setup(self, stage='fit'):
        if stage == 'fit':
            # 학습 데이터와 검증 데이터셋을 호출합니다
            train_data = pd.read_csv(self.train_path)
            val_data = pd.read_csv(self.dev_path)

            # 학습데이터 준비
            train_inputs, train_targets = self.preprocessing(train_data)

            # 검증데이터 준비
            val_inputs, val_targets = self.preprocessing(val_data)

            # train 데이터만 shuffle을 적용해줍니다, 필요하다면 val, test 데이터에도 shuffle을 적용할 수 있습니다
            self.train_dataset = Dataset(train_inputs, train_targets)
            self.val_dataset = Dataset(val_inputs, val_targets)
        else:
            # 평가데이터 준비
            test_data = pd.read_csv(self.test_path)
            test_inputs, test_targets = self.preprocessing(test_data)
            self.test_dataset = Dataset(test_inputs, test_targets)

            predict_data = pd.read_csv(self.predict_path)
            predict_inputs, predict_targets = self.preprocessing(predict_data)
            self.predict_dataset = Dataset(predict_inputs, [])

    def train_dataloader(self):
        return torch.utils.data.DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=args.shuffle)

    def val_dataloader(self):
        return torch.utils.data.DataLoader(self.val_dataset, batch_size=self.batch_size)

    def test_dataloader(self):
        return torch.utils.data.DataLoader(self.test_dataset, batch_size=self.batch_size)

    def predict_dataloader(self):
        return torch.utils.data.DataLoader(self.predict_dataset, batch_size=self.batch_size)



In [11]:
dataloader = Dataloader("google-t5/t5-base", 16, True, None, None, None, None)



In [12]:
text = "[CLS] [UNK] [SEP] It is very rewarding. [SEP]"
token_ids = tokenizer.encode(text, add_special_tokens=False)

print(token_ids)

[2, 1, 3, 45, 5005, 17732, 90, 24703, 86, 15472, 17460, 10920, 18, 3]


In [13]:
dataloader.tokenizing(df[:10])

tokenizing:   0%|          | 0/10 [00:00<?, ?it/s]Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
tokenizing: 100%|██████████| 10/10 [00:00<00:00, 2095.79it/s]

스릴도있고 반전도 있고 여느 한국영화 쓰레기들하고는 차원이 다르네요~[SEP]반전도 있고,사랑도 있고재미도있네요.
{'input_ids': [3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 6306, 134, 8569, 908, 2, 3, 2, 6, 2, 3, 2, 5, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
앗 제가 접근권한이 없다고 뜹니다;;[SEP]오, 액세스 권한이 없다고 합니다.
{'input_ids': [3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 117, 117, 6306, 134, 8569, 908, 2, 6, 3, 2, 3, 2, 3, 2, 3, 2, 5, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
주택청약조건 변경해주세요.[SEP]주택청약 무주택기준 변경해주세요.
{'input_ids': [3, 2, 3, 2, 5, 6306, 134, 8569, 908, 2, 3, 2, 3, 2, 5, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
입사후 처음 대면으로 만나 반가웠습니다.[SEP]화상으로만 보다가 리얼로 만나니 정말 반가웠습니다.
{'input_ids': [3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 5, 6306, 134, 8569, 908, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 5, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
뿌듯뿌




[[3,
  2,
  3,
  2,
  3,
  2,
  3,
  2,
  3,
  2,
  3,
  2,
  3,
  2,
  3,
  2,
  6306,
  134,
  8569,
  908,
  2,
  3,
  2,
  6,
  2,
  3,
  2,
  5,
  1],
 [3,
  2,
  3,
  2,
  3,
  2,
  3,
  2,
  3,
  2,
  117,
  117,
  6306,
  134,
  8569,
  908,
  2,
  6,
  3,
  2,
  3,
  2,
  3,
  2,
  3,
  2,
  5,
  1],
 [3, 2, 3, 2, 5, 6306, 134, 8569, 908, 2, 3, 2, 3, 2, 5, 1],
 [3,
  2,
  3,
  2,
  3,
  2,
  3,
  2,
  3,
  2,
  5,
  6306,
  134,
  8569,
  908,
  2,
  3,
  2,
  3,
  2,
  3,
  2,
  3,
  2,
  3,
  2,
  5,
  1],
 [3,
  2,
  3,
  2,
  1603,
  6306,
  134,
  8569,
  908,
  2,
  3,
  2,
  3,
  2,
  3,
  2,
  3,
  2,
  55,
  2,
  55,
  1],
 [3, 2, 6306, 134, 8569, 908, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 1],
 [3,
  2,
  3,
  2,
  3,
  2,
  3,
  2,
  3,
  2,
  5,
  5,
  3,
  2,
  6306,
  134,
  8569,
  908,
  2,
  3,
  2,
  3,
  2,
  3,
  2,
  5,
  5,
  3,
  2,
  1],
 [3,
  2,
  3,
  2,
  3,
  2,
  3,
  2,
  5,
  2,
  6306,
  134,
  8569,
  908,
  2,
  3,
  2,
  3,
  2,
  3,
  2,
  