In [1]:
import torch
import dill
import pytorch_lightning as pl
import transformers
import pandas as pd
from dataloader import Dataloader
import torchmetrics
from model import Model

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

tokenizer = transformers.AutoTokenizer.from_pretrained(
    "monologg/koelectra-base-v3-discriminator", max_length=160
)
new_tokens = pd.read_csv("new_token.csv").columns.tolist()
special_tokens_dict = {"additional_special_tokens": ["[RTT]", "[ORG]"]}
tokenizer.add_special_tokens(special_tokens_dict)
tokenizer.add_tokens(new_tokens)

model = Model(
        "monologg/koelectra-base-v3-discriminator",
        2e-5,
        35,
        0.1,
        1,
        tokenizer,
    )
trainer = pl.Trainer(gpus=1, log_every_n_steps=1)

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: 

In [3]:
len(tokenizer)

35085

In [4]:
dataloader = Dataloader(
"monologg/koelectra-base-v3-discriminator",
32,
False,
"../../data/train.csv",
"../../data/dev.csv",
"../../data/dev.csv",
"../../data/dev.csv",
4,
tokenizer,
)

In [5]:
checkpoint = torch.load(
        "model_10_28.pt",
        pickle_module=dill,
    )

In [6]:
model.load_state_dict(
    checkpoint.state_dict(),
)

<All keys matched successfully>

In [7]:
result = trainer.test(model=model, datamodule=dataloader)

tokenizing: 100%|██████████| 550/550 [00:00<00:00, 2453.24it/s]
tokenizing: 100%|██████████| 550/550 [00:00<00:00, 2486.20it/s]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing DataLoader 0: 100%|██████████| 18/18 [00:13<00:00,  1.33it/s]
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      test_pearson          0.9079529643058777
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


In [8]:
predictions = trainer.predict(model=model, datamodule=dataloader)

tokenizing: 100%|██████████| 550/550 [00:00<00:00, 2672.43it/s]
tokenizing: 100%|██████████| 550/550 [00:00<00:00, 2635.29it/s]
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Predicting DataLoader 0: 100%|██████████| 18/18 [00:12<00:00,  1.40it/s]


In [9]:
predictions

[tensor([1.7456, 3.1178, 4.3055, 3.6427, 2.1913, 3.2191, 2.1326, 1.6633, 3.8716,
         2.7186, 3.5678, 3.1476, 3.3361, 0.0060, 2.7364, 1.5919, 3.7550, 1.9116,
         1.0269, 1.0667, 4.2774, 3.6113, 1.0410, 1.0937, 1.3662, 2.1332, 4.3061,
         3.7815, 1.2882, 1.7056, 0.0343, 0.6909]),
 tensor([3.1222, 1.2518, 0.9726, 1.2240, 2.6844, 2.3059, 4.3773, 1.2623, 1.3673,
         3.8686, 2.2744, 4.1944, 1.9586, 4.4182, 4.1105, 1.3136, 2.7080, 3.2875,
         2.3856, 2.5903, 2.4081, 2.2835, 1.8196, 0.9330, 2.0743, 4.8792, 4.3232,
         0.3958, 2.3244, 0.4078, 3.3301, 3.9596]),
 tensor([ 4.2461,  3.3363,  4.4530,  0.1795,  2.7854,  4.2405,  1.4739, -0.0198,
          1.9889,  1.4241,  4.0203,  4.6171,  3.9165,  4.1365,  1.2104,  1.9799,
          4.0383,  3.1827,  3.3745,  3.2620,  4.1456,  3.5024,  2.4832,  0.2206,
          1.7998,  0.9154,  4.1206,  1.1714,  3.8107,  1.2164,  3.9101,  3.9535]),
 tensor([ 3.8422,  3.3260, -0.0077,  1.0659,  3.8614,  3.9157,  2.0920,  4.0113,
     

In [10]:
predictions = list(round(float(i), 1) for i in torch.cat(predictions))

In [11]:
predictions

[1.7,
 3.1,
 4.3,
 3.6,
 2.2,
 3.2,
 2.1,
 1.7,
 3.9,
 2.7,
 3.6,
 3.1,
 3.3,
 0.0,
 2.7,
 1.6,
 3.8,
 1.9,
 1.0,
 1.1,
 4.3,
 3.6,
 1.0,
 1.1,
 1.4,
 2.1,
 4.3,
 3.8,
 1.3,
 1.7,
 0.0,
 0.7,
 3.1,
 1.3,
 1.0,
 1.2,
 2.7,
 2.3,
 4.4,
 1.3,
 1.4,
 3.9,
 2.3,
 4.2,
 2.0,
 4.4,
 4.1,
 1.3,
 2.7,
 3.3,
 2.4,
 2.6,
 2.4,
 2.3,
 1.8,
 0.9,
 2.1,
 4.9,
 4.3,
 0.4,
 2.3,
 0.4,
 3.3,
 4.0,
 4.2,
 3.3,
 4.5,
 0.2,
 2.8,
 4.2,
 1.5,
 -0.0,
 2.0,
 1.4,
 4.0,
 4.6,
 3.9,
 4.1,
 1.2,
 2.0,
 4.0,
 3.2,
 3.4,
 3.3,
 4.1,
 3.5,
 2.5,
 0.2,
 1.8,
 0.9,
 4.1,
 1.2,
 3.8,
 1.2,
 3.9,
 4.0,
 3.8,
 3.3,
 -0.0,
 1.1,
 3.9,
 3.9,
 2.1,
 4.0,
 2.9,
 4.3,
 1.0,
 4.2,
 2.9,
 2.6,
 2.5,
 1.3,
 2.8,
 2.8,
 4.2,
 4.5,
 1.8,
 1.6,
 4.2,
 4.1,
 0.8,
 4.0,
 3.7,
 2.8,
 0.7,
 3.0,
 0.5,
 3.6,
 4.1,
 1.3,
 2.9,
 0.1,
 1.6,
 4.0,
 4.2,
 2.0,
 2.9,
 4.4,
 4.0,
 4.4,
 3.3,
 1.6,
 1.2,
 2.0,
 3.7,
 3.6,
 3.1,
 3.4,
 1.8,
 3.8,
 4.2,
 1.9,
 2.9,
 3.1,
 3.3,
 1.0,
 2.7,
 4.2,
 0.7,
 1.2,
 4.1,
 4.0,
 4.4,
 0.4,
 3.8,
 2.3,
 2

In [12]:
val_data = pd.read_csv('../../data/dev.csv')

In [13]:
labels = val_data['label'].tolist()

In [14]:
cnt = 0
for idx, (pred, y) in enumerate(zip(predictions, labels)):
    if abs(pred - y) >= 1.5:
        cnt += 1
        print(f"model pred: {pred}")
        print(val_data.iloc[idx])
        print('-------------------------------------')

model pred: 1.0
id              boostcamp-sts-v1-dev-018
source                         slack-rtt
sentence_1                   코파일럿은 사랑입니다
sentence_2                    조종사는 사랑입니다
label                                3.2
binary-label                         1.0
Name: 18, dtype: object
-------------------------------------
model pred: 2.0
id              boostcamp-sts-v1-dev-044
source                         slack-rtt
sentence_1                   ㅋㅋ두분 남매 바이브
sentence_2                   ㅎㅎ 두 남매 분위기
label                                3.6
binary-label                         1.0
Name: 44, dtype: object
-------------------------------------
model pred: 2.7
id              boostcamp-sts-v1-dev-156
source                  petition-sampled
sentence_1                문재인 대통님게 청원합니다
sentence_2               문제인 대통령님깨 청원합니다
label                                5.0
binary-label                         1.0
Name: 156, dtype: object
-------------------------------------
model pred: 3.6
id         

In [15]:
cnt

12

In [19]:
tokenizer.decode(dataloader.test_dataset.__getitem__(4)[0])

'[CLS] [ORG] 다음 밥스테이지가 기대됩니다 ~ ㅎ [SEP] 다음 후기도 기대됩니다 ~ ~ [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [P

In [17]:
tokenizer(['[ORG]'])

{'input_ids': [[2, 35001, 3]], 'token_type_ids': [[0, 0, 0]], 'attention_mask': [[1, 1, 1]]}