In [1]:
from tqdm import trange
import numpy as np
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification
from tokenizers import BertWordPieceTokenizer

import argparse
import os

In [20]:
def exec(text: str, tokenizer) -> None:
    max_length = 256
    encoded = tokenizer.encode(text)
    encoded.pad(max_length)
    encoded.truncate(max_length)
    input_ids = torch.tensor([encoded.ids]).to(device)
    token_type_ids = torch.tensor([encoded.type_ids]).to(device)
    attention_mask = torch.tensor([encoded.attention_mask]).to(device)
    output = model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
    softmax = torch.nn.Softmax(dim=1)(output['logits'])
    return (1 if output['logits'].argmax(-1)==1 else 0), softmax[0][0], softmax[0][1]

model = AutoModelForSequenceClassification.from_pretrained('./')
model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
tokenizer = BertWordPieceTokenizer(
    "./vocab.txt",
    clean_text=True,
    handle_chinese_chars=True,
    strip_accents=False,
    lowercase=False,
    wordpieces_prefix="##",
)

In [23]:
%%time
test = "아버지가방에들어가신다"

print(tokenizer.encode(test).tokens)
label, none, curse = exec(test, tokenizer)
print(f"{'curse' if label else 'none'}, {none:f}, {curse:f}")
print()

['[CLS]', '아버지가', '##방에', '##들어가', '##신다', '[SEP]']
none, 0.971718, 0.028282

CPU times: user 20 ms, sys: 4 ms, total: 24 ms
Wall time: 24 ms


In [8]:
import random

t = test.split()
n_t = ''
for w in t:
    n_t = ' '.join([n_t, ''.join(random.sample(w, k=len(w)))])
n_t.strip()

'어죽'