In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import json
import os
import re

In [None]:
import torch
import argparse
import torch.nn as nn
from tqdm import trange, tqdm
from transformers import (
    AutoModel,
    AutoConfig,
    BertConfig,
    BertModel,
    XLMRobertaModel,
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    DataCollatorWithPadding)
from torch.utils.data import DataLoader, TensorDataset
from transformers import get_linear_schedule_with_warmup
from transformers import AdamW
from sklearn.metrics import f1_score

In [None]:
PADDING_TOKEN = 1
S_OPEN_TOKEN = 0
S_CLOSE_TOKEN = 2

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('device: ', device)

device:  cuda


In [None]:
special_tokens_dict = {
    'additional_special_tokens': ['&name&', '&affiliation&', '&social-security-num&', '&tel-num&', '&card-num&', '&bank-account&', '&num&', '&online-account&']
}

### json 불러오기

In [None]:
def jsonload(fname, encoding="utf-8"):
    with open(fname, encoding=encoding) as f:
        j = json.load(f)

    return j

In [None]:
# json 개체를 파일이름으로 깔끔하게 저장
def jsondump(j, fname):
    with open(fname, "w", encoding="UTF8") as f:
        json.dump(j, f, ensure_ascii=False)

In [None]:
# jsonl 파일 읽어서 list에 저장
def jsonlload(fname, encoding="utf-8"):
    json_list = []
    with open(fname, encoding=encoding) as f:
        for line in f.readlines():
            json_list.append(json.loads(line))
    return json_list

In [None]:
# jsonlist를 jsonl 형태로 저장
def jsonldump(j_list, fname):
    f = open(fname, "w", encoding='utf-8')
    for json_data in j_list:
        f.write(json.dumps(json_data, ensure_ascii=False)+'\n')

### 파싱하기

In [None]:
def parse_args():
    parser = argparse.ArgumentParser(description="unethical expression classifier using pretrained model")
    parser.add_argument(
        "--train_data", type=str, default="/content/drive/MyDrive/NLP/data/nikluge-au-2022-train.jsonl",
        help="train file"
    )
    parser.add_argument(
        "--test_data", type=str, default="/content/drive/MyDrive/NLP/data/nikluge-au-2022-test.jsonl",
        help="test file"
    )
    parser.add_argument(
        "--pred_data", type=str, default="/content/drive/MyDrive/NLP/output/hate_you_6.jsonl",
        help="pred file"
    )
    parser.add_argument(
        "--dev_data", type=str, default="/content/drive/MyDrive/NLP/data/nikluge-au-2022-dev.jsonl",
        help="dev file"
    )
    parser.add_argument(
        "--batch_size", type=int, default=8
    )
    parser.add_argument(
        "--learning_rate", type=float, default=3e-6
    )
    parser.add_argument(
        "--eps", type=float, default=1e-8
    )
    parser.add_argument(
        "--do_train", action="store_true"
    )
    parser.add_argument(
        "--do_eval", action="store_true"
    )
    parser.add_argument(
        "--do_test", action="store_true"
    )
    parser.add_argument(
        "--num_train_epochs", type=int, default=6
    )
    parser.add_argument(
        "--base_model", type=str, default="beomi/korean-hatespeech-classifier"
    )
    parser.add_argument(
        "--model_path", type=str, default="/content/drive/MyDrive/NLP/save_models/korean-hatespeech-classifier/saved_model_epoch_6.pt"
    )
    parser.add_argument(
        "--access_token", type=str, default="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" # special token
    )
    parser.add_argument(
        "--output_dir", type=str, default="/content/drive/MyDrive/NLP/output/"
    )
    parser.add_argument(
        "--do_demo", action="store_true"
    )
    parser.add_argument(
        "--max_len", type=int, default=256
    )
    parser.add_argument(
        "--classifier_hidden_size", type=int, default=768
    )
    parser.add_argument(
        "--classifier_dropout_prob", type=int, default=0.1, help="dropout in classifier"
    )
    args, unknowns = parser.parse_known_args()
    return args

In [None]:
args = parse_args()

### 데이터 확인하기

In [None]:
example = jsonlload(args.train_data)
print(example[:10])

[{'id': 'nikluge-au-2022-train-000001', 'input': '보여주면서 왜 엿보냐고 비난 하는것도 웃기지만. 훔쳐 보면서 왜 보여주냐고 하는 사람 역시 우습다..', 'output': 1}, {'id': 'nikluge-au-2022-train-000002', 'input': '왜 개인 사생활을 방송으로 보여주고 싶은지 이해도 안가지만 &location&식 프로포즈란 무슨 자로 잰 든 무릎 꿇고 반지 내밀고 나랑 결혼해줄래? 가 전부이다.', 'output': 1}, {'id': 'nikluge-au-2022-train-000003', 'input': '이런 쓰레기같은 새끼가 아무렇지 않게 멀쩡히 돌아다닐 생각을 하니까 진짜 너무 소름돋는다.', 'output': 1}, {'id': 'nikluge-au-2022-train-000004', 'input': '인간의 탈을 쓰고...', 'output': 1}, {'id': 'nikluge-au-2022-train-000005', 'input': '인기글에 짱깨뭐라하니까 댓글로 ㅂㄷㅂㄷ하네요...', 'output': 1}, {'id': 'nikluge-au-2022-train-000006', 'input': '계속 페미년 거리면서 왜 그렇게 생각하는지 뭐 그딴거 아무것고 없곸', 'output': 1}, {'id': 'nikluge-au-2022-train-000007', 'input': '가게에 한남왔어', 'output': 1}, {'id': 'nikluge-au-2022-train-000008', 'input': '그래도 한줘라 하면 줄듯', 'output': 1}, {'id': 'nikluge-au-2022-train-000009', 'input': '참고로 몇몇 캐릭터 더 있는데 다 허벌창같아서 소개는 안하겠음', 'output': 1}, {'id': 'nikluge-au-2022-train-000010', 'input': '그냥 ‘나쁜 인간’ 내지는 감정이 좀 상승이 되시면

### DataLoader

In [None]:
def tokenize_and_align_labels(tokenizer, form, label, max_len):
    data_dict = {
          'input_ids': [],
          'attention_mask': [],
          'label': [],
      }

    tokenized_data = tokenizer(form,
                               padding='max_length',
                               max_length=max_len,
                               truncation=True,
                               add_special_tokens=True)

    data_dict['input_ids'].append(tokenized_data['input_ids'])
    data_dict['attention_mask'].append(tokenized_data['attention_mask'])
    data_dict['label'].append(label)

    return data_dict

In [None]:
def get_dataset(raw_data, tokenizer, max_len):
    input_ids_list = []
    attention_mask_list = []
    token_labels_list = []

    for utterance in raw_data:
        tokenized_data = tokenize_and_align_labels(tokenizer, utterance['input'], utterance['output'] , max_len)

        # if tokenized_data is not None:
        input_ids_list.extend(tokenized_data['input_ids'])
        attention_mask_list.extend(tokenized_data['attention_mask'])
        token_labels_list.extend(tokenized_data['label'])

    print(input_ids_list[:5])
    print(attention_mask_list[:5])
    print(token_labels_list[:5])

    return TensorDataset(torch.tensor(input_ids_list), torch.tensor(attention_mask_list),
                         torch.tensor(token_labels_list))

### Model

In [None]:
class SimpleClassifier(nn.Module):

    def __init__(self, args, num_label):
        super().__init__()
        self.dense = nn.Linear(args.classifier_hidden_size, args.classifier_hidden_size)
        self.dropout = nn.Dropout(args.classifier_dropout_prob)
        self.output = nn.Linear(args.classifier_hidden_size, num_label)

    def forward(self, features):
        x = features[:, 0, :]
        # x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        # x = torch.relu(x)
        x = self.dropout(x)
        x = self.output(x)
        return x

In [None]:
class UnethicalExpressionClassifier(nn.Module):
    def __init__(self, args, num_label, len_tokenizer):
        super(UnethicalExpressionClassifier, self).__init__()

        self.num_label = num_label
        print(self.num_label)

        # config = BertConfig.from_pretrained(
        #     args.base_model,
        #     num_labels=num_label)
        # print(config)

        # AutoModel로도 돌려보기..
        self.pre_trained_model = AutoModel.from_pretrained(
            args.base_model,
            token=args.access_token,
            # config=config,
            ignore_mismatched_sizes=True,
        )
        self.pre_trained_model.resize_token_embeddings(len_tokenizer)

        self.labels_classifier = SimpleClassifier(args, self.num_label)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.pre_trained_model(
            input_ids=input_ids,
            attention_mask=attention_mask,
        )

        sequence_output = outputs[0]
        # print("\nSeqence Output: ", sequence_output.shape)
        logits = self.labels_classifier(sequence_output)

        loss = None

        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_label), labels.view(-1))

        return loss, logits

In [None]:
def evaluation(y_true, y_pred):

    y_true = list(map(int, y_true))
    y_pred = list(map(int, y_pred))

    print('f1_score: ', f1_score(y_true, y_pred, average=None))
    print('f1_score_micro: ', f1_score(y_true, y_pred, average='micro'))

### 훈련하기

In [None]:
def train_unethical_expression_classifier(args=None):
    if not os.path.exists(args.model_path):
        os.makedirs(args.model_path)

    print('train_unethical_expression_classifier')
    print('model would be saved at ', args.model_path)

    print('loading train data')
    train_data = jsonlload(args.train_data)
    dev_data = jsonlload(args.dev_data)

    print('tokenizing train data')
    tokenizer = AutoTokenizer.from_pretrained(args.base_model)
    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
    print('We have added', num_added_toks, 'tokens')
    train_dataloader = DataLoader(get_dataset(train_data, tokenizer, args.max_len), shuffle=True,
                                  batch_size=args.batch_size)
    dev_dataloader = DataLoader(get_dataset(dev_data, tokenizer, args.max_len), shuffle=True,
                                batch_size=args.batch_size)

    print('loading model')
    model = UnethicalExpressionClassifier(args, 2, len(tokenizer))
    model.to(device)

    # print(model)

    FULL_FINETUNING = True
    if FULL_FINETUNING:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'gamma', 'beta']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
             'weight_decay_rate': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
             'weight_decay_rate': 0.0}
        ]
    else:
        param_optimizer = list(model.classifier.named_parameters())
        optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

    optimizer = AdamW(
        optimizer_grouped_parameters,
        lr=args.learning_rate,
        eps=args.eps
    )
    epochs = args.num_train_epochs
    max_grad_norm = 1.0
    total_steps = epochs * len(train_dataloader)

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    epoch_step = 0

    for _ in trange(epochs, desc="Epoch"):
        model.train()
        epoch_step += 1
        total_loss = 0

        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch

            model.zero_grad()

            loss, _ = model(b_input_ids, b_input_mask, b_labels)

            loss.backward()

            total_loss += loss.item()

            # print('batch_loss: ', loss.item())

            torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_loss / len(train_dataloader)
        print("Epoch: ", epoch_step)
        print("Average train loss: {}".format(avg_train_loss))

        if args.do_eval:
            model.eval()

            pred_list = []
            label_list = []

            for batch in dev_dataloader:
                batch = tuple(t.to(device) for t in batch)
                b_input_ids, b_input_mask, b_labels = batch

                with torch.no_grad():
                    loss, logits = model(b_input_ids, b_input_mask, b_labels)

                predictions = torch.argmax(logits, dim=-1)
                pred_list.extend(predictions)
                label_list.extend(b_labels)

            evaluation(label_list, pred_list)

        if not os.path.exists(args.model_path):
            os.makedirs(args.model_path)

        model_saved_path = args.model_path + 'saved_model_epoch_' + str(epoch_step) + '.pt'
        torch.save(model.state_dict(), model_saved_path)

    print("training is done")

In [None]:
def test_unethical_expression_classifier(args):

    test_data = jsonlload(args.test_data)
    pred_data = jsonlload(args.pred_data)

    temp_ground_truth_dict = {}

    true_list = []
    pred_list = []

    # 데이터 list로 변경
    for data in test_data:
        if data['id'] in temp_ground_truth_dict:
            return {
                "error": "정답 데이터에 중복된 id를 가지는 경우 존재"
            }
        temp_ground_truth_dict[data['id']] = data['output']

    for data in pred_data:
        if data['id'] not in temp_ground_truth_dict:
            return {
                "error": "제출 파일과 정답 파일의 id가 일치하지 않음"
            }
        true_list.append(temp_ground_truth_dict[data['id']])
        pred_list.append(data['output'])

    evaluation(true_list, pred_list)

In [None]:
def separate_by_s_token(form):
    splited_temp_form = form.split('</s></s>')
    splited_temp_form[0] = splited_temp_form[0].split('<s>')[-1]
    splited_temp_form[-1] = splited_temp_form[-1].split('</s>')[0]

    for i in range(len(splited_temp_form)):
        splited_temp_form[i] = splited_temp_form[i].strip()

    return splited_temp_form

In [None]:
def demo_unethical_expression_classifier(args):
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    tokenizer = AutoTokenizer.from_pretrained(args.base_model)
    num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

    test_data = jsonlload(args.test_data)

    model = UnethicalExpressionClassifier(args, 2, len(tokenizer))
    model.load_state_dict(torch.load(args.model_path, map_location=device))
    model.to(device)
    model.eval()


    for data in tqdm(test_data):
        tokenized_data = tokenizer(data['input'], padding='max_length', max_length=args.max_len, truncation=True)

        input_ids = torch.tensor([tokenized_data['input_ids']]).to(device)
        attention_mask = torch.tensor([tokenized_data['attention_mask']]).to(device)

        with torch.no_grad():
            _, logits = model(input_ids, attention_mask)
        predictions = torch.argmax(logits, dim=-1)
        data['output'] = int(predictions[0])

    jsonldump(test_data, args.output_dir + 'hate_you_6.jsonl')

In [None]:
train_unethical_expression_classifier(args)

In [None]:
demo_unethical_expression_classifier(args)

2


100%|██████████| 2072/2072 [00:21<00:00, 97.41it/s]


In [None]:
test_unethical_expression_classifier(args)