In [1]:
import numpy as np
import pandas as pd
import re
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaModel
from torch.optim import Adam, lr_scheduler
import torch.nn.functional as F
import sys
sys.path.append('../')

import os
import argparse
import json
import torch.nn as nn

from util import *
from losses import LabelSmoothingCrossEntropy
from augment import *

from torch.utils.data.dataset import ConcatDataset
# from torch_model import SupConRobertaNet, SupConMultiRobertaNet
from torch.utils.data.sampler import RandomSampler

from torch_model import MLPRobertaNet, CNNRobertaNet, SIMRobertaNet, CNNInnerRobertaNet


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
class PetDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.SE_index = [ i for i, c in enumerate(df.columns) if "SE" in c][0]
        self.label_index = [ i for i, c in enumerate(df.columns) if "label_id" in c][0]
        self.Num_class = len(df[df.columns[self.label_index]].value_counts())

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.iloc[idx, self.SE_index]
        label = self.df.iloc[idx, self.label_index]
        return text, label
    

In [3]:
BATCH_SIZE = 8
MAX_SEQ_LEN = 512

In [4]:
train_df = pd.read_csv('files/train3.csv')
test_df = pd.read_csv('files/test3.csv')
Num_Label = len(train_df.label_id.value_counts())
print(Num_Label)
print(train_df.shape)
print(test_df.shape)

18
(4674, 14)
(813, 14)


In [5]:
label_info_dict = {16: ' 치주 질환 / 치주염 (젖니 유전자 좌로 유래하는 것 포함)치아 및 구강 질환 ',
 12: ' 세균성 장염  소화기 질환 ',
 15: ' 췌장염  간 · 담도계 및 췌장 질환',
 13: ' 유선 종양 / 유방 종괴  생식기 질환 ',
 14: ' 자궁 축농증  생식기 질환 ',
 11: ' 판막증 (의심 포함한 심장 잡음 + 심부전 증후 자) 순환기 질환 ',
 7: ' 소화관 이물 / 섭취  소화기 질환 ',
 10: ' 만성 신장 질환 (신부전 포함)  비뇨기과 질환 ',
 3: ' 구토 / 설사 / 혈변 (원인 미정)  소화기 질환 ',
 5: ' 방광염  비뇨기과 질환 ',
 9: '슬개골 (아) 탈구 근육 골격 질환 ',
 1: ' 경련 발작 (원인 미정)  신경 질환 ',
 2: ' 고양이 하부 요로 질환 FUS · FLUTD  비뇨기과 질환 ',
 17: ' 폐렴  호흡기 질환 ',
 0: ' 간 / 담도 / 췌장의 종양  간 · 담도계 및 췌장 질환',
 4: ' 당뇨병 내분비 질환 ',
 8: ' 수막염 / 수막 뇌염 / 뇌염  신경 질환 ',
 6: ' 빈혈 (면역 개입 용혈성) IMHA 혈액 및 조혈기의 질환 '}

In [6]:

device = torch.device("cuda")
# device = torch.device('cpu')
# pretrained_path = './pretrained_without_wiki'
pretrained_path = './pretrained_without_wiki/'
tokenizer = RobertaTokenizer.from_pretrained(pretrained_path, do_lower_case=False)
# donwstream_class_num = task_label_dict['diags_id']
# model = MLPRobertaNet(path=pretrained_path, 
model = CNNInnerRobertaNet(path=pretrained_path,                       
                              embedding_dim=768,
                              max_seq_length=MAX_SEQ_LEN, 
                              num_class=Num_Label)
model.to(device)
criterion = torch.nn.CrossEntropyLoss()
# criterion = LabelSmoothingCrossEntropy()
criterion = criterion.to(device)

In [7]:
# def reset_parameters(model):
#     for p in model.parameters():
#         if p.dim() > 1:
#             nn.init.xavier_uniform_(p)
            
# reset_parameters(model)            

In [8]:
# for param in model.parameters() :
#     print(param)

In [9]:
# label_tensor = torch.from_numpy(label_array.reshape(768, -1))
label_tensor = torch.normal(0, 0.1, size=(768,18))
label_tensor.size()
label_tensor = label_tensor.to(device)

In [10]:
# label_tensor = F.normalize(label_tensor, dim=0)

In [11]:
train_dataset = PetDataset(train_df)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=2)

In [12]:
optimizer = Adam(model.parameters(), lr=0.00008)
scheduler = lr_scheduler.LambdaLR(
    optimizer=optimizer, lr_lambda=lambda epoch: 1 / ((epoch/4) + 1)
)

In [13]:
def model_eval(test_df, label_tensor) :
#     device = torch.device("cuda")
#     model.to(device)       
    model.eval()

    test_dataset = PetDataset(test_df)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=2)

    total_loss = 0
    total_len = 0
    total_correct = 0

    for text, label in test_loader:
        #   encoded_list = [tokenizer.encode(t, add_special_token=True) for t in text]
        encoded_list = [tokenizer.encode(t, max_length=512, truncation=True) for t in text]
        padded_list = [e[:512] + [0] * (512-len(e[:512])) for e in encoded_list]
        sample = torch.tensor(padded_list)
        sample, label = sample.to(device), label.to(device)
        label = torch.tensor(label)
        outputs = model(sample=sample, isLabel=False)
        outputs = torch.transpose(outputs, 1, 2)
        outputs = torch.matmul(outputs, label_tensor)
        outputs = torch.transpose(outputs, 1, 2)
        outputs = nn.AdaptiveMaxPool1d(1)(outputs)
        outputs = torch.squeeze(outputs)        
        logits = outputs

        pred = torch.argmax(F.softmax(logits), dim=1)
        correct = pred.eq(label)
        total_correct += correct.sum().item()
        total_len += len(label)

    print('Test accuracy: ', total_correct / total_len) 

In [14]:
model.train()
epochs = 15

for epoch in range(epochs):
    losses = AverageMeter()
    total_loss = 0
    total_len = 0
    total_correct = 0
    total_count = 0
#     device = torch.device("cuda")
#     model.to(device)    
    model.train()
    for text, label in train_loader:
        encoded_list = [tokenizer.encode(t, add_special_tokens=True, max_length=512, truncation=True) for t in text]
        padded_list = [e[:512] + [0] * (512-len(e[:512])) for e in encoded_list]
        sample = torch.tensor(padded_list)
        sample, label = sample.to(device), label.to(device)
        label = torch.tensor(label)
        outputs = model(sample=sample, isLabel=False)
#         print(outputs.size())
#         outputs = F.normalize(outputs, dim=2)
#         print(label_tensor.size())
#         print(outputs.size())
        outputs = torch.transpose(outputs, 1, 2)
        outputs = torch.matmul(outputs, label_tensor)
#         print(outputs.size())
        outputs = torch.transpose(outputs, 1, 2)
        outputs = nn.AdaptiveMaxPool1d(1)(outputs)
        outputs = torch.squeeze(outputs)
#         print(outputs.size())
        
        pred = torch.argmax(F.softmax(outputs), dim=1)
        correct = pred.eq(label)
        loss = criterion(outputs, label)
        losses.update(loss.item(), BATCH_SIZE)
#         print(loss)
        
        total_correct += correct.sum().item()
        total_len += len(label)
        total_loss += loss.item()
        total_count += 1

        optimizer.zero_grad()
        loss.backward()
        optimizer.step() 

        if (total_count + 1) % 100 == 0:
            print('Train: [{0}][{1}/{2}]\t'
                  'loss {loss.avg:.5f}'.format(
                   epoch, total_count + 1, len(train_loader), loss=losses))   

    model.train()
    scheduler.step()
    model_eval(test_df, label_tensor)
#     label_tensor = get_label_tensor()
#     model_eval(test_df, label_tensor)
    print('***********************************')
#     print(tloss)     
model_eval(test_df)



Train: [0][100/585]	loss 2.78191
Train: [0][200/585]	loss 2.76701
Train: [0][300/585]	loss 2.76493
Train: [0][400/585]	loss 2.67998
Train: [0][500/585]	loss 2.48970




Test accuracy:  0.5768757687576875
***********************************
Train: [1][100/585]	loss 1.29240
Train: [1][200/585]	loss 1.24323
Train: [1][300/585]	loss 1.23904
Train: [1][400/585]	loss 1.19886
Train: [1][500/585]	loss 1.17764
Test accuracy:  0.6236162361623616
***********************************
Train: [2][100/585]	loss 0.78204
Train: [2][200/585]	loss 0.83395
Train: [2][300/585]	loss 0.83467
Train: [2][400/585]	loss 0.83573
Train: [2][500/585]	loss 0.82734
Test accuracy:  0.6346863468634686
***********************************
Train: [3][100/585]	loss 0.58786
Train: [3][200/585]	loss 0.54488
Train: [3][300/585]	loss 0.56643
Train: [3][400/585]	loss 0.55107
Train: [3][500/585]	loss 0.55161
Test accuracy:  0.6666666666666666
***********************************
Train: [4][100/585]	loss 0.32494
Train: [4][200/585]	loss 0.32455
Train: [4][300/585]	loss 0.32234
Train: [4][400/585]	loss 0.34154
Train: [4][500/585]	loss 0.33797
Test accuracy:  0.6543665436654367
*********************

Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/queues.py", line 240, in _feed
    send_bytes(obj)
  File "/usr/lib/python3.6/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/usr/lib/python3.6/multiprocessing/connection.py", line 404, in _send_bytes
    self._send(header + buf)
  File "/usr/lib/python3.6/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/queues.py", line 240, in _feed
    send_bytes(obj)
  File "/usr/lib/python3.6/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/usr/lib/python3.6/multiprocessing/connection.py", line 404, in _send_bytes
    self._send(header + buf)
  File "/usr/lib/python3.6/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, b

Traceback (most recent call last):
  File "/home/beomgon2/.local/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3343, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-14-0a451bfad737>", line 35, in <module>
    losses.update(loss.item(), BATCH_SIZE)
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/beomgon2/.local/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2044, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/beomgon2/.local/lib/python3.6/site-packages/IPython/core/ultratb.py", line 1169, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
  File "/home/beomgon2/.local/lib/p

TypeError: object of type 'NoneType' has no len()