In [22]:
import numpy as np
import pandas as pd
import re
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaModel
from torch.optim import Adam, lr_scheduler
import torch.nn.functional as F
import sys
sys.path.append('../')

import os
import argparse
import json
import torch.nn as nn

from util import *
from losses import LabelSmoothingCrossEntropy
from augment import *

from torch.utils.data.dataset import ConcatDataset
# from torch_model import SupConRobertaNet, SupConMultiRobertaNet
from torch.utils.data.sampler import RandomSampler

from torch_model import MLPRobertaNet, CNNRobertaNet, SIMRobertaNet


In [23]:
class PetDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.SE_index = [ i for i, c in enumerate(df.columns) if "SE" in c][0]
        self.label_index = [ i for i, c in enumerate(df.columns) if "label_id" in c][0]
        self.Num_class = len(df[df.columns[self.label_index]].value_counts())

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.iloc[idx, self.SE_index]
        label = self.df.iloc[idx, self.label_index]
        return text, label
    

In [24]:
BATCH_SIZE = 8
MAX_SEQ_LEN = 512

In [25]:
train_df = pd.read_csv('files/train3.csv')
test_df = pd.read_csv('files/test3.csv')
Num_Label = len(train_df.label_id.value_counts())
print(Num_Label)
print(train_df.shape)
print(test_df.shape)

18
(4674, 14)
(813, 14)


In [26]:
label_info_dict = {16: ' 치주 질환 / 치주염 (젖니 유전자 좌로 유래하는 것 포함)치아 및 구강 질환 ',
 12: ' 세균성 장염  소화기 질환 ',
 15: ' 췌장염  간 · 담도계 및 췌장 질환',
 13: ' 유선 종양 / 유방 종괴  생식기 질환 ',
 14: ' 자궁 축농증  생식기 질환 ',
 11: ' 판막증 (의심 포함한 심장 잡음 + 심부전 증후 자) 순환기 질환 ',
 7: ' 소화관 이물 / 섭취  소화기 질환 ',
 10: ' 만성 신장 질환 (신부전 포함)  비뇨기과 질환 ',
 3: ' 구토 / 설사 / 혈변 (원인 미정)  소화기 질환 ',
 5: ' 방광염  비뇨기과 질환 ',
 9: '슬개골 (아) 탈구 근육 골격 질환 ',
 1: ' 경련 발작 (원인 미정)  신경 질환 ',
 2: ' 고양이 하부 요로 질환 FUS · FLUTD  비뇨기과 질환 ',
 17: ' 폐렴  호흡기 질환 ',
 0: ' 간 / 담도 / 췌장의 종양  간 · 담도계 및 췌장 질환',
 4: ' 당뇨병 내분비 질환 ',
 8: ' 수막염 / 수막 뇌염 / 뇌염  신경 질환 ',
 6: ' 빈혈 (면역 개입 용혈성) IMHA 혈액 및 조혈기의 질환 '}

In [27]:

device = torch.device("cpu")
# device = torch.device('cpu')
# pretrained_path = './pretrained_without_wiki'
pretrained_path = './pretrained_without_wiki/'
tokenizer = RobertaTokenizer.from_pretrained(pretrained_path, do_lower_case=False)
# donwstream_class_num = task_label_dict['diags_id']
# model = MLPRobertaNet(path=pretrained_path, 
model = SIMRobertaNet(path=pretrained_path,                       
                              embedding_dim=768,
                              max_seq_length=MAX_SEQ_LEN, 
                              num_class=Num_Label)
model.to(device)
criterion = torch.nn.CrossEntropyLoss()
# criterion = LabelSmoothingCrossEntropy()
criterion = criterion.to(device)

In [28]:
# def reset_parameters(model):
#     for p in model.parameters():
#         if p.dim() > 1:
#             nn.init.xavier_uniform_(p)
            
# reset_parameters(model)            

In [29]:
# for param in model.parameters() :
#     print(param)

In [30]:
label_tensor_dict = {}
device = torch.device("cpu")
for label, text in label_info_dict.items():
#     print(text)
    encoded_list = [tokenizer.encode(t, add_special_tokens=True, max_length=512, truncation=True) for t in [text]]
#     print(encoded_list)
    padded_list = [e[:512] + [0] * (512-len(e[:512])) for e in encoded_list]
    sample = torch.tensor(padded_list)
    sample = sample.to(device)
    outputs = model(sample=sample, isLabel=True)
    label_tensor_dict[label] = outputs.detach().numpy()

In [31]:
def f1(x) :
    return x[0]
sorted_label_tensor_dict = sorted(label_tensor_dict.items(), key=f1, reverse=False)

In [32]:
# sorted_label_tensor_dict[1]

In [33]:
label_array = np.empty([1,768])
for label, narray in sorted_label_tensor_dict:
#     print(label)
    if label == 0 :
        label_array = narray
#         print(label_array)
    else :
        label_array = np.concatenate([label_array, narray], axis=0)

In [34]:
# label_tensor = torch.from_numpy(label_array.reshape(768, -1))
label_tensor = torch.normal(0, 0.1, size=(768,18))
label_tensor.size()

torch.Size([768, 18])

In [35]:
# label_tensor = F.normalize(label_tensor, dim=0)

In [36]:
def get_label_tensor() :
    device = torch.device("cuda")
    model.to(device)
    label_tensor_dict = {}
    for label, text in label_info_dict.items():
        encoded_list = [tokenizer.encode(t, add_special_tokens=True, max_length=512, truncation=True) for t in [text]]
    #     print(encoded_list)
        padded_list = [e[:512] + [0] * (512-len(e[:512])) for e in encoded_list]
        sample = torch.tensor(padded_list)
        sample = sample.to(device)
        outputs = model(sample=sample, isLabel=True)
#         label_tensor_dict[label] = outputs.detach().numpy()   
        label_tensor_dict[label] = outputs.detach()
        
    sorted_label_tensor_dict = sorted(label_tensor_dict.items(), key=f1, reverse=False)
        
#     label_array = np.empty([1,768])
    label_array = torch.empty([1,768])
    for label, narray in sorted_label_tensor_dict:
    #     print(label)
        if label == 0 :
            label_array = narray
    #         print(label_array)
        else :
#             label_array = np.concatenate([label_array, narray], axis=0)  
            label_array = torch.cat([label_array, narray], dim=0) 
#     label_tensor = torch.from_numpy(label_array.reshape(768, -1))
    label_tensor = label_array.reshape(768, -1)
    label_tensor.size()
    # label_tensor = F.normalize(label_tensor, dim=0)
    return label_tensor

In [37]:
device = torch.device("cuda")
model.to(device)
criterion = criterion.to(device)
label_tensor = label_tensor.to(device)

In [38]:
for param, state in zip(model.parameters(), model.state_dict()) :
    print(state)
    print(param.size())

encoder.embeddings.word_embeddings.weight
torch.Size([40000, 768])
encoder.embeddings.position_embeddings.weight
torch.Size([514, 768])
encoder.embeddings.token_type_embeddings.weight
torch.Size([1, 768])
encoder.embeddings.LayerNorm.weight
torch.Size([768])
encoder.embeddings.LayerNorm.bias
torch.Size([768])
encoder.encoder.layer.0.attention.self.query.weight
torch.Size([768, 768])
encoder.encoder.layer.0.attention.self.query.bias
torch.Size([768])
encoder.encoder.layer.0.attention.self.key.weight
torch.Size([768, 768])
encoder.encoder.layer.0.attention.self.key.bias
torch.Size([768])
encoder.encoder.layer.0.attention.self.value.weight
torch.Size([768, 768])
encoder.encoder.layer.0.attention.self.value.bias
torch.Size([768])
encoder.encoder.layer.0.attention.output.dense.weight
torch.Size([768, 768])
encoder.encoder.layer.0.attention.output.dense.bias
torch.Size([768])
encoder.encoder.layer.0.attention.output.LayerNorm.weight
torch.Size([768])
encoder.encoder.layer.0.attention.output.

In [39]:
train_dataset = PetDataset(train_df)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=2)

In [40]:
optimizer = Adam(model.parameters(), lr=0.00008)
scheduler = lr_scheduler.LambdaLR(
    optimizer=optimizer, lr_lambda=lambda epoch: 1 / ((epoch/4) + 1)
)

In [41]:
def model_eval(test_df, label_tensor) :
#     device = torch.device("cuda")
#     model.to(device)       
    model.eval()

    test_dataset = PetDataset(test_df)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=2)

    total_loss = 0
    total_len = 0
    total_correct = 0

    for text, label in test_loader:
        #   encoded_list = [tokenizer.encode(t, add_special_token=True) for t in text]
        encoded_list = [tokenizer.encode(t, max_length=512, truncation=True) for t in text]
        padded_list = [e[:512] + [0] * (512-len(e[:512])) for e in encoded_list]
        sample = torch.tensor(padded_list)
        sample, label = sample.to(device), label.to(device)
        label = torch.tensor(label)
        outputs = model(sample=sample, isLabel=False)
        outputs = torch.matmul(outputs, label_tensor)
#         outputs = torch.transpose(outputs, 1, 2)
#         outputs = nn.AdaptiveMaxPool1d(1)(outputs)
#         outputs = torch.squeeze(outputs)        
        logits = outputs

        pred = torch.argmax(F.softmax(logits), dim=1)
        correct = pred.eq(label)
        total_correct += correct.sum().item()
        total_len += len(label)

    print('Test accuracy: ', total_correct / total_len) 

In [42]:
model.train()
epochs = 15

for epoch in range(epochs):
    losses = AverageMeter()
    total_loss = 0
    total_len = 0
    total_correct = 0
    total_count = 0
#     device = torch.device("cuda")
#     model.to(device)    
    model.train()
    for text, label in train_loader:
        encoded_list = [tokenizer.encode(t, add_special_tokens=True, max_length=512, truncation=True) for t in text]
        padded_list = [e[:512] + [0] * (512-len(e[:512])) for e in encoded_list]
        sample = torch.tensor(padded_list)
        sample, label = sample.to(device), label.to(device)
        label = torch.tensor(label)
        outputs = model(sample=sample, isLabel=False)
#         print(outputs.size())
#         outputs = F.normalize(outputs, dim=2)
#         print(label_tensor.size())
#         print(outputs.size())
        outputs = torch.matmul(outputs, label_tensor)
#         print(outputs.size())
#         outputs = torch.transpose(outputs, 1, 2)
#         outputs = nn.AdaptiveMaxPool1d(1)(outputs)
#         outputs = torch.squeeze(outputs)
#         print(outputs.size())
        
        pred = torch.argmax(F.softmax(outputs), dim=1)
        correct = pred.eq(label)
        loss = criterion(outputs, label)
        losses.update(loss.item(), BATCH_SIZE)
#         print(loss)
        
        total_correct += correct.sum().item()
        total_len += len(label)
        total_loss += loss.item()
        total_count += 1

        optimizer.zero_grad()
        loss.backward()
        optimizer.step() 

        if (total_count + 1) % 100 == 0:
            print('Train: [{0}][{1}/{2}]\t'
                  'loss {loss.avg:.5f}'.format(
                   epoch, total_count + 1, len(train_loader), loss=losses))   

    model.train()
    scheduler.step()
    model_eval(test_df, label_tensor)
#     label_tensor = get_label_tensor()
#     model_eval(test_df, label_tensor)
    print('***********************************')
#     print(tloss)     
model_eval(test_df)



Train: [0][100/585]	loss 2.75637
Train: [0][200/585]	loss 2.74534
Train: [0][300/585]	loss 2.72913
Train: [0][400/585]	loss 2.61841
Train: [0][500/585]	loss 2.44493




Test accuracy:  0.5990159901599016
***********************************
Train: [1][100/585]	loss 1.29158
Train: [1][200/585]	loss 1.27547
Train: [1][300/585]	loss 1.22379
Train: [1][400/585]	loss 1.20242
Train: [1][500/585]	loss 1.18251
Test accuracy:  0.6371463714637147
***********************************
Train: [2][100/585]	loss 0.83387
Train: [2][200/585]	loss 0.83700
Train: [2][300/585]	loss 0.82515
Train: [2][400/585]	loss 0.81222
Train: [2][500/585]	loss 0.80077
Test accuracy:  0.6555965559655597
***********************************
Train: [3][100/585]	loss 0.60680
Train: [3][200/585]	loss 0.57544
Train: [3][300/585]	loss 0.56977
Train: [3][400/585]	loss 0.56172
Train: [3][500/585]	loss 0.56131
Test accuracy:  0.6875768757687577
***********************************
Train: [4][100/585]	loss 0.37472
Train: [4][200/585]	loss 0.36471
Train: [4][300/585]	loss 0.36664
Train: [4][400/585]	loss 0.36864
Train: [4][500/585]	loss 0.36696
Test accuracy:  0.6642066420664207
*********************

KeyboardInterrupt: 