In [1]:
import numpy as np
import pandas as pd
import re
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaModel
from torch.optim import Adam, lr_scheduler
import torch.nn.functional as F
import sys
sys.path.append('../')

import os
import argparse
import json
import torch.nn as nn

from util import *
from losses import LabelSmoothingCrossEntropy, SupConLoss
from augment import *

from torch.utils.data.dataset import ConcatDataset
# from torch_model import SupConRobertaNet, SupConMultiRobertaNet
from torch.utils.data.sampler import RandomSampler
from torch_model import MLPRobertaNet, CNNRobertaNet, SIMRobertaNet, ContraRobertaNet
from preprop import *
from losses import FocalLoss

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
class PetDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.SE_index = [ i for i, c in enumerate(df.columns) if "SE" in c][0]
        self.label_index = [ i for i, c in enumerate(df.columns) if "label_id" in c][0]
        self.Num_class = len(df[df.columns[self.label_index]].value_counts())

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.iloc[idx, self.SE_index]
        label = self.df.iloc[idx, self.label_index]
        return text, label
    

In [3]:
BATCH_SIZE = 14
MAX_SEQ_LEN = 512

In [4]:
train_df = pd.read_csv('files/train2.csv')
test_df = pd.read_csv('files/test2.csv')
# train_df = pd.read_csv('data/files/disease_train.csv')
# test_df = pd.read_csv('data/files/disease_test.csv')
train_df.SE = train_df.SE.apply(lambda x : preprocess(str(x)))
test_df.SE = test_df.SE.apply(lambda x : preprocess(str(x)))
Num_Label = len(train_df.label_id.value_counts())
print(Num_Label)
print(train_df.shape)
print(test_df.shape)

10
(4280, 5)
(476, 5)


In [5]:
label_info_dict = {16: ' 치주 질환 / 치주염 (젖니 유전자 좌로 유래하는 것 포함)치아 및 구강 질환 ',
 12: ' 세균성 장염  소화기 질환 ',
 15: ' 췌장염  간 · 담도계 및 췌장 질환',
 13: ' 유선 종양 / 유방 종괴  생식기 질환 ',
 14: ' 자궁 축농증  생식기 질환 ',
 11: ' 판막증 (의심 포함한 심장 잡음 + 심부전 증후 자) 순환기 질환 ',
 7: ' 소화관 이물 / 섭취  소화기 질환 ',
 10: ' 만성 신장 질환 (신부전 포함)  비뇨기과 질환 ',
 3: ' 구토 / 설사 / 혈변 (원인 미정)  소화기 질환 ',
 5: ' 방광염  비뇨기과 질환 ',
 9: '슬개골 (아) 탈구 근육 골격 질환 ',
 1: ' 경련 발작 (원인 미정)  신경 질환 ',
 2: ' 고양이 하부 요로 질환 FUS · FLUTD  비뇨기과 질환 ',
 17: ' 폐렴  호흡기 질환 ',
 0: ' 간 / 담도 / 췌장의 종양  간 · 담도계 및 췌장 질환',
 4: ' 당뇨병 내분비 질환 ',
 8: ' 수막염 / 수막 뇌염 / 뇌염  신경 질환 ',
 6: ' 빈혈 (면역 개입 용혈성) IMHA 혈액 및 조혈기의 질환 '}

In [6]:
device = torch.device("cuda")
# device = torch.device('cpu')
# pretrained_path = './pretrained_without_wiki'
pretrained_path = './pretrained_0818/'
tokenizer = RobertaTokenizer.from_pretrained(pretrained_path, do_lower_case=False)
# donwstream_class_num = task_label_dict['diags_id']
# model = MLPRobertaNet(path=pretrained_path, 
model = ContraRobertaNet(path=pretrained_path,                       
                              embedding_dim=768,
                              num_class=Num_Label)
model.to(device)
# criterion = torch.nn.CrossEntropyLoss()
# criterion = LabelSmoothingCrossEntropy()
# criterion = SupConLoss()
# criterion = criterion.to(device)

ContraRobertaNet(
  (encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-

In [7]:
# def reset_parameters(model):
#     for p in model.parameters():
#         if p.dim() > 1:
#             nn.init.xavier_uniform_(p)
            
# reset_parameters(model)            

In [8]:
# for param in model.parameters() :
#     print(param)

In [9]:
# for param, state in zip(model.parameters(), model.state_dict()) :
#     print(state)
#     print(param.size())

In [10]:
def model_eval(test_df) :
#     device = torch.device("cuda")
#     model.to(device)       
    model.eval()

    test_dataset = PetDataset(test_df)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=2)

    total_loss = 0
    total_len = 0
    total_correct = 0

    for text, label in test_loader:
        #   encoded_list = [tokenizer.encode(t, add_special_token=True) for t in text]
        encoded_list = [tokenizer.encode(t, max_length=512, truncation=True) for t in text]
        padded_list = [e[:512] + [0] * (512-len(e[:512])) for e in encoded_list]
        sample = torch.tensor(padded_list)
        sample, label = sample.to(device), label.to(device)
        label = torch.tensor(label)
        outputs = model(sample=sample, iscontra=False)
        logits = outputs

        pred = torch.argmax(F.softmax(logits), dim=1)
        correct = pred.eq(label)
        total_correct += correct.sum().item()
        total_len += len(label)

    print('Test accuracy: ', total_correct / total_len) 

In [11]:
# for param, state in zip(model.parameters(), model.state_dict()) :
#     print(state)
#     print(param.size())
# for param, state in zip(model.parameters(), model.state_dict()) :
#     print(state)
#     print(param)

In [12]:
def print_weight(model) :
    for param, state in zip(model.parameters(), model.state_dict()) :
    #     print(state)
#         if 'encoder.pooler.dense.bias' in state:
# #             print(param.size())
#             print(param[0:10])
        if 'encoder.encoder.layer.5.attention.self.query.weight' in state:
#             print(param.size())
            print(param)
#             print(param[0][0:10]) 
#             print(param[10][0:10]) 
#             print(param[100][0:10])

In [13]:
train_dataset = PetDataset(train_df)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)

In [14]:
# Learning_rate=0.00001
Learning_rate=0.000004
optimizer = Adam(model.parameters(), lr=Learning_rate)
scheduler = lr_scheduler.LambdaLR(
    optimizer=optimizer, lr_lambda=lambda epoch: 1 / ((epoch/4)+1)
)

In [15]:
model.load_state_dict(torch.load('contrastive/tune3'))

<All keys matched successfully>

In [None]:
criterion = SupConLoss(temperature=1)
# criterion = SupConLoss()
criterion = criterion.to(device)
model.train()
epochs = 30
avg_loss = 0.4

for epoch in range(epochs):
    losses = AverageMeter()
    total_loss = 0
    total_len = 0
    total_correct = 0
    total_count = 0
    model.train()
    for text, label in train_loader:
#         print(label)
        encoded_list = [tokenizer.encode(t, add_special_tokens=True, max_length=512, truncation=True) for t in text]
        padded_list = [e[:512] + [0] * (512-len(e[:512])) for e in encoded_list]
        sample = torch.tensor(padded_list) 
        sample, label = sample.to(device), label.to(device)
        label = torch.tensor(label)
        outputs = model(sample=sample, iscontra=True)
        outputs = torch.unsqueeze(outputs, dim=1)

        loss = criterion(outputs, label)
        losses.update(loss.item(), BATCH_SIZE)
#         print(loss)
#         total_correct += correct.sum().item()
        total_len += len(label)
        total_loss += loss.item()
        total_count += 1

#         print_weight(model)        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step() 

#         if (total_count + 1) % 200 == 0:
#             print('###################################')
# #             print_weight(model)
#             print('Train: [{0}][{1}/{2}]\t'
#                   'loss {loss.avg:.5f}'.format(
#                    epoch, total_count + 1, len(train_loader), loss=losses))   

    model.train()
    scheduler.step()  
    print('[Epoch {}/{}] Train Loss: {:.4f}'.format(epoch + 1, epochs, total_loss / total_count))
    if total_loss / total_count < avg_loss :
        avg_loss = total_loss / total_count
        torch.save(model.state_dict(), 'contrastive/tune3')
        print('model is saved')    



[Epoch 1/30] Train Loss: 0.4270
[Epoch 2/30] Train Loss: 0.4419
[Epoch 3/30] Train Loss: 0.4223
[Epoch 4/30] Train Loss: 0.4260
[Epoch 5/30] Train Loss: 0.4281
[Epoch 6/30] Train Loss: 0.4397
[Epoch 7/30] Train Loss: 0.4047


In [None]:
# torch.save(model.state_dict(), 'contrastive/tune2')

In [None]:
model.load_state_dict(torch.load('contrastive/tune3'))

In [None]:
for param, state in zip(model.parameters(), model.state_dict()) :
    if 'fc.' not in state :
        param.requires_grad = False


In [None]:
optimizer = Adam(model.parameters(), lr=0.00008)
scheduler = lr_scheduler.LambdaLR(
    optimizer=optimizer, lr_lambda=lambda epoch: 1 / ((epoch/2) + 1)
)

In [None]:
model.train()
# criterion = FocalLoss(alpha=0.97, reduce=True)
criterion = torch.nn.CrossEntropyLoss()
criterion.to(device)
epochs = 7

for epoch in range(epochs):
    losses = AverageMeter()
    total_loss = 0
    total_len = 0
    total_correct = 0
    total_count = 0
    model.train()
    for text, label in train_loader:
#         print(label)
        encoded_list = [tokenizer.encode(t, add_special_tokens=True, max_length=512, truncation=True) for t in text]
        padded_list = [e[:512] + [0] * (512-len(e[:512])) for e in encoded_list]
        sample = torch.tensor(padded_list)
        sample, label = sample.to(device), label.to(device)
        label = torch.tensor(label)
        outputs = model(sample=sample, iscontra=False)

        loss = criterion(outputs, label)
        losses.update(loss.item(), BATCH_SIZE)
        pred = torch.argmax(F.softmax(outputs), dim=1)
        correct = pred.eq(label)
        total_correct += correct.sum().item()
        total_len += len(label)
        total_loss += loss.item()
        total_count += 1
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step() 

#         if (total_count + 1) % 200 == 0:
# #             print_weight(model)
#             print('Train: [{0}][{1}/{2}]\t'
#                   'loss {loss.avg: .5f}'.format(
#                    epoch, total_count + 1, len(train_loader), loss=losses))   

    model.train()
    scheduler.step()
    model_eval(test_df)
    print('[Epoch {}/{}] Train Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch + 1, epochs, total_loss / total_count, total_correct / total_len))
#     print(tloss)  

In [None]:
# for param, state in zip(model.parameters(), model.state_dict()) :
#     if 'fc.' not in state :
#         param.requires_grad = True
#     else  :
#         param.requires_grad = False        

In [None]:
# optimizer = Adam(model.parameters(), lr=0.00001)
# scheduler = lr_scheduler.LambdaLR(
#     optimizer=optimizer, lr_lambda=lambda epoch: 1 / ((epoch/2) + 1)
# )

In [None]:
# model.train()
# criterion = FocalLoss(alpha=0.97, reduce=True)
# # criterion = torch.nn.CrossEntropyLoss()
# criterion.to(device)
# epochs = 7

# for epoch in range(epochs):
#     losses = AverageMeter()
#     total_loss = 0
#     total_len = 0
#     total_correct = 0
#     total_count = 0
#     model.train()
#     for text, label in train_loader:
# #         print(label)
#         encoded_list = [tokenizer.encode(t, add_special_tokens=True, max_length=512, truncation=True) for t in text]
#         padded_list = [e[:512] + [0] * (512-len(e[:512])) for e in encoded_list]
#         sample = torch.tensor(padded_list)
#         sample, label = sample.to(device), label.to(device)
#         label = torch.tensor(label)
#         outputs = model(sample=sample, iscontra=False)

#         loss = criterion(outputs, label)
#         losses.update(loss.item(), BATCH_SIZE)
# #         print(loss)
        
# #         total_correct += correct.sum().item()
#         total_len += len(label)
#         total_loss += loss.item()
#         total_count += 1
        
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step() 

#         if (total_count + 1) % 200 == 0:
# #             print_weight(model)
#             print('Train: [{0}][{1}/{2}]\t'
#                   'loss {loss.avg:.5f}'.format(
#                    epoch, total_count + 1, len(train_loader), loss=losses))   

#     model.train()
#     scheduler.step()
#     model_eval(test_df)
#     print('***********************************')
# #     print(tloss)  