In [None]:
import numpy as np
import pandas as pd
import re
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import RobertaTokenizer, RobertaForSequenceClassification, RobertaModel
from torch.optim import Adam, lr_scheduler
import torch.nn.functional as F
import sys
sys.path.append('../')

import os
import argparse
import json
import torch.nn as nn

from util import *
from losses import LabelSmoothingCrossEntropy
from augment import *

from torch.utils.data.dataset import ConcatDataset
# from torch_model import SupConRobertaNet, SupConMultiRobertaNet
from torch.utils.data.sampler import RandomSampler

from torch_model import MLPRobertaNet, CNNRobertaNet


In [2]:
class PetDataset(Dataset):
    def __init__(self, df):
        self.df = df
        self.SE_index = [ i for i, c in enumerate(df.columns) if "SE" in c][0]
        self.label_index = [ i for i, c in enumerate(df.columns) if "label_id" in c][0]
        self.Num_class = len(df[df.columns[self.label_index]].value_counts())

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = self.df.iloc[idx, self.SE_index]
        label = self.df.iloc[idx, self.label_index]
        return text, label
    

In [3]:
BATCH_SIZE = 8
MAX_SEQ_LEN = 512

In [4]:
train_df = pd.read_csv('files/train3.csv')
test_df = pd.read_csv('files/test3.csv')
Num_Label = len(train_df.label_id.value_counts())
print(Num_Label)
print(train_df.shape)
print(test_df.shape)

18
(4674, 14)
(813, 15)


In [5]:
train_dataset = PetDataset(train_df)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=2)

In [6]:
device = torch.device("cuda")
# device = torch.device('cpu')
# pretrained_path = './pretrained_without_wiki'
pretrained_path = './pretrained_without_wiki/'
tokenizer = RobertaTokenizer.from_pretrained(pretrained_path, do_lower_case=False)
# donwstream_class_num = task_label_dict['diags_id']
# model = MLPRobertaNet(path=pretrained_path, 
model = CNNRobertaNet(path=pretrained_path,                       
                              embedding_dim=768,
                              max_seq_length=MAX_SEQ_LEN, 
                              num_class=Num_Label)
model.to(device)
criterion = torch.nn.CrossEntropyLoss()
# criterion = LabelSmoothingCrossEntropy()
criterion = criterion.to(device)

In [7]:
optimizer = Adam(model.parameters(), lr=0.00008)
scheduler = lr_scheduler.LambdaLR(
    optimizer=optimizer, lr_lambda=lambda epoch: 1 / ((epoch/4) + 1)
)

In [8]:
def model_eval(test_df) :
    model.eval()

    test_dataset = PetDataset(test_df)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=2)

    total_loss = 0
    total_len = 0
    total_correct = 0

    for text, label in test_loader:
        #   encoded_list = [tokenizer.encode(t, add_special_token=True) for t in text]
        encoded_list = [tokenizer.encode(t, max_length=512, truncation=True) for t in text]
        padded_list = [e[:512] + [0] * (512-len(e[:512])) for e in encoded_list]
        sample = torch.tensor(padded_list)
        sample, label = sample.to(device), label.to(device)
        label = torch.tensor(label)
        outputs = model(sample=sample)
        logits = outputs

        pred = torch.argmax(F.softmax(logits), dim=1)
        correct = pred.eq(label)
        total_correct += correct.sum().item()
        total_len += len(label)

    print('Test accuracy: ', total_correct / total_len) 

In [9]:
model.train()
epochs = 7
for epoch in range(epochs):
    losses = AverageMeter()
    total_loss = 0
    total_len = 0
    total_correct = 0
    total_count = 0
    model.train()    
    for text, label in train_loader:
        encoded_list = [tokenizer.encode(t, add_special_tokens=True, max_length=512, truncation=True) for t in text]
        padded_list = [e[:512] + [0] * (512-len(e[:512])) for e in encoded_list]
        sample = torch.tensor(padded_list)
        sample, label = sample.to(device), label.to(device)
        label = torch.tensor(label)
        outputs = model(sample=sample)
        pred = torch.argmax(F.softmax(outputs), dim=1)
        correct = pred.eq(label)
        loss = criterion(outputs, label)
        losses.update(loss.item(), BATCH_SIZE)
#         print(loss)
        
        total_correct += correct.sum().item()
        total_len += len(label)
        total_loss += loss.item()
        total_count += 1

        optimizer.zero_grad()
        loss.backward()
        optimizer.step() 

        if (total_count + 1) % 100 == 0:
            print('Train: [{0}][{1}/{2}]\t'
                  'loss {loss.avg:.5f}'.format(
                   epoch, total_count + 1, len(train_loader), loss=losses))   

    model.train()
    scheduler.step()
    model_eval(test_df)
    print('***********************************')
#     print(tloss)     
model_eval(test_df)

  from ipykernel import kernelapp as app


Train: [0][100/585]	loss 2.76273
Train: [0][200/585]	loss 2.75689
Train: [0][300/585]	loss 2.75086
Train: [0][400/585]	loss 2.75258
Train: [0][500/585]	loss 2.75721




Test accuracy:  0.1992619926199262
***********************************
Train: [1][100/585]	loss 2.55303
Train: [1][200/585]	loss 2.51337
Train: [1][300/585]	loss 2.48054
Train: [1][400/585]	loss 2.47218
Train: [1][500/585]	loss 2.45695
Test accuracy:  0.22755227552275523
***********************************
Train: [2][100/585]	loss 2.17318
Train: [2][200/585]	loss 2.20296
Train: [2][300/585]	loss 2.19177
Train: [2][400/585]	loss 2.15886
Train: [2][500/585]	loss 2.13957
Test accuracy:  0.42435424354243545
***********************************
Train: [3][100/585]	loss 1.83167
Train: [3][200/585]	loss 1.80540
Train: [3][300/585]	loss 1.79732
Train: [3][400/585]	loss 1.77206
Train: [3][500/585]	loss 1.77757
Test accuracy:  0.4489544895448955
***********************************
Train: [4][100/585]	loss 1.55736
Train: [4][200/585]	loss 1.54889
Train: [4][300/585]	loss 1.54460
Train: [4][400/585]	loss 1.56000
Train: [4][500/585]	loss 1.56283
Test accuracy:  0.45264452644526443
******************