In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] ='6'
# os.environ['CUDA_VISIBLE_DEVICES'] ='4'

In [2]:
# !echo $CUDA_VISIBLE_DEVICES

In [3]:
import torch
import json
import numpy as np
import transformers
import pandas as pd
import pickle as pkl
from torch import nn
from tqdm import tqdm
from os.path import join
from importlib import reload
import multiprocessing as mp
from collections import Counter
from data_pub import pubmedDataset
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report, confusion_matrix
from transformers import (BertPreTrainedModel, BertModel, AdamW, get_linear_schedule_with_warmup, 
                          RobertaPreTrainedModel, RobertaModel,
                          AutoTokenizer, AutoModel, AutoConfig)
from transformers import (WEIGHTS_NAME,
                          BertConfig, BertForSequenceClassification, BertTokenizer,
                          XLMConfig, XLMForSequenceClassification, XLMTokenizer,
                          DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer,
                          RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer)

In [4]:
config = AutoConfig.from_pretrained(
        "roberta-base",
        num_labels=3,
        finetuning_task='pubmedqa')

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

model = RobertaForSequenceClassification.from_pretrained("roberta-base", 
                                                         config = config)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifi

In [5]:
def read_data(split, fold=1):
    if split == 'train':
        train_json = json.load(open('/mnt/nfs/work1/hongyu/brawat/pubmedqa/pubmedqa/data/pqal_fold%d/train_set.json' % fold, 
                                    'r'))
        dev_json = json.load(open('/mnt/nfs/work1/hongyu/brawat/pubmedqa/pubmedqa/data/pqal_fold%d/dev_set.json' % fold, 
                                  'r'))
        final_json = {**train_json, **dev_json}
    else:
        test_json = json.load(open('/mnt/nfs/work1/hongyu/brawat/pubmedqa/pubmedqa/data/test_set.json', 'r'))
        final_json = test_json
    list_data = []
    for key_, val_ in final_json.items():
        tmp_ = {'sentence1': val_['QUESTION'], 
                'sentence2': ' '.join(val_['CONTEXTS']), 
                'gold_label': val_['final_decision']}
        list_data.append(tmp_)
    return list_data

In [29]:
def get_class_wts(dict_cnt, alpha=15):
    tot_cnt = sum([dict_cnt[x] for x in dict_cnt])
    wt_ = {}
    for each_cat in dict_cnt:
        wt_[each_cat] = np.log(alpha * tot_cnt/dict_cnt[each_cat])
    return wt_

In [6]:
dict_data = {}
dict_data['train'] = read_data(split='train', 
                              fold=1)
dict_data['test'] = read_data(split='test')

label2id = {'yes':0, 'no': 1, 'maybe': 2}

In [22]:
dict_data['train'][0]

{'sentence1': "Is cytokeratin immunoreactivity useful in the diagnosis of short-segment Barrett's oesophagus in Korea?",
 'sentence2': "Cytokeratin 7/20 staining has been reported to be helpful in diagnosing Barrett's oesophagus and gastric intestinal metaplasia. However, this is still a matter of some controversy. To determine the diagnostic usefulness of cytokeratin 7/20 immunostaining for short-segment Barrett's oesophagus in Korea. In patients with Barrett's oesophagus, diagnosed endoscopically, at least two biopsy specimens were taken from just below the squamocolumnar junction. If goblet cells were found histologically with alcian blue staining, cytokeratin 7/20 immunohistochemical stains were performed. Intestinal metaplasia at the cardia was diagnosed whenever biopsy specimens taken from within 2 cm below the oesophagogastric junction revealed intestinal metaplasia. Barrett's cytokeratin 7/20 pattern was defined as cytokeratin 20 positivity in only the superficial gland, combin

In [28]:
print("=="*10)
print('Train')
print("=="*10)
print("Train: ", Counter([x['gold_label'] for x in dict_data['train']]))
print("Train: ", np.mean([x['sentence1'].__len__() for x in dict_data['train']]))
print("Train: ", np.mean([x['sentence2'].__len__() for x in dict_data['train']]))

print('\n')

print("=="*10)
print("Test")
print("=="*10)
print("Test: ", Counter([x['gold_label'] for x in dict_data['test']]))
print("Test: ", np.mean([x['sentence1'].__len__() for x in dict_data['test']]))
print("Test: ", np.mean([x['sentence2'].__len__() for x in dict_data['test']]))

Train
Train:  Counter({'yes': 276, 'no': 169, 'maybe': 55})
Train:  93.272
Train:  1330.376


Test
Test:  Counter({'yes': 276, 'no': 169, 'maybe': 55})
Test:  95.12
Test:  1352.152


In [33]:
class_wts = get_class_wts(dict_cnt={'yes': 276, 'no': 169, 'maybe': 55}, 
                          alpha=3)
print(class_wts)

{'yes': 1.6928195213731514, 'no': 2.183321672167228, 'maybe': 3.3058872018578307}


In [7]:
train_dataset = pubmedDataset(list_data=dict_data['train'], 
                             tokenizer=tokenizer, 
                             max_length=506, 
                             label2id=label2id)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True,
                          num_workers=1)

In [8]:
args = {'weight_decay':0.0,
        'learning_rate':2e-5,
        'epochs':3,
        'gradient_accumulation_steps':1,
        'adam_epsilon':1e-8}
args['t_total'] = len(train_loader) // args['gradient_accumulation_steps'] * args['epochs']
args['warmup_steps'] = int(0.20*args['t_total'])

In [9]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
         'weight_decay': args['weight_decay']},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 
         'weight_decay': 0.0}
    ]
optimizer = AdamW(optimizer_grouped_parameters, lr=args['learning_rate'], eps=args['adam_epsilon'])
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args['warmup_steps'],
                                            num_training_steps=args['t_total'])

In [10]:
model = model.to("cuda")

In [11]:
# model = nn.DataParallel(model)

In [12]:
# loss_fct = CrossEntropyLoss(reduction='none')
loss_fct = CrossEntropyLoss()

In [13]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [14]:
def evaluate(model, data_loader):
    model.eval()
    dict_result = {'actual':[],
                   'preds':[]}
    with torch.no_grad():
        for batch in tqdm(data_loader):
            dict_result['actual'] += batch['label'].numpy().tolist()

            input_batch = {'input_ids':batch['input_ids'],
                       'attention_mask':batch['attention_mask']}
            input_batch = {k: v.to('cuda') for k, v in input_batch.items()}
            outputs = model(**input_batch)

            dict_result['preds'] += np.argmax(outputs[0].detach().cpu().numpy(), axis=1).tolist()

    dict_result['actual'] = [x[0] for x in dict_result['actual']]    
    return dict_result

In [15]:
def get_performance(actual_, preds_, dict_mapping):
    print(classification_report(actual_, preds_))
    print('--'*10)
    print('Confusion matrix')
    print(pd.DataFrame(confusion_matrix(actual_, preds_)))
    print('--'*10)
    print('Actual counter:', Counter(actual_))
    print('Prediction counter:', Counter(preds_))
    print('Mapping:', dict_mapping)

In [16]:
model.train()
for each_epoch in range(args['epochs']):
    model.train()
    for batch in tqdm(train_loader):
        model.zero_grad()
        input_batch = {'input_ids':batch['input_ids'],
                       'attention_mask':batch['attention_mask']}
        input_batch = {k: v.to('cuda') for k, v in input_batch.items()}
        
        outputs = model(**input_batch)
        ### Loss calculation
#         loss = loss_fct(outputs[0], batch['label'].view(-1).cuda()).sum()
        loss = loss_fct(outputs[0], batch['label'].view(-1).to('cuda'))
        
        loss.backward()
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        
    dict_train = evaluate(model=model, 
                          data_loader=train_loader)
    get_performance(actual_ = dict_train['actual'], 
                    preds_ = dict_train['preds'], 
                    dict_mapping = label2id)

100%|██████████| 63/63 [00:20<00:00,  3.14it/s]
100%|██████████| 63/63 [00:06<00:00, 10.18it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.55      1.00      0.71       276
           1       0.00      0.00      0.00       169
           2       0.00      0.00      0.00        55

    accuracy                           0.55       500
   macro avg       0.18      0.33      0.24       500
weighted avg       0.30      0.55      0.39       500

--------------------
Confusion matrix
     0  1  2
0  276  0  0
1  169  0  0
2   55  0  0
--------------------
Actual counter: Counter({0: 276, 1: 169, 2: 55})
Prediction counter: Counter({0: 500})
Mapping: {'yes': 0, 'no': 1, 'maybe': 2}


100%|██████████| 63/63 [00:19<00:00,  3.28it/s]
100%|██████████| 63/63 [00:06<00:00, 10.09it/s]
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.55      1.00      0.71       276
           1       0.00      0.00      0.00       169
           2       0.00      0.00      0.00        55

    accuracy                           0.55       500
   macro avg       0.18      0.33      0.24       500
weighted avg       0.30      0.55      0.39       500

--------------------
Confusion matrix
     0  1  2
0  276  0  0
1  169  0  0
2   55  0  0
--------------------
Actual counter: Counter({0: 276, 1: 169, 2: 55})
Prediction counter: Counter({0: 500})
Mapping: {'yes': 0, 'no': 1, 'maybe': 2}


100%|██████████| 63/63 [00:19<00:00,  3.17it/s]
100%|██████████| 63/63 [00:06<00:00,  9.94it/s]

              precision    recall  f1-score   support

           0       0.55      1.00      0.71       276
           1       0.00      0.00      0.00       169
           2       0.00      0.00      0.00        55

    accuracy                           0.55       500
   macro avg       0.18      0.33      0.24       500
weighted avg       0.30      0.55      0.39       500

--------------------
Confusion matrix
     0  1  2
0  276  0  0
1  169  0  0
2   55  0  0
--------------------
Actual counter: Counter({0: 276, 1: 169, 2: 55})
Prediction counter: Counter({0: 500})
Mapping: {'yes': 0, 'no': 1, 'maybe': 2}



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
test_dataset = pubmedDataset(list_data=dict_data['test'], 
                             tokenizer=tokenizer, 
                             max_length=400, 
                             label2id=label2id)

test_loader = DataLoader(test_dataset, 
                         batch_size=16, 
                         shuffle=False,
                         num_workers=1)

In [18]:
dict_test = evaluate(model=model, 
                     data_loader=test_loader)

100%|██████████| 32/32 [00:04<00:00,  6.57it/s]


In [19]:
get_performance(actual_ = dict_test['actual'], 
                preds_ = dict_test['preds'], 
                dict_mapping = label2id)

              precision    recall  f1-score   support

           0       0.55      1.00      0.71       276
           1       0.00      0.00      0.00       169
           2       0.00      0.00      0.00        55

    accuracy                           0.55       500
   macro avg       0.18      0.33      0.24       500
weighted avg       0.30      0.55      0.39       500

--------------------
Confusion matrix
     0  1  2
0  276  0  0
1  169  0  0
2   55  0  0
--------------------
Actual counter: Counter({0: 276, 1: 169, 2: 55})
Prediction counter: Counter({0: 500})
Mapping: {'yes': 0, 'no': 1, 'maybe': 2}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
dict_data['train'][:10]

[{'sentence1': "Is cytokeratin immunoreactivity useful in the diagnosis of short-segment Barrett's oesophagus in Korea?",
  'sentence2': "Cytokeratin 7/20 staining has been reported to be helpful in diagnosing Barrett's oesophagus and gastric intestinal metaplasia. However, this is still a matter of some controversy. To determine the diagnostic usefulness of cytokeratin 7/20 immunostaining for short-segment Barrett's oesophagus in Korea. In patients with Barrett's oesophagus, diagnosed endoscopically, at least two biopsy specimens were taken from just below the squamocolumnar junction. If goblet cells were found histologically with alcian blue staining, cytokeratin 7/20 immunohistochemical stains were performed. Intestinal metaplasia at the cardia was diagnosed whenever biopsy specimens taken from within 2 cm below the oesophagogastric junction revealed intestinal metaplasia. Barrett's cytokeratin 7/20 pattern was defined as cytokeratin 20 positivity in only the superficial gland, comb

In [21]:
dict_data['test'][:10]

[{'sentence1': 'Is anorectal endosonography valuable in dyschesia?',
  'sentence2': 'Dyschesia can be provoked by inappropriate defecation movements. The aim of this prospective study was to demonstrate dysfunction of the anal sphincter and/or the musculus (m.) puborectalis in patients with dyschesia using anorectal endosonography. Twenty consecutive patients with a medical history of dyschesia and a control group of 20 healthy subjects underwent linear anorectal endosonography (Toshiba models IUV 5060 and PVL-625 RT). In both groups, the dimensions of the anal sphincter and the m. puborectalis were measured at rest, and during voluntary squeezing and straining. Statistical analysis was performed within and between the two groups. The anal sphincter became paradoxically shorter and/or thicker during straining (versus the resting state) in 85% of patients but in only 35% of control subjects. Changes in sphincter length were statistically significantly different (p<0.01, chi(2) test) in 