In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] ='6'
# os.environ['CUDA_VISIBLE_DEVICES'] ='4'

In [2]:
# !echo $CUDA_VISIBLE_DEVICES

In [3]:
import torch
import json
import numpy as np
import transformers
import pandas as pd
import pickle as pkl
from torch import nn
from tqdm import tqdm
from os.path import join
from importlib import reload
import multiprocessing as mp
from collections import Counter
from data_pub import pubmedDataset
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
from copy import deepcopy
from sklearn.metrics import classification_report, confusion_matrix
from transformers import (BertPreTrainedModel, BertModel, AdamW, get_linear_schedule_with_warmup, 
                          RobertaPreTrainedModel, RobertaModel,
                          AutoTokenizer, AutoModel, AutoConfig)
from transformers import (WEIGHTS_NAME,
                          BertConfig, BertForSequenceClassification, BertTokenizer,
                          XLMConfig, XLMForSequenceClassification, XLMTokenizer,
                          DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer,
                          RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer)

In [4]:
def read_data(split, fold=1):
    if split == 'train':
        train_json = json.load(open('/mnt/nfs/work1/hongyu/brawat/pubmedqa/pubmedqa/data/pqal_fold%d/train_set.json' % fold, 
                                    'r'))
        dev_json = json.load(open('/mnt/nfs/work1/hongyu/brawat/pubmedqa/pubmedqa/data/pqal_fold%d/dev_set.json' % fold, 
                                  'r'))
        final_json = {**train_json, **dev_json}
    else:
        test_json = json.load(open('/mnt/nfs/work1/hongyu/brawat/pubmedqa/pubmedqa/data/test_set.json', 'r'))
        final_json = test_json
    list_data = []
    for key_, val_ in final_json.items():
        tmp_ = {'sentence1': val_['QUESTION'], 
                'sentence2': ' '.join(val_['CONTEXTS']), 
                'gold_label': val_['final_decision']}
        list_data.append(tmp_)
    return list_data

def read_data_(dict_data_):
    
    list_data = []
    for idx in range(len(dict_data_['question'])):
        instance = {
            'sentence1': dict_data_['question'][idx],
            'sentence2': ''.join(dict_data_['context'][idx]['contexts']),
            'gold_label': dict_data_['final_decision'][idx]
        }
        list_data.append(instance)
    
    return list_data
    

In [5]:
def get_class_wts(dict_cnt, alpha=15):
    tot_cnt = sum([dict_cnt[x] for x in dict_cnt])
    wt_ = {}
    for each_cat in dict_cnt:
        wt_[each_cat] = np.log(alpha * tot_cnt/dict_cnt[each_cat])
    return wt_

In [6]:
#
import datasets
from sklearn.model_selection import train_test_split

pubmedqa = datasets.load_dataset('pubmed_qa', 'pqa_labeled')
pubmedqa_train, pubmedqa_test = train_test_split(pubmedqa['train'])

pubmedqa_train.keys()

Reusing dataset pubmed_qa (/Users/vijetadeshpande/.cache/huggingface/datasets/pubmed_qa/pqa_labeled/1.0.0/2e65addecca4197502cd10ab8ef1919a47c28672f62d7abac7cc9afdcf24fb2d)


  0%|          | 0/1 [00:00<?, ?it/s]

dict_keys(['pubid', 'question', 'context', 'long_answer', 'final_decision'])

In [7]:

dict_data = {}
#dict_data['train'] = read_data(split='train', fold=1)
#dict_data['test'] = read_data(split='test')
dict_data['train'] = read_data_(pubmedqa_train)
dict_data['test'] = read_data_(pubmedqa_test)

label2id = {'yes':0, 'no': 1, 'maybe': 2}

In [8]:
dict_data['train'][0]

{'sentence1': "Follow-up of patients with new cardiovascular implantable electronic devices: are experts' recommendations implemented in routine clinical practice?",
 'sentence2': 'A 2008 expert consensus statement outlined the minimum frequency of follow-up of patients with cardiovascular implantable electronic devices (CIEDs).We studied 38 055 Medicare beneficiaries who received a new CIED between January 1, 2005, and June 30, 2009. The main outcome measure was variation of follow-up by patient factors and year of device implantation. We determined the number of patients who were eligible for and attended an in-person CIED follow-up visit within 2 to 12 weeks, 0 to 16 weeks, and 1 year after implantation. Among eligible patients, 42.4% had an initial in-person visit within 2 to 12 weeks. This visit was significantly more common among white patients than black patients and patients of other races (43.0% versus 36.8% versus 40.5%; P<0.001). Follow-up within 2 to 12 weeks improved from 

In [9]:
print("=="*10)
print('Train')
print("=="*10)
class_counts = Counter([x['gold_label'] for x in dict_data['train']])
print("Train: ", Counter([x['gold_label'] for x in dict_data['train']]))
print("Train: ", np.mean([x['sentence1'].__len__() for x in dict_data['train']]))
print("Train: ", np.mean([x['sentence2'].__len__() for x in dict_data['train']]))

print('\n')

print("=="*10)
print("Test")
print("=="*10)
print("Test: ", Counter([x['gold_label'] for x in dict_data['test']]))
print("Test: ", np.mean([x['sentence1'].__len__() for x in dict_data['test']]))
print("Test: ", np.mean([x['sentence2'].__len__() for x in dict_data['test']]))

Train
Train:  Counter({'yes': 405, 'no': 259, 'maybe': 86})
Train:  94.63333333333334
Train:  1344.0253333333333


Test
Test:  Counter({'yes': 147, 'no': 79, 'maybe': 24})
Test:  92.884
Test:  1323.548


In [10]:
#class_wts = get_class_wts(dict_cnt={'yes': 276, 'no': 169, 'maybe': 55}, 
#                          alpha=3)

class_wts = get_class_wts(
    dict_cnt={
        'yes': class_counts['yes'], 
        'no': class_counts['no'], 
        'maybe': class_counts['maybe'],
    }, 
    alpha=3
)
print(class_wts)

{'yes': 1.7147984280919266, 'no': 2.1618574334989282, 'maybe': 3.2643381989449582}


In [11]:
#train_dataset = pubmedDataset(list_data=dict_data['train'], 
#                             tokenizer=tokenizer, 
#                             max_length=506, 
#                             label2id=label2id)

#train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True,
#                          num_workers=1)

In [12]:
# auxilliary functions

def get_grouped_parameters(
    model_in, 
    no_decay_layers, 
    weight_decay
):
    
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model_in.named_parameters() if not any(nd in n for nd in no_decay_layers)],
         'weight_decay': weight_decay},
        {'params': [p for n, p in model_in.named_parameters() if any(nd in n for nd in no_decay_layers)], 
         'weight_decay': 0.0}
    ]
    
    return optimizer_grouped_parameters

def evaluate(model, data_loader):
    model.eval()
    dict_result = {'actual':[],
                   'preds':[]}
    
    print('\nStarting model evaluation:')
    with torch.no_grad():
        for batch in tqdm(data_loader):
            dict_result['actual'] += batch['label'].numpy().tolist()

            input_batch = {'input_ids':batch['input_ids'],
                       'attention_mask':batch['attention_mask']}
            input_batch = {k: v.to(device) for k, v in input_batch.items()}
            outputs = model(**input_batch)

            dict_result['preds'] += np.argmax(outputs[0].detach().cpu().numpy(), axis=1).tolist()

    dict_result['actual'] = [x[0] for x in dict_result['actual']]    
    return dict_result

def get_performance(
    actual_, 
    preds_,
    dict_mapping
):
    results = {}
    
    # accuracy, precision, recall, f1
    results['metrics'] = classification_report(
        actual_, 
        preds_,
        output_dict=True,
        zero_division=0,
    )
    
    # confusion matrix
    results['confusion_matrix'] = pd.DataFrame(
        confusion_matrix(
            actual_, 
            preds_
        )
    )
    
    # counter
    results['actual_counter'] = Counter(actual_)
    results['prediction_counter'] = Counter(preds_)
    
    return results

In [13]:
#model_name = 'roberta-base'
#tokenizer_name = 'roberta-base'

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
args = {
    'weight_decay': 0.0,
    'learning_rate': 2e-5,
    'epochs': 2,
    'gradient_accumulation_steps': 1,
    'adam_epsilon': 1e-8,
    'max_sequence_length': 503,
    'batch_size': 32,
}
no_decay = ['bias', 'LayerNorm.weight']


In [14]:
#
from PubMedQAData import QADataLoader
labe2id = {'yes': 0, 'no': 1, 'maybe': 3}


In [15]:
model_list = {
    1: {
        'model': 'roberta-base',
        'tokenizer': 'roberta-base',
    },
    2: {
        'model': 'allenai/biomed_roberta_base',
        'tokenizer': 'allenai/biomed_roberta_base',
    },
    
}

In [16]:

for model_idx in model_list:    
    
    
    # get dataloaders for training and testing
    dataloaders = QADataLoader(
        datasets_name='pubmed_qa',
        datasets_config='pqa_labeled',
        label2id=label2id,
        tokenizer_name=model_list[model_idx]['tokenizer'],
        max_sequence_length=args['max_sequence_length'],
        batch_size=2,
        debug=True
    )

    #
    train_loader = dataloaders.dataloader_train
    val_loader = dataloaders.dataloader_validation
    test_loader = dataloaders.dataloader_test
    
    # set total steps and warmp-up steps for sheduler
    args['t_total'] = len(train_loader) // args['gradient_accumulation_steps'] * args['epochs']
    args['warmup_steps'] = int(0.20*args['t_total'])

    # define model
    config = AutoConfig.from_pretrained(
        model_list[model_idx]['model'],
        num_labels=dataloaders.num_classes,
        finetuning_task='pubmedqa'
    )
    model = RobertaForSequenceClassification.from_pretrained(
        model_list[model_idx]['model'], 
        config=config,
    )
    model = model.to(device)

    # optimizer
    optimizer = AdamW(
        get_grouped_parameters(model, no_decay, args['weight_decay']), 
        lr=args['learning_rate'], 
        eps=args['adam_epsilon']
    )

    # scheduler for lr
    scheduler = get_linear_schedule_with_warmup(
        optimizer, 
        num_warmup_steps=args['warmup_steps'],
        num_training_steps=args['t_total']
    )

    # objective function
    loss_fct = CrossEntropyLoss()
    
    # train
    best_model = None
    best_f1_eval = -1
    best_test_results = None
    best_val_results = None
    model.train()
    for each_epoch in range(args['epochs']):
        model.train()
        print('\nStarting epoch number: %d'%(each_epoch+1))
        for batch in tqdm(train_loader):

            # clean gradients
            model.zero_grad()

            # unroll inputs and sent to device
            input_batch = {'input_ids':batch['input_ids'],
                           'attention_mask':batch['attention_mask']}
            input_batch = {k: v.to(device) for k, v in input_batch.items()}

            # forward pass
            outputs = model(**input_batch)

            # calculate loss
            loss = loss_fct(outputs[0], batch['label'].view(-1).to(device))

            # backpropagation
            loss.backward()

            # update parameters and lr
            optimizer.step()
            scheduler.step()  

        # evaluate model
        val_predictions = evaluate(
            model=model, 
            data_loader=val_loader
        )
        val_results = get_performance(
            actual_=val_predictions['actual'], 
            preds_=val_predictions['preds'], 
            dict_mapping=label2id
        )

        # update best model
        if best_f1_eval < val_results['metrics']['weighted avg']['f1-score']:
            best_model = deepcopy(model).to(device)
            best_val_results = deepcopy(val_results)
            best_f1_eval = val_results['metrics']['weighted avg']['f1-score']

    
    # test the model based on best_model
    test_predictions = evaluate(
        model=best_model, 
        data_loader=test_loader
    )
    best_test_results = get_performance(
        actual_=test_predictions['actual'], 
        preds_=test_predictions['preds'], 
        dict_mapping=label2id
    )
    
    # save the results and the model
    model_list[model_idx]['results'] = {
        'validation_results': deepcopy(best_val_results),
        'test_results': deepcopy(best_test_results),
        'trained_model': deepcopy(best_model),
    }
    
    #
    print('\n')
    print('='*5)
    print('Results for model\t : %s'%model_list[model_idx]['model'])
    print('='*5)
    print('Precision \t\t = %f'%model_list[model_idx]['results']['test_results']['metrics']['weighted avg']['precision'])
    print('Recall \t\t\t = %f'%model_list[model_idx]['results']['test_results']['metrics']['weighted avg']['recall'])
    print('f1-score \t\t = %f'%model_list[model_idx]['results']['test_results']['metrics']['weighted avg']['f1-score'])
    print('Accuracy \t\t = %f'%model_list[model_idx]['results']['test_results']['metrics']['accuracy'])
    print('='*5)


Reusing dataset pubmed_qa (/Users/vijetadeshpande/.cache/huggingface/datasets/pubmed_qa/pqa_labeled/1.0.0/2e65addecca4197502cd10ab8ef1919a47c28672f62d7abac7cc9afdcf24fb2d)


  0%|          | 0/1 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.


Starting epoch number: 1


100%|█████████████████████████████████████████████| 4/4 [01:00<00:00, 15.11s/it]



Starting model evaluation:


100%|█████████████████████████████████████████████| 4/4 [00:19<00:00,  4.76s/it]



Starting epoch number: 2


100%|█████████████████████████████████████████████| 4/4 [01:01<00:00, 15.32s/it]



Starting model evaluation:


100%|█████████████████████████████████████████████| 4/4 [00:18<00:00,  4.74s/it]



Starting model evaluation:


100%|█████████████████████████████████████████████| 4/4 [00:19<00:00,  4.77s/it]


Results for model	 : roberta-base
Precision 		 = 0.250000
Recall 			 = 0.500000
f1-score 		 = 0.333333
Accuracy 		 = 0.500000


Reusing dataset pubmed_qa (/Users/vijetadeshpande/.cache/huggingface/datasets/pubmed_qa/pqa_labeled/1.0.0/2e65addecca4197502cd10ab8ef1919a47c28672f62d7abac7cc9afdcf24fb2d)


  0%|          | 0/1 [00:00<?, ?it/s]

Downloading:   0%|          | 0.00/185 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/430 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625M [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/biomed_roberta_base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at allenai/biomed_roberta_base and are newly initialized: ['classi


Starting epoch number: 1


100%|█████████████████████████████████████████████| 4/4 [00:59<00:00, 14.91s/it]



Starting model evaluation:


100%|█████████████████████████████████████████████| 4/4 [00:18<00:00,  4.72s/it]



Starting epoch number: 2


100%|█████████████████████████████████████████████| 4/4 [01:00<00:00, 15.06s/it]



Starting model evaluation:


100%|█████████████████████████████████████████████| 4/4 [00:19<00:00,  4.78s/it]



Starting model evaluation:


100%|█████████████████████████████████████████████| 4/4 [00:19<00:00,  4.78s/it]

Results for model	 : allenai/biomed_roberta_base
Precision 		 = 0.250000
Recall 			 = 0.500000
f1-score 		 = 0.333333
Accuracy 		 = 0.500000



