In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] ='6'
# os.environ['CUDA_VISIBLE_DEVICES'] ='4'

In [2]:
# !echo $CUDA_VISIBLE_DEVICES

In [3]:
import torch
import json
import numpy as np
import transformers
import pandas as pd
import pickle as pkl
from torch import nn
from tqdm import tqdm
from os.path import join
from importlib import reload
import multiprocessing as mp
from collections import Counter
from data_pub import pubmedDataset
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report, confusion_matrix
from transformers import (BertPreTrainedModel, BertModel, AdamW, get_linear_schedule_with_warmup, 
                          RobertaPreTrainedModel, RobertaModel,
                          AutoTokenizer, AutoModel, AutoConfig)
from transformers import (WEIGHTS_NAME,
                          BertConfig, BertForSequenceClassification, BertTokenizer,
                          XLMConfig, XLMForSequenceClassification, XLMTokenizer,
                          DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer,
                          RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer)

In [4]:
config = AutoConfig.from_pretrained(
        "roberta-base",
        num_labels=3,
        finetuning_task='pubmedqa')

tokenizer = AutoTokenizer.from_pretrained("roberta-base")

model = RobertaForSequenceClassification.from_pretrained("roberta-base", 
                                                         config = config)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [5]:
def read_data(split, fold=1):
    if split == 'train':
        train_json = json.load(open('/mnt/nfs/work1/hongyu/brawat/pubmedqa/pubmedqa/data/pqal_fold%d/train_set.json' % fold, 
                                    'r'))
        dev_json = json.load(open('/mnt/nfs/work1/hongyu/brawat/pubmedqa/pubmedqa/data/pqal_fold%d/dev_set.json' % fold, 
                                  'r'))
        final_json = {**train_json, **dev_json}
    else:
        test_json = json.load(open('/mnt/nfs/work1/hongyu/brawat/pubmedqa/pubmedqa/data/test_set.json', 'r'))
        final_json = test_json
    list_data = []
    for key_, val_ in final_json.items():
        tmp_ = {'sentence1': val_['QUESTION'], 
                'sentence2': ' '.join(val_['CONTEXTS']), 
                'gold_label': val_['final_decision']}
        list_data.append(tmp_)
    return list_data

def read_data_(dict_data_):
    
    list_data = []
    for idx in range(len(dict_data_['question'])):
        instance = {
            'sentence1': dict_data_['question'][idx],
            'sentence2': ''.join(dict_data_['context'][idx]['contexts']),
            'gold_label': dict_data_['final_decision'][idx]
        }
        list_data.append(instance)
    
    return list_data
    

In [6]:
def get_class_wts(dict_cnt, alpha=15):
    tot_cnt = sum([dict_cnt[x] for x in dict_cnt])
    wt_ = {}
    for each_cat in dict_cnt:
        wt_[each_cat] = np.log(alpha * tot_cnt/dict_cnt[each_cat])
    return wt_

In [7]:
#
import datasets
from sklearn.model_selection import train_test_split

pubmedqa = datasets.load_dataset('pubmed_qa', 'pqa_labeled')
pubmedqa_train, pubmedqa_test = train_test_split(pubmedqa['train'])

pubmedqa_train.keys()

Reusing dataset pubmed_qa (/Users/vijetadeshpande/.cache/huggingface/datasets/pubmed_qa/pqa_labeled/1.0.0/2e65addecca4197502cd10ab8ef1919a47c28672f62d7abac7cc9afdcf24fb2d)


  0%|          | 0/1 [00:00<?, ?it/s]

dict_keys(['pubid', 'question', 'context', 'long_answer', 'final_decision'])

In [8]:

dict_data = {}
#dict_data['train'] = read_data(split='train', fold=1)
#dict_data['test'] = read_data(split='test')
dict_data['train'] = read_data_(pubmedqa_train)
dict_data['test'] = read_data_(pubmedqa_test)

label2id = {'yes':0, 'no': 1, 'maybe': 2}

In [9]:
dict_data['train'][0]

{'sentence1': 'Do ventricular arrhythmias in athletes subside over time?',
 'sentence2': 'Sudden death in athletes can occur during sport activities and is presumably related to ventricular arrhythmias.To investigate the long-term follow-up ofathletes with ventricular arrhythmias during an exercise test.From a database of 56,462 athletes we identified 192 athletes (35 years old who had ventricular arrhythmias during an exercise test. Ninety athletes had>or =3 ventricular premature beats (VPB) (group A) and 102 athletes had ventricular couplets or non-sustained ventricular tachycardia during an exercise test (group B). A control group of 92 athletesfrom without ventricular arrhythmias was randomly seleclted from the database (group C). Of the 192 athletes 39 returnied for a repeat exercise test after a mean follow-up period of 70 +/- 25 months and they constitute the study population.Twelve athletes from group A, 21 fromgroup B and 6 from group C returned for a repeat exercise test. The

In [10]:
print("=="*10)
print('Train')
print("=="*10)
print("Train: ", Counter([x['gold_label'] for x in dict_data['train']]))
print("Train: ", np.mean([x['sentence1'].__len__() for x in dict_data['train']]))
print("Train: ", np.mean([x['sentence2'].__len__() for x in dict_data['train']]))

print('\n')

print("=="*10)
print("Test")
print("=="*10)
print("Test: ", Counter([x['gold_label'] for x in dict_data['test']]))
print("Test: ", np.mean([x['sentence1'].__len__() for x in dict_data['test']]))
print("Test: ", np.mean([x['sentence2'].__len__() for x in dict_data['test']]))

Train
Train:  Counter({'yes': 418, 'no': 254, 'maybe': 78})
Train:  93.428
Train:  1339.3746666666666


Test
Test:  Counter({'yes': 134, 'no': 84, 'maybe': 32})
Test:  96.5
Test:  1337.5


In [11]:
#class_wts = get_class_wts(dict_cnt={'yes': 276, 'no': 169, 'maybe': 55}, 
#                          alpha=3)

class_wts = get_class_wts(dict_cnt={'yes': 400, 'no': 264, 'maybe': 86}, 
                          alpha=3)
print(class_wts)

{'yes': 1.7272209480904839, 'no': 2.1427363920521496, 'maybe': 3.2643381989449582}


In [12]:
train_dataset = pubmedDataset(list_data=dict_data['train'], 
                             tokenizer=tokenizer, 
                             max_length=506, 
                             label2id=label2id)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True,
                          num_workers=1)

In [13]:
args = {'weight_decay':0.0,
        'learning_rate':2e-5,
        'epochs':1,
        'gradient_accumulation_steps':1,
        'adam_epsilon':1e-8}
args['t_total'] = len(train_loader) // args['gradient_accumulation_steps'] * args['epochs']
args['warmup_steps'] = int(0.20*args['t_total'])

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [14]:
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
         'weight_decay': args['weight_decay']},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 
         'weight_decay': 0.0}
    ]
optimizer = AdamW(optimizer_grouped_parameters, lr=args['learning_rate'], eps=args['adam_epsilon'])
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args['warmup_steps'],
                                            num_training_steps=args['t_total'])

In [15]:
model = model.to(device)

In [16]:
# model = nn.DataParallel(model)

In [17]:
# loss_fct = CrossEntropyLoss(reduction='none')
loss_fct = CrossEntropyLoss()

In [18]:
model

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [19]:
def evaluate(model, data_loader):
    model.eval()
    dict_result = {'actual':[],
                   'preds':[]}
    with torch.no_grad():
        for batch in tqdm(data_loader):
            dict_result['actual'] += batch['label'].numpy().tolist()

            input_batch = {'input_ids':batch['input_ids'],
                       'attention_mask':batch['attention_mask']}
            input_batch = {k: v.to(device) for k, v in input_batch.items()}
            outputs = model(**input_batch)

            dict_result['preds'] += np.argmax(outputs[0].detach().cpu().numpy(), axis=1).tolist()

    dict_result['actual'] = [x[0] for x in dict_result['actual']]    
    return dict_result

In [20]:
def get_performance(actual_, preds_, dict_mapping):
    print(classification_report(actual_, preds_))
    print('--'*10)
    print('Confusion matrix')
    print(pd.DataFrame(confusion_matrix(actual_, preds_)))
    print('--'*10)
    print('Actual counter:', Counter(actual_))
    print('Prediction counter:', Counter(preds_))
    print('Mapping:', dict_mapping)

In [None]:
model.train()
for each_epoch in range(args['epochs']):
    model.train()
    for batch in tqdm(train_loader):
        model.zero_grad()
        input_batch = {'input_ids':batch['input_ids'],
                       'attention_mask':batch['attention_mask']}
        input_batch = {k: v.to(device) for k, v in input_batch.items()}
        
        outputs = model(**input_batch)
        ### Loss calculation
#         loss = loss_fct(outputs[0], batch['label'].view(-1).cuda()).sum()
        loss = loss_fct(outputs[0], batch['label'].view(-1).to(device))
        
        loss.backward()
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        
    dict_train = evaluate(model=model, 
                          data_loader=train_loader)
    get_performance(actual_ = dict_train['actual'], 
                    preds_ = dict_train['preds'], 
                    dict_mapping = label2id)

In [None]:
test_dataset = pubmedDataset(list_data=dict_data['test'], 
                             tokenizer=tokenizer, 
                             max_length=400, 
                             label2id=label2id)

test_loader = DataLoader(test_dataset, 
                         batch_size=16, 
                         shuffle=False,
                         num_workers=1)

In [None]:
dict_test = evaluate(model=model, 
                     data_loader=test_loader)

In [None]:
get_performance(actual_ = dict_test['actual'], 
                preds_ = dict_test['preds'], 
                dict_mapping = label2id)

In [None]:
dict_data['train'][:10]

In [None]:
dict_data['test'][:10]