In [1]:
import torch
import json
import numpy as np
import transformers
import pandas as pd
import pickle as pkl
from torch import nn
from tqdm import tqdm
from os.path import join
from importlib import reload
import multiprocessing as mp
from collections import Counter
from data_pub import pubmedDataset
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from copy import deepcopy
from sklearn.metrics import classification_report, confusion_matrix
from transformers import (BertPreTrainedModel, BertModel, AdamW, get_linear_schedule_with_warmup, 
                          RobertaPreTrainedModel, RobertaModel,
                          AutoTokenizer, AutoModel, AutoConfig)
from transformers import (WEIGHTS_NAME,
                          AutoModelForSequenceClassification,
                          BertConfig, BertForSequenceClassification, BertTokenizer,
                          XLMConfig, XLMForSequenceClassification, XLMTokenizer,
                          DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer,
                          RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer)
from PubMedQAData import QADataLoader
import wandb
import os
os.environ['CUDA_VISIBLE_DEVICES'] ='3'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# model class
class QAModel(nn.Module):
    def __init__(
        self,
        model_name,
        num_classes,
    ):
        super(QAModel, self).__init__()

        config = AutoConfig.from_pretrained(
            model_name,
            num_labels=num_classes,
            finetuning_task='pubmedqa'
        )
        self.encoder = AutoModelForSequenceClassification.from_pretrained(
            model_name, 
            config=config,
        )

        self.classifier = nn.Linear(
            in_features=768,
            out_features=num_classes,
        )
    
        return

    def forward(
        self,
        batch_,
    ):
        outputs = self.encoder(**batch_)
        #pooled = torch.mean(outputs[0], dim=1).to(device)
        #logits_ = self.classifier(pooled)
        logits_ = outputs[0]
        
        return logits_

In [3]:
# function for collecting all predictions on the input dataset
def get_predictions(model_, loader_):
    model_.eval()
    
    #
    dict_results = {}
    all_preds = []
    for batch_idx, batch_ in tqdm(enumerate(loader_)):
        with torch.inference_mode():
            
            # unroll features
            input_batch = {
                'input_ids':batch_['input_ids'],
                'attention_mask':batch_['attention_mask']
            }
            input_batch = {k: v.to(device) for k, v in input_batch.items()}
            
            # forward pass
            logits = model(input_batch)
            
            # update
            preds = np.argmax(logits.detach().cpu().numpy(), axis=1).tolist()
            all_preds += preds
            ids_ = batch_['ids'].numpy().tolist()
            for id_idx, id_ in enumerate(ids_):
                dict_results[str(id_)] = {'custom_label': preds[id_idx]}
    
    # get distribution of predicted labels
    count = {}
    count['yes'] = (np.array(all_preds) == 0).sum()
    count['no'] = (np.array(all_preds) == 1).sum()
    count['maybe'] = (np.array(all_preds) == 2).sum()
    dist_class = {}
    for i in ['yes', 'no', 'maybe']:
        dist_class[i] = count[i]/len(all_preds)
    
    return dict_results, dist_class


In [4]:
# once we get the data with artificial label we will need to convert it back to the required format, following class does that

class CustomArtiDataloader():
    
    def __init__(
        self, 
        dict_data: dict,
        label2id: dict,
        batch_size: int = 16,
        debug: bool = False,
        debug_size: int = 8,
    ):
        data = self.to_list(dict_data)
        
        # define Dataset object
        self.dataset = CustomArtiDataset(data)
        
        # define dataloader object
        self.dataloader = Dataloader(
            self.dataset,
            batch_size=batch_size,
            shuffle=False,
            num_workers=0,
            collate_fn=collation_f,            
        )
        
        return
    
    def to_list(self, data_in):
        
        data_out = []
        for idx_ in range(len(data_in['input_ids'])):
            instance = {k_: v_[idx_] for k_, v_ in data_in.items()}
            data_out.append(instance)
            
        return data_out
    
    def collation_f(self, batch):
        
        #
        input_ids_list = [ex["input_ids"] for ex in batch]
        attention_mask_list = [ex["attention_mask"] for ex in batch]
        decoder_input_ids_list = [ex["decoder_input_ids"] for ex in batch]
        decoder_attention_mask_list = [ex["decoder_attention_mask"] for ex in batch]
        decoder_labels_list = [ex["decoder_labels"] for ex in batch]
        encoder_label_list = [ex['encoder_labels_artificial'] for ex in batch]

        collated_batch = {
            "input_ids": torch.LongTensor(input_ids_list),
            "attention_mask": torch.LongTensor(attention_mask_list),
            "encoder_labels": torch.LongTensor(encoder_label_list),
            "decoder_input_ids": torch.LongTensor(decoder_input_ids_list),
            "decoder_attention_mask": torch.LongTensor(decoder_attention_mask_list),
            "decoder_labels": torch.LongTensor(decoder_labels_list),
        }

        return collated_batch
    
class CustomArtiDataset(Dataset):
    
    def __init__(self, list_data):
        self.data = list_data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return list_data[idx]

#
def inspect_dataloader(loaders):
    print('Inspecting dataloader...')
    
    #
    print(f"\nSize of the training set is {len(loaders.dataset_train)}")
    print(f"Size of the validation set is {len(loaders.dataset_validation)}")
    print(f"Size of the test set is {len(loaders.dataset_test)}")
    
    #
    check_first = loaders.dataset_validation[0]['input_ids'] == loaders.dataset_test[0]['input_ids']
    check_last = loaders.dataset_validation[-1]['input_ids'] == loaders.dataset_test[-1]['input_ids']
    print(f"\nFirst example in test and validation set is same: {check_first}")
    print(f"Last example in test and validation set is same: {check_last}")
    
    # check if train example exists in test or validation set
    with open('test_set.json', 'r') as f:
        test_ = json.load(f)
    with open('dev_set.json', 'r') as f:
        dev_ = json.load(f)
    check_pool = list(test_.keys()) + list(dev_.keys())
    
    
    # check distribution of all classes in train, test and valid
    id2label = {0: 'yes', 1: 'no', 2: 'maybe'}
    count_ = {'yes': 0, 'no': 0, 'maybe': 0}
    for idx in tqdm(range(len(loaders.dataset_train))):
        label_i = loaders.dataset_train[idx]['gold_label'][0]
        label_i = id2label[label_i]
        count_[label_i] += 1
    print("Distribution of classes in training set")
    for c_ in count_:
        print(f"Class: {c_}, Percentage: {count_[c_] / len(loaders.dataset_train)}")
        
    count_ = {'yes': 0, 'no': 0, 'maybe': 0}
    for idx in tqdm(range(len(loaders.dataset_validation))):
        label_i = loaders.dataset_validation[idx]['gold_label'][0]
        label_i = id2label[label_i]
        count_[label_i] += 1
    print("Distribution of classes in validation set")
    for c_ in count_:
        print(f"Class: {c_}, Percentage: {count_[c_] / len(loaders.dataset_validation)}")
    
    count_ = {'yes': 0, 'no': 0, 'maybe': 0}
    for idx in tqdm(range(len(loaders.dataset_test))):
        label_i = loaders.dataset_test[idx]['gold_label'][0]
        label_i = id2label[label_i]
        count_[label_i] += 1
    print("Distribution of classes in test set")
    for c_ in count_:
        print(f"Class: {c_}, Percentage: {count_[c_] / len(loaders.dataset_test)}")
        
        
    
    """
    print("\nChecking if training examples exists in test.dev set...")
    for idx in range(len(loaders.dataset_train)):
        train_i = loaders.dataset_train[idx]
        id_ = train_i['id'][0]
        assert id_ not in check_pool, "Training exampl exists in test/dev set, check dataloader"
    
    #
    print("\nPrinting three randomly sampled examples...")
    random_samples = np.random.randint(0, len(loaders.dataset_train), size=3)
    for sample_ in random_samples:
        tokenized_sample = loaders.dataset_train[sample_]
        tokenizer = loaders.source_tokenizer
        id2label = loaders.id2label
        
        #
        print('\nInput sequence to the model i.e. Question + Context, is as follows:')
        print(tokenizer.decode(tokenized_sample['input_ids']))
        print('Gold label is as follows:')
        print(id2label[tokenized_sample['gold_label'][0]])
    """
    
    return

In [5]:
# Phase 2:
# Step 1: get dataloader for unlabled and artifial dataset
# Step 2: instantiate biomed-roberta model and load previously trained model
# Step 3: use loaded model to predict artificial labels
# Step 4: convert the predictions into dataloader
# Step 5: train BioMedRoberta on artificial data
# Step 6: save the trained model


In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
args = {
    'weight_decay': 10,
    'learning_rate': 6.2e-6,
    'epochs': 100,
    'eval_every_steps': 300,
    'gradient_accumulation_steps': 1,
    'adam_epsilon': 1e-8,
    'max_sequence_length': 512,
    'batch_size': 768,
    'output_dir': r'./local_biomed_roberta_base',
}
label2id = {
    'yes': 0,
    'no': 1,
    'maybe': 2,
}
no_decay = ['bias', 'LayerNorm.weight']

#
model_dict = {
    0: {
        'model': 'allenai/biomed_roberta_base',
        'tokenizer': 'allenai/biomed_roberta_base',
    },
}
"""
model_dict = {
    0: {
        'model': 'RoBERTa-large-PM-M3-hf',
        'tokenizer': 'roberta-large',
    },
}
"""

"\nmodel_dict = {\n    0: {\n        'model': 'RoBERTa-large-PM-M3-hf',\n        'tokenizer': 'roberta-large',\n    },\n}\n"

In [7]:
# Step 1: Dataloader

#
data_all = QADataLoader(
    datasets_name=None,#'pubmed_qa',
    datasets_config=None,#'pqa_artificial',
    label2id=label2id,
    tokenizer_name=model_dict[0]['tokenizer'],
    max_sequence_length=args['max_sequence_length'],
    batch_size=args['batch_size'],
    debug=False
)

Reading unlabeled and artificially labeled subsets of the data
length of artificially labled data: 211269
length of unlabeled data: 61249
length of combined data: 272518


272518it [00:00, 320960.01it/s]


In [8]:
#inspect_dataloader(data_all)

In [9]:
# Step 2: Model

#
model_name = model_dict[0]['model'].split('/')[-1]
model = QAModel(
    model_name= model_dict[0]['model'],
    num_classes=data_all.num_classes,
)
model.load_state_dict(torch.load(os.path.join(args['output_dir'],  model_name+'_phase1_.pt')))
model.to(device)

Some weights of the model checkpoint at allenai/biomed_roberta_base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at allenai/biomed_roberta_base and are newly initialized: ['classi

QAModel(
  (encoder): RobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50265, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0): RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
                (dense): Linear(in_fe

In [10]:
# Step 3: Predict (get artificial labels)
predictions, dist_pred = get_predictions(model, data_all.dataloader_train)

355it [51:23,  8.69s/it]


In [11]:
print(dist_pred)

{'yes': 0.8312328968392441, 'no': 0.12620984774743338, 'maybe': 0.04255725541332256}


In [16]:
id2label = {
    0: 'yes',
    1: 'no',
    2: 'maybe',
}

model_labeled_data = {}

with open('ori_pqaa.json', 'r') as f:
    a_ = json.load(f)
with open('ori_pqau.json', 'r') as f:
    u_ = json.load(f)

for data_ in [a_, u_]:
    for id_idx, id_ in enumerate(data_):
        model_labeled_data[id_] = data_[id_]
        if id_ in predictions:
            model_labeled_data[id_]['custom_label'] = id2label[predictions[id_]['custom_label']]
        else:
            model_labeled_data[id_]['custom_label'] = ''

In [17]:
model_labeled_data['25429730']

{'QUESTION': 'Are group 2 innate lymphoid cells ( ILC2s ) increased in chronic rhinosinusitis with nasal polyps or eosinophilia?',
 'CONTEXTS': ['Chronic rhinosinusitis (CRS) is a heterogeneous disease with an uncertain pathogenesis. Group 2 innate lymphoid cells (ILC2s) represent a recently discovered cell population which has been implicated in driving Th2 inflammation in CRS; however, their relationship with clinical disease characteristics has yet to be investigated.',
  'The aim of this study was to identify ILC2s in sinus mucosa in patients with CRS and controls and compare ILC2s across characteristics of disease.',
  'A cross-sectional study of patients with CRS undergoing endoscopic sinus surgery was conducted. Sinus mucosal biopsies were obtained during surgery and control tissue from patients undergoing pituitary tumour resection through transphenoidal approach. ILC2s were identified as CD45(+) Lin(-) CD127(+) CD4(-) CD8(-) CRTH2(CD294)(+) CD161(+) cells in single cell suspen

In [18]:
with open('model_labeled_data.json', 'w') as f:
    json.dump(model_labeled_data, f, indent=4)

In [23]:
with open('model_labeled_data.json', 'r') as f:
    data = json.load(f)

In [24]:
count = {'yes': 0, 'no': 0, 'maybe': 0}
for i_ in data:
    if data[i_]['custom_label'] != '':
        count[data[i_]['custom_label']] += 1

dist_ = {'yes': 0, 'no': 0, 'maybe': 0}
for i_ in count:
    dist_[i_] = count[i_] / len(data)

In [25]:
dist_

{'yes': 0.8304001937486698,
 'no': 0.12608341467352616,
 'maybe': 0.04251462288729552}

In [22]:
with open('ori_pqaa_1st_attempt.json', 'r') as f:
    data = json.load(f)

count = {'yes': 0, 'no': 0, 'maybe': 0}
for i_ in data:
    if data[i_]['custom_label'] != '':
        count[data[i_]['custom_label']] += 1

dist_ = {'yes': 0, 'no': 0, 'maybe': 0}
for i_ in count:
    dist_[i_] = count[i_] / len(data)

print(dist_)

{'yes': 0.8268179430015762, 'no': 0.14271379142230994, 'maybe': 0.020466798252464866}
