Initialization Google Drive Configuration 

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive') 

Mounted at /content/drive


# 1. Dataset BERT Transformer

**Created By:**  Jirarote Jirasirikul

**Monash University (Melbourne) Australia** 

This file contain a code to transform input of NLP tasks (HoC and PubMedQA) from BLURB Leaderboard into BERT vector representation.
https://microsoft.github.io/BLURB/leaderboard.html 

This code has been modified from www.HuggingFace.co

## Import Library

All Library and File Path will be added here

In [None]:
# On M3 : for shell script file
# import fire

In [None]:
# Standard Library
import pandas as pd
import numpy as np
import glob
import os
from pathlib import Path

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, matthews_corrcoef
from sklearn.metrics import precision_recall_fscore_support,accuracy_score

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from collections import defaultdict

import matplotlib.pyplot as plt

from datetime import datetime
import json 
# pd.set_option('display.max_colwidth', -1)

In [None]:
# BERT Transformer Library
!pip install transformers
import transformers as ppb

Collecting transformers
  Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 639 kB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 56.7 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 52.5 MB/s 
[?25hCollecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 5.7 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 62.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installati

## Check Available Device (CPU/GPU)

In [None]:
import torch
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    DEVICE_AVAILABLE = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    DEVICE_AVAILABLE = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


## Utilities Functions

In [None]:
# Customize LOGGGING function
ENABLE_LOGS = 1
def print_log(*arg, log_type="Info"):
    global ENABLE_LOGS
    if(ENABLE_LOGS==1 or log_type!="Info"): 
        print("["+log_type+"]"," ".join(str(x) for x in arg))



---

##BERT Text Representation

Transform Language Model

When using BERT, technically we are transforming our sentence into a vector that represent each sentence. The process is call Language Model a representation of each word. 

BERT add [CLS] token infront of each sentence. This token representation vector could later be use for Classification as it contain the sentence representation.

## Define Classes

### Class : My BERT
This class build for assisting and store BERT data

In [None]:
# BERT weight Options 
# - 'distilbert-base-uncased'
# - 'bert-base-uncased'
# - 'dmis-lab/biobert-base-cased-v1.1'
# - 'dmis-lab/biobert-v1.1' : Data Mining and Information Systems Lab, Korea University's picture Updated May 19 • 41k
# - 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext'

In [None]:
class my_BERT:
    ###### Load pretrain BERT Language Model transformer (Otherwise use 'set' to customize)
    model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

    # Load pretrained model/tokenizer
    bert_tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
    bert_tokenizer.add_special_tokens = True
    bert_model = model_class.from_pretrained(pretrained_weights)

    PRETRAIN_MAPPING = {'distilbert-base-uncased':'distilbert-base-uncased',
                        'bert-base-uncased':'bert-base-uncased',
                        'biobert-base-cased':'dmis-lab/biobert-base-cased-v1.1',
                        'biobert-base-uncased':'dmis-lab/biobert-v1.1',
                        'pubmedbert-base-uncased':'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext'}

    def __init__(self, df_input,is_transform=False, ENABLE_LOGS = 1):
        ## INPUT STRUCTURE (COLUMNS): 
        ## - 'text' - Required
        ## - 'label' - Optional default name is 'label' otherwise need to specific when called

        if(is_transform):
            self.df = None
            self.df_BERT = df_input
        else:
            self.df = df_input
            self.df_BERT = None
        self.ENABLE_LOGS = ENABLE_LOGS
    
    def print_log(self, *arg, log_type="Info"):
        if(self.ENABLE_LOGS==1 or log_type!="Info"): 
            print("["+log_type+"]"," ".join(str(x) for x in arg))

    def bert_tokenize(self, token_length=128):
        df_output = self.df.copy()

        df_output['BERTTokens'] = df_output["text"].apply((lambda x: self.bert_tokenizer.encode(x, add_special_tokens=True,truncation=True)))
        # df_output['n_tokens0'] = df_output['BERTTokens'].apply(lambda x: len(x)) # Just for verification
        temp = df_output['BERTTokens'].apply(lambda x: len(x))
        self.print_log("NO TRUNCATE","Token - Done","( mean/max no. of token:",round(temp.mean()),temp.max(),")")

        # BERT Tokenizer + truncate to BERT_MAX_LENGTH
        df_output['BERTTokens'] = df_output["text"].apply((lambda x: self.bert_tokenizer.encode(x, add_special_tokens=True,truncation=True,max_length = token_length)))
        # df_output['n_tokens0'] = df_output['BERTTokens'].apply(lambda x: len(x)) # Just for verification
        temp = df_output['BERTTokens'].apply(lambda x: len(x))
        self.print_log("Token - Done","( mean/max no. of token:",round(temp.mean()),temp.max(),")")

        # Padding tokens to BERT_MAX_LENGTH
        df_output['BERTTokens'] = df_output['BERTTokens'].apply(lambda x: x + [0]*(token_length-len(x)))
        # df_output['n_tokens'] = df_output['BERTTokens'].apply(lambda x: len(x)) # Just for verification
        self.print_log("Pad - Done")

        # BERT Mask
        df_output['BERTMasks'] = df_output['BERTTokens'].apply(lambda x: [np.where(i != 0, 1, 0) for i in x])
        # df_output['n_mask1'] = df_output['BERTMask'].apply(lambda x: sum(x)) # Just for verification
        self.print_log("Mask - Done")

        return df_output

    def run_bert_transform(self, dataloader, device_available = torch.device("cpu")):
        all_result = []

        self.bert_model.to(device_available)

        digit = len(str(len(dataloader)))-1 # Report progress

        for step, batch in enumerate(dataloader):
            if(step == 0 or (step+1)%(10**digit) == 0 or step == len(dataloader)-1): self.print_log("Step:",step+1,"/",len(dataloader))

            b_input_ids = batch[0].to(device_available)
            b_input_mask = batch[1].to(device_available)

            with torch.no_grad():
                last_hidden_states = self.bert_model(b_input_ids, attention_mask=b_input_mask)
        
            res_features = last_hidden_states[0][:,0,:].cpu().numpy()
            all_result.append(res_features)
        self.print_log("BERT transform - Done")

        return np.vstack(all_result)

    def bert_transform(self, device_available = torch.device("cpu"), batch_size = 32, token_length=128):
        df_output = self.bert_tokenize(token_length)

        # Convert to Tensor
        input_tokens = torch.tensor(np.stack(df_output['BERTTokens'].values))
        input_masks = torch.tensor(np.stack(df_output['BERTMasks'].values))
        # print(input_tokens,input_masks)

        # Create the DataLoader for our training set.
        input_data = TensorDataset(input_tokens, input_masks)
        input_sampler = SequentialSampler(input_data)
        input_dataloader = DataLoader(input_data, sampler=input_sampler, batch_size=batch_size)

        self.print_log("Running BERT Transform on", str(device_available))
        if(str(device_available) == 'cpu'):
            self.print_log("Running BERT on CPU can take longer time...",log_type="WARNING")
        self.print_log("BERT token length:",token_length)
        self.print_log("Data size:",str(len(input_tokens)), "( Total batch", str(len(input_dataloader)),'* size',str(batch_size),")")
        
        output_features = self.run_bert_transform(input_dataloader,device_available)
        df_output = pd.concat([df_output,pd.DataFrame(output_features.tolist()).add_prefix('feature_')],axis=1)
        
        self.print_log("BERT transformed", log_type="Success")
        self.df_BERT = df_output

    def get_features(self):
        if(isinstance(self.df_BERT, pd.DataFrame)):
            return np.array(self.df_BERT.filter(regex='feature_',axis=1).values)
            # return np.array([np.array(xi) for xi in self.df_BERT.BERT_Features.values])
        else:
            print_log("Please run function 'bert_transform' to generate text representation first!",log_type="Error")

    def get_labels(self, list_target = ['label']):
        return np.array(self.df_BERT[list_target].values.tolist())

    def get_current_bert_model(self):
        return self.bert_model.config._name_or_path

    def load_pretrain_bert(self, model_name='bert-base-uncased'):
        ## Want BERT instead of distilBERT? Uncomment the following line:
        self.model_class, self.tokenizer_class, self.pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, self.PRETRAIN_MAPPING[model_name])

        # Load pretrained model/tokenizer
        self.bert_tokenizer = self.tokenizer_class.from_pretrained(self.pretrained_weights)
        self.bert_model = self.model_class.from_pretrained(self.pretrained_weights)

    def get_features_df(self,additional_col=[]):
        if(isinstance(self.df_BERT, pd.DataFrame)):
            return pd.concat([self.df_BERT.filter(regex='feature_',axis=1),self.df_BERT[additional_col]], axis=1)
        else:
            print_log("Please run function 'bert_transform' to generate text representation first!",log_type="Error")

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## Data Transforming

### Hall Of Cancer (HoC)

#### Assisting function

In [None]:
## Sample code to load raw data

# DATAPATH = "/content/drive/MyDrive/MinorThesis/"
# DATASET = "HoC"
# TOKEN_SIZE = 128
# PRETRAIN_MODEL = 'biobert-base-uncased'

# temppath_train = os.path.join(DATAPATH,"datasets","raw",DATASET,"train.tsv")
#     # temppath_valid = os.path.join(DATAPATH,"datasets","raw",DATASET,"dev.tsv")
#     # temppath_test = os.path.join(DATAPATH,"datasets","raw",DATASET,"test.tsv")

# df_train = pd.read_csv(temppath_train, sep='\t')
#     # df_test = pd.read_csv(temppath_test, sep='\t')
#     # df_valid = pd.read_csv(temppath_valid, sep='\t')

#     # TO DO : Modify this if not HoC
# df_train.columns = ['label','text','filename_line']
#     # df_test.columns = ['label','text','filename_line']
#     # df_valid.columns = ['label','text','filename_line']

This function help us build a context with Previous n-sentences format.

In [None]:
def text_dependent(df_input, shift_level=0):
    temp_df = df_input.copy()
    new = temp_df['filename_line'].str.split("_", n = 1, expand = True)
    # making separate first name column from new data frame
    temp_df["filename"]= new[0]
    # making separate last name column from new data frame
    temp_df["sentence"]= new[1]
    if(shift_level==1):
        df_input['text'] = temp_df.groupby('filename').text.apply(lambda x: x.shift(1).fillna('')+' '+ x).str.strip()
    elif(shift_level==2):
        df_input['text'] = temp_df.groupby('filename').text.apply(lambda x: x.shift(2).fillna('')+' '+ x.shift(1).fillna('')+' '+ x).str.strip()
    elif(shift_level==3):
        df_input['text'] = temp_df.groupby('filename').text.apply(lambda x: x.shift(3).fillna('')+' '+ x.shift(2).fillna('')+' '+ x.shift(1).fillna('')+' '+ x).str.strip()
    return df_input

##### Test function
# text_dependent(df_train)

This function help us transform with different parameters

In [None]:
def transform_dataset(DATAPATH,DATASET,PRETRAIN_MODEL='bert-base-uncased',TOKEN_SIZE=128, SHIFT_LEVEL=None):
    temppath_train = os.path.join(DATAPATH,"datasets","raw",DATASET,"train.tsv")
    temppath_valid = os.path.join(DATAPATH,"datasets","raw",DATASET,"dev.tsv")
    temppath_test = os.path.join(DATAPATH,"datasets","raw",DATASET,"test.tsv")

    df_train = pd.read_csv(temppath_train, sep='\t')
    df_test = pd.read_csv(temppath_test, sep='\t')
    df_valid = pd.read_csv(temppath_valid, sep='\t')

    # TO DO : Modify this if not HoC
    df_train.columns = ['label','text','filename_line']
    df_test.columns = ['label','text','filename_line']
    df_valid.columns = ['label','text','filename_line']

    if(SHIFT_LEVEL != None):
        df_train = text_dependent(df_train,SHIFT_LEVEL)
        df_test = text_dependent(df_test,SHIFT_LEVEL)
        df_valid = text_dependent(df_valid,SHIFT_LEVEL)

    bert_train = my_BERT(df_train)
    bert_test = my_BERT(df_test)
    bert_valid = my_BERT(df_valid)

    bert_train.load_pretrain_bert(PRETRAIN_MODEL)
    bert_test.load_pretrain_bert(PRETRAIN_MODEL)
    bert_valid.load_pretrain_bert(PRETRAIN_MODEL)

    print_log("BERTTransform: Train Data")
    bert_train.bert_transform(DEVICE_AVAILABLE, token_length=TOKEN_SIZE)
    print_log("BERTTransform: Test Data")
    bert_test.bert_transform(DEVICE_AVAILABLE, token_length=TOKEN_SIZE)
    print_log("BERTTransform: Valid Data")
    bert_valid.bert_transform(DEVICE_AVAILABLE, token_length=TOKEN_SIZE)

    if(SHIFT_LEVEL == None or SHIFT_LEVEL == 0):
        temp_path = os.path.join(DATAPATH,"datasets","transformed",DATASET,PRETRAIN_MODEL,"token_length_"+str(TOKEN_SIZE))
    else:
        temp_path = os.path.join(DATAPATH,"datasets","transformed",DATASET,PRETRAIN_MODEL,"token_length_"+str(TOKEN_SIZE)+"_shift_"+str(SHIFT_LEVEL))
    Path(os.path.join(temp_path)).mkdir(parents=True, exist_ok=True)

    temp_df = bert_train.get_features_df(['filename_line','label'])
    temp_df.to_csv(os.path.join(temp_path,"train.csv"))
    print_log(len(df_train),"/",len(temp_df))
    temp_df = bert_test.get_features_df(['filename_line','label'])
    temp_df.to_csv(os.path.join(temp_path,"test.csv"))
    print_log(len(df_test),"/",len(temp_df))
    temp_df = bert_valid.get_features_df(['filename_line','label'])
    temp_df.to_csv(os.path.join(temp_path,"valid.csv"))
    print_log(len(df_valid),"/",len(temp_df))

    return bert_train, bert_test, bert_valid

# res = transform_dataset(DATAPATH = "/content/drive/MyDrive/MinorThesis/",
#     DATASET = "HoC",
#     TOKEN_SIZE = 512,
#     PRETRAIN_MODEL = 'biobert-base-uncased')


#### Executing

In [None]:
# # For Execute in M3
# if __name__ == "__main__":
#     fire.Fire(transform_dataset)

In [None]:
for bert_type in ['bert-base-uncased','pubmedbert-base-uncased','biobert-base-cased']:
    for shift in range(4):
        res = transform_dataset(DATAPATH = "/content/drive/MyDrive/MinorThesis/",
                                DATASET = "HoC",
                                TOKEN_SIZE = 512,
                                PRETRAIN_MODEL = bert_type,
                                SHIFT_LEVEL = shift)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_re

[Info] BERTTransform: Train Data
[Info] NO TRUNCATE Token - Done ( mean/max no. of token: 40 230 )
[Info] Token - Done ( mean/max no. of token: 40 230 )
[Info] Pad - Done
[Info] Mask - Done
[Info] Running BERT Transform on cuda
[Info] BERT token length: 512
[Info] Data size: 12119 ( Total batch 379 * size 32 )
[Info] Step: 1 / 379
[Info] Step: 100 / 379
[Info] Step: 200 / 379
[Info] Step: 300 / 379
[Info] Step: 379 / 379
[Info] BERT transform - Done
[Success] BERT transformed
[Info] BERTTransform: Test Data
[Info] NO TRUNCATE Token - Done ( mean/max no. of token: 40 150 )
[Info] Token - Done ( mean/max no. of token: 40 150 )
[Info] Pad - Done
[Info] Mask - Done
[Info] Running BERT Transform on cuda
[Info] BERT token length: 512
[Info] Data size: 3547 ( Total batch 111 * size 32 )
[Info] Step: 1 / 111
[Info] Step: 100 / 111
[Info] Step: 111 / 111
[Info] BERT transform - Done
[Success] BERT transformed
[Info] BERTTransform: Valid Data
[Info] NO TRUNCATE Token - Done ( mean/max no. of tok

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_re

[Info] BERTTransform: Train Data
[Info] NO TRUNCATE Token - Done ( mean/max no. of token: 74 284 )
[Info] Token - Done ( mean/max no. of token: 74 284 )
[Info] Pad - Done
[Info] Mask - Done
[Info] Running BERT Transform on cuda
[Info] BERT token length: 512
[Info] Data size: 12119 ( Total batch 379 * size 32 )
[Info] Step: 1 / 379
[Info] Step: 100 / 379
[Info] Step: 200 / 379
[Info] Step: 300 / 379
[Info] Step: 379 / 379
[Info] BERT transform - Done
[Success] BERT transformed
[Info] BERTTransform: Test Data
[Info] NO TRUNCATE Token - Done ( mean/max no. of token: 74 242 )
[Info] Token - Done ( mean/max no. of token: 74 242 )
[Info] Pad - Done
[Info] Mask - Done
[Info] Running BERT Transform on cuda
[Info] BERT token length: 512
[Info] Data size: 3547 ( Total batch 111 * size 32 )
[Info] Step: 1 / 111
[Info] Step: 100 / 111
[Info] Step: 111 / 111
[Info] BERT transform - Done
[Success] BERT transformed
[Info] BERTTransform: Valid Data
[Info] NO TRUNCATE Token - Done ( mean/max no. of tok

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_re

[Info] BERTTransform: Train Data
[Info] NO TRUNCATE Token - Done ( mean/max no. of token: 104 367 )
[Info] Token - Done ( mean/max no. of token: 104 367 )
[Info] Pad - Done
[Info] Mask - Done
[Info] Running BERT Transform on cuda
[Info] BERT token length: 512
[Info] Data size: 12119 ( Total batch 379 * size 32 )
[Info] Step: 1 / 379
[Info] Step: 100 / 379
[Info] Step: 200 / 379
[Info] Step: 300 / 379
[Info] Step: 379 / 379
[Info] BERT transform - Done
[Success] BERT transformed
[Info] BERTTransform: Test Data
[Info] NO TRUNCATE Token - Done ( mean/max no. of token: 104 317 )
[Info] Token - Done ( mean/max no. of token: 104 317 )
[Info] Pad - Done
[Info] Mask - Done
[Info] Running BERT Transform on cuda
[Info] BERT token length: 512
[Info] Data size: 3547 ( Total batch 111 * size 32 )
[Info] Step: 1 / 111
[Info] Step: 100 / 111
[Info] Step: 111 / 111
[Info] BERT transform - Done
[Success] BERT transformed
[Info] BERTTransform: Valid Data
[Info] NO TRUNCATE Token - Done ( mean/max no. of

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_re

[Info] BERTTransform: Train Data


KeyboardInterrupt: ignored

In [None]:
## Select out of the results to Preview label count for each cancer types
# dev = res[2]
# for i in range(10):
#    print(dev.df['label'].str.split(',',expand = True)[i].value_counts())

### PubMedQA

In [None]:
def pubmedqa_transform_dataset(DATAPATH,DATASET,PRETRAIN_MODEL='bert-base-uncased',TOKEN_SIZE=128, REASONING=False):
    list_data_fold = []
    for i in range(1): # We merge dataset to generate trained (Separate later using index - filename_line)
        print("fold",i)
        temppath_train = os.path.join(DATAPATH,"datasets","raw",DATASET,"pqal_fold"+str(i),"train_set.json")
        temppath_valid = os.path.join(DATAPATH,"datasets","raw",DATASET,"pqal_fold"+str(i),"dev_set.json")
            
        df_temp_train = pd.read_json(temppath_train).transpose()
        df_temp_valid = pd.read_json(temppath_valid).transpose()
        list_data_fold.append((df_temp_train,df_temp_valid))
        print(df_temp_train.shape,df_temp_valid.shape)
        # print(df_temp_train.value_counts("final_decision")/450*100)
        # print(df_temp_valid.value_counts("final_decision")/50*100)

    # Test
    temppath_test = os.path.join(DATAPATH,"datasets","raw",DATASET,"test_set.json")
    df_train = pd.concat([list_data_fold[0][0],list_data_fold[0][1]])
    df_test = pd.read_json(temppath_test).transpose()

    print("Final",df_train.shape,df_test.shape)        
    return list_data_fold,df_test


# list_data_fold,df_test = pubmedqa_transform_dataset(DATAPATH = "/content/drive/MyDrive/MinorThesis/",
#             DATASET = "pubmedqa")

In [None]:
def pubmedqa_transform_dataset(DATAPATH,DATASET,PRETRAIN_MODEL='bert-base-uncased',TOKEN_SIZE=128, REASONING=False,POSITION="QuesAbs"):
    # Train & Valid
    list_data_fold = []
    for i in range(1): # We merge dataset to generate trained (Separate later using index - filename_line)
        temppath_train = os.path.join(DATAPATH,"datasets","raw",DATASET,"pqal_fold"+str(i),"train_set.json")
        temppath_valid = os.path.join(DATAPATH,"datasets","raw",DATASET,"pqal_fold"+str(i),"dev_set.json")
        
        df_temp_train = pd.read_json(temppath_train).transpose()
        df_temp_valid = pd.read_json(temppath_valid).transpose()
        list_data_fold.append((df_temp_train,df_temp_valid))
        print(df_temp_train.shape,df_temp_valid.shape)

    # Test
    temppath_test = os.path.join(DATAPATH,"datasets","raw",DATASET,"test_set.json")
    df_train = pd.concat([list_data_fold[0][0],list_data_fold[0][1]])
    df_test = pd.read_json(temppath_test).transpose()

    print("Final",df_train.shape,df_test.shape)

    # # TO DO : Modify this if not HoC

    
    if(REASONING):
        # REASONING REQUIRED
        df_train_mod = df_train[['QUESTION','CONTEXTS','final_decision','reasoning_required_pred','reasoning_free_pred']].reset_index().copy()
        if(POSITION == "QuesAbs"):
            df_train_mod['text'] = df_train_mod.QUESTION +". "+ df_train_mod.CONTEXTS.apply(lambda x : (' ').join(x)) #question before
        else:
            df_train_mod['text'] = df_train_mod.QUESTION +". "+ df_train_mod.CONTEXTS.apply(lambda x : (' ').join(x)) #question before
        
        # df_train_mod['text'] = df_train_mod.CONTEXTS.apply(lambda x : (' ').join(x)) +" "+ df_train_mod.QUESTION  #question after
        df_train_mod.drop(columns=['QUESTION','CONTEXTS'],inplace=True)
        df_train_mod.columns = ['id','label','reasoning_required_pred','reasoning_free_pred','text']

        df_test_mod = df_test[['QUESTION','CONTEXTS','final_decision','reasoning_required_pred','reasoning_free_pred']].reset_index().copy()
        if(POSITION == "QuesAbs"):
            df_test_mod['text'] = df_test_mod.QUESTION +". "+ df_test_mod.CONTEXTS.apply(lambda x : (' ').join(x)) #question before
        else:
            df_test_mod['text'] = df_test_mod.CONTEXTS.apply(lambda x : (' ').join(x)) +" "+ df_test_mod.QUESTION  #question after

        df_test_mod.drop(columns=['QUESTION','CONTEXTS'],inplace=True)
        df_test_mod.columns = ['id','label','reasoning_required_pred','reasoning_free_pred','text']
    else:
        # REASONING FREE
        df_train_mod = df_train[['QUESTION','LONG_ANSWER','final_decision','reasoning_required_pred','reasoning_free_pred']].reset_index().copy()
        if(POSITION == "QuesAbs"):
            df_train_mod['text'] = df_train_mod.QUESTION +" "+ df_train_mod.LONG_ANSWER #question before
        else:
            df_train_mod['text'] = df_train_mod.LONG_ANSWER +" "+ df_train_mod.QUESTION #question after

        df_train_mod.drop(columns=['QUESTION','LONG_ANSWER'],inplace=True)
        df_train_mod.columns = ['id','label','reasoning_required_pred','reasoning_free_pred','text']

        df_test_mod = df_test[['QUESTION','LONG_ANSWER','final_decision','reasoning_required_pred','reasoning_free_pred']].reset_index().copy()
        if(POSITION == "QuesAbs"):
            df_test_mod['text'] = df_test_mod.QUESTION +" "+ df_test_mod.LONG_ANSWER #question before
        else:
            df_test_mod['text'] = df_test_mod.LONG_ANSWER +" "+ df_test_mod.QUESTION #question after
          
        df_test_mod.drop(columns=['QUESTION','LONG_ANSWER'],inplace=True)
        df_test_mod.columns = ['id','label','reasoning_required_pred','reasoning_free_pred','text']


    bert_train = my_BERT(df_train_mod)
    bert_test = my_BERT(df_test_mod)

    bert_train.load_pretrain_bert(PRETRAIN_MODEL)
    bert_test.load_pretrain_bert(PRETRAIN_MODEL)

    print_log("BERTTransform: Train Data")
    bert_train.bert_transform(DEVICE_AVAILABLE, token_length=TOKEN_SIZE)
    print_log("BERTTransform: Test Data")
    bert_test.bert_transform(DEVICE_AVAILABLE, token_length=TOKEN_SIZE)

    temp_path = os.path.join(DATAPATH,"datasets","transformed",DATASET,"QuesAbs","reasoning_required" if REASONING else "reasoning_free",PRETRAIN_MODEL,"token_length_"+str(TOKEN_SIZE))
    Path(os.path.join(temp_path)).mkdir(parents=True, exist_ok=True)

    temp_df = bert_train.get_features_df(['id','label','reasoning_required_pred','reasoning_free_pred'])
    temp_df.to_csv(os.path.join(temp_path,"train.csv"))
    print_log(len(df_train),"/",len(temp_df))
    temp_df = bert_test.get_features_df(['id','label','reasoning_required_pred','reasoning_free_pred'])
    temp_df.to_csv(os.path.join(temp_path,"test.csv"))
    print_log(len(df_test),"/",len(temp_df))

    return bert_train, bert_test

# df_train,df_test = pubmedqa_transform_dataset(DATAPATH = "/content/drive/MyDrive/MinorThesis/",
#             DATASET = "pubmedqa",
#             TOKEN_SIZE = 512,
#             PRETRAIN_MODEL = 'pubmedbert-base-uncased',
#             REASONING=True)

for i in [True,False]:
    for j in ['pubmedbert-base-uncased','bert-base-uncased','biobert-base-cased']:
        df_train,df_test = pubmedqa_transform_dataset(DATAPATH = "/content/drive/MyDrive/MinorThesis/",
            DATASET = "pubmedqa",
            TOKEN_SIZE = 512,
            PRETRAIN_MODEL = j,
            REASONING=i)

(450, 9) (50, 9)
Final (500, 9) (500, 9)


Downloading:   0%|          | 0.00/221k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-unc

[Info] BERTTransform: Train Data
[Info] NO TRUNCATE Token - Done ( mean/max no. of token: 293 611 )
[Info] Token - Done ( mean/max no. of token: 292 512 )
[Info] Pad - Done
[Info] Mask - Done
[Info] Running BERT Transform on cuda
[Info] BERT token length: 512
[Info] Data size: 500 ( Total batch 16 * size 32 )
[Info] Step: 1 / 16
[Info] Step: 10 / 16
[Info] Step: 16 / 16


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[Info] BERT transform - Done
[Success] BERT transformed
[Info] BERTTransform: Test Data
[Info] NO TRUNCATE Token - Done ( mean/max no. of token: 299 613 )
[Info] Token - Done ( mean/max no. of token: 299 512 )
[Info] Pad - Done
[Info] Mask - Done
[Info] Running BERT Transform on cuda
[Info] BERT token length: 512
[Info] Data size: 500 ( Total batch 16 * size 32 )
[Info] Step: 1 / 16
[Info] Step: 10 / 16
[Info] Step: 16 / 16
[Info] BERT transform - Done
[Success] BERT transformed
[Info] 500 / 500
[Info] 500 / 500
(450, 9) (50, 9)
Final (500, 9) (500, 9)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_re

[Info] BERTTransform: Train Data
[Info] NO TRUNCATE Token - Done ( mean/max no. of token: 325 512 )
[Info] Token - Done ( mean/max no. of token: 325 512 )
[Info] Pad - Done
[Info] Mask - Done
[Info] Running BERT Transform on cuda
[Info] BERT token length: 512
[Info] Data size: 500 ( Total batch 16 * size 32 )
[Info] Step: 1 / 16
[Info] Step: 10 / 16
[Info] Step: 16 / 16
[Info] BERT transform - Done
[Success] BERT transformed
[Info] BERTTransform: Test Data
[Info] NO TRUNCATE Token - Done ( mean/max no. of token: 331 512 )
[Info] Token - Done ( mean/max no. of token: 331 512 )
[Info] Pad - Done
[Info] Mask - Done
[Info] Running BERT Transform on cuda
[Info] BERT token length: 512
[Info] Data size: 500 ( Total batch 16 * size 32 )
[Info] Step: 1 / 16
[Info] Step: 10 / 16
[Info] Step: 16 / 16
[Info] BERT transform - Done
[Success] BERT transformed
[Info] 500 / 500
[Info] 500 / 500
(450, 9) (50, 9)
Final (500, 9) (500, 9)


Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/313 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.1 were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.1 were not used when initializing Ber

[Info] BERTTransform: Train Data
[Info] NO TRUNCATE Token - Done ( mean/max no. of token: 343 690 )
[Info] Token - Done ( mean/max no. of token: 340 512 )
[Info] Pad - Done
[Info] Mask - Done
[Info] Running BERT Transform on cuda
[Info] BERT token length: 512
[Info] Data size: 500 ( Total batch 16 * size 32 )
[Info] Step: 1 / 16
[Info] Step: 10 / 16
[Info] Step: 16 / 16


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[Info] BERT transform - Done
[Success] BERT transformed
[Info] BERTTransform: Test Data
[Info] NO TRUNCATE Token - Done ( mean/max no. of token: 349 683 )
[Info] Token - Done ( mean/max no. of token: 346 512 )
[Info] Pad - Done
[Info] Mask - Done
[Info] Running BERT Transform on cuda
[Info] BERT token length: 512
[Info] Data size: 500 ( Total batch 16 * size 32 )
[Info] Step: 1 / 16
[Info] Step: 10 / 16
[Info] Step: 16 / 16
[Info] BERT transform - Done
[Success] BERT transformed
[Info] 500 / 500
[Info] 500 / 500
(450, 9) (50, 9)
Final (500, 9) (500, 9)


Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-unc

[Info] BERTTransform: Train Data
[Info] NO TRUNCATE Token - Done ( mean/max no. of token: 66 150 )
[Info] Token - Done ( mean/max no. of token: 66 150 )
[Info] Pad - Done
[Info] Mask - Done
[Info] Running BERT Transform on cuda
[Info] BERT token length: 512
[Info] Data size: 500 ( Total batch 16 * size 32 )
[Info] Step: 1 / 16
[Info] Step: 10 / 16
[Info] Step: 16 / 16


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[Info] BERT transform - Done
[Success] BERT transformed
[Info] BERTTransform: Test Data
[Info] NO TRUNCATE Token - Done ( mean/max no. of token: 65 173 )
[Info] Token - Done ( mean/max no. of token: 65 173 )
[Info] Pad - Done
[Info] Mask - Done
[Info] Running BERT Transform on cuda
[Info] BERT token length: 512
[Info] Data size: 500 ( Total batch 16 * size 32 )
[Info] Step: 1 / 16
[Info] Step: 10 / 16
[Info] Step: 16 / 16
[Info] BERT transform - Done
[Success] BERT transformed
[Info] 500 / 500
[Info] 500 / 500
(450, 9) (50, 9)
Final (500, 9) (500, 9)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_re

[Info] BERTTransform: Train Data
[Info] NO TRUNCATE Token - Done ( mean/max no. of token: 76 173 )
[Info] Token - Done ( mean/max no. of token: 76 173 )
[Info] Pad - Done
[Info] Mask - Done
[Info] Running BERT Transform on cuda
[Info] BERT token length: 512
[Info] Data size: 500 ( Total batch 16 * size 32 )
[Info] Step: 1 / 16
[Info] Step: 10 / 16
[Info] Step: 16 / 16
[Info] BERT transform - Done
[Success] BERT transformed
[Info] BERTTransform: Test Data
[Info] NO TRUNCATE Token - Done ( mean/max no. of token: 74 196 )
[Info] Token - Done ( mean/max no. of token: 74 196 )
[Info] Pad - Done
[Info] Mask - Done
[Info] Running BERT Transform on cuda
[Info] BERT token length: 512
[Info] Data size: 500 ( Total batch 16 * size 32 )
[Info] Step: 1 / 16
[Info] Step: 10 / 16
[Info] Step: 16 / 16
[Info] BERT transform - Done
[Success] BERT transformed
[Info] 500 / 500
[Info] 500 / 500
(450, 9) (50, 9)
Final (500, 9) (500, 9)


Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.1 were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.1 were not used when initializing Ber

[Info] BERTTransform: Train Data
[Info] NO TRUNCATE Token - Done ( mean/max no. of token: 80 174 )
[Info] Token - Done ( mean/max no. of token: 80 174 )
[Info] Pad - Done
[Info] Mask - Done
[Info] Running BERT Transform on cuda
[Info] BERT token length: 512
[Info] Data size: 500 ( Total batch 16 * size 32 )
[Info] Step: 1 / 16
[Info] Step: 10 / 16
[Info] Step: 16 / 16


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[Info] BERT transform - Done
[Success] BERT transformed
[Info] BERTTransform: Test Data
[Info] NO TRUNCATE Token - Done ( mean/max no. of token: 78 210 )
[Info] Token - Done ( mean/max no. of token: 78 210 )
[Info] Pad - Done
[Info] Mask - Done
[Info] Running BERT Transform on cuda
[Info] BERT token length: 512
[Info] Data size: 500 ( Total batch 16 * size 32 )
[Info] Step: 1 / 16
[Info] Step: 10 / 16
[Info] Step: 16 / 16
[Info] BERT transform - Done
[Success] BERT transformed
[Info] 500 / 500
[Info] 500 / 500


In [None]:
# # For test in Colab Only
# datapath = "/content/drive/MyDrive/MinorThesis/"
# dataset = "pubmedqa"
# token_size = 128
# pretrain_model = 'biobert-base-uncased'

# # Train & Valid
# list_data_fold = []
# for i in range(10):
#     temppath_train = os.path.join(datapath,"datasets","raw",dataset,"pqal_fold"+str(i),"train_set.json")
#     temppath_valid = os.path.join(datapath,"datasets","raw",dataset,"pqal_fold"+str(i),"dev_set.json")
    
#     df_temp_train = pd.read_json(temppath_train).transpose()
#     df_temp_valid = pd.read_json(temppath_valid).transpose()
#     list_data_fold.append((df_temp_train,df_temp_valid))
#     print(df_temp_train.shape,df_temp_valid.shape)

# # Test
# temppath_test = os.path.join(datapath,"datasets","raw",dataset,"test_set.json")
# df_test = pd.read_json(temppath_test).transpose()

# # Test Label
# temppath_test_gt = os.path.join(datapath,"datasets","raw",dataset,"test_ground_truth.json")
# df_test_label = pd.read_json(temppath_test_gt, typ='series')

# # # Original 1k data before split
# # temppath_ori = os.path.join(datapath,"datasets","raw",dataset,"ori_pqal.json")
# # df_ori = pd.read_json(temppath_ori).transpose()
# # df_ori.shape

# print("Final",list_data_fold[0][0].shape,list_data_fold[0][1].shape,df_test.shape)

In [None]:
# # Extract Fold ID (test,train)

# def get_pubmedqa_fold_id():
#     list_data_fold = []
#     for i in range(10):
#         temppath_train = os.path.join(datapath,"datasets","raw",dataset,"pqal_fold"+str(i),"train_set.json")
#         temppath_valid = os.path.join(datapath,"datasets","raw",dataset,"pqal_fold"+str(i),"dev_set.json")
        
#         df_temp_train = pd.read_json(temppath_train).transpose().reset_index()
#         df_temp_valid = pd.read_json(temppath_valid).transpose().reset_index()

#         list_data_fold.append((df_temp_train['index'].values,df_temp_valid['index'].values))
#         # list_data_fold.append((df_temp_train,df_temp_valid))
#         # print(df_temp_train.shape,df_temp_valid.shape)
#     return list_data_fold
    
# list_data_fold = get_pubmedqa_fold_id()

### BioASQ

In [None]:
def bioasq_transform_dataset(DATAPATH,DATASET,PRETRAIN_MODEL='bert-base-uncased',TOKEN_SIZE=128, REASONING=False):
    # Train & Valid & Test
    temppath_train = os.path.join(DATAPATH,"datasets","raw",DATASET,"train.tsv")
    temppath_valid = os.path.join(DATAPATH,"datasets","raw",DATASET,"dev.tsv")
    temppath_test = os.path.join(DATAPATH,"datasets","raw",DATASET,"test.tsv")
        
    df_train = pd.read_csv(temppath_train,sep="\t",header=None)
    df_valid = pd.read_csv(temppath_valid,sep="\t",header=None)
    df_test = pd.read_csv(temppath_test,sep="\t",header=None)

    df_train.columns = ["id","question","answer","label"]
    df_valid.columns = ["id","question","answer","label"]
    df_test.columns = ["id","question","answer","label"]

    print("Final",df_train.shape,df_valid.shape,df_test.shape)

    # # TO DO : Modify this if not HoC
    df_train['text'] = df_train.question +". "+ df_train.answer
    df_valid['text'] = df_valid.question +". "+ df_valid.answer
    df_test['text'] = df_test.question +". "+ df_test.answer

    bert_train = my_BERT(df_train)
    bert_test = my_BERT(df_test)
    bert_valid = my_BERT(df_valid)

    bert_train.load_pretrain_bert(PRETRAIN_MODEL)
    bert_test.load_pretrain_bert(PRETRAIN_MODEL)
    bert_valid.load_pretrain_bert(PRETRAIN_MODEL)

    print_log("BERTTransform: Train Data")
    bert_train.bert_transform(DEVICE_AVAILABLE, token_length=TOKEN_SIZE)
    print_log("BERTTransform: Test Data")
    bert_test.bert_transform(DEVICE_AVAILABLE, token_length=TOKEN_SIZE)
    print_log("BERTTransform: Valid Data")
    bert_valid.bert_transform(DEVICE_AVAILABLE, token_length=TOKEN_SIZE)
    
    temp_path = os.path.join(DATAPATH,"datasets","transformed",DATASET,PRETRAIN_MODEL,"token_length_"+str(TOKEN_SIZE))
    Path(os.path.join(temp_path)).mkdir(parents=True, exist_ok=True)

    # Additional labels for HOC

    temp_df = bert_train.get_features_df(['id','label'])
    temp_df.to_csv(os.path.join(temp_path,"train.csv"))
    print_log(len(df_train),"/",len(temp_df))
    temp_df = bert_test.get_features_df(['id','label'])
    temp_df.to_csv(os.path.join(temp_path,"test.csv"))
    print_log(len(df_test),"/",len(temp_df))
    temp_df = bert_valid.get_features_df(['id','label'])
    temp_df.to_csv(os.path.join(temp_path,"valid.csv"))
    print_log(len(df_valid),"/",len(temp_df))

    # return bert_train, bert_test

# bioasq_transform_dataset(DATAPATH = "/content/drive/MyDrive/MinorThesis/",
#             DATASET = "BioASQ",
#             TOKEN_SIZE = 512,
#             PRETRAIN_MODEL = 'pubmedbert-base-uncased',
#             REASONING=True)

In [None]:
# # For test in Colab Only
# datapath = "/content/drive/MyDrive/MinorThesis/"
# dataset = "BioASQ"
# token_size = 512
# pretrain_model = 'biobert-base-uncased'

# # Train & Valid
# temppath_train = os.path.join(datapath,"datasets","raw",dataset,"train.tsv")
# temppath_valid = os.path.join(datapath,"datasets","raw",dataset,"dev.tsv")
# temppath_test = os.path.join(datapath,"datasets","raw",dataset,"test.tsv")
    
# df_train = pd.read_csv(temppath_train,sep="\t",header=None)
# df_valid = pd.read_csv(temppath_valid,sep="\t",header=None)
# df_test = pd.read_csv(temppath_test,sep="\t",header=None)

# df_train.columns = ["id","question","answer","label"]
# df_valid.columns = ["id","question","answer","label"]
# df_test.columns = ["id","question","answer","label"]

In [None]:
# df_train