Initialization Google Drive Configuration 

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive') 

Mounted at /content/drive


# 2. Downstream Model Generator

**Created By:**  Jirarote Jirasirikul

**Monash University (Melbourne) Australia** 

## Import Library

All Library and File Path will be added here

In [None]:
# Standard Library
import pandas as pd
import numpy as np
import glob
import os
from pathlib import Path

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, matthews_corrcoef
from sklearn.metrics import precision_recall_fscore_support,accuracy_score, classification_report
from sklearn.preprocessing import MinMaxScaler    
from sklearn.model_selection import train_test_split

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from collections import defaultdict

import matplotlib.pyplot as plt

from datetime import datetime
import json 

import seaborn as sns
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler

In [None]:
# BERT Transformer Library
!pip install transformers
import transformers as ppb

Collecting transformers
  Downloading transformers-4.11.3-py3-none-any.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 5.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 59.1 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 40.5 MB/s 
[?25hCollecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.5 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 53.4 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempti

## Check Available Device (CPU/GPU)

In [None]:
import torch
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    DEVICE_AVAILABLE = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    DEVICE_AVAILABLE = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


## Utilities Functions

In [None]:
# # MY GLOBAL FUNCTION - 

ENABLE_LOGS = 1
def print_log(*arg, log_type="Info"):
    global ENABLE_LOGS
    if(ENABLE_LOGS==1 or log_type!="Info"): 
        print("["+log_type+"]"," ".join(str(x) for x in arg))



---

##BERT Text Representation

Transform Language Model

When using BERT, technically we are transforming our sentence into a vector that represent each sentence. The process is call Language Model a representation of each word. 

BERT add [CLS] token infront of each sentence. This token representation vector could later be use for Classification as it contain the sentence representation.

### Class my_BERT

In [None]:
# BERT weight Options 
# - 'distilbert-base-uncased'
# - 'bert-base-uncased'
# - 'dmis-lab/biobert-base-cased-v1.1'
# - 'dmis-lab/biobert-v1.1' : Data Mining and Information Systems Lab, Korea University's picture Updated May 19 • 41k
# - 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext'

In [None]:
class my_BERT:
    ###### Load pretrain BERT Language Model transformer (Otherwise use 'set' to customize)
    model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

    # Load pretrained model/tokenizer
    bert_tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
    bert_tokenizer.add_special_tokens = True
    bert_model = model_class.from_pretrained(pretrained_weights)

    PRETRAIN_MAPPING = {'distilbert-base-uncased':'distilbert-base-uncased',
                        'bert-base-uncased':'bert-base-uncased',
                        'biobert-base-cased':'dmis-lab/biobert-base-cased-v1.1',
                        'biobert-base-uncased':'dmis-lab/biobert-v1.1',
                        'pubmedbert-base-uncased':'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext'}

    def __init__(self, df_input,is_transform=False, ENABLE_LOGS = 1):
        ## INPUT STRUCTURE (COLUMNS): 
        ## - 'text' - Required
        ## - 'label' - Optional default name is 'label' otherwise need to specific when called

        if(is_transform):
            self.df = None
            self.df_BERT = df_input
        else:
            self.df = df_input
            self.df_BERT = None
        self.ENABLE_LOGS = ENABLE_LOGS
    
    def print_log(self, *arg, log_type="Info"):
        if(self.ENABLE_LOGS==1 or log_type!="Info"): 
            print("["+log_type+"]"," ".join(str(x) for x in arg))

    def bert_tokenize(self, token_length=128):
        df_output = self.df.copy()

        # BERT Tokenizer + truncate to BERT_MAX_LENGTH
        df_output['BERTTokens'] = df_output["text"].apply((lambda x: self.bert_tokenizer.encode(x, add_special_tokens=True,truncation=True,max_length = token_length)))
        # df_output['n_tokens0'] = df_output['BERTTokens'].apply(lambda x: len(x)) # Just for verification
        temp = df_output['BERTTokens'].apply(lambda x: len(x))
        self.print_log("Token - Done","( mean/max no. of token:",round(temp.mean()),temp.max(),")")

        # Padding tokens to BERT_MAX_LENGTH
        df_output['BERTTokens'] = df_output['BERTTokens'].apply(lambda x: x + [0]*(token_length-len(x)))
        # df_output['n_tokens'] = df_output['BERTTokens'].apply(lambda x: len(x)) # Just for verification
        self.print_log("Pad - Done")

        # BERT Mask
        df_output['BERTMasks'] = df_output['BERTTokens'].apply(lambda x: [np.where(i != 0, 1, 0) for i in x])
        # df_output['n_mask1'] = df_output['BERTMask'].apply(lambda x: sum(x)) # Just for verification
        self.print_log("Mask - Done")

        return df_output

    def run_bert_transform(self, dataloader, device_available = torch.device("cpu")):
        all_result = []

        self.bert_model.to(device_available)

        digit = len(str(len(dataloader)))-1 # Report progress

        for step, batch in enumerate(dataloader):
            if(step == 0 or (step+1)%(10**digit) == 0 or step == len(dataloader)-1): self.print_log("Step:",step+1,"/",len(dataloader))

            b_input_ids = batch[0].to(device_available)
            b_input_mask = batch[1].to(device_available)

            with torch.no_grad():
                last_hidden_states = self.bert_model(b_input_ids, attention_mask=b_input_mask)
        
            
            res_features = last_hidden_states[0][:,0,:].cpu().numpy()
            all_result.append(res_features)
        self.print_log("BERT transform - Done")

        return np.vstack(all_result)


    def bert_transform(self, device_available = torch.device("cpu"), batch_size = 32, token_length=128):
        df_output = self.bert_tokenize(token_length)

        # Convert to Tensor
        input_tokens = torch.tensor(np.stack(df_output['BERTTokens'].values))
        input_masks = torch.tensor(np.stack(df_output['BERTMasks'].values))
        # print(input_tokens,input_masks)

        # Create the DataLoader for our training set.
        input_data = TensorDataset(input_tokens, input_masks)
        input_sampler = SequentialSampler(input_data)
        input_dataloader = DataLoader(input_data, sampler=input_sampler, batch_size=batch_size)

        self.print_log("Running BERT Transform on", str(device_available))
        if(str(device_available) == 'cpu'):
            self.print_log("Running BERT on CPU can take longer time...",log_type="WARNING")
        self.print_log("BERT token length:",token_length)
        self.print_log("Data size:",str(len(input_tokens)), "( Total batch", str(len(input_dataloader)),'* size',str(batch_size),")")
        
        output_features = self.run_bert_transform(input_dataloader,device_available)
        df_output = pd.concat([df_output,pd.DataFrame(output_features.tolist()).add_prefix('feature_')],axis=1)
        
        self.print_log("BERT transformed", log_type="Success")
        self.df_BERT = df_output

    def get_features(self):
        if(isinstance(self.df_BERT, pd.DataFrame)):
            return np.array(self.df_BERT.filter(regex='feature_',axis=1).values)
            # return np.array([np.array(xi) for xi in self.df_BERT.BERT_Features.values])
        else:
            print_log("Please run function 'bert_transform' to generate text representation first!",log_type="Error")

    def get_labels(self, list_target = ['label']):
        return np.array(self.df_BERT[list_target].values.tolist())

    def get_current_bert_model(self):
        return self.bert_model.config._name_or_path

    def load_pretrain_bert(self, model_name='bert-base-uncased'):
        ## Want BERT instead of distilBERT? Uncomment the following line:
        self.model_class, self.tokenizer_class, self.pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, self.PRETRAIN_MAPPING[model_name])

        # Load pretrained model/tokenizer
        self.bert_tokenizer = self.tokenizer_class.from_pretrained(self.pretrained_weights)
        self.bert_model = self.model_class.from_pretrained(self.pretrained_weights)

    def get_features_df(self,additional_col=[]):
        if(isinstance(self.df_BERT, pd.DataFrame)):
            return pd.concat([self.df_BERT.filter(regex='feature_',axis=1),self.df_BERT[additional_col]], axis=1)
        else:
            print_log("Please run function 'bert_transform' to generate text representation first!",log_type="Error")

    ### BELOW is an EXTENSION

    def extract_hoc_label(self):
        try:
            LABEL_LIST = ['label_IM', 'label_ID', 'label_CE', 'label_RI', 'label_GS', 'label_GI', 'label_A', 'label_CD', 'label_PS', 'label_TPI']
            temp = self.df_BERT['label'].str.split(',').apply(lambda x: [int(i.split('_')[1]) for i in x])
            temp_df = pd.DataFrame(temp.tolist())
            temp_df.columns = LABEL_LIST
            self.df_BERT = pd.concat([self.df_BERT,temp_df], axis=1)
            print_log("Extract HoC label",log_type="Success")
            return self.df_BERT
        except:
            print_log("Something went wrong",log_type="Error")
        
    def class2idx_pubmedqa_label(self,col='label'):
        try:
            class2idx = {
                'no':0,
                'maybe':1,
                'yes':2
            }
            # idx2class = {v: k for k, v in class2idx.items()}
            self.df_BERT[col].replace(class2idx, inplace=True)
            print_log("class2idx_pubmedqa_label",log_type="Success")
            return self.df_BERT
        except:
            print_log("Something went wrong",log_type="Error")

    def class2idx_bioasq_label(self,col='label'):
        try:
            class2idx = {
                'no':0,
                'yes':1
            }
            # idx2class = {v: k for k, v in class2idx.items()}
            self.df_BERT[col].replace(class2idx, inplace=True)
            print_log("class2idx_bioasq_label",log_type="Success")
            return self.df_BERT
        except:
            print_log("Something went wrong",log_type="Error")
            

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




---


## Downstream Model

### Class: my_downstream

Dataset training hyperparameter in BLURB Paper
*   learning_rate = [1e-5, 3e-3, 5e-5]
*   batch_size = (16,32)
*   epoch_number = 2~60


In [None]:
class LinearLayer(torch.nn.Module):
    def __init__(self, D_in, D_out):
        super(LinearLayer, self).__init__()

        self.linear = torch.nn.Linear(D_in, D_out)
        self.dropout = torch.nn.Dropout(0.1)

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        output = self.linear(x)
        # output = self.dropout(output)
        output = torch.sigmoid(output)
        return output

In [None]:
class MulticlassClassification(torch.nn.Module):
    def __init__(self, num_feature, num_class):
        super(MulticlassClassification, self).__init__()
        
        self.layer_1 = nn.Linear(num_feature, 512)
        self.layer_2 = nn.Linear(512, 128)
        self.layer_3 = nn.Linear(128, 64)
        self.layer_out = nn.Linear(64, num_class) 
        
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.2)
        self.batchnorm1 = nn.BatchNorm1d(512)
        self.batchnorm2 = nn.BatchNorm1d(128)
        self.batchnorm3 = nn.BatchNorm1d(64)
        
    def forward(self, x):
        x = self.layer_1(x)
        x = self.batchnorm1(x)
        x = self.relu(x)
        
        x = self.layer_2(x)
        x = self.batchnorm2(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.layer_3(x)
        x = self.batchnorm3(x)
        x = self.relu(x)
        x = self.dropout(x)
        
        x = self.layer_out(x)
        
        return x

In [None]:
class my_downstream:
    def __init__(self, train_x, train_y, test_x = None, test_y = None, valid_x = None, valid_y = None, ENABLE_LOGS = 1):
        self.train_x = train_x
        self.train_y = train_y
        self.test_x = test_x
        self.test_y = test_y
        self.valid_x = valid_x
        self.valid_y = valid_y
        self.predict_x = None
        self.predict_y = None

        self.model = None
        self.best_model = None
        self.best_iter = None

        self.ENABLE_LOGS = ENABLE_LOGS


        self.accuracy_stats = {
                                  'train': [],
                                  "val": []
                              }
        self.loss_stats = {
                              'train': [],
                              "val": []
                          }

    def print_log(self, *arg, log_type="Info"):
        if(self.ENABLE_LOGS==1 or log_type!="Info"): 
            print("["+log_type+"]"," ".join(str(x) for x in arg))

    def train_logistic(self, n_iter=500, random_state=0):
        self.model = LogisticRegression(random_state=random_state,max_iter=n_iter)
        self.print_log("Train", self.model.__class__.__name__)
        self.print_log("iteration:",n_iter)
        self.print_log("random_state:",random_state)
        self.model.fit(self.train_x, self.train_y)
        self.print_log("Train Logistic Regression", log_type="Success")

    def train_multiclass_nn(self, D_in, n_classes, EPOCHS=500, learning_rate = 1e-5, batch_size = 32, CONT = 0):
        class_count = [i for i in get_class_distribution(self.train_y).values()]
        class_weights = 1./torch.tensor(class_count, dtype=torch.float) 
        # print(class_weights)
        
        
        self.model = MulticlassClassification(num_feature = D_in, num_class=n_classes).to(DEVICE_AVAILABLE)

        criterion = nn.CrossEntropyLoss(weight=class_weights.to(DEVICE_AVAILABLE))
        optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
        # print(self.model)

        # Convert to Tensor
        input_x = torch.tensor(self.train_x).float()
        input_y = torch.tensor(self.train_y).long()
        valid_x = torch.tensor(self.valid_x).float()
        valid_y = torch.tensor(self.valid_y).long()

        # Create the DataLoader for our training set.
        input_data = TensorDataset(input_x, input_y)
        input_sampler = SequentialSampler(input_data)
        input_dataloader = DataLoader(input_data, sampler=input_sampler, batch_size=batch_size)
        self.print_log("Data size:",str(len(self.train_x)), "( Total batch", str(len(input_dataloader)),'* size',str(batch_size),")")
        valid_data = TensorDataset(valid_x, valid_y)
        valid_sampler = SequentialSampler(valid_data)
        valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size)
        self.print_log("Data size:",str(len(self.valid_x)), "( Total batch", str(len(valid_dataloader)),'* size',str(batch_size),")")

        min_loss = 999999

        self.print_log("Begin training.")
        for e in tqdm(range(1, EPOCHS+1)):
            # TRAINING
            train_epoch_loss = 0
            train_epoch_acc = 0

            self.model.train()
            for X_train_batch, y_train_batch in input_dataloader:
                X_train_batch, y_train_batch = X_train_batch.to(DEVICE_AVAILABLE), y_train_batch.to(DEVICE_AVAILABLE)

                optimizer.zero_grad()
                
                y_train_pred = self.model(X_train_batch)
                train_loss = criterion(y_train_pred, y_train_batch.squeeze_())
                train_acc = my_evaluator.multi_acc(y_train_pred, y_train_batch)
                
                train_loss.backward()
                optimizer.step()
                
                train_epoch_loss += train_loss.item()
                train_epoch_acc += train_acc.item()

            # VALIDATION    
            with torch.no_grad():
        
                val_epoch_loss = 0
                val_epoch_acc = 0
                
                self.model.eval()
                for X_val_batch, y_val_batch in valid_dataloader:
                    X_val_batch, y_val_batch = X_val_batch.to(DEVICE_AVAILABLE), y_val_batch.to(DEVICE_AVAILABLE)
                    
                    y_val_pred = self.model(X_val_batch)
                                
                    val_loss = criterion(y_val_pred, y_val_batch.squeeze_())
                    val_acc = my_evaluator.multi_acc(y_val_pred, y_val_batch)
                    
                    val_epoch_loss += val_loss.item()
                    val_epoch_acc += val_acc.item()
            self.loss_stats['train'].append(train_epoch_loss/len(input_dataloader))
            self.loss_stats['val'].append(val_epoch_loss/len(valid_dataloader))
            self.accuracy_stats['train'].append(train_epoch_acc/len(input_dataloader))
            self.accuracy_stats['val'].append(val_epoch_acc/len(valid_dataloader))     
    
            if(min_loss > self.loss_stats['val'][-1]):
                self.print_log(f'Epoch {e+0:03}: SAVE')
                self.best_model = self.model
                self.best_iter = e+1
                min_loss = self.loss_stats['val'][-1]

            # print(f'Epoch {e+0:03}: | Train Loss: {train_epoch_loss/len(input_dataloader):.5f} | Val Loss: {val_epoch_loss/len(valid_dataloader):.5f} | Train Acc: {train_epoch_acc/len(input_dataloader):.3f}| Val Acc: {val_epoch_acc/len(valid_dataloader):.3f}')

    def train_linear_nn(self, D_in, n_classes, n_iter=500, learning_rate = 1e-5, batch_size = 32, CONT = 0):
        list_train_loss = []
        list_valid_loss = []
      
        # Convert to Tensor
        input_x = torch.tensor(self.train_x).float()
        input_y = torch.tensor(self.train_y).float()

        # Create the DataLoader for our training set.
        input_data = TensorDataset(input_x, input_y)
        input_sampler = SequentialSampler(input_data)
        input_dataloader = DataLoader(input_data, sampler=input_sampler, batch_size=batch_size)
        self.print_log("Data size:",str(len(self.train_x)), "( Total batch", str(len(input_dataloader)),'* size',str(batch_size),")")
        
        if(CONT == 0):
            self.model = LinearLayer(D_in, n_classes).to(DEVICE_AVAILABLE)
        self.print_log("Train", self.model.__class__.__name__)
        self.print_log("iteration:",n_iter)

        criterion = torch.nn.BCELoss()
        optimizer = torch.optim.Adam(self.model.parameters(), lr=learning_rate)

        min_loss = 999999

        digit_iter = len(str(n_iter))-1 # Report progress
        for i in range(n_iter):
            digit = len(str(len(input_dataloader)))-1 # Report progress
            for step, batch in enumerate(input_dataloader):
                
                x_input = batch[0].to(DEVICE_AVAILABLE)
                y_input = batch[1].to(DEVICE_AVAILABLE)
                y_pred = self.model(x_input).squeeze()

                loss = criterion(y_pred,y_input)

                # if(step == 0 or (step+1)%(10**digit) == 0 or step == len(input_dataloader)-1): 
                #     self.print_log("Step:",step+1,"/",len(input_dataloader),":",loss.item())

                optimizer.zero_grad()
                loss.backward()
                optimizer.step() 

            # Train
            x_train = torch.tensor(self.train_x).float().to(DEVICE_AVAILABLE)
            y_train = torch.tensor(self.train_y).float().to(DEVICE_AVAILABLE)

            with torch.no_grad():
                y_vpred_train = self.model(x_train).squeeze()

            iter_loss_train = criterion(y_vpred_train, y_train)
            list_train_loss.append(iter_loss_train.item())

            # Valid
            x_valid = torch.tensor(self.valid_x).float().to(DEVICE_AVAILABLE)
            y_valid = torch.tensor(self.valid_y).float().to(DEVICE_AVAILABLE)

            with torch.no_grad():
                y_vpred_valid = self.model(x_valid).squeeze()

            iter_loss_valid = criterion(y_vpred_valid, y_valid)
            list_valid_loss.append(iter_loss_valid.item())

            if(min_loss > iter_loss_valid.item()):
                self.print_log("Iteration:",i+1,"/",n_iter,":",'train',iter_loss_train.item(),'valid',iter_loss_valid.item())
                self.best_model = self.model
                self.best_iter = i+1
                min_loss = iter_loss_valid.item()
            # else:
            #     break
        
        self.print_log("Train Linear NN", log_type="Success")   
        return list_train_loss, list_valid_loss   

    def get_model_name(self):
        if(self.model is None):
            return "None"
        else:
            return self.model.__class__.__name__

    def predict(self, test_x = None):
        # Train Data
        self.predict_x = self.test_x if test_x is None else test_x
        if(self.predict_x is None): return print_log("No test data", log_type="Error")

        # Model
        if(self.model is None): 
            print_log("Predict","NULL MODEL", log_type="WARNING")
            self.predict_y = [0]*len(self.predict_x)
        elif(self.get_model_name() == 'LogisticRegression'):
            print_log("Predict",self.get_model_name())
            self.predict_y = self.model.predict(self.predict_x)
        elif(self.get_model_name() == 'LinearLayer'):
            print_log("Predict",self.get_model_name())
            input = torch.tensor(self.predict_x).float().to(DEVICE_AVAILABLE)

            with torch.no_grad():
                self.predict_y = self.best_model(input).squeeze().round().cpu().detach().numpy()
        elif(self.get_model_name() == 'MulticlassClassification'):
            print_log("Predict",self.get_model_name())
            # Convert to Tensor
            input_x = torch.tensor(self.predict_x).float()
            input_y = torch.tensor(self.test_y).long()

            # Create the DataLoader for our training set.
            input_data = TensorDataset(input_x, input_y)
            input_sampler = SequentialSampler(input_data)
            input_dataloader = DataLoader(input_data, sampler=input_sampler, batch_size=1)
            self.print_log("Data size:",str(len(self.train_x)), "( Total batch", str(len(input_dataloader)),'* size',str(1),")")

            y_pred_list = []
            with torch.no_grad():
                self.best_model.eval()
                for X_batch, _ in input_dataloader:
                    X_batch = X_batch.to(DEVICE_AVAILABLE)
                    y_test_pred = self.best_model(X_batch)
                    _, y_pred_tags = torch.max(y_test_pred, dim = 1)
                    y_pred_list.append(y_pred_tags.cpu().numpy())
            self.predict_y = [a.squeeze().tolist() for a in y_pred_list]
        else:
            print_log("Predict",self.get_model_name())
            print_log("something wrong with prediction function", log_type="ERROR")

        return self.predict_x,self.predict_y

    def __str__(self):
        return self.get_model_name()

    def load_linear_nn(self, D_in, n_classes, model_path):
        self.model = LinearLayer(D_in, n_classes).to(DEVICE_AVAILABLE)
        self.model.load_state_dict(torch.load(model_path, map_location=DEVICE_AVAILABLE))
        self.best_model = self.model

    def load_multiclass_nn(self, D_in, n_classes, model_path):
        self.model = MulticlassClassification(num_feature = D_in, num_class=n_classes).to(DEVICE_AVAILABLE)
        self.model.load_state_dict(torch.load(model_path, map_location=DEVICE_AVAILABLE))
        self.best_model = self.model


In [None]:
# labels = ['label_IM', 'label_ID', 'label_CE', 'label_RI', 'label_GS', 'label_GI', 'label_A', 'label_CD', 'label_PS', 'label_TPI']
# learning_rate = 1e-5
# batch_size = 32

# print_log("Training Downstream Model")
# model = my_downstream(bert_train.get_features(),bert_train.get_labels(labels),
#                           bert_test.get_features(),bert_test.get_labels(labels),
#                           bert_valid.get_features(),bert_valid.get_labels(labels))

# train_loss, valid_loss = model.train_linear_nn(D_in = 768, 
#                           n_classes = 1 if type(labels) == str else len(labels), 
#                           n_iter=100, 
#                           learning_rate = learning_rate, 
#                           batch_size = batch_size)
  
# _,predict_y = model.predict()

### Class: my_evaluator

In [None]:
class my_evaluator:
    LABELS_HOC_FULL = ['activating invasion and metastasis', 'avoiding immune destruction',
                  'cellular energetics', 'enabling replicative immortality', 'evading growth suppressors',
                  'genomic instability and mutation', 'inducing angiogenesis', 'resisting cell death',
                  'sustaining proliferative signaling', 'tumor promoting inflammation']
    LABELS_HOC_SHORT = ['label_IM', 'label_ID', 
                       'label_CE', 'label_RI', 'label_GS', 
                       'label_GI', 'label_A', 'label_CD', 
                       'label_PS', 'label_TPI']
    @classmethod
    def divide(self, x, y):
        return np.true_divide(x, y, out=np.zeros_like(x, dtype=np.float), where=y != 0)

    @classmethod
    def get_p_r_f_arrary(self, test_predict_label, test_true_label):
        num, cat = test_predict_label.shape
        # print(num,cat)
        acc_list = []
        prc_list = []
        rec_list = []
        f_score_list = []
        for i in range(num):
            # print(test_predict_label[i])
            # print(test_true_label[i])

            acc = accuracy_score(test_true_label[i], test_predict_label[i])
            prc,rec,f_score,_ = precision_recall_fscore_support(test_true_label[i], test_predict_label[i], average='macro')

            if prc == 0 and rec == 0:
                f_score = 0
            else:
                f_score = 2 * prc * rec / (prc + rec)

            acc_list.append(acc)
            prc_list.append(prc)
            rec_list.append(rec)
            f_score_list.append(f_score)

        # print(prc_list)
        # print(rec_list)

        mean_prc = np.mean(prc_list)
        mean_rec = np.mean(rec_list)
        f_score = self.divide(2 * mean_prc * mean_rec, (mean_prc + mean_rec))
        return mean_prc, mean_rec, f_score

    @classmethod
    def hoc_sentence2doc(self, input_df):
        # Output variables
        data = {}
        input_labels_count = dict(zip(self.LABELS_HOC_FULL, [0]*len(self.LABELS_HOC_FULL))) # sentence

        # Group sentence back into documents      
        for i in range(len(input_df)):
            input_row = input_df.iloc[i]
            
            key = input_row['index'][:input_row['index'].find('_')]

            if key not in data:
                data[key] = set()

            if not pd.isna(input_row['labels']):
                for l in input_row['labels'].split(','):
                    ind,val = l.split('_')
                    if(val != '0'):
                        data[key].add(self.LABELS_HOC_FULL[int(ind)])
                        input_labels_count[self.LABELS_HOC_FULL[int(ind)]] += 1

        return data, input_labels_count

    @classmethod
    def hoc_labels2np(self, data):
        labels_list = dict(zip(self.LABELS_HOC_FULL, [[],[],[],[],[],[],[],[],[],[]]))

        y_np = []

        for k, v in data.items():
            # print(k)
            # print(true,pred)
            t = [0] * len(self.LABELS_HOC_FULL)
            for i in v:
                t[self.LABELS_HOC_FULL.index(i)] = 1

            y_np.append(t)

            for lab in self.LABELS_HOC_FULL:
                if(lab in v):
                    labels_list[lab].append(1)
                else:
                    labels_list[lab].append(0)

        return np.array(y_np),labels_list

    @classmethod
    def analysis_hoc(self, input_df):
        # Reformat to Paper evaluator format
        input_df = input_df[['filename_line','label']].copy()
        input_df.columns = ['index','labels']

        data, labels_counts_sen = self.hoc_sentence2doc(input_df)

        print_log('HoC Dataset Details')
        print_log('No. of Documents:',len(data))
        print_log('No. of Sentences:',len(input_df))
        # print(labels_counts_sen)

        y_np, labels_counts_doc = self.hoc_labels2np(data)
        # print(y_np)
        # print(labels_counts_doc)
        res = pd.DataFrame()
        for lab in self.LABELS_HOC_FULL:
            print_log(lab,sum(labels_counts_doc[lab]),len(labels_counts_doc[lab]))
            temp_df = pd.DataFrame([[sum(labels_counts_doc[lab]),len(labels_counts_doc[lab])]], columns=['sum','len'], index=[lab])
            res = res.append(temp_df)

        return res

    @classmethod
    def eval_hoc(self, input_df):   
        print_log("eval hoc",log_type="Function")
    
        # Reformat to Paper evaluator format
        ## Label need to be in a format of list of 10 cancers in fixed order
        true_df = input_df[['filename_line','label']].copy()
        pred_df = input_df[['filename_line','prediction']].copy()
        true_df.columns = ['index','labels']
        pred_df.columns = ['index','labels']

        # Group sentence back into documents 
        data_true, true_labels_count_sen = self.hoc_sentence2doc(true_df)
        data_pred, pred_labels_count_sen = self.hoc_sentence2doc(pred_df)

        # merge data_true/pred into format of {'key':(set(true),set(pred))}
        assert data_true.keys() == data_pred.keys(), 'Key mismatch'
        all_keys = set(data_true.keys()).union(data_pred.keys()) 

        data = {}
        for k in all_keys:
            data[k] = (data_true[k],data_pred[k]) 
        # print(data)
        assert len(data) == 371, 'There are 371 documents in the test set: %d' % len(data)
        
        print_log('HoC Dataset Details')
        print_log('No. of Documents:',len(data))
        print_log('No. of Sentences:',len(true_df),'/',len(pred_df))
        print(true_labels_count_sen,pred_labels_count_sen)

        # Write into dataframe
        res_count_sen = pd.DataFrame()
        for lab in self.LABELS_HOC_FULL:
            temp_df = pd.DataFrame([[true_labels_count_sen[lab],pred_labels_count_sen[lab],len(true_df)]], columns=['sentence_count_label','sentence_count_pred','sentence_count_total'], index=[lab])
            res_count_sen = res_count_sen.append(temp_df)
        # print(res_count)

        y_test, true_labels_count_doc = self.hoc_labels2np(data_true)
        y_pred, pred_labels_count_doc = self.hoc_labels2np(data_pred)
        
        res_count_doc = pd.DataFrame()
        for lab in self.LABELS_HOC_FULL:
            temp_df = pd.DataFrame([[sum(true_labels_count_doc[lab]),sum(pred_labels_count_doc[lab]),len(true_labels_count_doc[lab])]], columns=['doc_count_label','doc_count_pred','doc_count_total'], index=[lab])
            res_count_doc = res_count_doc.append(temp_df)
        res_confmat = pd.DataFrame(columns=['tn','fp','fn','tp'])

        # print(true_labels_list,pred_labels_list)
        for lab in self.LABELS_HOC_FULL:
            # print(lab)
            df2 = pd.DataFrame([list(confusion_matrix(true_labels_count_doc[lab],pred_labels_count_doc[lab]).ravel())], columns=['tn','fp','fn','tp'], index=[lab])
            res_confmat = res_confmat.append(df2)
        # print(res_confmat)
        df_res = pd.concat([res_confmat,res_count_sen,res_count_doc], axis=1)
        print(df_res)

        r, p, f1 = self.get_p_r_f_arrary(y_pred, y_test)
        print('Precision: {:.6f}'.format(p))
        print('Recall   : {:.6f}'.format(r))
        print('F1       : {:.6f}'.format(f1))
        return float(r), float(p), float(f1) , df_res

    @classmethod
    def multi_acc(self, y_pred, y_test):
        y_pred_softmax = torch.log_softmax(y_pred, dim = 1)
        _, y_pred_tags = torch.max(y_pred_softmax, dim = 1)    
        
        correct_pred = (y_pred_tags == y_test).float()
        acc = correct_pred.sum() / len(correct_pred)
        
        acc = torch.round(acc * 100)
        
        return acc

    @classmethod
    def eval_pubmedqa(self, input_df):
        class2idx = {
                'no':0,
                'maybe':1,
                'yes':2
        }
        idx2class = {v: k for k, v in class2idx.items()}
        input_df['label'].replace(idx2class, inplace=True)
        input_df['prediction'].replace(idx2class, inplace=True)

        confusion_matrix_df = pd.DataFrame(confusion_matrix(input_df.label.values, input_df.prediction.values))
        class_report = classification_report(input_df.label.values, input_df.prediction.values, digits=4,output_dict = True)
        print(class_report)
        # sns.heatmap(confusion_matrix_df, annot=True)
        return class_report , confusion_matrix_df

    @classmethod
    def eval_bioasq(self, input_df):
        class2idx = {
                'no':0,
                'yes':1
        }
        idx2class = {v: k for k, v in class2idx.items()}
        input_df['label'].replace(idx2class, inplace=True)
        input_df['prediction'].replace(idx2class, inplace=True)

        confusion_matrix_df = pd.DataFrame(confusion_matrix(input_df.label.values, input_df.prediction.values))
        class_report = classification_report(input_df.label.values, input_df.prediction.values, digits = 4,output_dict = True)
        print(class_report)
        sns.heatmap(confusion_matrix_df, annot=True)


        acc = accuracy_score(input_df.label.values, input_df.prediction.values)
        prc,rec,f_score,_ = precision_recall_fscore_support(input_df.label.values, input_df.prediction.values, average='macro')

        print("acc",acc)
        print("prc",prc)
        print("rec",rec)
        print("f_score",f_score)

        return class_report , confusion_matrix_df

# eval_hoc(temp_df)

In [None]:
# temp_df = bert_test.df_BERT.copy()
# temp_df['prediction'] = pd.Series(map(lambda x: [str(i)+"_"+str(int(x[i])) for i in range(len(x))], predict_y))
# temp_df['prediction'] = temp_df['prediction'].apply(lambda x: ','.join(x))
# r, p, f1 = my_evaluator.eval_hoc(temp_df)



---



## Model Generator

### Fine-tuned Hall-of-Cancer (HoC)

#### Assisting function

In [None]:
def text_dependent(df_input, shift_level=0):
    temp_df = df_input.copy()
    new = temp_df['filename_line'].str.split("_", n = 1, expand = True)
    # making separate first name column from new data frame
    temp_df["filename"]= new[0]
    # making separate last name column from new data frame
    temp_df["sentence"]= new[1]
    if(shift_level==1):
        df_input['text'] = temp_df.groupby('filename').text.apply(lambda x: x.shift(1).fillna('')+' '+ x).str.strip()
    elif(shift_level==2):
        df_input['text'] = temp_df.groupby('filename').text.apply(lambda x: x.shift(2).fillna('')+' '+ x.shift(1).fillna('')+' '+ x).str.strip()
    elif(shift_level==3):
        df_input['text'] = temp_df.groupby('filename').text.apply(lambda x: x.shift(3).fillna('')+' '+ x.shift(2).fillna('')+' '+ x.shift(1).fillna('')+' '+ x).str.strip()
    return df_input

In [None]:
def transform_dataset(DATAPATH,DATASET,PRETRAIN_MODEL='bert-base-uncased',TOKEN_SIZE=128, SHIFT_LEVEL=None, FORCE=False):
    print_log("Try loading data from cache")
    temp_path = os.path.join(DATAPATH,"datasets","transformed",DATASET,PRETRAIN_MODEL)
    if(SHIFT_LEVEL == None or SHIFT_LEVEL == 0):
        temp_path = os.path.join(temp_path,"token_length_"+str(TOKEN_SIZE))
    else:
        temp_path = os.path.join(temp_path,"token_length_"+str(TOKEN_SIZE)+"_shift_"+str(SHIFT_LEVEL))

    temppath_train = os.path.join(temp_path,"train.csv")
    temppath_valid = os.path.join(temp_path,"valid.csv")
    temppath_test = os.path.join(temp_path,"test.csv")
    
    print_log("Train file exist:",os.path.isfile(temppath_train),"(",temppath_train,")")
    print_log("Valid file exist:",os.path.isfile(temppath_valid),"(",temppath_valid,")")
    print_log("Test file exist:",os.path.isfile(temppath_test),"(",temppath_test,")")
    
    if((os.path.isfile(temppath_train) and os.path.isfile(temppath_valid) and os.path.isfile(temppath_valid)) and not FORCE):
        df_train = pd.read_csv(temppath_train).iloc[:, 1:]
        df_valid = pd.read_csv(temppath_valid).iloc[:, 1:]
        df_test = pd.read_csv(temppath_test).iloc[:, 1:]
        bert_train = my_BERT(df_train, is_transform=True)
        bert_test = my_BERT(df_test, is_transform=True)
        bert_valid = my_BERT(df_valid, is_transform=True)
        print_log("Load Data From Existing",log_type="Success")
        return bert_train, bert_test, bert_valid

    print_log("Transform Data","FORCE" if FORCE else "")
    temp_path = os.path.join(DATAPATH,"datasets","raw",DATASET)
    temppath_train = os.path.join(temp_path,"train.tsv")
    temppath_valid = os.path.join(temp_path,"dev.tsv")
    temppath_test = os.path.join(temp_path,"test.tsv")

    df_train = pd.read_csv(temppath_train, sep='\t')
    df_test = pd.read_csv(temppath_test, sep='\t')
    df_valid = pd.read_csv(temppath_valid, sep='\t')

    # TO DO : Modify this if not HoC
    df_train.columns = ['label','text','filename_line']
    df_test.columns = ['label','text','filename_line']
    df_valid.columns = ['label','text','filename_line']

    if(SHIFT_LEVEL != None):
        df_train = text_dependent(df_train,SHIFT_LEVEL)
        df_test = text_dependent(df_test,SHIFT_LEVEL)
        df_valid = text_dependent(df_valid,SHIFT_LEVEL)

    bert_train = my_BERT(df_train)
    bert_test = my_BERT(df_test)
    bert_valid = my_BERT(df_valid)

    bert_train.load_pretrain_bert(PRETRAIN_MODEL)
    bert_test.load_pretrain_bert(PRETRAIN_MODEL)
    bert_valid.load_pretrain_bert(PRETRAIN_MODEL)

    print_log("BERTTransform: Train Data")
    bert_train.bert_transform(DEVICE_AVAILABLE, token_length=TOKEN_SIZE)
    print_log("BERTTransform: Test Data")
    bert_test.bert_transform(DEVICE_AVAILABLE, token_length=TOKEN_SIZE)
    print_log("BERTTransform: Valid Data")
    bert_valid.bert_transform(DEVICE_AVAILABLE, token_length=TOKEN_SIZE)

    if(SHIFT_LEVEL == None):
        temp_path = os.path.join(DATAPATH,"datasets","transformed",DATASET,PRETRAIN_MODEL,"token_length_"+str(TOKEN_SIZE))
    else:
        temp_path = os.path.join(DATAPATH,"datasets","transformed",DATASET,PRETRAIN_MODEL,"token_length_"+str(TOKEN_SIZE)+"_shift_"+str(SHIFT_LEVEL))
    Path(os.path.join(temp_path)).mkdir(parents=True, exist_ok=True)

    temp_df = bert_train.get_features_df(['filename_line','label'])
    temp_df.to_csv(os.path.join(temp_path,"train.csv"))
    print_log(len(df_train),"/",len(temp_df))
    temp_df = bert_test.get_features_df(['filename_line','label'])
    temp_df.to_csv(os.path.join(temp_path,"test.csv"))
    print_log(len(df_test),"/",len(temp_df))
    temp_df = bert_valid.get_features_df(['filename_line','label'])
    temp_df.to_csv(os.path.join(temp_path,"valid.csv"))
    print_log(len(df_valid),"/",len(temp_df))

    return bert_train, bert_test, bert_valid

# transform_dataset(DATAPATH = "/content/drive/MyDrive/MinorThesis/",
#     DATASET = "HoC",
#     TOKEN_SIZE = 128,
#     PRETRAIN_MODEL = 'biobert-base-uncased',
#     SHIFT_LEVEL = None,
#     FORCE = True)

In [None]:
def train_linear_model(BERT_TRAIN, BERT_TEST, BERT_VALID, LABELS=['label'], LEARNING_RATE = 1e-5, BATCH_SIZE = 32, N_ITER=3000):
    print_log("learning_rate:",LEARNING_RATE)
    print_log("batch_size",BATCH_SIZE)
    print_log("N_ITER",N_ITER)
    print_log("Training Downstream Model")

    model = my_downstream(BERT_TRAIN.get_features(),BERT_TRAIN.get_labels(LABELS),
                          BERT_TEST.get_features(),BERT_TEST.get_labels(LABELS),
                          BERT_VALID.get_features(),BERT_VALID.get_labels(LABELS))
    train_loss, valid_loss = model.train_linear_nn(D_in = 768, 
                          n_classes = 1 if type(LABELS) == str else len(LABELS), 
                          n_iter=N_ITER, 
                          learning_rate = LEARNING_RATE, 
                          batch_size = BATCH_SIZE)
    return model, train_loss, valid_loss

#### Execution

In [None]:
# HoC
def run_dsgenerator(DATAPATH, DATASET, LABELS=['label'], PRETRAIN_MODEL = 'bert-base-uncased', TOKEN_SIZE = 128, SHIFT_LEVEL = None , LEARNING_RATE = 1e-5, BATCH_SIZE = 32, N_ITER=3000):
    print_log("Run Generator","Function")
    
    # UNIQUE IDENTIFIER USING
    sttime = datetime.now().strftime('%Y%m%d_%H-%M-%S')

    bert_train, bert_test, bert_valid = transform_dataset(DATAPATH = DATAPATH,
                                                          DATASET = DATASET,
                                                          PRETRAIN_MODEL = PRETRAIN_MODEL,
                                                          SHIFT_LEVEL = SHIFT_LEVEL,
                                                          TOKEN_SIZE = TOKEN_SIZE)

    bert_train.extract_hoc_label()
    bert_test.extract_hoc_label()
    bert_valid.extract_hoc_label()

    # TRAINING
    model, train_loss, valid_loss = train_linear_model(bert_train, bert_test, bert_valid,
                               LABELS, LEARNING_RATE, BATCH_SIZE, N_ITER)
    # print_log("learning_rate:",LEARNING_RATE)
    # print_log("batch_size",BATCH_SIZE)
    # print_log("Training Downstream Model")
    # model = my_downstream(bert_train.get_features(),bert_train.get_labels(LABELS),
    #                       bert_test.get_features(),bert_test.get_labels(LABELS),
    #                       bert_valid.get_features(),bert_valid.get_labels(LABELS))
    # train_loss, valid_loss = model.train_linear_nn(D_in = 768, 
    #                       n_classes = 1 if type(LABELS) == str else len(LABELS), 
    #                       n_iter=N_ITER, 
    #                       learning_rate = LEARNING_RATE, 
    #                       batch_size = BATCH_SIZE)
  
    _,predict_y = model.predict()

    temp_df = bert_test.df_BERT.copy()
    temp_df['prediction'] = pd.Series(map(lambda x: [str(i)+"_"+str(int(x[i])) for i in range(len(x))], predict_y))
    temp_df['prediction'] = temp_df['prediction'].apply(lambda x: ','.join(x))
    r, p, f1, _ = my_evaluator.eval_hoc(temp_df)

    result = {}
    result['dataset'] = DATASET
    result['labels'] = LABELS
    
    result['pretrain_model'] = PRETRAIN_MODEL + '-tks' + str(TOKEN_SIZE)
    result['downstream_model'] = model.get_model_name()
    result['downstream_model_savepoint'] = "model_"+sttime+".pt"
    
    result['recall'] = r
    result['precision'] = p
    result['f1score'] = f1

    result['best_iter'] = model.best_iter
    
    result['SHIFT_LEVEL'] = SHIFT_LEVEL
    result['LEARNING_RATE'] = LEARNING_RATE
    result['BATCH_SIZE'] = BATCH_SIZE
    
    # result['hyper_param'] = hp 
    # result['predict_y'] = predict_y

    # SAVING SECTION
    temp_path_model = os.path.join(DATAPATH,"models",DATASET,PRETRAIN_MODEL)
    temp_path_result = os.path.join(DATAPATH,"results",DATASET,PRETRAIN_MODEL)

    Path(temp_path_model).mkdir(parents=True, exist_ok=True)
    Path(temp_path_result).mkdir(parents=True, exist_ok=True)

    # SAVE MODEL
    torch.save(model.best_model.state_dict(), os.path.join(temp_path_model,"model_"+sttime+".pt"))

    res_df = pd.DataFrame(result.items(), columns=['key', 'result']).set_index('key')
    res_df.to_json(os.path.join(temp_path_result,"result_"+sttime+".json"))

    # SAVE PLOT
    plt.plot(train_loss, label="train")
    plt.plot(valid_loss, label="valid")
    plt.title(PRETRAIN_MODEL+"-"+DATASET+"-"+str(TOKEN_SIZE))
    plt.suptitle(str(BATCH_SIZE)+"-"+str(LEARNING_RATE))
    plt.ylabel('loss')
    plt.legend()
    plt.savefig(os.path.join(temp_path_result,"result_"+sttime+".png"))

    print_log("",log_type="----------")
    # return predict_y
    return result
    

In [None]:
# list_cancer = ['label_IM', 'label_ID', 'label_CE', 'label_RI', 'label_GS', 'label_GI', 'label_A', 'label_CD', 'label_PS', 'label_TPI']
    
# TARGET_LABEL = list_cancer
# TARGET_DATASET = "HoC" # "dat_hoc","dat_semi"
# TARGET_PATH = '/content/drive/MyDrive/MinorThesis/'
# PRETRAIN_MODEL = 'bert-base-uncased'
# N_ITER=1000

# BATCH_SIZE = 16
# LEARNING_RATE = 5e-5
# SHIFT_LEVEL=None

# # for i in range(10):
# res = run_dsgenerator(TARGET_PATH,TARGET_DATASET,TARGET_LABEL,
#                             TOKEN_SIZE=512, PRETRAIN_MODEL=PRETRAIN_MODEL, 
#                             LEARNING_RATE = LEARNING_RATE, BATCH_SIZE = BATCH_SIZE,
#                             N_ITER = N_ITER, SHIFT_LEVEL=SHIFT_LEVEL)
# res

In [None]:
# # # For test in Colab Only
# DATAPATH = "/content/drive/MyDrive/MinorThesis/"
# DATASET = "HoC"
# TOKEN_SIZE = 128
# PRETRAIN_MODEL = 'bert-base-uncased'
# SHIFT_LEVEL = None
# IS_TRANSFORM = True

# temp_path = os.path.join(DATAPATH,"datasets","transformed",DATASET,PRETRAIN_MODEL);
# if(SHIFT_LEVEL == None):
#     temp_path = os.path.join(temp_path,"token_length_"+str(TOKEN_SIZE))
# else:
#     temp_path = os.path.join(temp_path,"token_length_"+str(TOKEN_SIZE)+"_shift_"+str(SHIFT_LEVEL))

# temppath_train = os.path.join(temp_path,"train.csv")
# temppath_valid = os.path.join(temp_path,"valid.csv")
# temppath_test = os.path.join(temp_path,"test.csv")

# print_log("Train file exist:",os.path.isfile(temppath_train),"(",temppath_train,")")
# print_log("Valid file exist:",os.path.isfile(temppath_valid),"(",temppath_valid,")")
# print_log("Test file exist:",os.path.isfile(temppath_test),"(",temppath_test,")")

# df_train = pd.read_csv(temppath_train).iloc[:, 1:]
# df_valid = pd.read_csv(temppath_valid).iloc[:, 1:]
# df_test = pd.read_csv(temppath_test).iloc[:, 1:]

# bert_train = my_BERT(extract_hoc_label(df_train),is_transform=IS_TRANSFORM)
# bert_valid = my_BERT(extract_hoc_label(df_valid),is_transform=IS_TRANSFORM)
# bert_test = my_BERT(extract_hoc_label(df_test),is_transform=IS_TRANSFORM)

In [None]:
# BATCH_SIZE = 32
# LEARNING_RATE = 5e-5
# SHIFT_LEVEL=3

# res = run_dsgenerator(TARGET_PATH,TARGET_DATASET,TARGET_LABEL,
#                     TOKEN_SIZE=512, PRETRAIN_MODEL=PRETRAIN_MODEL, 
#                     LEARNING_RATE = LEARNING_RATE, BATCH_SIZE = BATCH_SIZE,
#                     N_ITER = N_ITER, SHIFT_LEVEL=SHIFT_LEVEL)
# res

### Fine-tuned PubMedQA

#### Assisting function

In [None]:
# Extract Fold ID (test,train)

def get_pubmedqa_fold_id(DATAPATH,DATASET):
    list_data_fold = []
    for i in range(10):
        temppath_train = os.path.join(DATAPATH,"datasets","raw",DATASET,"pqal_fold"+str(i),"train_set.json")
        temppath_valid = os.path.join(DATAPATH,"datasets","raw",DATASET,"pqal_fold"+str(i),"dev_set.json")
        
        df_temp_train = pd.read_json(temppath_train).transpose().reset_index()
        df_temp_valid = pd.read_json(temppath_valid).transpose().reset_index()

        list_data_fold.append((df_temp_train['index'].values,df_temp_valid['index'].values))
        # list_data_fold.append((df_temp_train,df_temp_valid))
        # print(df_temp_train.shape,df_temp_valid.shape)
    return list_data_fold

In [None]:
def get_class_distribution(obj):
    count_dict = {
        "rating_no": 0,
        "rating_maybe": 0,
        "rating_yes": 0,
    }
    
    for i in obj:
        if i == 0: 
            count_dict['rating_no'] += 1
        elif i == 1: 
            count_dict['rating_maybe'] += 1
        elif i == 2: 
            count_dict['rating_yes'] += 1             
        else:
            print("Check classes.")
            
    return count_dict

In [None]:
def pubmedqa_train_linear_model(BERT_TRAIN, BERT_TEST, FOLDS_IDX, FOLD_I=0, LABELS=['label'], LEARNING_RATE = 1e-5, BATCH_SIZE = 32, N_ITER=3000):
    print_log("learning_rate:",LEARNING_RATE)
    print_log("batch_size",BATCH_SIZE)
    print_log("N_ITER",N_ITER)
    print_log("Training Downstream Model")

    df_temp = BERT_TRAIN.df_BERT.copy()
    df_train = df_temp[df_temp.id.isin(FOLDS_IDX[FOLD_I][0])]
    df_valid = df_temp[df_temp.id.isin(FOLDS_IDX[FOLD_I][1])]

    bert_train = my_BERT(df_train, is_transform=True)
    bert_valid = my_BERT(df_valid, is_transform=True)

    # print(np.array(df_train['label']))
    # fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(25,7))
    # # Train
    # sns.barplot(data = pd.DataFrame.from_dict([get_class_distribution(np.array(df_train['label']))]).melt(), x = "variable", y="value", hue="variable",  ax=axes[0]).set_title('Class Distribution in Train Set')
    # # Validation
    # sns.barplot(data = pd.DataFrame.from_dict([get_class_distribution(np.array(df_valid['label']))]).melt(), x = "variable", y="value", hue="variable",  ax=axes[1]).set_title('Class Distribution in Val Set')
    # # # Test
    # # sns.barplot(data = pd.DataFrame.from_dict([get_class_distribution(y_test)]).melt(), x = "variable", y="value", hue="variable",  ax=axes[2]).set_title('Class Distribution in Test Set')

    model = my_downstream(bert_train.get_features(),bert_train.get_labels(LABELS),
                          BERT_TEST.get_features(),BERT_TEST.get_labels(LABELS),
                          bert_valid.get_features(),bert_valid.get_labels(LABELS))
    model.train_multiclass_nn(D_in = 768, 
                              n_classes = 3, 
                              EPOCHS=N_ITER, 
                              learning_rate = LEARNING_RATE, 
                              batch_size = BATCH_SIZE)

    # Create dataframes
    train_val_acc_df = pd.DataFrame.from_dict(model.accuracy_stats).reset_index().melt(id_vars=['index']).rename(columns={"index":"epochs"})
    train_val_loss_df = pd.DataFrame.from_dict(model.loss_stats).reset_index().melt(id_vars=['index']).rename(columns={"index":"epochs"})
    # Plot the dataframes
    fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20,7))
    sns.lineplot(data=train_val_acc_df, x = "epochs", y="value", hue="variable",  ax=axes[0]).set_title('Train-Val Accuracy/Epoch')
    sns.lineplot(data=train_val_loss_df, x = "epochs", y="value", hue="variable", ax=axes[1]).set_title('Train-Val Loss/Epoch')

    return model

In [None]:
def pubmedqa_transform_dataset(DATAPATH,DATASET,PRETRAIN_MODEL='bert-base-uncased',TOKEN_SIZE=128, SHIFT_LEVEL=None, FORCE=False, REASONING=False):
    print_log("Try loading data from cache")
    temp_path = os.path.join(DATAPATH,"datasets","transformed",DATASET,"QuesAbs", "reasoning_required" if REASONING else "reasoning_free", PRETRAIN_MODEL)
    if(SHIFT_LEVEL == None):
        temp_path = os.path.join(temp_path,"token_length_"+str(TOKEN_SIZE))
    else:
        temp_path = os.path.join(temp_path,"token_length_"+str(TOKEN_SIZE)+"_shift_"+str(SHIFT_LEVEL))

    temppath_train = os.path.join(temp_path,"train.csv")
    temppath_test = os.path.join(temp_path,"test.csv")
    
    print_log("Train file exist:",os.path.isfile(temppath_train),"(",temppath_train,")")
    print_log("Test file exist:",os.path.isfile(temppath_test),"(",temppath_test,")")
    
    if((os.path.isfile(temppath_train) and os.path.isfile(temppath_test)) and not FORCE):
        df_train = pd.read_csv(temppath_train).iloc[:, 1:]
        df_test = pd.read_csv(temppath_test).iloc[:, 1:]
        bert_train = my_BERT(df_train, is_transform=True)
        bert_test = my_BERT(df_test, is_transform=True)
        print_log("Load Data From Existing",log_type="Success")
        return bert_train, bert_test

    print_log("Transform Data","FORCE" if FORCE else "")
    ### COPY FROM TRANSFOR DATASET FOR PUBMEDQA
    pass

In [None]:
# Pubmedqa
def run_pubmedqa_dsgenerator_allfold(DATAPATH, DATASET, LABELS=['label'], PRETRAIN_MODEL = 'bert-base-uncased', TOKEN_SIZE = 128, SHIFT_LEVEL = None , LEARNING_RATE = 1e-5, BATCH_SIZE = 32, N_ITER=3000, REASONING=False):
    print_log("Run Generator","Function")
    # UNIQUE IDENTIFIER USING
    sttime = datetime.now().strftime('%Y%m%d_%H-%M-%S')
    
    

    ####
    print_log("PRETRAIN_MODEL: ",PRETRAIN_MODEL)  
    print_log("REASONING: ",REASONING )
    bert_train, bert_test = pubmedqa_transform_dataset(DATAPATH = DATAPATH,
                                                          DATASET = DATASET,
                                                          PRETRAIN_MODEL = PRETRAIN_MODEL,
                                                          SHIFT_LEVEL = SHIFT_LEVEL,
                                                          TOKEN_SIZE = TOKEN_SIZE,
                                                       REASONING = REASONING)
    
    bert_train.class2idx_pubmedqa_label()
    bert_test.class2idx_pubmedqa_label()

    folds_idx = get_pubmedqa_fold_id(DATAPATH, DATASET)

    list_acc = []
    temp_df = bert_test.df_BERT.copy()

    for i in range(10):
        print_log("######## FOLD: ",i)
        # TRAINING
        model = pubmedqa_train_linear_model(bert_train, bert_test, folds_idx, i,
                                  LABELS, LEARNING_RATE, BATCH_SIZE, N_ITER)
        
        # TEST
        _,predict_y = model.predict()

        temp_df['prediction'] = pd.Series(predict_y)
        class_report, confusion_matrix_df = my_evaluator.eval_pubmedqa(temp_df)
        temp_df.rename(columns={'prediction': 'reason_'+str(REASONING)+'_fold'+str(i)},inplace=True)
        list_acc.append(class_report['accuracy'])

        # SAVING SECTION
        temp_path_model = os.path.join(DATAPATH,"models",DATASET,PRETRAIN_MODEL,'reason_'+str(REASONING),sttime)
        Path(temp_path_model).mkdir(parents=True, exist_ok=True)

        # SAVE MODEL
        torch.save(model.best_model.state_dict(), os.path.join(temp_path_model,"model_"+"fold_"+str(i)+".pt"))

    #### 
    print_log("PRETRAIN_MODEL: ",PRETRAIN_MODEL) 
    print_log("REASONING: ",not REASONING )
    bert_train, bert_test = pubmedqa_transform_dataset(DATAPATH = DATAPATH,
                                                          DATASET = DATASET,
                                                          PRETRAIN_MODEL = PRETRAIN_MODEL,
                                                          SHIFT_LEVEL = SHIFT_LEVEL,
                                                          TOKEN_SIZE = TOKEN_SIZE,
                                                       REASONING = not REASONING)
    
    # bert_train.class2idx_pubmedqa_label()
    # bert_test.class2idx_pubmedqa_label()

    # folds_idx = get_pubmedqa_fold_id(DATAPATH, DATASET)
    # for i in range(10):
    #     # TRAINING
    #     print(i)
    #     model = pubmedqa_train_linear_model(bert_train, bert_test, folds_idx, i,
    #                               LABELS, LEARNING_RATE, BATCH_SIZE, N_ITER)
    #     # TEST
    #     _,predict_y = model.predict()

    #     temp_df['prediction'] = pd.Series(predict_y)
    #     class_report, confusion_matrix_df = my_evaluator.eval_pubmedqa(temp_df)
    #     temp_df.rename(columns={'prediction': 'reason_'+str(not REASONING)+'_fold'+str(i)},inplace=True)
    #     list_acc.append(class_report['accuracy'])

    #     # SAVING SECTION
    #     temp_path_model = os.path.join(DATAPATH,"models",DATASET,PRETRAIN_MODEL,'reason_'+str(not REASONING),sttime)
    #     Path(temp_path_model).mkdir(parents=True, exist_ok=True)

    #     # SAVE MODEL
    #     torch.save(model.best_model.state_dict(), os.path.join(temp_path_model,"model_"+"fold_"+str(i)+".pt"))

    # #### MERGE
    # print_log("REASONING MERGE")
    # temp_df['prediction'] = temp_df.apply(lambda x : x['reason_True'] if x['reason_True']==x['reason_False'] else "maybe", axis=1)
    # class_report, confusion_matrix_df = my_evaluator.eval_pubmedqa(temp_df)

    # result = {}
    # result['dataset'] = DATASET
    # result['labels'] = LABELS
    
    # result['pretrain_model'] = PRETRAIN_MODEL + '-tks' + str(TOKEN_SIZE)
    # result['downstream_model'] = model.get_model_name()
    # result['downstream_model_savepoint'] = "model_"+sttime+".pt"
    
    # result['summary'] = class_report
    # # result['recall'] = r
    # # result['precision'] = p
    # # result['f1score'] = f1

    # result['best_iter'] = model.best_iter
    
    # result['LEARNING_RATE'] = LEARNING_RATE
    # result['BATCH_SIZE'] = BATCH_SIZE

    # # SAVING SECTION
    # temp_path_model = os.path.join(DATAPATH,"models",DATASET,"reasoning_required" if REASONING else "reasoning_free",PRETRAIN_MODEL)
    # temp_path_result = os.path.join(DATAPATH,"results",DATASET,"reasoning_required" if REASONING else "reasoning_free",PRETRAIN_MODEL)

    # Path(temp_path_model).mkdir(parents=True, exist_ok=True)
    # Path(temp_path_result).mkdir(parents=True, exist_ok=True)

    # # SAVE MODEL
    # torch.save(model.best_model.state_dict(), os.path.join(temp_path_model,"model_"+sttime+".pt"))

    # res_df = pd.DataFrame(result.items(), columns=['key', 'result']).set_index('key')
    # res_df.to_json(os.path.join(temp_path_result,"result_"+sttime+".json"))

    arr = np.array(list_acc)
    arr2 = arr.reshape((1,10))
    print_log("Results (REASONING_REQUIRED, REASONING_FREE):",np.average(arr2, axis=1),log_type="SUCCESS")

    return list_acc

TARGET_LABEL = ['label']
TARGET_DATASET = "pubmedqa"
TARGET_PATH = '/content/drive/MyDrive/MinorThesis/'
PRETRAIN_MODEL = 'biobert-base-cased'
N_ITER=300

BATCH_SIZE = 32
LEARNING_RATE = 5e-5
SHIFT_LEVEL=None

# for bs in [16, 32]:
#     for lr in [1e-5, 3e-3, 5e-5]:


# for bert_model in ['biobert-base-cased']:
#     for bs in [32]:
#         for lr in [3e-3]:
#             for i in range(45):
#                 res = run_pubmedqa_dsgenerator_allfold(TARGET_PATH,TARGET_DATASET,TARGET_LABEL,
#                                         TOKEN_SIZE=512, PRETRAIN_MODEL=bert_model, 
#                                         LEARNING_RATE = lr, BATCH_SIZE = bs,
#                                         N_ITER = N_ITER, SHIFT_LEVEL=SHIFT_LEVEL,
#                                         REASONING = True)
# res


# res = run_pubmedqa_dsgenerator_allfold(TARGET_PATH,TARGET_DATASET,TARGET_LABEL,
#                         TOKEN_SIZE=512, PRETRAIN_MODEL=PRETRAIN_MODEL, 
#                         LEARNING_RATE = LEARNING_RATE, BATCH_SIZE = BATCH_SIZE,
#                         N_ITER = N_ITER, SHIFT_LEVEL=SHIFT_LEVEL,
#                         REASONING = True)
# res

In [1]:
def run_pubmedqa_ensemble(DATAPATH, DATASET, MODEL_GROUP, LABELS=['label'], TOKEN_SIZE=128, SHIFT_LEVEL = None):
    print_log("Run Ensemble","Function")

    temp_path = os.path.join(DATAPATH,"models",DATASET,"ensemble",MODEL_GROUP)
    LIST_PRETRAIN_MODEL = os.listdir(temp_path)
    print_log("Ensemble pretrain model list:",LIST_PRETRAIN_MODEL)

    list_acc = []
    list_results = []
    list_results_f1 = []

    all_pred_df = pd.DataFrame()

    for pretrain_model in LIST_PRETRAIN_MODEL:
        print_log("model:",pretrain_model)

        print_log("PRETRAIN_MODEL: ",pretrain_model)
        REASONING = True  
        print_log("REASONING: ",REASONING)
        bert_train, bert_test = pubmedqa_transform_dataset(DATAPATH = DATAPATH,
                                                           DATASET = DATASET,
                                                           PRETRAIN_MODEL = pretrain_model,
                                                           SHIFT_LEVEL = SHIFT_LEVEL,
                                                           TOKEN_SIZE = TOKEN_SIZE,
                                                           REASONING = REASONING)
        
        bert_train.class2idx_pubmedqa_label()
        bert_test.class2idx_pubmedqa_label()

        folds_idx = get_pubmedqa_fold_id(DATAPATH, DATASET)

        model = my_downstream(bert_train.get_features(),bert_train.get_labels(LABELS), # Not going to be used
                              bert_test.get_features(),bert_test.get_labels(LABELS), # Not going to be used
                              bert_train.get_features(),bert_train.get_labels(LABELS))

        temp_path2 = os.path.join(temp_path,pretrain_model,"reason_True")
        LIST_MODEL = os.listdir(temp_path2)
        
        print()
        print()
        print_log("Ensemble pretrain LIST_MODEL:",LIST_MODEL)
        for group_model in LIST_MODEL:
            temp_df = bert_test.df_BERT.copy()
            temp_path3 = os.path.join(temp_path2,group_model)
            LIST_FOLDS = os.listdir(temp_path3)

            print()
            print()
            print_log("Ensemble pretrain LIST_FOLDS:",str(len(LIST_FOLDS)),LIST_FOLDS)
            for downsteam_model in LIST_FOLDS:
                print_log("FULL PATH:",temp_path3,downsteam_model)
                model.load_multiclass_nn(D_in = 768, 
                                          n_classes = 3, 
                                          model_path = os.path.join(temp_path3,downsteam_model))

                # print(model)
                # TEST
                _,predict_y = model.predict()
                list_results.append(predict_y)

                all_pred_df[group_model+"_"+downsteam_model] = pd.Series(predict_y)
                temp_df['prediction'] = pd.Series(predict_y)
                class_report, confusion_matrix_df = my_evaluator.eval_pubmedqa(temp_df)
                temp_df.rename(columns={'prediction': group_model+"_"+downsteam_model},inplace=True)
                list_acc.append(class_report['accuracy'])



    majority_df = all_pred_df.apply(pd.Series.value_counts, axis=1).fillna(0)
    # majority_df
    maxValuesObj = majority_df.idxmax(axis=1)
    maxValuesObj
    print()
    print()

    print_log("Final Results of",len(list_results),"models",log_type='Success')
    # ensemble_res = sum(list_results)
    # ensemble_res = ensemble_res/np.amax(ensemble_res) # Majority Vote
    # # ensemble_res[ensemble_res>=1] = 1 # Only 1 is positive

    temp_df = bert_test.df_BERT.copy()
    temp_df['prediction'] = maxValuesObj
    class_report, confusion_matrix_df = my_evaluator.eval_pubmedqa(temp_df)
    print(confusion_matrix_df)
    print(class_report)

    return list_acc,all_pred_df

#### Execution

In [None]:
TARGET_LABEL = ['label']
TARGET_DATASET = "pubmedqa" # "dat_hoc","dat_semi"
TARGET_PATH = '/content/drive/MyDrive/MinorThesis/'

MODEL_GROUP = "1_pubmedbert"

list_acc,temp_df = run_pubmedqa_ensemble(TARGET_PATH,TARGET_DATASET,
                             MODEL_GROUP = MODEL_GROUP,
                             LABELS = TARGET_LABEL,
                             TOKEN_SIZE=512)

In [None]:
temp_df

Unnamed: 0,20211008_13-01-09_model_fold_0.pt,20211008_13-01-09_model_fold_1.pt,20211008_13-01-09_model_fold_2.pt,20211008_13-01-09_model_fold_3.pt,20211008_13-01-09_model_fold_4.pt,20211008_13-01-09_model_fold_5.pt,20211008_13-01-09_model_fold_6.pt,20211008_13-01-09_model_fold_7.pt,20211008_13-01-09_model_fold_8.pt,20211008_13-01-09_model_fold_9.pt,20211008_13-21-15_model_fold_0.pt,20211008_13-21-15_model_fold_1.pt,20211008_13-21-15_model_fold_2.pt,20211008_13-21-15_model_fold_3.pt,20211008_13-21-15_model_fold_4.pt,20211008_13-21-15_model_fold_5.pt,20211008_13-21-15_model_fold_6.pt,20211008_13-21-15_model_fold_7.pt,20211008_13-21-15_model_fold_8.pt,20211008_13-21-15_model_fold_9.pt,20211008_12-33-17_model_fold_0.pt,20211008_12-33-17_model_fold_1.pt,20211008_12-33-17_model_fold_2.pt,20211008_12-33-17_model_fold_3.pt,20211008_12-33-17_model_fold_4.pt,20211008_12-33-17_model_fold_5.pt,20211008_12-33-17_model_fold_6.pt,20211008_12-33-17_model_fold_7.pt,20211008_12-33-17_model_fold_8.pt,20211008_12-33-17_model_fold_9.pt,20211008_12-10-17_model_fold_0.pt,20211008_12-10-17_model_fold_1.pt,20211008_12-10-17_model_fold_2.pt,20211008_12-10-17_model_fold_3.pt,20211008_12-10-17_model_fold_4.pt,20211008_12-10-17_model_fold_5.pt,20211008_12-10-17_model_fold_6.pt,20211008_12-10-17_model_fold_7.pt,20211008_12-10-17_model_fold_8.pt,20211008_12-10-17_model_fold_9.pt,...,20211008_13-25-16_model_fold_0.pt,20211008_13-25-16_model_fold_1.pt,20211008_13-25-16_model_fold_2.pt,20211008_13-25-16_model_fold_3.pt,20211008_13-25-16_model_fold_4.pt,20211008_13-25-16_model_fold_5.pt,20211008_13-25-16_model_fold_6.pt,20211008_13-25-16_model_fold_7.pt,20211008_13-25-16_model_fold_8.pt,20211008_13-25-16_model_fold_9.pt,20211008_12-17-58_model_fold_0.pt,20211008_12-17-58_model_fold_1.pt,20211008_12-17-58_model_fold_2.pt,20211008_12-17-58_model_fold_3.pt,20211008_12-17-58_model_fold_4.pt,20211008_12-17-58_model_fold_5.pt,20211008_12-17-58_model_fold_6.pt,20211008_12-17-58_model_fold_7.pt,20211008_12-17-58_model_fold_8.pt,20211008_12-17-58_model_fold_9.pt,20211008_12-57-07_model_fold_0.pt,20211008_12-57-07_model_fold_1.pt,20211008_12-57-07_model_fold_2.pt,20211008_12-57-07_model_fold_3.pt,20211008_12-57-07_model_fold_4.pt,20211008_12-57-07_model_fold_5.pt,20211008_12-57-07_model_fold_6.pt,20211008_12-57-07_model_fold_7.pt,20211008_12-57-07_model_fold_8.pt,20211008_12-57-07_model_fold_9.pt,20211008_13-33-19_model_fold_0.pt,20211008_13-33-19_model_fold_1.pt,20211008_13-33-19_model_fold_2.pt,20211008_13-33-19_model_fold_3.pt,20211008_13-33-19_model_fold_4.pt,20211008_13-33-19_model_fold_5.pt,20211008_13-33-19_model_fold_6.pt,20211008_13-33-19_model_fold_7.pt,20211008_13-33-19_model_fold_8.pt,20211008_13-33-19_model_fold_9.pt
0,2,1,0,0,0,1,2,0,2,2,2,1,0,0,1,2,1,1,0,1,2,0,1,0,0,2,0,2,0,2,2,0,2,0,0,2,2,2,0,2,...,2,0,0,0,0,2,1,1,1,1,2,0,2,0,0,2,0,2,0,1,2,0,0,0,2,1,2,2,1,1,2,0,2,0,0,2,0,1,2,2
1,2,2,2,2,2,2,2,2,2,2,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
2,0,0,1,0,0,1,2,0,0,1,1,1,0,0,0,2,1,1,0,1,0,0,0,2,0,1,1,0,0,0,0,2,0,0,1,0,2,1,0,0,...,0,0,0,0,0,1,2,0,1,1,0,0,0,0,0,1,1,1,0,1,0,0,0,1,0,1,1,0,0,0,0,2,0,1,0,1,0,1,0,0
3,2,2,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
4,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
496,2,2,2,2,2,2,2,0,2,2,2,0,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,0,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2,2,2,2,2,0,2,2,2,2,2,2,2,2,2,2,2,0,2,2,2,2,2,2,2,2,2,2,2,2,2
497,1,0,0,2,0,2,0,2,2,0,2,0,0,0,0,0,1,1,2,1,0,2,0,0,0,1,1,1,2,2,1,0,1,0,0,0,0,2,0,1,...,1,0,0,0,0,0,1,2,0,1,2,0,0,0,0,0,0,2,1,2,2,0,0,0,0,0,0,2,0,2,2,0,0,0,0,0,0,1,2,1
498,2,2,0,0,1,1,1,0,0,0,2,2,0,0,0,2,1,1,2,0,2,2,2,0,0,1,1,0,0,0,2,2,2,0,0,2,2,0,0,2,...,2,2,0,0,0,2,0,2,2,0,2,2,0,0,0,2,0,0,0,2,0,2,0,0,0,0,0,0,0,0,0,1,0,0,0,2,1,2,0,1


### Fine-tuned BioASQ

#### Assisting function

In [None]:
# HoC
def run_bioasq_dsgenerator(DATAPATH, DATASET, LABELS=['label'], PRETRAIN_MODEL = 'bert-base-uncased', TOKEN_SIZE = 128, SHIFT_LEVEL = None , LEARNING_RATE = 1e-5, BATCH_SIZE = 32, N_ITER=3000):
    print_log("Run Generator","Function")
    
    # UNIQUE IDENTIFIER USING
    sttime = datetime.now().strftime('%Y%m%d_%H-%M-%S')

    bert_train, bert_test, bert_valid = transform_dataset(DATAPATH = DATAPATH,
                                                          DATASET = DATASET,
                                                          PRETRAIN_MODEL = PRETRAIN_MODEL,
                                                          SHIFT_LEVEL = SHIFT_LEVEL,
                                                          TOKEN_SIZE = TOKEN_SIZE)
    bert_train.class2idx_bioasq_label()
    bert_valid.class2idx_bioasq_label()
    bert_test.class2idx_bioasq_label()


    # # TRAINING
    model, train_loss, valid_loss = train_linear_model(bert_train, bert_test, bert_valid,
                               LABELS, LEARNING_RATE, BATCH_SIZE, N_ITER)
  
    _,predict_y = model.predict()

    temp_df = bert_test.df_BERT.copy()
    temp_df['prediction'] = pd.Series(predict_y)
    class_report, confusion_matrix_df = my_evaluator.eval_bioasq(temp_df)

    # result = {}
    # result['dataset'] = DATASET
    # result['labels'] = LABELS
    
    # result['pretrain_model'] = PRETRAIN_MODEL + '-tks' + str(TOKEN_SIZE)
    # result['downstream_model'] = model.get_model_name()
    # result['downstream_model_savepoint'] = "model_"+sttime+".pt"
    
    # result['recall'] = r
    # result['precision'] = p
    # result['f1score'] = f1

    # result['best_iter'] = model.best_iter
    
    # result['SHIFT_LEVEL'] = SHIFT_LEVEL
    # result['LEARNING_RATE'] = LEARNING_RATE
    # result['BATCH_SIZE'] = BATCH_SIZE
    
    # # result['hyper_param'] = hp 
    # # result['predict_y'] = predict_y

    # # SAVING SECTION
    # temp_path_model = os.path.join(DATAPATH,"models",DATASET,PRETRAIN_MODEL)
    # temp_path_result = os.path.join(DATAPATH,"results",DATASET,PRETRAIN_MODEL)

    # Path(temp_path_model).mkdir(parents=True, exist_ok=True)
    # Path(temp_path_result).mkdir(parents=True, exist_ok=True)

    # # SAVE MODEL
    # torch.save(model.best_model.state_dict(), os.path.join(temp_path_model,"model_"+sttime+".pt"))

    # res_df = pd.DataFrame(result.items(), columns=['key', 'result']).set_index('key')
    # res_df.to_json(os.path.join(temp_path_result,"result_"+sttime+".json"))

    # # SAVE PLOT
    # plt.plot(train_loss, label="train")
    # plt.plot(valid_loss, label="valid")
    # plt.title(PRETRAIN_MODEL+"-"+DATASET+"-"+str(TOKEN_SIZE))
    # plt.suptitle(str(BATCH_SIZE)+"-"+str(LEARNING_RATE))
    # plt.ylabel('loss')
    # plt.legend()
    # plt.savefig(os.path.join(temp_path_result,"result_"+sttime+".png"))

    # print_log("",log_type="----------")
    # # return predict_y
    # return result
    

#### Execution

In [None]:
TARGET_LABEL = 'label'
TARGET_DATASET = "BioASQ" # "dat_hoc","dat_semi"
TARGET_PATH = '/content/drive/MyDrive/MinorThesis/'
PRETRAIN_MODEL = 'biobert-base-cased'
N_ITER=1000

BATCH_SIZE = 32
LEARNING_RATE = 5e-5
SHIFT_LEVEL=None


res = run_bioasq_dsgenerator(TARGET_PATH,TARGET_DATASET,TARGET_LABEL,
                        TOKEN_SIZE=512, PRETRAIN_MODEL=PRETRAIN_MODEL, 
                        LEARNING_RATE = LEARNING_RATE, BATCH_SIZE = BATCH_SIZE,
                        N_ITER = N_ITER, SHIFT_LEVEL=SHIFT_LEVEL)
res

[Info] Run Generator Function
[Info] Try loading data from cache
[Info] Train file exist: False ( /content/drive/MyDrive/MinorThesis/datasets/transformed/BioASQ/biobert-base-cased/token_length_512/train.csv )
[Info] Valid file exist: False ( /content/drive/MyDrive/MinorThesis/datasets/transformed/BioASQ/biobert-base-cased/token_length_512/valid.csv )
[Info] Test file exist: False ( /content/drive/MyDrive/MinorThesis/datasets/transformed/BioASQ/biobert-base-cased/token_length_512/test.csv )
[Info] Transform Data 


ValueError: ignored

In [None]:
res.df_BERT

## Model Results Read 

In [None]:
import os
import pandas as pd
pd.set_option('display.max_colwidth', 0)

In [None]:
TARGET_PATH = '/content/drive/MyDrive/MinorThesis/'

In [None]:
# temp_list = []
# datasets = os.listdir(TARGET_PATH+"results")
# print(datasets)
# for ds in datasets:
#     files = os.listdir(TARGET_PATH+"results/"+ds)
#     # print(arr2)
#     for f in files:
#         print(TARGET_PATH+"results/"+ds+"/"+f)
#         try:
#             a = pd.read_json(TARGET_PATH+"results/"+ds+"/"+f)
#             a = a.transpose()
#             a.index = [f]
#             temp_list.append(a)
#         except:
#             print("error")
#             pass
# temp_res = pd.concat(temp_list, sort=False)
# # temp_res.drop(columns='predict_y',inplace=True) # This cause error when display result
# # temp_res.drop_duplicates(keep='last',inplace=True)

In [None]:
temp_res = temp_res[['dataset','pretrain_model','hyper_param','f1score','precision','recall']]

In [None]:

# temp_res.drop(columns='predict_y',inplace=True)
# temp_res.drop(columns='labels',inplace=True)
temp_res.drop_duplicates(keep='last',inplace=True)

In [None]:
# Table 1 : Show comparison between different type of BERT
# ONLY Original BERT IS WORST!!!
temp_res[(temp_res.dataset=='blurb_hoc')].sort_values(['f1score'], ascending = False) 