<a href="https://colab.research.google.com/github/blizrys/BERT-Classification-Tutorial/blob/master/BERT_Evaluator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Configuration initialization

In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive') 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# # Locate path of data in 'google drive'
# DATA_PATH = '/content/drive/MyDrive/MinorThesis/'
# SEED_NUMBER = 19900506

# BERT Evaluator for Minor Thesis

**Student Name:**  Jirarote Jirasirikul

**Student ID:**    31334679

## Import Library

All Library and File Path will be added here

In [4]:
# # Installation (Uncomment if need to installation or update library)
# !pip install spacy #==2.0.11
# !pip install transformers

In [5]:
!pip install transformers
import transformers as ppb

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/fd/1a/41c644c963249fd7f3836d926afa1e3f1cc234a1c40d80c5f03ad8f6f1b2/transformers-4.8.2-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 7.3MB/s 
Collecting huggingface-hub==0.0.12
  Downloading https://files.pythonhosted.org/packages/2f/ee/97e253668fda9b17e968b3f97b2f8e53aa0127e8807d24a547687423fe0b/huggingface_hub-0.0.12-py3-none-any.whl
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 42.3MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     

In [6]:
import pandas as pd
import numpy as np
import glob
import os
from pathlib import Path

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, matthews_corrcoef

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from collections import defaultdict

from datetime import datetime
import json 

In [7]:
# # MY GLOBAL FUNCTION - 

ENABLE_LOGS = 1
def print_log(*arg, log_type="Info"):
    global ENABLE_LOGS
    if(ENABLE_LOGS==1 or log_type!="Info"): 
        print("["+log_type+"]"," ".join(str(x) for x in arg))

## Check Available Device (CPU/GPU)

In [8]:
import torch
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    DEVICE_AVAILABLE = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    DEVICE_AVAILABLE = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4




---

##BERT Text Representation

Transform Language Model

When using BERT, technically we are transforming our sentence into a vector that represent each sentence. The process is call Language Model a representation of each word. 

BERT add [CLS] token infront of each sentence. This token representation vector could later be use for Classification as it contain the sentence representation.

### Class my_BERT

In [9]:
# BERT weight Options 
# - 'distilbert-base-uncased'
# - 'bert-base-uncased'
# - 'dmis-lab/biobert-base-cased-v1.1'
# - 'dmis-lab/biobert-v1.1' : Data Mining and Information Systems Lab, Korea University's picture Updated May 19 • 41k
# - 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext'



In [10]:
class my_BERT:
    ###### Load pretrain BERT Language Model transformer (Otherwise use 'set' to customize)
    # # For DistilBERT:
    # # model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

    ## Want BERT instead of distilBERT? Uncomment the following line:
    model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')
    # model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'dmis-lab/biobert-base-cased-v1.1')

    # Load pretrained model/tokenizer
    bert_tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
    bert_model = model_class.from_pretrained(pretrained_weights)

    def __init__(self, df_input, ENABLE_LOGS = 1):
        ## INPUT STRUCTURE (COLUMNS): 
        ## - 'text' - Required
        ## - 'label' - Optional default name is 'label' otherwise need to specific when called

        self.df = df_input
        self.df_BERT = None
        self.ENABLE_LOGS = ENABLE_LOGS
    
    def print_log(self, *arg, log_type="Info"):
        if(self.ENABLE_LOGS==1 or log_type!="Info"): 
            print("["+log_type+"]"," ".join(str(x) for x in arg))

    def bert_tokenize(self, token_length=128):
        # return "Hellow"
        df_output = self.df.copy()

        # BERT Tokenizer + truncate to BERT_MAX_LENGTH
        df_output['BERTTokens'] = df_output["text"].apply((lambda x: self.bert_tokenizer.encode(x, add_special_tokens=True,truncation=True,max_length = token_length)))
        # df_output['n_tokens0'] = df_output['BERTTokens'].apply(lambda x: len(x)) # Just for verification
        temp = df_output['BERTTokens'].apply(lambda x: len(x))
        self.print_log("Token - Done","( mean/max no. of token:",round(temp.mean()),temp.max(),")")

        # Padding tokens to BERT_MAX_LENGTH
        df_output['BERTTokens'] = df_output['BERTTokens'].apply(lambda x: x + [0]*(token_length-len(x)))
        # df_output['n_tokens'] = df_output['BERTTokens'].apply(lambda x: len(x)) # Just for verification
        self.print_log("Pad - Done")

        # BERT Mask
        df_output['BERTMasks'] = df_output['BERTTokens'].apply(lambda x: [np.where(i != 0, 1, 0) for i in x])
        # df_output['n_mask1'] = df_output['BERTMask'].apply(lambda x: sum(x)) # Just for verification
        self.print_log("Mask - Done")

        return df_output

    def run_bert_transform(self, dataloader, device_available = torch.device("cpu")):
        all_result = []

        self.bert_model.to(device_available)

        digit = len(str(len(dataloader)))-1 # Report progress

        for step, batch in enumerate(dataloader):
            if(step == 0 or (step+1)%(10**digit) == 0 or step == len(dataloader)-1): self.print_log("Step:",step+1,"/",len(dataloader))

            b_input_ids = batch[0].to(device_available)
            b_input_mask = batch[1].to(device_available)

            with torch.no_grad():
                last_hidden_states = self.bert_model(b_input_ids, attention_mask=b_input_mask)
        
            
            res_features = last_hidden_states[0][:,0,:].cpu().numpy()
            all_result.append(res_features)
        self.print_log("BERT transform - Done")

        return np.vstack(all_result)


    def bert_transform(self, device_available = torch.device("cpu"), batch_size = 32, token_length=128):
        df_output = self.bert_tokenize(token_length)

        # Convert to Tensor
        input_tokens = torch.tensor(np.stack(df_output['BERTTokens'].values))
        input_masks = torch.tensor(np.stack(df_output['BERTMasks'].values))
        # print(input_tokens,input_masks)

        # Create the DataLoader for our training set.
        input_data = TensorDataset(input_tokens, input_masks)
        input_sampler = SequentialSampler(input_data)
        input_dataloader = DataLoader(input_data, sampler=input_sampler, batch_size=batch_size)

        self.print_log("Running BERT Transform on", str(device_available))
        if(str(device_available) == 'cpu'):
            self.print_log("Running BERT on CPU can take longer time...",log_type="WARNING")
        self.print_log("BERT token length:",token_length)
        self.print_log("Data size:",str(len(input_tokens)), "( Total batch", str(len(input_dataloader)),'* size',str(batch_size),")")
        
        output_features = self.run_bert_transform(input_dataloader,device_available)
        df_output['BERT_Features'] = pd.Series(output_features.tolist())
        
        self.print_log("BERT transformed", log_type="Success")
        self.df_BERT = df_output

    def get_features(self):
        if(isinstance(self.df_BERT, pd.DataFrame)):
            return np.array([np.array(xi) for xi in self.df_BERT.BERT_Features.values])
        else:
            print_log("Please run function 'bert_transform' to generate text representation first!",log_type="Error")

    def get_labels(self,col_target = 'label'):
        return np.array(self.df_BERT[col_target].tolist())

    def get_current_bert_model(self):
        return self.bert_model.config._name_or_path

    def load_pretrain_bert(self, model_name='bert-base-uncased'):
        ## Want BERT instead of distilBERT? Uncomment the following line:
        self.model_class, self.tokenizer_class, self.pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, model_name)

        # Load pretrained model/tokenizer
        self.bert_tokenizer = self.tokenizer_class.from_pretrained(self.pretrained_weights)
        self.bert_model = self.model_class.from_pretrained(self.pretrained_weights)



HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [11]:
# bert_HOC_train = my_BERT(pd.DataFrame())
# bert_HOC_train.get_current_bert_model()

In [12]:
# bert_HOC_train.bert_tokenizer

In [13]:
# bert_HOC_train.bert_model

In [14]:
# bert_HOC_train.load_pretrain_bert('dmis-lab/biobert-v1.1')

In [15]:
# bert_HOC_train.bert_tokenizer

In [16]:
# bert_HOC_train.get_current_bert_model()



---


## Downstream Model

### Class: my_downstream

In [18]:
class my_downstream:
    def __init__(self, train_x, train_y, test_x = None, test_y = None, ENABLE_LOGS = 1):
        self.train_x = train_x
        self.train_y = train_y
        self.test_x = test_x
        self.test_y = test_y
        self.predict_x = None
        self.predict_y = None

        self.model = None

        self.ENABLE_LOGS = ENABLE_LOGS

    def print_log(self, *arg, log_type="Info"):
        if(self.ENABLE_LOGS==1 or log_type!="Info"): 
            print("["+log_type+"]"," ".join(str(x) for x in arg))

    def train_logistic(self, n_iter=500, random_state=0):
        self.model = LogisticRegression(random_state=random_state,max_iter=n_iter)
        self.print_log("Train", self.model.__class__.__name__)
        self.print_log("iteration:",n_iter)
        self.print_log("random_state:",random_state)
        self.model.fit(self.train_x, self.train_y)
        self.print_log("Train Logistic Regression", log_type="Success")

    def get_model_name(self):
        if(self.model is None):
            return "None"
        else:
            return self.model.__class__.__name__

    def predict(self, test_x = None):
        # Train Data
        self.predict_x = self.test_x if test_x is None else test_x
        if(self.predict_x is None): return print_log("No test data", log_type="Error")

        # Model
        if(self.model is None): 
            print_log("NULL MODEL", log_type="WARNING")
            self.predict_y = [0]*len(self.predict_x)
        elif(self.get_model_name() == 'LogisticRegression'):
            self.predict_y = self.model.predict(self.predict_x)
        else:
            print_log("something wrong with prediction function", log_type="ERROR")

        return self.predict_x,self.predict_y

    def __str__(self):
        return self.get_model_name()

    def my_eval(self):
        if(self.test_y is None): return print_log("Evaluation requires ground truth", log_type="Error")
        if(self.predict_y is None): return print_log("Evaluation requires prediction", log_type="Error")
        
        tn, fp, fn, tp = confusion_matrix(self.test_y,self.predict_y).ravel()
        print_log('Confusion Matrix (tn, fp, fn, tp):',(tn, fp, fn, tp))
        recall = recall_score(self.test_y,self.predict_y,average='macro')
        precision = precision_score(self.test_y,self.predict_y,average='macro')
        f1score = f1_score(self.test_y,self.predict_y,average='macro')
        accuracy = accuracy_score(self.test_y,self.predict_y)
        # matthews = matthews_corrcoef(y_test,y_predict) 
        print_log('Accuracy: '+ str(accuracy))
        print_log('Macro Precision: '+ str(precision))
        print_log('Macro Recall: '+ str(recall))
        print_log('Macro F1 score:'+ str(f1score))
        # print('MCC:'+ str(matthews))

        return {'model_name':self.get_model_name(),
                # 'target':None,
                # 'algorithm':"BERT_Logistic",
                # 'text_preprocessing':None,
                # 'datasize':None,
                'TP':tp,
                'TN':tn,
                'FP':fp,
                'FN':fn,
                'accuracy':accuracy,
                'precision':precision,
                'recall':recall,
                'f1score':f1score}



---



## Run Evaluator

In [19]:
def run_evaluator(datapath,dataset,label='label',pretrain_model = 'bert-base-uncased',token_size=128, seed_number = 19900506):
    print_log("Run Evaluator","Function")
    print_log("pretrain_model:",pretrain_model)
    print_log("token_size:",token_size)
    print_log("seed_number",seed_number)

    temppath_train = datapath+"datasets/"+dataset+'/'+"train.json"
    temppath_test = datapath+"datasets/"+dataset+'/'+"test.json"

    print_log("Train file exist:",os.path.isfile(temppath_train),"(",temppath_train,")")
    print_log("Test file exist:",os.path.isfile(temppath_test),"(",temppath_test,")")

    df_train = pd.read_json(temppath_train)
    df_test = pd.read_json(temppath_test)
    print_log(df_train.shape)
    print_log(df_test.shape)
    # print_log(df_train.columns)

    print_log("BERTTransform: Train Data")
    bert_train = my_BERT(df_train)
    bert_train.load_pretrain_bert(pretrain_model)
    bert_train.bert_transform(DEVICE_AVAILABLE, token_length=token_size)

    print_log("BERTTransform: Test Data")
    bert_test = my_BERT(df_test)
    bert_test.load_pretrain_bert(pretrain_model)
    bert_test.bert_transform(DEVICE_AVAILABLE, token_length=token_size)

    print_log("Training Downstream Model")
    model = my_downstream(bert_train.get_features(),bert_train.get_labels(label),
                              bert_test.get_features(),bert_test.get_labels(label))
    
    model.train_logistic(n_iter=500,random_state = seed_number)
    _,predict_y = model.predict()

    result = model.my_eval()
    result['predict_y'] = predict_y
    result['hyper_param'] = "berttoken_"+str(token_size)
    result['dataset'] = dataset
    result['label'] = label
    result['pretrain_model'] = bert_train.get_current_bert_model()
    res2 = pd.DataFrame(result.items(), columns=['key', 'result']).set_index('key')
    
    Path(datapath+"results/"+dataset).mkdir(parents=True, exist_ok=True)
    sttime = datetime.now().strftime('%Y%m%d_%H-%M-%S')
    res2.to_json(datapath+"results/"+dataset+"/result_"+sttime+".json")

    print_log("",log_type="----------")
    return result

In [None]:
TARGET_LABEL = 'label_A'
TARGET_DATASET = "blurb_hoc" # "dat_hoc","dat_semi"
TARGET_PATH = '/content/drive/MyDrive/MinorThesis/'
# print(TARGET_PATH+TARGET_DATASET+"train.json")
# print(TARGET_PATH+TARGET_DATASET+"train.json")

# [Info] df_HOC: (20383, 3)
# [Info] df_HOC_train: (14268, 3)
# [Info] df_HOC_valid: (5680, 3)

res = run_evaluator(TARGET_PATH,TARGET_DATASET,TARGET_LABEL,token_size=128)
# res = run_evaluator(TARGET_DATAPATH,TARGET_DATASET,TARGET_LABEL,token_size=128,pretrain_model= 'dmis-lab/biobert-v1.1')
# res = run_evaluator(TARGET_DATAPATH,TARGET_DATASET,TARGET_LABEL,token_size=128,pretrain_model= 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')

# res = run_evaluator(TARGET_DATAPATH,TARGET_DATASET,TARGET_LABEL,token_size=256)
# res = run_evaluator(TARGET_DATAPATH,TARGET_DATASET,TARGET_LABEL,token_size=256,pretrain_model= 'dmis-lab/biobert-v1.1')
# res = run_evaluator(TARGET_DATAPATH,TARGET_DATASET,TARGET_LABEL,token_size=256,pretrain_model= 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')

# res = run_evaluator(TARGET_DATAPATH,TARGET_DATASET,TARGET_LABEL,token_size=512)
# res = run_evaluator(TARGET_DATAPATH,TARGET_DATASET,TARGET_LABEL,token_size=512,pretrain_model= 'dmis-lab/biobert-v1.1')
# res = run_evaluator(TARGET_DATAPATH,TARGET_DATASET,TARGET_LABEL,token_size=512,pretrain_model= 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')


[Info] Run Evaluator Function
[Info] pretrain_model: bert-base-uncased
[Info] token_size: 128
[Info] seed_number 19900506
[Info] Train file exist: True ( /content/drive/MyDrive/MinorThesis/datasets/blurb_hoc/train.json )
[Info] Test file exist: True ( /content/drive/MyDrive/MinorThesis/datasets/blurb_hoc/test.json )
[Info] (12119, 14)
[Info] (3547, 14)
[Info] BERTTransform: Train Data


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[Info] Token - Done ( mean/max no. of token: 40 128 )
[Info] Pad - Done
[Info] Mask - Done
[Info] Running BERT Transform on cuda
[Info] BERT token length: 128
[Info] Data size: 12119 ( Total batch 379 * size 32 )
[Info] Step: 1 / 379
[Info] Step: 100 / 379
[Info] Step: 200 / 379
[Info] Step: 300 / 379
[Info] Step: 379 / 379
[Info] BERT transform - Done
[Success] BERT transformed
[Info] BERTTransform: Test Data


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[Info] Token - Done ( mean/max no. of token: 40 128 )
[Info] Pad - Done
[Info] Mask - Done
[Info] Running BERT Transform on cuda
[Info] BERT token length: 128
[Info] Data size: 3547 ( Total batch 111 * size 32 )
[Info] Step: 1 / 111
[Info] Step: 100 / 111
[Info] Step: 111 / 111
[Info] BERT transform - Done
[Success] BERT transformed
[Info] Training Downstream Model
[Info] Train LogisticRegression
[Info] iteration: 500
[Info] random_state: 19900506
[Success] Train Logistic Regression
[Info] Confusion Matrix (tn, fp, fn, tp): (3473, 14, 56, 4)
[Info] Accuracy: 0.9802650126867776
[Info] Macro Precision: 0.6031768521142281
[Info] Macro Recall: 0.531325877067202
[Info] Macro F1 score:0.5462934537906031
[----------] 


## Read Results

In [None]:
import os
import pandas as pd

In [None]:
TARGET_PATH = '/content/drive/MyDrive/MinorThesis/'

In [None]:
temp_list = []
datasets = os.listdir(TARGET_PATH+"results")
print(datasets)
for ds in datasets:
    files = os.listdir(TARGET_PATH+"results/"+ds)
    # print(arr2)
    for f in files:
        print(TARGET_PATH+"results/"+ds+"/"+f)
        try:
            a = pd.read_json(TARGET_PATH+"results/"+ds+"/"+f)
            a = a.transpose()
            a.index = [f]
            temp_list.append(a)
        except:
            print("error")
            pass
temp_res = pd.concat(temp_list, sort=False)
temp_res.drop(columns='predict_y',inplace=True) # This cause error when display result
temp_res.drop_duplicates(keep='last',inplace=True)

['dat_hoc_label_A', 'blurb_hoc']
/content/drive/MyDrive/MinorThesis/results/dat_hoc_label_A/result_20210629_10:11:46.json
/content/drive/MyDrive/MinorThesis/results/dat_hoc_label_A/result_20210629_10:14:40.json
/content/drive/MyDrive/MinorThesis/results/dat_hoc_label_A/result_20210629_10:17:38.json
/content/drive/MyDrive/MinorThesis/results/dat_hoc_label_A/result_20210629_10:23:02.json
/content/drive/MyDrive/MinorThesis/results/dat_hoc_label_A/result_20210629_10:28:21.json
/content/drive/MyDrive/MinorThesis/results/dat_hoc_label_A/result_20210629_10:33:44.json
/content/drive/MyDrive/MinorThesis/results/dat_hoc_label_A/result_20210629_10:46:22.json
/content/drive/MyDrive/MinorThesis/results/dat_hoc_label_A/result_20210629_10:58:55.json
/content/drive/MyDrive/MinorThesis/results/dat_hoc_label_A/result_20210629_11:11:30.json
/content/drive/MyDrive/MinorThesis/results/dat_hoc_label_A/result_20210701_11:12:20.json
/content/drive/MyDrive/MinorThesis/results/blurb_hoc/result_20210703_03-49-58

In [None]:
# Table 1 : Show comparison between different type of BERT
# ONLY Original BERT IS WORST!!!
temp_res[(temp_res.dataset=='blurb_hoc') & (temp_res.hyper_param == 'berttoken_512')].sort_values(['label','f1score']) 

Unnamed: 0,FN,FP,TN,TP,accuracy,dataset,f1score,hyper_param,label,model_name,precision,pretrain_model,recall
result_20210703_16:24:21.json,56,15,3472,4,0.979983,blurb_hoc,0.545572,berttoken_512,label_A,LogisticRegression,0.597327,bert-base-uncased,0.531182
result_20210703_16:31:25.json,35,10,3477,25,0.987313,blurb_hoc,0.759943,berttoken_512,label_A,LogisticRegression,0.85216,dmis-lab/biobert-v1.1,0.706899
result_20210703_16:38:45.json,33,11,3476,27,0.987595,blurb_hoc,0.772366,berttoken_512,label_A,LogisticRegression,0.850561,microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...,0.723423
result_20210703_16:46:08.json,104,35,3354,54,0.960812,blurb_hoc,0.708473,berttoken_512,label_CD,LogisticRegression,0.788333,bert-base-uncased,0.665722
result_20210703_17:00:42.json,79,37,3352,79,0.967296,blurb_hoc,0.779817,berttoken_512,label_CD,LogisticRegression,0.829005,microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...,0.744541
result_20210703_16:53:21.json,74,42,3347,84,0.967296,blurb_hoc,0.787258,berttoken_512,label_CD,LogisticRegression,0.822518,dmis-lab/biobert-v1.1,0.759626
result_20210703_14:54:04.json,28,4,3510,5,0.990978,blurb_hoc,0.616779,berttoken_512,label_CE,LogisticRegression,0.773821,bert-base-uncased,0.575188
result_20210703_15:02:49.json,23,3,3511,10,0.99267,blurb_hoc,0.715547,berttoken_512,label_CE,LogisticRegression,0.881361,dmis-lab/biobert-v1.1,0.651088
result_20210703_15:10:48.json,19,3,3511,14,0.993798,blurb_hoc,0.778438,berttoken_512,label_CE,LogisticRegression,0.909073,microsoft/BiomedNLP-PubMedBERT-base-uncased-ab...,0.711694
result_20210703_16:02:25.json,99,50,3355,43,0.957993,blurb_hoc,0.672117,berttoken_512,label_GI,LogisticRegression,0.716852,bert-base-uncased,0.644066
