<a href="https://colab.research.google.com/github/blizrys/MinorThesisBERT/blob/main/BERT_Evaluator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Configuration initialization

In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive') 

Mounted at /content/drive


In [2]:
# Locate path of data in 'google drive'
DATA_PATH = '/content/drive/MyDrive/MinorThesis/dataset/'
SEED_NUMBER = 19900506

# BERT Evaluator for Minor Thesis

**Student Name:**  Jirarote Jirasirikul

**Student ID:**    31334679

## Import Library

All Library and File Path will be added here

In [3]:
# # Installation (Uncomment if need to installation or update library)
# !pip install spacy #==2.0.11
# !pip install transformers

In [4]:
!pip install transformers
import transformers as ppb

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 8.0MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 48.0MB/s 
[?25hCollecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25e86/huggingface_hub-0.0.8-py3-none-any.whl
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |█

In [5]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, matthews_corrcoef

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [6]:
# # MY GLOBAL FUNCTION - 

# ENABLE_LOGS = 1
# def print_log(*arg,log_type='Info'):
#     global ENABLE_LOGS
#     if(ENABLE_LOGS): 
#         print("["+log_type+"]"," ".join(arg))

# print_log("This","is")

## Check Available Device (CPU/GPU)

In [7]:
import torch
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    DEVICE_AVAILABLE = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    DEVICE_AVAILABLE = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


## Data Exploration

### Dataset : Hallmarks-of-Cancer
The Hallmarks of Cancer Corpus for text classification

The Hallmarks of Cancer (HOC) Corpus consists of 1852 PubMed publication abstracts manually annotated by experts according to a taxonomy. The taxonomy consists of 37 classes in a hierarchy. Zero or more class labels are assigned to each sentence in the corpus. The labels are found under the "labels" directory, while the tokenized text can be found under "text" directory. The filenames are the corresponding PubMed IDs (PMID).

https://github.com/sb895/Hallmarks-of-Cancer



In [8]:
# Clone dataset
!git clone https://github.com/sb895/Hallmarks-of-Cancer.git

Cloning into 'Hallmarks-of-Cancer'...
remote: Enumerating objects: 3433, done.[K
remote: Total 3433 (delta 0), reused 0 (delta 0), pack-reused 3433[K
Receiving objects: 100% (3433/3433), 1.82 MiB | 18.30 MiB/s, done.
Resolving deltas: 100% (256/256), done.


In [9]:
import glob
import os
import pandas as pd
from collections import defaultdict

df = pd.DataFrame()

file_list = os.listdir('/content/Hallmarks-of-Cancer/text/')

file_list
for filename in file_list:
    temp_dict = {}
    temp_dict['filename'] = filename
    with open('/content/Hallmarks-of-Cancer/text/'+filename) as f_input:
        temp_dict['text'] = f_input.read()
    with open('/content/Hallmarks-of-Cancer/labels/'+filename) as f_input:
        temp_dict['labels'] = f_input.read()
    # print(temp_dict)
    df = df.append(temp_dict, ignore_index=True)

df

Unnamed: 0,filename,labels,text
0,12572356.txt,< < < < Evading growth suppressors--By deregu...,The effects of tea polyphenols and tea pigment...
1,22575613.txt,< < < < < < < < < < <,Whole genome duplications ( WGDs ) are conside...
2,20398195.txt,< < < < < < <Resisting cell death--Necrosis< <...,Thyroid-like low-grade nasopharyngeal papillar...
3,22481237.txt,< < < <Sustaining proliferative signaling--Rec...,OBJECTIVE Expression of N-myc downstream-regul...
4,23272224.txt,< < < < <Enabling replicative immortality--Sen...,Cellular senescence is considered as a tumor s...
...,...,...,...
1847,22913657.txt,< < < < <Sustaining proliferative signaling--C...,"Hesperetin , a flavonoid from citrus fruits , ..."
1848,22919057.txt,< < < < < < < < < < < <,Mutations of the thyroid hormone receptor-β ge...
1849,23180767.txt,< < < < < Genomic instability and mutation--D...,The environmental arylamine mutagens are impli...
1850,23177934.txt,< < < < < <Tumor promoting inflammation--Infla...,Metabolic reprogramming of cancer cells provid...


In [10]:
df_a = df.assign(labels_line=df['labels'].str.split('<')).explode('labels_line')

In [11]:
df_b = df.assign(text_line=df['text'].str.split('\n')).explode('text_line')

In [12]:
new_df = pd.concat([df_a, df_b], axis=1)

In [13]:
new_df.head(3)

Unnamed: 0,filename,labels,text,labels_line,filename.1,labels.1,text.1,text_line
0,12572356.txt,< < < < Evading growth suppressors--By deregu...,The effects of tea polyphenols and tea pigment...,,12572356.txt,< < < < Evading growth suppressors--By deregu...,The effects of tea polyphenols and tea pigment...,The effects of tea polyphenols and tea pigment...
0,12572356.txt,< < < < Evading growth suppressors--By deregu...,The effects of tea polyphenols and tea pigment...,,12572356.txt,< < < < Evading growth suppressors--By deregu...,The effects of tea polyphenols and tea pigment...,HepG2 cells were incubated with 50 and 100 mg/...
0,12572356.txt,< < < < Evading growth suppressors--By deregu...,The effects of tea polyphenols and tea pigment...,,12572356.txt,< < < < Evading growth suppressors--By deregu...,The effects of tea polyphenols and tea pigment...,"Flow cytometry , Western blot and RT-PCR analy..."


In [14]:
new_df.labels_line = new_df.labels_line.str.strip()

In [15]:
new_df.labels_line.value_counts()

                                                                                                                                                                                                               14862
Resisting cell death--Apoptosis                                                                                                                                                                                  339
Activating invasion and metastasis--Metastasis                                                                                                                                                                   238
Sustaining proliferative signaling--Receptors                                                                                                                                                                    228
Activating invasion and metastasis--Invasion                                                                                                        

In [16]:
new_df = new_df.reset_index(drop=True).iloc[:, [0,3,7]]

In [17]:
new_df

Unnamed: 0,filename,labels_line,text_line
0,12572356.txt,,The effects of tea polyphenols and tea pigment...
1,12572356.txt,,HepG2 cells were incubated with 50 and 100 mg/...
2,12572356.txt,,"Flow cytometry , Western blot and RT-PCR analy..."
3,12572356.txt,,Flow cytometry analysis showed that tea polyph...
4,12572356.txt,Evading growth suppressors--By deregulating ce...,Western blot analysis showed tea polyphenols a...
...,...,...,...
19311,23013101.txt,,We found that Bortezomib inhibited the cellula...
19312,23013101.txt,,Bortezomib also prevented the migration of HUV...
19313,23013101.txt,,"In addition , bortezomib dose-dependently inhi..."
19314,23013101.txt,Inducing angiogenesis--By deregulating angioge...,"In conclusion , bortezomib prevented the angio..."


In [18]:
# USE THIS FINAL :) 
df_c = new_df.assign(labels_line_AND=new_df['labels_line'].str.split('AND')).explode('labels_line_AND')
df_c = df_c[['filename','text_line','labels_line_AND']]

In [19]:
df_c[df_c.filename == '22968518.txt']

Unnamed: 0,filename,text_line,labels_line_AND
2027,22968518.txt,OBJECTIVE To investigate the relationship betw...,
2028,22968518.txt,METHODS Eighty cases of invasive cervical squa...,
2029,22968518.txt,Double immunohistochemical staining with antib...,
2030,22968518.txt,The peritumoral lymphatic vessel density and i...,
2031,22968518.txt,The lymphatic vessels proliferation index was ...,
2032,22968518.txt,Then the correlation between lymphangiogenesis...,
2033,22968518.txt,RESULTS The LVD of cervical cancer ( 15.23 � 3...,
2034,22968518.txt,The peritumoral lymphatic vessel density of ce...,
2035,22968518.txt,Lymphatic PI ( LPI ) of cervical cancer ( 0.25...,Inducing angiogenesis--By deregulating angioge...
2036,22968518.txt,The peritumoral lymphatic vessel PI of cervica...,


In [20]:
df_c.labels_line_AND = df_c.labels_line_AND.str.strip()

In [21]:
df_c.labels_line_AND.value_counts()

                                                                                                     14862
Resisting cell death--Apoptosis                                                                        612
Sustaining proliferative signaling--Receptors                                                          345
Sustaining proliferative signaling--Cell cycle                                                         320
Activating invasion and metastasis--Metastasis                                                         317
Activating invasion and metastasis--Invasion                                                           282
Tumor promoting inflammation--Inflammation--Oxidative stress                                           241
Evading growth suppressors--By deregulating cell cycle checkpoints--Cell cycle                         238
Genomic instability and mutation--Mutation                                                             215
Tumor promoting inflammation--Inflamm

In [22]:
len(df_c.labels_line_AND.value_counts())

38

In [23]:
## Format to classification input
df_HOC = df_c[['text_line','labels_line_AND']].copy()
df_HOC.columns = ['text','label']
df_HOC['label_bool'] = df_HOC['label'].apply(lambda x: 0 if x == '' else 1)
df_HOC.head()

Unnamed: 0,text,label,label_bool
0,The effects of tea polyphenols and tea pigment...,,0
1,HepG2 cells were incubated with 50 and 100 mg/...,,0
2,"Flow cytometry , Western blot and RT-PCR analy...",,0
3,Flow cytometry analysis showed that tea polyph...,,0
4,Western blot analysis showed tea polyphenols a...,Evading growth suppressors--By deregulating ce...,1


In [24]:
df_HOC_train = df_HOC.sample(frac = 0.7,random_state=SEED_NUMBER)
df_HOC_test = df_HOC.drop(df_HOC_train.index)

df_HOC_train = df_HOC_train.reset_index(drop=True)
df_HOC_test = df_HOC_test.reset_index(drop=True)

print('df_HOC:',df_HOC.shape)
print('df_HOC_train:',df_HOC_train.shape)
print('df_HOC_valid:',df_HOC_test.shape)

df_HOC: (20383, 3)
df_HOC_train: (14268, 3)
df_HOC_valid: (5642, 3)


### Dataset : Semistructure assignment

In [54]:
df_train = pd.read_csv(DATA_PATH+'/semi_assignment/axcs_train_cleaned.csv')
df_valid = pd.read_csv(DATA_PATH+'/semi_assignment/axcs_valid_cleaned.csv')
df_test = pd.read_csv(DATA_PATH+'/semi_assignment/axcs_test_cleaned.csv')

In [55]:
df_semi_train = pd.concat([df_train,df_valid],axis=0)[['Abstract','InfoTheory','CompVis','Math']]
df_semi_train.columns = ['text','label_InfoTheory','label_CompVis','label_Math']
df_semi_train.head()

Unnamed: 0,text,label_InfoTheory,label_CompVis,label_Math
0,Decentralized Constraint Satisfaction We show...,0,0,0
1,Utility Constrained Energy Minimization In Al...,1,0,1
2,Deterministic modal Bayesian Logic: derive th...,0,0,1
3,Regret Bounds for Reinforcement Learning with...,0,0,0
4,Theoretical Limits on Time Delay Estimation f...,1,0,1


In [56]:
df_semi_test = df_test[['Abstract','InfoTheory','CompVis','Math']]
df_semi_test.columns = ['text','label_InfoTheory','label_CompVis','label_Math']
df_semi_test.head()

Unnamed: 0,text,label_InfoTheory,label_CompVis,label_Math
0,A Data Transparency Framework for Mobile Appl...,0.0,0.0,0.0
1,A reclaimer scheduling problem arising in coa...,0.0,0.0,0.0
2,Communication-Efficient Distributed Optimizat...,0.0,0.0,1.0
3,Consistent Classification Algorithms for Mult...,0.0,0.0,0.0
4,Managing key multicasting through orthogonal ...,0.0,0.0,0.0




---

##BERT Text Representation

Transform Language Model

When using BERT, technically we are transforming our sentence into a vector that represent each sentence. The process is call Language Model a representation of each word. 

BERT add [CLS] token infront of each sentence. This token representation vector could later be use for Classification as it contain the sentence representation.

### Class my_BERT

In [44]:
class my_BERT:
    ###### Load pretrain BERT Language Model transformer (Otherwise use 'set' to customize)
    # # For DistilBERT:
    # # model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')

    ## Want BERT instead of distilBERT? Uncomment the following line:
    model_class, tokenizer_class, pretrained_weights = (ppb.BertModel, ppb.BertTokenizer, 'bert-base-uncased')

    # Load pretrained model/tokenizer
    bert_tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
    bert_model = model_class.from_pretrained(pretrained_weights)

    def __init__(self, df_input, ENABLE_LOGS = 1):
        ## INPUT STRUCTURE (COLUMNS): 
        ## - 'text' - Required
        ## - 'label' - Optional default name is 'label' otherwise need to specific when called

        self.df = df_input
        self.df_BERT = None
        self.ENABLE_LOGS = ENABLE_LOGS
    
    def print_log(self, *arg, log_type="Info"):
        if(self.ENABLE_LOGS==1 or log_type!="Info"): 
            print("["+log_type+"]"," ".join(arg))

    def bert_tokenize(self, token_length=128):
        # return "Hellow"
        df_output = self.df.copy()

        # BERT Tokenizer + truncate to BERT_MAX_LENGTH
        df_output['BERTTokens'] = df_output["text"].apply((lambda x: self.bert_tokenizer.encode(x, add_special_tokens=True,truncation=True,max_length = token_length)))
        # df_output['n_tokens0'] = df_output['BERTTokens'].apply(lambda x: len(x)) # Just for verification
        self.print_log("Token - Done")

        # Padding tokens to BERT_MAX_LENGTH
        df_output['BERTTokens'] = df_output['BERTTokens'].apply(lambda x: x + [0]*(token_length-len(x)))
        # df_output['n_tokens'] = df_output['BERTTokens'].apply(lambda x: len(x)) # Just for verification
        self.print_log("Pad - Done")

        # BERT Mask
        df_output['BERTMasks'] = df_output['BERTTokens'].apply(lambda x: [np.where(i != 0, 1, 0) for i in x])
        # df_output['n_mask1'] = df_output['BERTMask'].apply(lambda x: sum(x)) # Just for verification
        self.print_log("Mask - Done")

        return df_output

    def run_bert_transform(self, dataloader, device_available = torch.device("cpu")):
        all_result = []

        self.bert_model.to(device_available)

        for step, batch in enumerate(dataloader):
            self.print_log("Step:",str(step),"/",str(len(batch)))

            b_input_ids = batch[0].to(device_available)
            b_input_mask = batch[1].to(device_available)

            with torch.no_grad():
                last_hidden_states = self.bert_model(b_input_ids, attention_mask=b_input_mask)
        
            res_features = last_hidden_states[0][:,0,:].cpu().numpy()
            all_result.append(res_features)
        self.print_log("BERT transform - Done")

        return np.vstack(all_result)


    def bert_transform(self, device_available = torch.device("cpu"), batch_size = 32, token_length=128):
        df_output = self.bert_tokenize(token_length)

        # Convert to Tensor
        input_tokens = torch.tensor(np.stack(df_output['BERTTokens'].values))
        input_masks = torch.tensor(np.stack(df_output['BERTMasks'].values))
        # print(input_tokens,input_masks)

        # Create the DataLoader for our training set.
        input_data = TensorDataset(input_tokens, input_masks)
        input_sampler = SequentialSampler(input_data)
        input_dataloader = DataLoader(input_data, sampler=input_sampler, batch_size=batch_size)

        self.print_log("Running BERT Transform on", str(device_available))
        if(str(device_available) == 'cpu'):
            self.print_log("Running BERT on CPU can take longer time...",log_type="WARNING")
        self.print_log("Data size:",str(len(input_tokens)), "( Total batch", str(len(input_dataloader)),'* size',str(batch_size),")")
        
        output_features = self.run_bert_transform(input_dataloader,device_available)
        # print(output_features)
        # print(pd.Series(output_features.tolist()))
        df_output['BERT_Features'] = pd.Series(output_features.tolist())
        
        self.print_log("BERT transformed", log_type="Success")
        self.df_BERT = df_output

    def get_features(self):
        if(isinstance(self.df_BERT, pd.DataFrame)):
            return np.array([np.array(xi) for xi in self.df_BERT.BERT_Features.values])
        else:
            print_log("Please run function 'bert_transform' to generate text representation first!",log_type="Error")

    def get_labels(self,col_target = 'label'):
        return np.array(self.df_BERT[col_target].tolist())

    def set_bert_tokenizer(self,tokenizer):
        self.bert_tokenizer = tokenizer

    def set_bert_model(self,model):
        self.bert_model = model


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Transform Data to Text Representation

#### HOC

In [57]:
df_HOC_train.head()

Unnamed: 0,text,label,label_bool
0,"A low rate of proliferation was observed , sim...",,0
1,"This glycolytic shift , called the Warburg eff...",,0
2,Several studies have indicated that the cell-s...,,0
3,The hypothesis of a selective effect on differ...,Avoiding immune destruction--Immunosuppression,1
4,The HER-2 gene encodes a cell-surface growth f...,,0


In [46]:
bert_HOC_train = my_BERT(df_HOC_train)
bert_HOC_train.bert_transform(DEVICE_AVAILABLE)

bert_HOC_test = my_BERT(df_HOC_test)
bert_HOC_test.bert_transform(DEVICE_AVAILABLE)

# bert_HOC_train.df_BERT.head()
# bert_HOC_train.get_features()

[Info] Token success
[Info] Pad success
[Info] Mask success
[Info] Running BERT Transform on cuda
[Info] Data size: 14268 ( Total batch 446 * size 32 )
[Info] BERT transform success
[Info] Token success
[Info] Pad success
[Info] Mask success
[Info] Running BERT Transform on cuda
[Info] Data size: 5642 ( Total batch 177 * size 32 )
[Info] BERT transform success


#### Semi

In [59]:
df_semi_train.head()

Unnamed: 0,text,label_InfoTheory,label_CompVis,label_Math
0,Decentralized Constraint Satisfaction We show...,0,0,0
1,Utility Constrained Energy Minimization In Al...,1,0,1
2,Deterministic modal Bayesian Logic: derive th...,0,0,1
3,Regret Bounds for Reinforcement Learning with...,0,0,0
4,Theoretical Limits on Time Delay Estimation f...,1,0,1


In [58]:
bert_semi_train = my_BERT(df_semi_train)
bert_semi_train.bert_transform(DEVICE_AVAILABLE)

[Info] Token success
[Info] Pad success
[Info] Mask success
[Info] Running BERT Transform on cuda
[Info] Data size: 54731 ( Total batch 1711 * size 32 )
[Info] BERT transform success


In [60]:
bert_semi_test = my_BERT(df_semi_test)
bert_semi_test.bert_transform(DEVICE_AVAILABLE)

[Info] Token success
[Info] Pad success
[Info] Mask success
[Info] Running BERT Transform on cuda
[Info] Data size: 19678 ( Total batch 615 * size 32 )
[Info] BERT transform success




---


## Downstream Model

In [26]:
df_result = pd.DataFrame()
all_models = pd.DataFrame()

### Train Model

In [47]:
def build_logistic_model_BERT(x_train,y_train,SEED_NUMBER = 19900506):
    model = LogisticRegression(random_state=SEED_NUMBER,max_iter=500)
    model_name = model.__class__.__name__
    model.fit(x_train, y_train)
    return model

# trainLabels = df_semi_train['label_Math'].tolist() #+ df_valid['Math'].tolist()
# m = build_logistic_model_BERT(train_features,np.asarray(trainLabels))
# all_models['Logistic_all_lemma_math_BERT'] = [m]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
## FOR HOC
m = build_logistic_model_BERT(bert_HOC_train.get_features(),bert_HOC_train.get_labels('label_bool'))
all_models['Logistic_all_HOC_BERT'] = [m]

## For Semi
# Train all 3 models of BERT_Statistic 
var_target = ['label_Math','label_CompVis','label_InfoTheory']

for tg in var_target:
    # 6 models of all data
    m = build_logistic_model_BERT(bert_semi_train.get_features(),bert_semi_train.get_labels(tg))
    all_models["Logistic_all_"+tg+"_BERT"] = [m]

print(all_models.keys())

### Test Model

In [51]:
def test_logistic_model_BERT(model,x_test,y_test,dummy = 0):
    # Do the prediction
    if(dummy==1):
        y_predict = [0]*len(x_test)
    else:
        y_predict=model.predict(x_test)

    # print(confusion_matrix(y_test,y_predict))
    tn, fp, fn, tp = confusion_matrix(y_test,y_predict).ravel()
    print((tn, fp, fn, tp))
    recall = recall_score(y_test,y_predict,average='macro')
    precision = precision_score(y_test,y_predict,average='macro')
    f1score = f1_score(y_test,y_predict,average='macro')
    accuracy = accuracy_score(y_test,y_predict)
    # matthews = matthews_corrcoef(y_test,y_predict) 
    print('Accuracy: '+ str(accuracy))
    print('Macro Precision: '+ str(precision))
    print('Macro Recall: '+ str(recall))
    print('Macro F1 score:'+ str(f1score))
    # print('MCC:'+ str(matthews))

    return({'model_name':None,
            'target':None,
            'algorithm':"BERT_Logistic",
            'text_preprocessing':None,
            'datasize':None,
            'TP':tp,
            'TN':tn,
            'FP':fp,
            'FN':fn,
            'accuracy':accuracy,
            'precision':precision,
            'recall':recall,
            'f1score':f1score})

(3752, 740, 666, 484)
Accuracy: 0.7507975895072669
Macro Precision: 0.6223389461413055
Macro Recall: 0.6280661272213404
Macro F1 score:0.6249752036890481
(4492, 0, 1150, 0)
Accuracy: 0.7961715703651188
Macro Precision: 0.3980857851825594
Macro Recall: 0.5
Macro F1 score:0.44326031182159065


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
## FOR HOC
result = test_logistic_model_BERT(all_models["Logistic_all_HOC_BERT"][0], 
                                  bert_HOC_test.get_features(), 
                                  bert_HOC_test.get_labels('label_bool'))
result['model_name'] = 'Logistic_all_HOC_BERT'
result['target'] = "HOC"
df_result = df_result.append(result,ignore_index=True)

# For DUMMY
result = test_logistic_model_BERT(all_models["Logistic_all_HOC_BERT"][0],
                                  bert_HOC_test.get_features(), 
                                  bert_HOC_test.get_labels('label_bool'),1)
result['model_name'] = 'Logistic_all_HOC_BERT'
result['target'] = "HOC"
df_result = df_result.append(result,ignore_index=True)

# For Semi
# Test all 6 models of Statistic BERT (3 target * all * Lemma+BERT & BERT)
var_target = ['label_Math','label_CompVis','label_InfoTheory']

for tg in var_target:
    result = test_logistic_model_BERT(all_models["Logistic_all_"+tg+"_BERT"][0], 
                                  bert_semi_test.get_features(), 
                                  bert_semi_test.get_labels(tg))
    result['model_name'] = "Logistic_all_"+tg+"_BERT"
    result['target'] = tg
    df_result = df_result.append(result,ignore_index=True)

In [63]:
df_result

Unnamed: 0,FN,FP,TN,TP,accuracy,algorithm,datasize,f1score,model_name,precision,recall,target,text_preprocessing
0,666.0,740.0,3752.0,484.0,0.750798,BERT_Logistic,,0.624975,Logistic_all_HOC_BERT,0.622339,0.628066,HOC,
1,666.0,740.0,3752.0,484.0,0.750798,BERT_Logistic,,0.624975,Logistic_all_HOC_BERT,0.622339,0.628066,HOC,
2,1150.0,0.0,4492.0,0.0,0.796172,BERT_Logistic,,0.44326,Logistic_all_HOC_BERT,0.398086,0.5,HOC,
3,2165.0,1068.0,12680.0,3765.0,0.835705,BERT_Logistic,,0.793275,Logistic_all_label_Math_BERT,0.816589,0.778612,label_Math,
4,908.0,110.0,17416.0,1244.0,0.948267,BERT_Logistic,,0.840622,Logistic_all_label_CompVis_BERT,0.934603,0.785895,label_CompVis,
5,1206.0,318.0,15744.0,2410.0,0.922553,BERT_Logistic,,0.856804,Logistic_all_label_InfoTheory_BERT,0.90614,0.823342,label_InfoTheory,


In [64]:
df_result.to_csv(DATA_PATH+'my_csv.csv', mode='a+')



---

