##### Copyright 2020 The TensorFlow Hub Authors.


In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://www.tensorflow.org/text/tutorials/classify_text_with_bert"><img src="https://www.tensorflow.org/images/tf_logo_32px.png" />View on TensorFlow.org</a>
  </td>
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/text/blob/master/docs/tutorials/classify_text_with_bert.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/text/blob/master/docs/tutorials/classify_text_with_bert.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View on GitHub</a>
  </td>
  <td>
    <a href="https://storage.googleapis.com/tensorflow_docs/text/docs/tutorials/classify_text_with_bert.ipynb"><img src="https://www.tensorflow.org/images/download_logo_32px.png" />Download notebook</a>
  </td>
  <td>
    <a href="https://tfhub.dev/google/collections/bert/1"><img src="https://www.tensorflow.org/images/hub_logo_32px.png" />See TF Hub model</a>
  </td>
</table>

## Setup


In [None]:
# A dependency of the preprocessing for BERT inputs
!pip install -q -U "tensorflow-text==2.11.*"

You will use the AdamW optimizer from [tensorflow/models](https://github.com/tensorflow/models).

In [None]:
!pip install -q tf-models-official==2.11.0

In [None]:
!pip install tensorflow_hub

In [None]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import validation_curve
from sklearn.metrics import make_scorer
from sklearn import metrics as mt
from sklearn.metrics import f1_score as f1
import matplotlib.pyplot as plt

tf.get_logger().setLevel('ERROR')

In [None]:
print(tf.config.list_physical_devices('GPU'))

In [None]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import os
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 16
seed = 42

## Dataset Generation

In [None]:
def remove_non_ascii(sentence):
    return ''.join(char for char in sentence if ord(char) < 128)
    
def html_Filter(sentence):
    sentence = BeautifulSoup(sentence, "lxml").text
    #print("after html_Filter",sentence)
    
    return sentence

In [None]:
def delete_files_in_directory(directory_path):
    try:
        files = os.listdir(directory_path)
        for file in files:
            file_path = os.path.join(directory_path, file)
            if os.path.isfile(file_path):
                os.remove(file_path)
        print("All files deleted successfully.")
    except OSError:
        print("Error occurred while deleting files.")

In [None]:
def create_folder(class_list,file_path,df_data,col = "y_ETD"): 
    for name in class_list:
        newpath = file_path+name+'/' 
        if not os.path.exists(newpath):
            os.makedirs(newpath)
        else:
            delete_files_in_directory(newpath)
    
#     df_data[col] = list(1-df_data[col].values)
    
    for i, (sentence, label) in enumerate(zip(df_data["issue_clean"],df_data[col])):
        sentence = html_Filter(sentence)
        sentence = remove_non_ascii(sentence)
    
        #vlabel_list.append(label)
    
        with open(file_path+str(label)+'/'+str(i)+".txt","w",encoding="UTF-8") as f:
            f.write(sentence)

## Dataset Generation for 10-fold

In [None]:
train_ds_etd_list = []
val_ds_etd_list = []
test_ds_etd_list = []

train_ds_ps_list = []
val_ds_ps_list = []
test_ds_ps_list = []

train_ds_ts_list = []
val_ds_ts_list = []
test_ds_ts_list = []

In [None]:
path = "../data/cross_validate_10/"
dir_list = sorted(os.listdir(path))

for folder in dir_list:
    train_etd_path = path+folder+"/train/train_bert_etd/"
    val_etd_path = path+folder+"/val/val_bert_etd/"
    test_etd_path = path+folder+"/test/test_bert_etd/"
    
    train_ps_path = path+folder+"/train/train_bert_ps/"
    val_ps_path = path+folder+"/val/val_bert_ps/"
    test_ps_path = path+folder+"/test/test_bert_ps/"
    
    train_ts_path = path+folder+"/train/train_bert_ts/"
    val_ts_path = path+folder+"/val/val_bert_ts/"
    test_ts_path = path+folder+"/test/test_bert_ts/"
    
    df_train = pd.read_csv(path+folder+"/train.csv")
    df_val = pd.read_csv(path+folder+"/val.csv")
    df_test = pd.read_csv(path+folder+"/test.csv")
    
    class_list = sorted([str(folder) for folder in set(df_train["y_ETD"])])
    
    create_folder(class_list,train_etd_path,df_train,col = "y_ETD")
    create_folder(class_list,val_etd_path,df_val,col = "y_ETD")
    create_folder(class_list,test_etd_path,df_test,col = "y_ETD")
    
    create_folder(class_list,train_ps_path,df_train,col = "y_PS")
    create_folder(class_list,val_ps_path,df_val,col = "y_PS")
    create_folder(class_list,test_ps_path,df_test,col = "y_PS")
    
    train_ds_etd = tf.keras.utils.text_dataset_from_directory(
        train_etd_path,
        labels='inferred',
        label_mode = 'categorical',
        batch_size=batch_size,
        seed=seed)

    train_ds_etd = train_ds_etd.cache().prefetch(buffer_size=AUTOTUNE)
    
    val_ds_etd = tf.keras.utils.text_dataset_from_directory(
        val_etd_path,
        labels='inferred',
        label_mode = 'categorical',
        batch_size=batch_size,
        seed=seed)

    val_ds_etd = val_ds_etd.cache().prefetch(buffer_size=AUTOTUNE)
    
    test_ds_etd = tf.keras.utils.text_dataset_from_directory(
        test_etd_path,
        label_mode = 'categorical',
        shuffle = False,
        batch_size=batch_size)

    test_ds_etd = test_ds_etd.cache().prefetch(buffer_size=AUTOTUNE)
    
    train_ds_ps = tf.keras.utils.text_dataset_from_directory(
        train_ps_path,
        labels='inferred',
        label_mode = 'categorical',
        batch_size=batch_size,
        seed=seed)

    train_ds_ps = train_ds_ps.cache().prefetch(buffer_size=AUTOTUNE)
    
    val_ds_ps = tf.keras.utils.text_dataset_from_directory(
        val_ps_path,
        labels='inferred',
        label_mode = 'categorical',
        batch_size=batch_size,
        seed=seed)

    val_ds_ps = val_ds_ps.cache().prefetch(buffer_size=AUTOTUNE)
    
    test_ds_ps = tf.keras.utils.text_dataset_from_directory(
        test_ps_path,
        label_mode = 'categorical',
        shuffle = False,
        batch_size=batch_size)

    test_ds_ps = test_ds_ps.cache().prefetch(buffer_size=AUTOTUNE)
    
    
    train_ds_etd_list.append(train_ds_etd)
    val_ds_etd_list.append(val_ds_etd)
    test_ds_etd_list.append(test_ds_etd)
    train_ds_ps_list.append(train_ds_ps)
    val_ds_ps_list.append(val_ds_ps)
    test_ds_ps_list.append(test_ds_ps)

## Loading models from TensorFlow Hub

Here you can choose which BERT model you will load from TensorFlow Hub and fine-tune. There are multiple BERT models available.

  - [BERT-Base](https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3), [Uncased](https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3) and [seven more models](https://tfhub.dev/google/collections/bert/1) with trained weights released by the original BERT authors.
  - [Small BERTs](https://tfhub.dev/google/collections/bert/1) have the same general architecture but fewer and/or smaller Transformer blocks, which lets you explore tradeoffs between speed, size and quality.
  - [ALBERT](https://tfhub.dev/google/collections/albert/1): four different sizes of "A Lite BERT" that reduces model size (but not computation time) by sharing parameters between layers.
  - [BERT Experts](https://tfhub.dev/google/collections/experts/bert/1): eight models that all have the BERT-base architecture but offer a choice between different pre-training domains, to align more closely with the target task.
  - [Electra](https://tfhub.dev/google/collections/electra/1) has the same architecture as BERT (in three different sizes), but gets pre-trained as a discriminator in a set-up that resembles a Generative Adversarial Network (GAN).
  - BERT with Talking-Heads Attention and Gated GELU [[base](https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1), [large](https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_large/1)] has two improvements to the core of the Transformer architecture.

The model documentation on TensorFlow Hub has more details and references to the
research literature. Follow the links above, or click on the [`tfhub.dev`](http://tfhub.dev) URL
printed after the next cell execution.

The suggestion is to start with a Small BERT (with fewer parameters) since they are faster to fine-tune. If you like a small model but with higher accuracy, ALBERT might be your next option. If you want even better accuracy, choose
one of the classic BERT sizes or their recent refinements like Electra, Talking Heads, or a BERT Expert.

Aside from the models available below, there are [multiple versions](https://tfhub.dev/google/collections/transformer_encoders_text/1) of the models that are larger and can yield even better accuracy, but they are too big to be fine-tuned on a single GPU. You will be able to do that on the [Solve GLUE tasks using BERT on a TPU colab](https://www.tensorflow.org/text/tutorials/bert_glue).

You'll see in the code below that switching the tfhub.dev URL is enough to try any of these models, because all the differences between them are encapsulated in the SavedModels from TF Hub.

In [None]:
#@title Choose a BERT model to fine-tune

bert_model_name = 'small_bert/bert_en_uncased_L-12_H-768_A-12'  #@param ["bert_en_uncased_L-12_H-768_A-12", "bert_en_cased_L-12_H-768_A-12", "bert_multi_cased_L-12_H-768_A-12", "small_bert/bert_en_uncased_L-2_H-128_A-2", "small_bert/bert_en_uncased_L-2_H-256_A-4", "small_bert/bert_en_uncased_L-2_H-512_A-8", "small_bert/bert_en_uncased_L-2_H-768_A-12", "small_bert/bert_en_uncased_L-4_H-128_A-2", "small_bert/bert_en_uncased_L-4_H-256_A-4", "small_bert/bert_en_uncased_L-4_H-512_A-8", "small_bert/bert_en_uncased_L-4_H-768_A-12", "small_bert/bert_en_uncased_L-6_H-128_A-2", "small_bert/bert_en_uncased_L-6_H-256_A-4", "small_bert/bert_en_uncased_L-6_H-512_A-8", "small_bert/bert_en_uncased_L-6_H-768_A-12", "small_bert/bert_en_uncased_L-8_H-128_A-2", "small_bert/bert_en_uncased_L-8_H-256_A-4", "small_bert/bert_en_uncased_L-8_H-512_A-8", "small_bert/bert_en_uncased_L-8_H-768_A-12", "small_bert/bert_en_uncased_L-10_H-128_A-2", "small_bert/bert_en_uncased_L-10_H-256_A-4", "small_bert/bert_en_uncased_L-10_H-512_A-8", "small_bert/bert_en_uncased_L-10_H-768_A-12", "small_bert/bert_en_uncased_L-12_H-128_A-2", "small_bert/bert_en_uncased_L-12_H-256_A-4", "small_bert/bert_en_uncased_L-12_H-512_A-8", "small_bert/bert_en_uncased_L-12_H-768_A-12", "albert_en_base", "electra_small", "electra_base", "experts_pubmed", "experts_wiki_books", "talking-heads_base"]

map_name_to_handle = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_base/2',
    'electra_small':
        'https://tfhub.dev/google/electra_small/2',
    'electra_base':
        'https://tfhub.dev/google/electra_base/2',
    'electra_large':
        'https://tfhub.dev/google/electra_large/2',
    'experts_pubmed':
        'https://tfhub.dev/google/experts/bert/pubmed/2',
    'experts_wiki_books':
        'https://tfhub.dev/google/experts/bert/wiki_books/2',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
    'roberta_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/jeongukjae/roberta_en_cased_L-12_H-768_A-12/1',
}

map_model_to_preprocess = {
    'bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-2_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-4_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-6_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-8_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-10_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-128_A-2':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-256_A-4':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-512_A-8':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'small_bert/bert_en_uncased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'bert_multi_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3',
    'albert_en_base':
        'https://tfhub.dev/tensorflow/albert_en_preprocess/3',
    'electra_small':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'electra_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'electra_large':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_pubmed':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'experts_wiki_books':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'talking-heads_base':
        'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
    'roberta_en_cased_L-12_H-768_A-12':
        'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
}

tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

## Define custom model

In [None]:
def build_classifier_model(class_num=2, dropout = 0.1):
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(dropout)(net)
    net = tf.keras.layers.Dense(class_num, activation='softmax', name='classifier')(net)
    return tf.keras.Model(text_input, net)

## Model training and evaluation

In [None]:
import tensorflow_addons as tfa

loss = tf.keras.losses.CategoricalCrossentropy()
metrics = tfa.metrics.F1Score(num_classes =2, threshold=0.5)

In [None]:
def create_optimizer(ds, epochs):
    steps_per_epoch = tf.data.experimental.cardinality(ds).numpy()
    num_train_steps = steps_per_epoch * epochs
    num_warmup_steps = int(0.1*num_train_steps)
    
    init_lr = 3e-5
    optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=0,
                                          optimizer_type='adamw')
    
    return optimizer

In [None]:
def evaluation_report(classifier_model, dataset, files, df_data):
    y_prob = classifier_model.predict(dataset)
    predicted_category_index = [np.argmax(y_prob[i]) for i in range(len(y_prob))]
    y_pred = [class_list[index] for index in predicted_category_index]
    
    ## generate report
    label_pred = np.zeros(len(y_pred))

    for i, file in enumerate(files):
        sid = file[:-4]
        label_pred[int(sid)] = y_pred[i]
    
    return label_pred

In [None]:
precision_list = []
recall_list = []
f1_list = []
test_data = []
ylabels = []
yhatlabels = []
dropout_list = np.linspace(0.2, 0.6, 5)
epochs = 10

for i,(train_ds,val_ds,test_ds) in enumerate(zip(train_ds_etd_list, val_ds_etd_list, test_ds_etd_list)):
    optimizer = create_optimizer(train_ds,epochs)
    max_clf = build_classifier_model(dropout = 0.2)
    max_clf.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)
    
    print(f'Training model with {tfhub_handle_encoder}')
    history = max_clf.fit(x=train_ds,
                        validation_data=val_ds,
                        verbose=0,
                        epochs=epochs)
    
    max_clf = None
    max_score = 0
    best_dropout = 0.2
    print(f'Training model with {tfhub_handle_encoder}')
    for rate in dropout_list:
        clf = build_classifier_model(dropout = rate)
        clf.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)
    
    
        history = clf.fit(x=train_ds,
                        validation_data=val_ds,
                        verbose=0,
                        epochs=epochs)
        
        curr_score = np.average(history.history["val_classifier1_f1_score"])
        if curr_score > max_score:
            max_score = curr_score
            max_clf = clf
            best_dropout = rate
    
    print("fold",(i+1),": best dropout rate is",best_dropout)
    test_etd_path = path+str(i)+"/test/test_bert_etd/"
    df_test = pd.read_csv(path+str(i)+"/test.csv")
    test_files = []
    for test_folder in class_list:
        file_path = test_etd_path+test_folder+"/"
        test_files.extend(sorted(os.listdir(file_path)))
    
    label_pred = evaluation_report(max_clf, test_ds,test_files,df_test)
    
    y_test = df_test["y_ETD"].values
    precison = mt.precision_score(y_test, label_pred)
    recall = mt.recall_score(y_test, label_pred)
    score = mt.f1_score(y_test, label_pred)
    
    test_data.extend(list(df_test["issue_clean"].values))
    ylabels.extend(list(y_test))
    yhatlabels.extend(list(label_pred))
    
    precision_list.append(precison)
    recall_list.append(recall)
    f1_list.append(score)

print("precision:",round(np.average(precision_list),3),"recall:",round(np.average(recall_list),3),"F1:",round(np.average(f1_list),3))

In [None]:
df_final = pd.DataFrame()
df_final["issue"] = test_data
df_final["y"] = ylabels
df_final["y_pred"] = yhatlabels
df_final

In [None]:
df_final.to_csv("../data/pretrained/BERT_ETD_result.csv",index =None)

In [None]:
precision_list = []
recall_list = []
f1_list = []
test_data = []
ylabels = []
yhatlabels = []
dropout_list = np.linspace(0.2, 0.6, 5)
epochs = 10
    
for i,(train_ds,val_ds,test_ds) in enumerate(zip(train_ds_ps_list, val_ds_ps_list, test_ds_ps_list)):
    optimizer = create_optimizer(train_ds,epochs) 
    max_clf = build_classifier_model(dropout = 0.2)
    max_clf.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)
    
    print(f'Training model with {tfhub_handle_encoder}')
    history = max_clf.fit(x=train_ds,
                        validation_data=val_ds,
                        verbose=0,
                        epochs=epochs)
    max_clf = None
    max_score = 0
    best_dropout = 0.2
    print(f'Training model with {tfhub_handle_encoder}')
    for rate in dropout_list:
        clf = build_classifier_model(dropout = rate)
        clf.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)
    
    
        history = clf.fit(x=train_ds,
                        validation_data=val_ds,
                        verbose=0,
                        epochs=epochs)
        
        curr_score = np.average(history.history["val_classifier1_f1_score"])
        if curr_score > max_score:
            max_score = curr_score
            max_clf = clf
            best_dropout = rate
    
    print("fold",(i+1),": best dropout rate is",best_dropout)
    test_etd_path = path+str(i)+"/test/test_bert_ps/"
    df_test = pd.read_csv(path+str(i)+"/test.csv")
    test_files = []
    for test_folder in class_list:
        file_path = test_etd_path+test_folder+"/"
        test_files.extend(sorted(os.listdir(file_path)))
    
    label_pred = evaluation_report(max_clf, test_ds,test_files,df_test)
    
    y_test = df_test["y_PS"].values
    precison = mt.precision_score(y_test, label_pred)
    recall = mt.recall_score(y_test, label_pred)
    score = mt.f1_score(y_test, label_pred)
    
    test_data.extend(list(df_test["issue_clean"].values))
    ylabels.extend(list(y_test))
    yhatlabels.extend(list(label_pred))
    
    precision_list.append(precison)
    recall_list.append(recall)
    f1_list.append(score)

print("precision:",round(np.average(precision_list),3),"recall:",round(np.average(recall_list),3),"F1:",round(np.average(f1_list),3))

In [None]:
df_final = pd.DataFrame()
df_final["issue"] = test_data
df_final["y"] = ylabels
df_final["y_pred"] = yhatlabels
df_final
df_final.to_csv("../data/pretrained/BERT_PS_result.csv",index =None)

In [None]:
bert_model_name = 'roberta_en_cased_L-12_H-768_A-12'
tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

In [None]:
precision_list = []
recall_list = []
f1_list = []
test_data = []
ylabels = []
yhatlabels = []
dropout_list = np.linspace(0.2, 0.6, 5)
epochs = 10

for i,(train_ds,val_ds,test_ds) in enumerate(zip(train_ds_etd_list, val_ds_etd_list, test_ds_etd_list)):
    optimizer = create_optimizer(train_ds,epochs)
    max_clf = build_classifier_model(dropout = 0.2)
    max_clf.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)
    
    print(f'Training model with {tfhub_handle_encoder}')
    history = max_clf.fit(x=train_ds,
                        validation_data=val_ds,
                        verbose=0,
                        epochs=epochs)
    
    max_clf = None
    max_score = 0
    best_dropout = 0.2
    print(f'Training model with {tfhub_handle_encoder}')
    for rate in dropout_list:
        clf = build_classifier_model(dropout = rate)
        clf.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)
    
    
        history = clf.fit(x=train_ds,
                        validation_data=val_ds,
                        verbose=0,
                        epochs=epochs)
        
        curr_score = np.average(history.history["val_classifier1_f1_score"])
        if curr_score > max_score:
            max_score = curr_score
            max_clf = clf
            best_dropout = rate
    
    print("fold",(i+1),": best dropout rate is",best_dropout)
    test_etd_path = path+str(i)+"/test/test_bert_etd/"
    df_test = pd.read_csv(path+str(i)+"/test.csv")
    test_files = []
    for test_folder in class_list:
        file_path = test_etd_path+test_folder+"/"
        test_files.extend(sorted(os.listdir(file_path)))
    
    label_pred = evaluation_report(max_clf, test_ds,test_files,df_test)
    
    y_test = df_test["y_ETD"].values
    precison = mt.precision_score(y_test, label_pred)
    recall = mt.recall_score(y_test, label_pred)
    score = mt.f1_score(y_test, label_pred)
    
    test_data.extend(list(df_test["issue_clean"].values))
    ylabels.extend(list(y_test))
    yhatlabels.extend(list(label_pred))
    
    precision_list.append(precison)
    recall_list.append(recall)
    f1_list.append(score)

print("precision:",round(np.average(precision_list),3),"recall:",round(np.average(recall_list),3),"F1:",round(np.average(f1_list),3))

In [None]:
df_final = pd.DataFrame()
df_final["issue"] = test_data
df_final["y"] = ylabels
df_final["y_pred"] = yhatlabels
df_final
df_final.to_csv("../data/pretrained/Roberta_ETD_result.csv",index =None)

In [None]:
precision_list = []
recall_list = []
f1_list = []
test_data = []
ylabels = []
yhatlabels = []
dropout_list = np.linspace(0.2, 0.6, 5)
epochs = 10
    
for i,(train_ds,val_ds,test_ds) in enumerate(zip(train_ds_ps_list, val_ds_ps_list, test_ds_ps_list)):
    optimizer = create_optimizer(train_ds,epochs)
    max_clf = build_classifier_model(dropout = 0.2)
    max_clf.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)
    
    print(f'Training model with {tfhub_handle_encoder}')
    history = max_clf.fit(x=train_ds,
                        validation_data=val_ds,
                        verbose=0,
                        epochs=epochs)
    max_clf = None
    max_score = 0
    best_dropout = 0.2
    print(f'Training model with {tfhub_handle_encoder}')
    for rate in dropout_list:
        clf = build_classifier_model(dropout = rate)
        clf.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)
    
    
        history = clf.fit(x=train_ds,
                        validation_data=val_ds,
                        verbose=0,
                        epochs=epochs)
        
        curr_score = np.average(history.history["val_classifier1_f1_score"])
        if curr_score > max_score:
            max_score = curr_score
            max_clf = clf
            best_dropout = rate
    
    print("fold",(i+1),": best dropout rate is",best_dropout)
    test_etd_path = path+str(i)+"/test/test_bert_ps/"
    df_test = pd.read_csv(path+str(i)+"/test.csv")
    test_files = []
    for test_folder in class_list:
        file_path = test_etd_path+test_folder+"/"
        test_files.extend(sorted(os.listdir(file_path)))
    
    label_pred = evaluation_report(max_clf, test_ds,test_files,df_test)
    
    y_test = df_test["y_PS"].values
    precison = mt.precision_score(y_test, label_pred)
    recall = mt.recall_score(y_test, label_pred)
    score = mt.f1_score(y_test, label_pred)
    
    test_data.extend(list(df_test["issue_clean"].values))
    ylabels.extend(list(y_test))
    yhatlabels.extend(list(label_pred))
    
    precision_list.append(precison)
    recall_list.append(recall)
    f1_list.append(score)

print("precision:",round(np.average(precision_list),3),"recall:",round(np.average(recall_list),3),"F1:",round(np.average(f1_list),3))

In [None]:
df_final = pd.DataFrame()
df_final["issue"] = test_data
df_final["y"] = ylabels
df_final["y_pred"] = yhatlabels
df_final
df_final.to_csv("../data/pretrained/Roberta_PS_result.csv",index =None)

In [None]:
bert_model_name = 'electra_base'
tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

print(f'BERT model selected           : {tfhub_handle_encoder}')
print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

In [None]:
precision_list = []
recall_list = []
f1_list = []
test_data = []
ylabels = []
yhatlabels = []
dropout_list = np.linspace(0.2, 0.6, 5)
epochs = 10

for i,(train_ds,val_ds,test_ds) in enumerate(zip(train_ds_etd_list, val_ds_etd_list, test_ds_etd_list)):  
    optimizer = create_optimizer(train_ds,epochs)
    max_clf = build_classifier_model(dropout = 0.2)
    max_clf.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)
    
    print(f'Training model with {tfhub_handle_encoder}')
    history = max_clf.fit(x=train_ds,
                        validation_data=val_ds,
                        verbose=0,
                        epochs=epochs)
    
    max_clf = None
    max_score = 0
    best_dropout = 0.2
    print(f'Training model with {tfhub_handle_encoder}')
    for rate in dropout_list:
        clf = build_classifier_model(dropout = rate)
        clf.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)
    
    
        history = clf.fit(x=train_ds,
                        validation_data=val_ds,
                        verbose=0,
                        epochs=epochs)
        
        curr_score = np.average(history.history["val_classifier1_f1_score"])
        if curr_score > max_score:
            max_score = curr_score
            max_clf = clf
            best_dropout = rate
    
    print("fold",(i+1),": best dropout rate is",best_dropout)
    test_etd_path = path+str(i)+"/test/test_bert_etd/"
    df_test = pd.read_csv(path+str(i)+"/test.csv")
    test_files = []
    for test_folder in class_list:
        file_path = test_etd_path+test_folder+"/"
        test_files.extend(sorted(os.listdir(file_path)))
    
    label_pred = evaluation_report(max_clf, test_ds,test_files,df_test)
    
    y_test = df_test["y_ETD"].values
    precison = mt.precision_score(y_test, label_pred)
    recall = mt.recall_score(y_test, label_pred)
    score = mt.f1_score(y_test, label_pred)
    
    test_data.extend(list(df_test["issue_clean"].values))
    ylabels.extend(list(y_test))
    yhatlabels.extend(list(label_pred))
    
    precision_list.append(precison)
    recall_list.append(recall)
    f1_list.append(score)

print("precision:",round(np.average(precision_list),3),"recall:",round(np.average(recall_list),3),"F1:",round(np.average(f1_list),3))

In [None]:
df_final = pd.DataFrame()
df_final["issue"] = test_data
df_final["y"] = ylabels
df_final["y_pred"] = yhatlabels
df_final
df_final.to_csv("../data/pretrained/Electra_ETD_result.csv",index =None)

In [None]:
precision_list = []
recall_list = []
f1_list = []
test_data = []
ylabels = []
yhatlabels = []
dropout_list = np.linspace(0.2, 0.6, 5)
epochs = 10

for i,(train_ds,val_ds,test_ds) in enumerate(zip(train_ds_ps_list, val_ds_ps_list, test_ds_ps_list)):
    optimizer = create_optimizer(train_ds,epochs)
    max_clf = build_classifier_model(dropout = 0.2)
    max_clf.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)
    
    print(f'Training model with {tfhub_handle_encoder}')
    history = max_clf.fit(x=train_ds,
                        validation_data=val_ds,
                        verbose=0,
                        epochs=epochs)
    max_clf = None
    max_score = 0
    best_dropout = 0.2
    print(f'Training model with {tfhub_handle_encoder}')
    for rate in dropout_list:
        clf = build_classifier_model(dropout = rate)
        clf.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)
    
    
        history = clf.fit(x=train_ds,
                        validation_data=val_ds,
                        verbose=0,
                        epochs=epochs)
        
        curr_score = np.average(history.history["val_classifier1_f1_score"])
        if curr_score > max_score:
            max_score = curr_score
            max_clf = clf
            best_dropout = rate
    
    print("fold",(i+1),": best dropout rate is",best_dropout)
    test_etd_path = path+str(i)+"/test/test_bert_ps/"
    df_test = pd.read_csv(path+str(i)+"/test.csv")
    test_files = []
    for test_folder in class_list:
        file_path = test_etd_path+test_folder+"/"
        test_files.extend(sorted(os.listdir(file_path)))
    
    label_pred = evaluation_report(max_clf, test_ds,test_files,df_test)
    
    y_test = df_test["y_PS"].values
    precison = mt.precision_score(y_test, label_pred)
    recall = mt.recall_score(y_test, label_pred)
    score = mt.f1_score(y_test, label_pred)
    
    test_data.extend(list(df_test["issue_clean"].values))
    ylabels.extend(list(y_test))
    yhatlabels.extend(list(label_pred))
    
    precision_list.append(precison)
    recall_list.append(recall)
    f1_list.append(score)

print("precision:",round(np.average(precision_list),3),"recall:",round(np.average(recall_list),3),"F1:",round(np.average(f1_list),3))

In [None]:
df_final = pd.DataFrame()
df_final["issue"] = test_data
df_final["y"] = ylabels
df_final["y_pred"] = yhatlabels
df_final
df_final.to_csv("../data/pretrained/Electra_PS_result.csv",index =None)