##### Copyright 2020 The TensorFlow Hub Authors.


In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://www.tensorflow.org/text/tutorials/classify_text_with_bert"><img src="https://www.tensorflow.org/images/tf_logo_32px.png" />View on TensorFlow.org</a>
  </td>
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/tensorflow/text/blob/master/docs/tutorials/classify_text_with_bert.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/tensorflow/text/blob/master/docs/tutorials/classify_text_with_bert.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View on GitHub</a>
  </td>
  <td>
    <a href="https://storage.googleapis.com/tensorflow_docs/text/docs/tutorials/classify_text_with_bert.ipynb"><img src="https://www.tensorflow.org/images/download_logo_32px.png" />Download notebook</a>
  </td>
  <td>
    <a href="https://tfhub.dev/google/collections/bert/1"><img src="https://www.tensorflow.org/images/hub_logo_32px.png" />See TF Hub model</a>
  </td>
</table>

# Classify Bulk Modulus Sentences



## Setup


In [None]:
# A dependency of the preprocessing for BERT inputs
!pip install -q -U tensorflow-text

You will use the AdamW optimizer from [tensorflow/models](https://github.com/tensorflow/models).

In [None]:
!pip install -q tf-models-official

In [None]:
import os
import shutil

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer

import matplotlib.pyplot as plt
import pandas as pd

tf.get_logger().setLevel('ERROR')

## Sentiment analysis

This notebook trains a sentiment analysis model to classify movie reviews as *positive* or *negative*, based on the text of the review.

You'll use the [Large Movie Review Dataset](https://ai.stanford.edu/~amaas/data/sentiment/) that contains the text of 50,000 movie reviews from the [Internet Movie Database](https://www.imdb.com/).

### Download the dataset

Let's download and extract the dataset, then explore the directory structure.


In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
os.chdir('gdrive/MyDrive/NLP')
os.getcwd()

'/content/gdrive/MyDrive/NLP'

In [None]:
def dataset(train_path, test_path, train_folder, test_folder):
  if train_path != None:
    train_csv = pd.read_csv(train_path)
    entail_train = train_csv[train_csv["true_label"] == "entailment"]
    contra_train = train_csv[train_csv["true_label"] == "contradiction"]

  if test_path != None:
    test_csv = pd.read_csv(test_path)
    entail_test = test_csv[test_csv["true_label"] == "entailment"]
    contra_test = test_csv[test_csv["true_label"] == "contradiction"]

  #process train dataset
  if train_folder != None:
    #entailment
    ent_path = train_folder+"/entailment"
    if not os.path.exists(ent_path):  
      os.makedirs(ent_path) 
      for i in range(len(entail_train.index)):
        with open( f"{ent_path}/{i}.txt", "w") as f1:
          f1.write(entail_train.at[entail_train.index[i],"sentence"])

    #contradiction
    cont_path = train_folder+"/contradiction"
    if not os.path.exists(cont_path): 
      os.makedirs(cont_path)
      for i in range(len(contra_train.index)):
        with open( f"{cont_path}/{i}.txt", "w") as f2:
          f2.write(contra_train.at[contra_train.index[i],"sentence"])

  #process test dataset
  if test_folder != None:
    #entailment
    ent_path = test_folder+"/entailment"
    if not os.path.exists(ent_path):  
      os.makedirs(ent_path) 
      for i in range(len(entail_test.index)):
        with open( f"{ent_path}/{i}.txt", "w") as f3:
          f3.write(entail_test.at[entail_test.index[i],"sentence"])
        
    #contradiction
    cont_path = test_folder+"/contradiction"
    if not os.path.exists(cont_path):
      os.makedirs(cont_path)
      for i in range(len(contra_test.index)):
        with open( f"{cont_path}/{i}.txt", "w") as f4:
          f4.write(contra_test.at[contra_test.index[i],"sentence"])


In [None]:
dataset("dataset/syn_aug_pos_full.csv", None, "dataset/train_syn", None)
dataset("dataset/BT_aug_pos_full.csv", None, "dataset/train_BT", None)
dataset("dataset/bert_aug_pos_full.csv", None, "dataset/train_bert", None)

In [None]:
test_csv = pd.read_csv("dataset/test.csv")

In [None]:
os.getcwd()

'/content/gdrive/MyDrive/NLP'

Next, you will use the `text_dataset_from_directory` utility to create a labeled `tf.data.Dataset`.

The IMDB dataset has already been divided into train and test, but it lacks a validation set. Let's create a validation set using an 80:20 split of the training data by using the `validation_split` argument below.

Note:  When using the `validation_split` and `subset` arguments, make sure to either specify a random seed, or to pass `shuffle=False`, so that the validation and training splits have no overlap.

In [None]:
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42

train_folder = "dataset/train_bert"

raw_train_ds = tf.keras.preprocessing.text_dataset_from_directory(
    train_folder,
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed)

class_names = raw_train_ds.class_names
train_ds = raw_train_ds.cache().prefetch(buffer_size=AUTOTUNE)

val_ds = tf.keras.preprocessing.text_dataset_from_directory(
    train_folder,
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed)

val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

test_ds = tf.keras.preprocessing.text_dataset_from_directory(
    'dataset/test',
    batch_size=batch_size)

test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

Found 930 files belonging to 2 classes.
Using 744 files for training.
Found 930 files belonging to 2 classes.
Using 186 files for validation.
Found 2677 files belonging to 2 classes.


## Loading models from TensorFlow Hub

Here you can choose which BERT model you will load from TensorFlow Hub and fine-tune. There are multiple BERT models available.

  - [BERT-Base](https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3), [Uncased](https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3) and [seven more models](https://tfhub.dev/google/collections/bert/1) with trained weights released by the original BERT authors.
  - [Small BERTs](https://tfhub.dev/google/collections/bert/1) have the same general architecture but fewer and/or smaller Transformer blocks, which lets you explore tradeoffs between speed, size and quality.
  - [ALBERT](https://tfhub.dev/google/collections/albert/1): four different sizes of "A Lite BERT" that reduces model size (but not computation time) by sharing parameters between layers.
  - [BERT Experts](https://tfhub.dev/google/collections/experts/bert/1): eight models that all have the BERT-base architecture but offer a choice between different pre-training domains, to align more closely with the target task.
  - [Electra](https://tfhub.dev/google/collections/electra/1) has the same architecture as BERT (in three different sizes), but gets pre-trained as a discriminator in a set-up that resembles a Generative Adversarial Network (GAN).
  - BERT with Talking-Heads Attention and Gated GELU [[base](https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1), [large](https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_large/1)] has two improvements to the core of the Transformer architecture.

The model documentation on TensorFlow Hub has more details and references to the
research literature. Follow the links above, or click on the [`tfhub.dev`](http://tfhub.dev) URL
printed after the next cell execution.

The suggestion is to start with a Small BERT (with fewer parameters) since they are faster to fine-tune. If you like a small model but with higher accuracy, ALBERT might be your next option. If you want even better accuracy, choose
one of the classic BERT sizes or their recent refinements like Electra, Talking Heads, or a BERT Expert.

Aside from the models available below, there are [multiple versions](https://tfhub.dev/google/collections/transformer_encoders_text/1) of the models that are larger and can yield even better accuracy, but they are too big to be fine-tuned on a single GPU. You will be able to do that on the [Solve GLUE tasks using BERT on a TPU colab](https://www.tensorflow.org/text/tutorials/bert_glue).

You'll see in the code below that switching the tfhub.dev URL is enough to try any of these models, because all the differences between them are encapsulated in the SavedModels from TF Hub.

In [None]:
# #@title Choose a BERT model to fine-tune

# bert_model_name = 'small_bert/bert_en_uncased_L-8_H-768_A-12'  #@param ["bert_en_uncased_L-12_H-768_A-12", "bert_en_cased_L-12_H-768_A-12", "bert_multi_cased_L-12_H-768_A-12", "small_bert/bert_en_uncased_L-2_H-128_A-2", "small_bert/bert_en_uncased_L-2_H-256_A-4", "small_bert/bert_en_uncased_L-2_H-512_A-8", "small_bert/bert_en_uncased_L-2_H-768_A-12", "small_bert/bert_en_uncased_L-4_H-128_A-2", "small_bert/bert_en_uncased_L-4_H-256_A-4", "small_bert/bert_en_uncased_L-4_H-512_A-8", "small_bert/bert_en_uncased_L-4_H-768_A-12", "small_bert/bert_en_uncased_L-6_H-128_A-2", "small_bert/bert_en_uncased_L-6_H-256_A-4", "small_bert/bert_en_uncased_L-6_H-512_A-8", "small_bert/bert_en_uncased_L-6_H-768_A-12", "small_bert/bert_en_uncased_L-8_H-128_A-2", "small_bert/bert_en_uncased_L-8_H-256_A-4", "small_bert/bert_en_uncased_L-8_H-512_A-8", "small_bert/bert_en_uncased_L-8_H-768_A-12", "small_bert/bert_en_uncased_L-10_H-128_A-2", "small_bert/bert_en_uncased_L-10_H-256_A-4", "small_bert/bert_en_uncased_L-10_H-512_A-8", "small_bert/bert_en_uncased_L-10_H-768_A-12", "small_bert/bert_en_uncased_L-12_H-128_A-2", "small_bert/bert_en_uncased_L-12_H-256_A-4", "small_bert/bert_en_uncased_L-12_H-512_A-8", "small_bert/bert_en_uncased_L-12_H-768_A-12", "albert_en_base", "electra_small", "electra_base", "experts_pubmed", "experts_wiki_books", "talking-heads_base"]

# map_name_to_handle = {
#     'bert_en_uncased_L-12_H-768_A-12':
#         'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
#     'bert_en_cased_L-12_H-768_A-12':
#         'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
#     'bert_multi_cased_L-12_H-768_A-12':
#         'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',
#     'small_bert/bert_en_uncased_L-2_H-128_A-2':
#         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
#     'small_bert/bert_en_uncased_L-2_H-256_A-4':
#         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1',
#     'small_bert/bert_en_uncased_L-2_H-512_A-8':
#         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1',
#     'small_bert/bert_en_uncased_L-2_H-768_A-12':
#         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1',
#     'small_bert/bert_en_uncased_L-4_H-128_A-2':
#         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1',
#     'small_bert/bert_en_uncased_L-4_H-256_A-4':
#         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1',
#     'small_bert/bert_en_uncased_L-4_H-512_A-8':
#         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
#     'small_bert/bert_en_uncased_L-4_H-768_A-12':
#         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1',
#     'small_bert/bert_en_uncased_L-6_H-128_A-2':
#         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/1',
#     'small_bert/bert_en_uncased_L-6_H-256_A-4':
#         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/1',
#     'small_bert/bert_en_uncased_L-6_H-512_A-8':
#         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1',
#     'small_bert/bert_en_uncased_L-6_H-768_A-12':
#         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1',
#     'small_bert/bert_en_uncased_L-8_H-128_A-2':
#         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/1',
#     'small_bert/bert_en_uncased_L-8_H-256_A-4':
#         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/1',
#     'small_bert/bert_en_uncased_L-8_H-512_A-8':
#         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1',
#     'small_bert/bert_en_uncased_L-8_H-768_A-12':
#         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/1',
#     'small_bert/bert_en_uncased_L-10_H-128_A-2':
#         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/1',
#     'small_bert/bert_en_uncased_L-10_H-256_A-4':
#         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/1',
#     'small_bert/bert_en_uncased_L-10_H-512_A-8':
#         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/1',
#     'small_bert/bert_en_uncased_L-10_H-768_A-12':
#         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1',
#     'small_bert/bert_en_uncased_L-12_H-128_A-2':
#         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1',
#     'small_bert/bert_en_uncased_L-12_H-256_A-4':
#         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1',
#     'small_bert/bert_en_uncased_L-12_H-512_A-8':
#         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1',
#     'small_bert/bert_en_uncased_L-12_H-768_A-12':
#         'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1',
#     'albert_en_base':
#         'https://tfhub.dev/tensorflow/albert_en_base/2',
#     'electra_small':
#         'https://tfhub.dev/google/electra_small/2',
#     'electra_base':
#         'https://tfhub.dev/google/electra_base/2',
#     'experts_pubmed':
#         'https://tfhub.dev/google/experts/bert/pubmed/2',
#     'experts_wiki_books':
#         'https://tfhub.dev/google/experts/bert/wiki_books/2',
#     'talking-heads_base':
#         'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
# }

# map_model_to_preprocess = {
#     'bert_en_uncased_L-12_H-768_A-12':
#         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
#     'bert_en_cased_L-12_H-768_A-12':
#         'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
#     'small_bert/bert_en_uncased_L-2_H-128_A-2':
#         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
#     'small_bert/bert_en_uncased_L-2_H-256_A-4':
#         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
#     'small_bert/bert_en_uncased_L-2_H-512_A-8':
#         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
#     'small_bert/bert_en_uncased_L-2_H-768_A-12':
#         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
#     'small_bert/bert_en_uncased_L-4_H-128_A-2':
#         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
#     'small_bert/bert_en_uncased_L-4_H-256_A-4':
#         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
#     'small_bert/bert_en_uncased_L-4_H-512_A-8':
#         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
#     'small_bert/bert_en_uncased_L-4_H-768_A-12':
#         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
#     'small_bert/bert_en_uncased_L-6_H-128_A-2':
#         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
#     'small_bert/bert_en_uncased_L-6_H-256_A-4':
#         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
#     'small_bert/bert_en_uncased_L-6_H-512_A-8':
#         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
#     'small_bert/bert_en_uncased_L-6_H-768_A-12':
#         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
#     'small_bert/bert_en_uncased_L-8_H-128_A-2':
#         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
#     'small_bert/bert_en_uncased_L-8_H-256_A-4':
#         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
#     'small_bert/bert_en_uncased_L-8_H-512_A-8':
#         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
#     'small_bert/bert_en_uncased_L-8_H-768_A-12':
#         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
#     'small_bert/bert_en_uncased_L-10_H-128_A-2':
#         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
#     'small_bert/bert_en_uncased_L-10_H-256_A-4':
#         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
#     'small_bert/bert_en_uncased_L-10_H-512_A-8':
#         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
#     'small_bert/bert_en_uncased_L-10_H-768_A-12':
#         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
#     'small_bert/bert_en_uncased_L-12_H-128_A-2':
#         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
#     'small_bert/bert_en_uncased_L-12_H-256_A-4':
#         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
#     'small_bert/bert_en_uncased_L-12_H-512_A-8':
#         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
#     'small_bert/bert_en_uncased_L-12_H-768_A-12':
#         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
#     'bert_multi_cased_L-12_H-768_A-12':
#         'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3',
#     'albert_en_base':
#         'https://tfhub.dev/tensorflow/albert_en_preprocess/3',
#     'electra_small':
#         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
#     'electra_base':
#         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
#     'experts_pubmed':
#         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
#     'experts_wiki_books':
#         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
#     'talking-heads_base':
#         'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
# }

# tfhub_handle_encoder = map_name_to_handle[bert_model_name]
# tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]

# print(f'BERT model selected           : {tfhub_handle_encoder}')
# print(f'Preprocess model auto-selected: {tfhub_handle_preprocess}')

## Load Sci-BERT

In [None]:
# import tarfile
# tar = tarfile.open("scibert_encoder.tar.gz")
# tar.extractall()
# tar.close()

In [None]:
# tar2 = tarfile.open("scibert_preprocess.tar.gz")
# tar2.extractall()
# tar2.close()

## Using the Sci-BERT model

In [None]:
tfhub_handle_encoder = tf.saved_model.load("encoder_export")
tfhub_handle_preprocess = tf.saved_model.load("bert_preprocessing")

The BERT models return a map with 3 important keys: `pooled_output`, `sequence_output`, `encoder_outputs`:

- `pooled_output` represents each input sequence as a whole. The shape is `[batch_size, H]`. You can think of this as an embedding for the entire movie review.
- `sequence_output` represents each input token in the context. The shape is `[batch_size, seq_length, H]`. You can think of this as a contextual embedding for every token in the movie review.
- `encoder_outputs` are the intermediate activations of the `L` Transformer blocks. `outputs["encoder_outputs"][i]` is a Tensor of shape `[batch_size, seq_length, 1024]` with the outputs of the i-th Transformer block, for `0 <= i < L`. The last value of the list is equal to `sequence_output`.

For the fine-tuning you are going to use the `pooled_output` array.

## Define your model

You will create a very simple fine-tuned model, with the preprocessing model, the selected BERT model, one Dense and a Dropout layer.

Note: for more information about the base model's input and output you can follow the model's URL for documentation. Here specifically, you don't need to worry about it because the preprocessing model will take care of that for you.


In [None]:
def build_classifier_model():
  text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
  preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
  encoder_inputs = preprocessing_layer(text_input)
  encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
  outputs = encoder(encoder_inputs)
  net = outputs['pooled_output']
  net = tf.keras.layers.Dropout(0.05)(net)
  # net = tf.keras.layers.Dense(128, activation=None, name='classifier1')(net)# hidden layer 1
  # net = tf.keras.layers.Dense(128, activation=None, name='classifier2')(net)# hidden layer 2
  # net = tf.keras.layers.Dense(64, activation=None, name='classifier3')(net) # hidden layer 3
  net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
  return tf.keras.Model(text_input, net)

Let's check that the model runs with the output of the preprocessing model.

In [None]:
classifier_model = build_classifier_model()
# bert_raw_result = classifier_model(tf.constant(text_test))
# print(tf.sigmoid(bert_raw_result))

In [None]:
#classifier_model.summary()

The output is meaningless, of course, because the model has not been trained yet.

Let's take a look at the model's structure.

In [None]:
# tf.keras.utils.plot_model(classifier_model)

## Model training

You now have all the pieces to train a model, including the preprocessing module, BERT encoder, data, and classifier.

### Loss function

Since this is a binary classification problem and the model outputs a probability (a single-unit layer), you'll use `losses.BinaryCrossentropy` loss function.


In [None]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()

### Optimizer

For fine-tuning, let's use the same optimizer that BERT was originally trained with: the "Adaptive Moments" (Adam). This optimizer minimizes the prediction loss and does regularization by weight decay (not using moments), which is also known as [AdamW](https://arxiv.org/abs/1711.05101).

For the learning rate (`init_lr`), you will use the same schedule as BERT pre-training: linear decay of a notional initial learning rate, prefixed with a linear warm-up phase over the first 10% of training steps (`num_warmup_steps`). In line with the BERT paper, the initial learning rate is smaller for fine-tuning (best of 5e-5, 3e-5, 2e-5).

In [None]:
epochs = 6
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

### Loading the BERT model and training

Using the `classifier_model` you created earlier, you can compile the model with the loss, metric and optimizer.

In [None]:
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

In [None]:
  print(f'Training model with {tfhub_handle_encoder}')
  history = classifier_model.fit(x=train_ds,
                               validation_data=val_ds,
                               epochs=epochs)

Training model with <tensorflow.python.saved_model.load.Loader._recreate_base_user_object.<locals>._UserObject object at 0x7f99fb105c90>
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


Note: training time will vary depending on the complexity of the BERT model you have selected.

## Testing

Now you just save your fine-tuned model for later use.

In [None]:
#test_csv = pd.read_csv("dataset/test.csv")
test_csv = test_csv.dropna()

In [None]:
test_csv

Unnamed: 0,sentence,true_label
0,The effect of grain boundaries on the effectiv...,contradiction
1,At the macro-level the continuum mechanics des...,contradiction
2,In view of the proposed geometrical idealizati...,contradiction
3,Such variation of elastic moduli with a grain ...,contradiction
4,Two single Kelvin moduli h 1 and h 2 are two s...,contradiction
...,...,...
2672,The elastic modulus for the bulk glass Zr52.5N...,entailment
2673,"After 703 K annealed, the glass phase almost f...",entailment
2674,The bulk glass structure has a lowest elastic ...,entailment
2675,"The calculated bulk modulus, using both LDA an...",entailment


Here you can test your model on any sentence you want, just add to the examples variable below.

In [None]:
#fp_L = []; fn_L = []; tp_L = []; tn_L = []
tp = 0; tn = 0; fp = 0; fn = 0
test_result = {"prediction":[], "probability":[]}

for i in range(len(test_csv.index)):
  sentence = [test_csv.at[test_csv.index[i],"sentence"]]
  test_label_fl = float(tf.sigmoid(classifier_model(tf.constant(sentence))))
  test_label = int(round(test_label_fl))
  test_result["probability"].append(test_label_fl)

  true_label = test_csv.at[test_csv.index[i],"true_label"]

  if test_label == 0:
    #test_result["prediction"].append("contradiction")

    if true_label == "contradiction":
      tn += 1
      #tn_L.append(sentence)
    elif true_label == "entailment":
      fn += 1
      #fn_L.append(sentence)

  elif test_label == 1:
    #test_result["prediction"].append("entailment")

    if true_label == "contradiction":
      fp += 1
      #fp_L.append(sentence)
    elif true_label == "entailment":
      tp += 1
      #tp_L.append(sentence)

  #print(f"input: {sentence} \n test_label: {test_label, test_label_fl} \n true_label: {true_label} \n")

#test_result_df = pd.DataFrame.from_dict(test_result)
#test_result_df.to_csv("test_model_result.csv", index=False)

In [None]:
recall = tp/(tp+fn)
precision = tp/(tp+fp)
f1 = (2*recall*precision) / (recall + precision)

print(tp,fn,tn,fp)
print(recall,precision,f1)

82 41 2515 39
0.6666666666666666 0.6776859504132231 0.6721311475409837


In [None]:
#bert_aug
recall_L = [0.7235772357723578,0.6991869918699187,0.6341463414634146,0.6666666666666666,0.6666666666666666]
prec_L = [0.6793893129770993,0.7049180327868853,0.6341463414634146,0.6666666666666666,0.6776859504132231]   
# countL = [(89 34 2512 42),(86 37 2518 36),(78 45 2509 45),(82 41 2513 41),(82 41 2515 39)]


In [None]:
avg_recall = sum(recall_L)/len(recall_L)
avg_pre = sum(prec_L)/len(prec_L)
avg_recall,avg_pre

(0.6780487804878048, 0.6725612608614577)

In [None]:
#BT_aug
recall_L = [0.8780487804878049,0.7886178861788617,0.8617886178861789,0.8373983739837398,0.8211382113821138]
prec_L = [0.5901639344262295,0.6423841059602649,0.6708860759493671,0.5885714285714285,0.5738636363636364]   
# countL = [(108 15 2479 75),(97 26 2500 54),(106 17 2502 52),(103 20 2482 72),(101 22 2479 75)]


In [None]:
#syn_aug
recall_L = [0.7479674796747967,0.6585365853658537,0.7398373983739838,0.6747967479674797,0.7642276422764228]
prec_L = [0.6764705882352942,0.7363636363636363,0.7054263565891473,0.7757009345794392,0.6861313868613139]
f1_L = [0.7104247104247104,0.6952789699570815,0.7222222222222223,0.7217391304347825,0.7230769230769232]     
# countL = [(92 31 2510 44),(81 42 2525 29),(91 32 2516 38),(83 40 2530 24),(94 29 2511 43)]


In [None]:
# Sci-BERT avg (smaller testing dataset)
recall_L = [0.8292682926829268,0.8536585365853658,0.8536585365853658,0.8130081300813008,0.8780487804878049]
prec_L = [0.7083333333333334,0.6818181818181818,0.6441717791411042,0.5988023952095808,0.6242774566473989]
f1_L = [0.7640449438202247,0.7581227436823105,0.7342657342657342,0.6896551724137931,0.7297297297297297]     
# countL = [(102 21 2512 42),(105 18 2505 49),(105 18 2496 58),(100 23 2487 67),(108 15 2489 65)]


In [None]:
# Sci-BERT with NN layers avg 
# recall_L = [0.8292682926829268,
#             0.7804878048780488,
#             0.8211382113821138,
#             0.8536585365853658,
#             0.7642276422764228
#             ]
# prec_L = [0.6181818181818182,
#           0.6906474820143885,
#           0.6158536585365854,
#           0.6521739130434783,
#           0.7286821705426356
#           ]
# f1_L = [0.7083333333333334,
#         0.7328244274809161,
#         0.7038327526132404,
#         0.7394366197183099,
#         0.746031746031746
#         ]
# countL = [(102 21 2491 63),
#           (96 27 2511 43),
#           (101 22 2491 63),
#           (105 18 2498 56),
#           (94 29 2519 35)]

In [None]:
# Sci-BERT avg (10k testing dataset)
# recall_L = [0.8307692307692308,
#             0.8,
#             0.7538461538461538,
#             0.7846153846153846,
#             0.7384615384615385]
# prec_L = [0.6585365853658537,
#           0.5977011494252874,
#           0.7205882352941176,
#           0.6219512195121951,
#           0.7111111111111111]
# f1_L = [0.7346938775510204,
#         0.6842105263157896,
#         0.7368421052631577,
#         0.6938775510204082,
#         0.7245283018867924]
# countL = [(108, 22, 9918, 56),
#           (104, 26, 9904, 70),
#           (98, 32, 9936, 38),
#           (102, 28, 9912, 62),
#           (96, 34, 9935, 39)]


In [None]:
# #experts_pubmed
# recall_L = [0.823076923076923
#             ]
# prec_L = [0.535
#          ]
# f1_L = [0.6484848484848486]
# countL = [(107, 23, 9881, 93)]

If you want to use your model on [TF Serving](https://www.tensorflow.org/tfx/guide/serving), remember that it will call your SavedModel through one of its named signatures. In Python, you can test them as follows:

In [None]:
# serving_results = reloaded_model \
#             .signatures['serving_default'](tf.constant(examples))

# serving_results = tf.sigmoid(serving_results['classifier'])

# print_my_examples(examples, serving_results)

## Model Prediction of Testing dataset


In [None]:
pred_df = pd.read_csv("test_model_result.csv")
pred_df=pred_df.dropna()
pred_df

In [None]:
import numpy as np
pred_df['2_pred_labels_diff'] = np.where(pred_df['prediction1'] == pred_df['prediction2'], 0, 1)
pred_df['combined_pred_label'] = np.where(pred_df['prediction1'] == pred_df['prediction2'], pred_df["prediction1"], (pred_df["probability1"] + pred_df["probability2"])/2)

tp = 0; tn = 0; fp = 0; fn = 0

#combined_pred_label: turn number into label 
for i in range(len(pred_df.index)):
  idx = pred_df.index[i]
  score = pred_df.at[idx,"combined_pred_label"]
  true_label = pred_df.at[idx,"true_label"]

  if type(score) == float:
    score = int(round(score))
    if score == 0:
      pred_df.at[idx,"combined_pred_label"] = "contradiction"
      if true_label == "contradiction":
        tn+=1
      else:
        fn+=1

    elif score == 1:
      pred_df.at[idx,"combined_pred_label"] = "entailment"
      if true_label == "entailment":
        tp+=1
      else:
        fp+=1

  elif type(score) == str:
    pred_label = pred_df.at[idx,"combined_pred_label"]
    if pred_label == "contradiction":  
      if pred_label == true_label:
        tn += 1
      else:
        fn += 1

    elif pred_label == "entailment":
      if pred_label == true_label:
        tp += 1
      else:
        fp += 1

#compare with the true label to count # of wrong predictions
pred_df['final_true_pred_diff'] = np.where(pred_df['true_label'] == pred_df['combined_pred_label'], 0, 1)

print(tp,fn,tn,fp)

recall = tp/(tp+fn)
precision = tp/(tp+fp)
f1 = (2*recall*precision) / (recall + precision)
print(recall,precision,f1)

In [None]:
#pred_df.to_csv("test_pred_final.csv", index=False)