###1.Import BERT and necessary packages

First, import pre-trained BERT

In [0]:
!pip install bert-tensorflow

Collecting bert-tensorflow
[?25l  Downloading https://files.pythonhosted.org/packages/a6/66/7eb4e8b6ea35b7cc54c322c816f976167a43019750279a8473d355800a93/bert_tensorflow-1.0.1-py2.py3-none-any.whl (67kB)
[K     |████████████████████████████████| 71kB 2.8MB/s 
Installing collected packages: bert-tensorflow
Successfully installed bert-tensorflow-1.0.1


Next, import the necessary packages.

In [0]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization

W0819 07:29:56.055510 140604099778432 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/bert/optimization.py:87: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



The code chunk below combines all necessary code to use the pre-trained BERT model, and also includes the code to compute class-level metrics, `class_level()`. Run the code chunk below.

In [0]:
def pretty_print(result):
    df = pd.DataFrame([result]).T
    df.columns = ["values"]
    return df
  
def create_tokenizer_from_hub_module(bert_model_hub):
  """Get the vocab file and casing info from the Hub module."""
  with tf.Graph().as_default():
    bert_module = hub.Module(bert_model_hub)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
      vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])
      
  return bert.tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)

def make_features(dataset, label_list, MAX_SEQ_LENGTH, tokenizer, DATA_COLUMN, LABEL_COLUMN):
    input_example = dataset.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)
    features = bert.run_classifier.convert_examples_to_features(input_example, label_list, MAX_SEQ_LENGTH, tokenizer)
    return features

def create_model(bert_model_hub, is_predicting, input_ids, input_mask, segment_ids, labels,
                 num_labels):
  """Creates a classification model."""

  bert_module = hub.Module(
      bert_model_hub,
      trainable=True)
  bert_inputs = dict(
      input_ids=input_ids,
      input_mask=input_mask,
      segment_ids=segment_ids)
  bert_outputs = bert_module(
      inputs=bert_inputs,
      signature="tokens",
      as_dict=True)

  # Use "pooled_output" for classification tasks on an entire sentence.
  # Use "sequence_outputs" for token-level output.
  output_layer = bert_outputs["pooled_output"]

  hidden_size = output_layer.shape[-1].value

  # Create our own layer to tune for data.
  output_weights = tf.get_variable(
      "output_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())

  with tf.variable_scope("loss"):

    # Dropout helps prevent overfitting
    output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=-1)

    # Convert labels into one-hot encoding
    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

    predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
    # If we're predicting, we want predicted labels and the probabiltiies.
    if is_predicting:
      return (predicted_labels, log_probs)

    # If we're train/eval, compute loss between predicted and actual label
    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)
    return (loss, predicted_labels, log_probs)

def model_fn_builder(bert_model_hub, num_labels, learning_rate, num_train_steps,
                     num_warmup_steps):
  """Returns `model_fn` closure for TPUEstimator."""
  def model_fn(features, labels, mode, params):  
    """The `model_fn` for TPUEstimator."""

    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]
    label_ids = features["label_ids"]

    is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)
    
    # TRAIN and EVAL
    if not is_predicting:

      (loss, predicted_labels, log_probs) = create_model(
        bert_model_hub, is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

      train_op = bert.optimization.create_optimizer(
          loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)

      # Calculate evaluation metrics. 
      def metric_fn(label_ids, predicted_labels):
        accuracy = tf.metrics.accuracy(label_ids, predicted_labels)
        f1_score = tf.contrib.metrics.f1_score(
            label_ids,
            predicted_labels)
        auc = tf.metrics.auc(
            label_ids,
            predicted_labels)
        recall = tf.metrics.recall(
            label_ids,
            predicted_labels)
        precision = tf.metrics.precision(
            label_ids,
            predicted_labels) 
        true_pos = tf.metrics.true_positives(
            label_ids,
            predicted_labels)
        true_neg = tf.metrics.true_negatives(
            label_ids,
            predicted_labels)   
        false_pos = tf.metrics.false_positives(
            label_ids,
            predicted_labels)  
        false_neg = tf.metrics.false_negatives(
            label_ids,
            predicted_labels)
        return {
            "eval_accuracy": accuracy,
            "f1_score": f1_score,
            "auc": auc,
            "precision": precision,
            "recall": recall,
            "true_positives": true_pos,
            "true_negatives": true_neg,
            "false_positives": false_pos,
            "false_negatives": false_neg
        }

      eval_metrics = metric_fn(label_ids, predicted_labels)

      if mode == tf.estimator.ModeKeys.TRAIN:
        return tf.estimator.EstimatorSpec(mode=mode,
          loss=loss,
          train_op=train_op)
      else:
          return tf.estimator.EstimatorSpec(mode=mode,
            loss=loss,
            eval_metric_ops=eval_metrics)
    else:
      (predicted_labels, log_probs) = create_model(
        bert_model_hub, is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

      predictions = {
          'probabilities': log_probs,
          'labels': predicted_labels
      }
      return tf.estimator.EstimatorSpec(mode, predictions=predictions)

  # Return the actual model function in the closure
  return model_fn

def estimator_builder(bert_model_hub, OUTPUT_DIR, SAVE_SUMMARY_STEPS, SAVE_CHECKPOINTS_STEPS, label_list, LEARNING_RATE, num_train_steps, num_warmup_steps, BATCH_SIZE):

    # Specify output directory and number of checkpoint steps to save
    run_config = tf.estimator.RunConfig(
        model_dir=OUTPUT_DIR,
        tf_random_seed=123,
        save_summary_steps=SAVE_SUMMARY_STEPS,
        save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

    model_fn = model_fn_builder(
      bert_model_hub = bert_model_hub,
      num_labels=len(label_list),
      learning_rate=LEARNING_RATE,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps)

    estimator = tf.estimator.Estimator(
      model_fn=model_fn,
      config=run_config,
      params={"batch_size": BATCH_SIZE})
    return estimator, model_fn, run_config
  
def run_on_dfs(train, test, DATA_COLUMN, LABEL_COLUMN, 
               MAX_SEQ_LENGTH = 128,
              BATCH_SIZE = 32,
              LEARNING_RATE = 2e-5,
              NUM_TRAIN_EPOCHS = 3.0,
              WARMUP_PROPORTION = 0.1,
              SAVE_SUMMARY_STEPS = 100,
               SAVE_CHECKPOINTS_STEPS = 10000,
              bert_model_hub = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"):

    label_list = train[LABEL_COLUMN].unique().tolist()
    
    tokenizer = create_tokenizer_from_hub_module(bert_model_hub)

    train_features = make_features(train, label_list, MAX_SEQ_LENGTH, tokenizer, DATA_COLUMN, LABEL_COLUMN)
    test_features = make_features(test, label_list, MAX_SEQ_LENGTH, tokenizer, DATA_COLUMN, LABEL_COLUMN)

    num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
    num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

    estimator, model_fn, run_config = estimator_builder(
                                  bert_model_hub, 
                                  OUTPUT_DIR, 
                                  SAVE_SUMMARY_STEPS, 
                                  SAVE_CHECKPOINTS_STEPS, 
                                  label_list, 
                                  LEARNING_RATE, 
                                  num_train_steps, 
                                  num_warmup_steps, 
                                  BATCH_SIZE)

    train_input_fn = bert.run_classifier.input_fn_builder(
        features=train_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=True,
        drop_remainder=False)

    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    test_input_fn = run_classifier.input_fn_builder(
        features=test_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=False,
        drop_remainder=False)

    result_dict = estimator.evaluate(input_fn=test_input_fn, steps=None)
    return result_dict, estimator

def class_level(result):
  conf = np.array([[result.get('true_positives'), result.get('false_positives')], [result.get('false_negatives'), result.get('true_negatives')]])
  p_1 = conf[0][0]/sum(conf[0])
  p_0 = conf[1][1]/sum(conf[1])
  r_1 = conf[0][0]/sum(conf[:,0])
  r_0 = conf[1][1]/sum(conf[:,1])
  return{
      '1_precision': p_1,
      '1_recall': r_1,
      '1_f-measure': 2*((p_1*r_1)/(p_1 + r_1)),
      '0_precision': p_0,
      '0_recall': r_0,
      '0_f-measure': 2*((p_0*r_0)/(p_0 + r_0))  
  }

###2.Imprort data and Split 

Upload 'drugTweet.txt' file that has user_id, tweets, and label. Tweets in this file are not lowercased due to test on both BERT cased and uncased model.

In [0]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving drugTweets.txt to drugTweets.txt
User uploaded file "drugTweets.txt" with length 368650 bytes


Convert drugTweets file to a pandas dataframe and drop rows that include NaN cell.

In [0]:
drugTweets = pd.read_csv('drugTweets.txt', delimiter="\t",header= 0)
drugTweets = drugTweets.dropna()

Next, split data into training and test set 80/20, and takes out abuse labels as new variables

In [0]:
test = drugTweets.sample(frac=0.2, random_state=123)
train = drugTweets.drop(test.index)
y_train = train.abuse
y_test = test.abuse

We can see the data is imbalanced.

In [0]:
train.abuse.value_counts()

0    2045
1     344
Name: abuse, dtype: int64

###3.Oversampling data

To improve our model performance, let's oversample minority class.

We use Random Over Sampling function to oversample the training data, and this function takes X and y data individually.

We put only text data into the funcion as X parameter, and y, then oversample. After that, combine oversampled data to put in BERT model.

In [0]:
from imblearn.over_sampling import RandomOverSampler
train_text = pd.DataFrame(train.text_text)

ros = RandomOverSampler(random_state=123)
X_resampled, y_resampled = ros.fit_resample(train_text, y_train)
train = pd.DataFrame(X_resampled, columns = ['text_text'])
train['abuse'] = y_resampled



### 4.Running BERT

The code line below help us monitor how BERT model runs.

In [0]:
tf.logging.set_verbosity(tf.logging.INFO)

Set output directory to store BERT model log.

In [0]:
OUTPUT_DIR = 'output1'

Define your dictionary containing your parameter values as myparam, using default values except for the MAX_SEQ_LENGTH argument, which you can choose.

In [0]:
myparam1 = {
    "DATA_COLUMN": "text_text",
    "LABEL_COLUMN": "abuse",
    "BATCH_SIZE": 16,
    "NUM_TRAIN_EPOCHS": 3,
    "LEARNING_RATE": 1e-5,
    "MAX_SEQ_LENGTH": 32,
    "WARMUP_PROPORTION": 0.15
}

Use `run_on_dfs()` function, providing the training data, testing data and parameter dictionary as arguments and output the results as result1 and estimator1.

In [0]:
result1, estimator1 = run_on_dfs(train, test, **myparam1)

I0818 19:20:10.481830 140551282849664 saver.py:1499] Saver not created because there are no variables in the graph to restore
W0818 19:20:11.594707 140551282849664 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/bert/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.

W0818 19:20:11.862549 140551282849664 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/bert/run_classifier.py:774: The name tf.logging.info is deprecated. Please use tf.compat.v1.logging.info instead.

I0818 19:20:11.863768 140551282849664 run_classifier.py:774] Writing example 0 of 4128
I0818 19:20:11.869169 140551282849664 run_classifier.py:461] *** Example ***
I0818 19:20:11.872508 140551282849664 run_classifier.py:462] guid: None
I0818 19:20:11.875549 140551282849664 run_classifier.py:464] tokens: [CLS] i know for a fact that alcohol does not de ##ple ##te the ser ##o ##quel levels in your blood , yet here we are [SEP]
I0818 

View the model-level performance measures for the BERT model testing set.

In [0]:
pretty_print(result1)

Unnamed: 0,values
auc,0.651838
eval_accuracy,0.814262
f1_score,0.416667
false_negatives,57.0
false_positives,55.0
global_step,774.0
loss,0.757923
precision,0.421053
recall,0.412371
true_negatives,451.0


View the class-level performance measures. Note: the precision, recall and f1 returned in result is for class 'level 1', defined above as positives.

In [0]:
pretty_print(class_level(result1))

Unnamed: 0,values
0_f-measure,0.889546
0_precision,0.887795
0_recall,0.891304
1_f-measure,0.416667
1_precision,0.421053
1_recall,0.412371


###5.Hyperparameter Batch

To figure out the best hyperparameter for BERT, we made a function that modifies parameters and run BERT. 

In [0]:
tf.logging.set_verbosity(tf.logging.INFO)

In [0]:
OUTPUT_DIR = 'output1'

In [0]:
def batch_run(parameters):
  results = []

  for parameter in parameters:
    
    result, estimator = run_on_dfs(train, test, **parameter)
    results.append((parameter['BATCH_SIZE'], parameter['NUM_TRAIN_EPOCHS'], parameter['LEARNING_RATE'], result))

  return results

On the chunk below, we create hyper parameter combinations that we want to test.

You can change,

1.   Batch Size
2.   Number of epochs
3.   Learning Rate
4.   Cased / Uncased



In [0]:
batch_sizes = [8, 16, 32, 64,128, 256]
epochs = [3, 4, 5]
learningRates = [2e-5, 3e-5, 4e-5, 5e-5]

idx = 0
parameters = []
for batch_size in batch_sizes:
  for epoch in epochs:
    for learningRate in learningRates:
      
      parameter = {
          "DATA_COLUMN": "text_text",
          "LABEL_COLUMN": "abuse",
          "BATCH_SIZE": batch_size,
          "NUM_TRAIN_EPOCHS": epoch,
          "LEARNING_RATE": learningRate,
          "MAX_SEQ_LENGTH": 32,
          "WARMUP_PROPORTION": 0.1,
          "bert_model_hub": "https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1"
      }
      parameters.append(parameter)

print(parameters)


[{'DATA_COLUMN': 'text_text', 'LABEL_COLUMN': 'abuse', 'BATCH_SIZE': 8, 'NUM_TRAIN_EPOCHS': 3, 'LEARNING_RATE': 2e-05, 'MAX_SEQ_LENGTH': 32, 'WARMUP_PROPORTION': 0.1, 'bert_model_hub': 'https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1'}, {'DATA_COLUMN': 'text_text', 'LABEL_COLUMN': 'abuse', 'BATCH_SIZE': 8, 'NUM_TRAIN_EPOCHS': 3, 'LEARNING_RATE': 3e-05, 'MAX_SEQ_LENGTH': 32, 'WARMUP_PROPORTION': 0.1, 'bert_model_hub': 'https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1'}, {'DATA_COLUMN': 'text_text', 'LABEL_COLUMN': 'abuse', 'BATCH_SIZE': 8, 'NUM_TRAIN_EPOCHS': 3, 'LEARNING_RATE': 4e-05, 'MAX_SEQ_LENGTH': 32, 'WARMUP_PROPORTION': 0.1, 'bert_model_hub': 'https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1'}, {'DATA_COLUMN': 'text_text', 'LABEL_COLUMN': 'abuse', 'BATCH_SIZE': 8, 'NUM_TRAIN_EPOCHS': 3, 'LEARNING_RATE': 5e-05, 'MAX_SEQ_LENGTH': 32, 'WARMUP_PROPORTION': 0.1, 'bert_model_hub': 'https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1'}, {'DATA_COLUMN': 'text_text', 'L

In [0]:
result_tuples = batch_run(parameters)

I0818 23:37:45.892725 139917112682368 saver.py:1499] Saver not created because there are no variables in the graph to restore
I0818 23:37:46.475522 139917112682368 run_classifier.py:774] Writing example 0 of 4090
I0818 23:37:46.476981 139917112682368 run_classifier.py:461] *** Example ***
I0818 23:37:46.477756 139917112682368 run_classifier.py:462] guid: None
I0818 23:37:46.484504 139917112682368 run_classifier.py:464] tokens: [CLS] i know for a FA ##CT that alcohol does not de ##ple ##te the se ##ro ##quel levels in your blood , Y ##ET H ##ER ##E W ##E AR [SEP]
I0818 23:37:46.486301 139917112682368 run_classifier.py:465] input_ids: 101 178 1221 1111 170 6820 16647 1115 6272 1674 1136 1260 7136 1566 1103 14516 2180 15966 3001 1107 1240 1892 117 162 11943 145 9637 2036 160 2036 22133 102
I0818 23:37:46.487191 139917112682368 run_classifier.py:466] input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
I0818 23:37:46.491561 139917112682368 run_classifier.py:467] segm

Iterate printing BERT result to find the best hyper parameters.

In [0]:
for tuples in result_tuples:
  print('{}'.format('_' * 50))
  print('Batch_size : {}  Epoch : {}  Learning Rate : {}'.format(tuples[0],tuples[1], tuples[2]))
  print(pd.DataFrame([tuples[3]]).T)

__________________________________________________
Batch_size : 8  Epoch : 3  Learning Rate : 2e-05
                           0
auc                 0.511044
eval_accuracy       0.184255
f1_score            0.289051
false_negatives     0.000000
false_positives   487.000000
global_step      1533.000000
loss                5.350457
precision           0.168942
recall              1.000000
true_negatives     11.000000
true_positives     99.000000
__________________________________________________
Batch_size : 8  Epoch : 3  Learning Rate : 3e-05
                           0
auc                 0.511044
eval_accuracy       0.184255
f1_score            0.289051
false_negatives     0.000000
false_positives   487.000000
global_step      1533.000000
loss                5.350457
precision           0.168942
recall              1.000000
true_negatives     11.000000
true_positives     99.000000
__________________________________________________
Batch_size : 8  Epoch : 3  Learning Rate : 4e-05
    