Library

In [0]:
from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime
from bs4 import BeautifulSoup
import torch
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
import re
%tensorflow_version 1.x
import tensorflow as tf
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization

In [0]:
OUTPUT_DIR = 'OUTPUT_MODEL'
tf.gfile.MakeDirs(OUTPUT_DIR)

***** Model output directory: OUTPUT_TEST *****


Data Preparation

In [0]:
datanews=pd.read_excel(r'path_to_newscontent.xlsx','Sheet1')
datacomment=pd.read_excel(r'path_to_comments.xlsx','Sheet1')

In [0]:
def datapreparation(data,data_column):
  data[data_column]=[str(i) for i in data[data_column]]
  num_split=int(len(data.index)*0.8)
  return data, data_column, num_split

In [0]:
LABEL_COLUMN = 'label'
label_list = [0, 1, 2]
data, DATA_COLUMN, num_split= datapreparation(datacomment, "comment")
#data, DATA_COLUMN, num_split= datapreparation(datanews, "all_lower")

In [0]:
train_InputExamples = data.iloc[:num_split].apply(lambda x: bert.run_classifier.InputExample(guid=None,
                                                                 text_a = x[DATA_COLUMN], 
                                                                 text_b = None, 
                                                                 label = x[LABEL_COLUMN]), axis = 1)

test_InputExamples = data.iloc[(num_split+1):].apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                 text_a = x[DATA_COLUMN], 
                                                                 text_b = None, 
                                                                 label = x[LABEL_COLUMN]), axis = 1)

In [0]:
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1"

def create_tokenizer_from_hub_module():

  with tf.Graph().as_default():
    bert_module = hub.Module(BERT_MODEL_HUB)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
      vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])
      
  return bert.tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()

In [0]:
MAX_SEQ_LENGTH = 128
train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

BERT Model

In [0]:
def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,
                 num_labels):
  bert_module = hub.Module(
      BERT_MODEL_HUB,
      trainable=True)
  bert_inputs = dict(
      input_ids=input_ids,
      input_mask=input_mask,
      segment_ids=segment_ids)
  bert_outputs = bert_module(
      inputs=bert_inputs,
      signature="tokens",
      as_dict=True)

  # Use "pooled_output" for classification tasks on an entire sentence.
  output_layer = bert_outputs["pooled_output"]
  hidden_size = output_layer.shape[-1].value

  output_weights = tf.get_variable(
      "output_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())

  with tf.variable_scope("loss"):

    output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
    
    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    
    logits = tf.nn.bias_add(logits, output_bias)
    
    log_probs = tf.nn.log_softmax(logits, axis=-1)
    
    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

    predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
    
    if is_predicting:
      return (predicted_labels, log_probs)
    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)
    return (loss, predicted_labels, log_probs)


In [0]:
def model_fn(features, labels, mode, params): 

  input_ids = features["input_ids"]
  input_mask = features["input_mask"]
  segment_ids = features["segment_ids"]
  label_ids = features["label_ids"]
  is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)

  if not is_predicting:

    (loss, predicted_labels, log_probs) = create_model(
      is_predicting, input_ids, input_mask, segment_ids, label_ids, 3)

    train_op = bert.optimization.create_optimizer(
        loss, params["learning_rate"], 
        params["num_train_steps"], params["num_warmup_steps"], use_tpu=False)

    if mode == tf.estimator.ModeKeys.TRAIN:
      return tf.estimator.EstimatorSpec(mode=mode,
        loss=loss,
        train_op=train_op)
  else:
    (predicted_labels, log_probs) = create_model(
      is_predicting, input_ids, input_mask, segment_ids, label_ids, 3)

    predictions = {
        'probabilities': log_probs,
        'labels': predicted_labels,
    }
    return tf.estimator.EstimatorSpec(mode, predictions=predictions)


Train Model

In [0]:
BATCH_SIZE = 16
LEARNING_RATE = 5e-5
NUM_TRAIN_EPOCHS = 100
WARMUP_PROPORTION = 0.1
SAVE_CHECKPOINTS_STEPS = 100
SAVE_SUMMARY_STEPS = 1

In [0]:
run_config = tf.estimator.RunConfig(
    model_dir=OUTPUT_DIR,
    save_summary_steps=SAVE_SUMMARY_STEPS,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    log_step_count_steps=10)

In [0]:
num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

In [0]:
def input_fn_builder(features, seq_length, is_training, drop_remainder):

  all_input_ids = []
  all_input_mask = []
  all_segment_ids = []
  all_label_ids = []

  for feature in features:
    all_input_ids.append(feature.input_ids)
    all_input_mask.append(feature.input_mask)
    all_segment_ids.append(feature.segment_ids)
    all_label_ids.append(feature.label_id)
  
  def input_fn(params):
    batch_size = params["batch_size"]

    num_examples = len(features)
    d = tf.data.Dataset.from_tensor_slices({
        "input_ids":
            tf.constant(
                all_input_ids, shape=[num_examples, seq_length],
                dtype=tf.int32),
        "input_mask":
            tf.constant(
                all_input_mask,
                shape=[num_examples, seq_length],
                dtype=tf.int32),
        "segment_ids":
            tf.constant(
                all_segment_ids,
                shape=[num_examples, seq_length],
                dtype=tf.int32),
        "label_ids":
            tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32)
    })

    if is_training:
      d = d.repeat()
      d = d.shuffle(buffer_size=100)

    d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)
    return d
  return input_fn

In [0]:
train_input_fn = input_fn_builder(
  features=train_features,
  seq_length=MAX_SEQ_LENGTH,
  is_training=True,
  drop_remainder=False)
test_input_fn = input_fn_builder(
  features=test_features,
  seq_length=MAX_SEQ_LENGTH,
  is_training=False,
  drop_remainder=False)

In [0]:
print('Beginning Training!')
current_time = datetime.now()
estimator = tf.estimator.Estimator(
      model_fn=model_fn,
      config=run_config,
      params={"batch_size": BATCH_SIZE,
              "learning_rate": LEARNING_RATE,
             "num_train_steps": num_train_steps,
             "num_warmup_steps": num_warmup_steps,
             "epoch":NUM_TRAIN_EPOCHS})
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print("Training took time ", datetime.now() - current_time)

Evaluation

In [0]:
predictions = estimator.predict(test_input_fn)
prelabel=[]
for pre in predictions:
  prelabel.append(pre['labels'])
print(classification_report(prelabel,list(data.iloc[(num_split+1):]['label'])))