In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import backend as K
import tensorflow_hub as hub
from tensorflow.keras.callbacks import EarlyStopping
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder

import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization

W0802 18:01:35.555200 140215057229632 deprecation_wrapper.py:119] From /home/brianmusisi/anaconda3/lib/python3.7/site-packages/bert/optimization.py:87: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



### Get Data

In [2]:
data = pd.read_csv('dataset/data/articles_dataset.csv')
data = data[~data['content'].isnull()]
data.head()

Unnamed: 0,title,content,link,source,class
0,Tibit Communications Raises $20M in Series B F...,"Tibit Communications, Inc., a Petaluma, CA-bas...",http://www.finsmes.com/2019/04/tibit-communica...,FinsmesUSA,Funding
1,Twitter blames human error after blocking a Ne...,"Over the holiday weekend, The New York Times f...",https://techcrunch.com/2017/11/27/twitter-blam...,techcrunch,Other
2,SimplyCook Raises £4.5M in Series A Funding\n,"SimplyCook, a London, UK-based recipe kit serv...",http://www.finsmes.com/2019/01/simplycook-rais...,FinsmesUK,Funding
3,Moogsoft Secures $40M in Series D Funding\n,"Moogsoft, a San Francisco, CA-based provider o...",http://www.finsmes.com/2018/03/moogsoft-secure...,FinsmesUSA,Funding
4,Zeta Global acquires commenting service†Disqus,A source close to the two companies tells us t...,https://techcrunch.com/2017/12/05/zeta-global-...,techcrunch,Other


In [3]:
data.shape

(44031, 5)

In [4]:
data['class'] = data['class'].map({'Funding':1, 'Other':0})
data.head()

Unnamed: 0,title,content,link,source,class
0,Tibit Communications Raises $20M in Series B F...,"Tibit Communications, Inc., a Petaluma, CA-bas...",http://www.finsmes.com/2019/04/tibit-communica...,FinsmesUSA,1
1,Twitter blames human error after blocking a Ne...,"Over the holiday weekend, The New York Times f...",https://techcrunch.com/2017/11/27/twitter-blam...,techcrunch,0
2,SimplyCook Raises £4.5M in Series A Funding\n,"SimplyCook, a London, UK-based recipe kit serv...",http://www.finsmes.com/2019/01/simplycook-rais...,FinsmesUK,1
3,Moogsoft Secures $40M in Series D Funding\n,"Moogsoft, a San Francisco, CA-based provider o...",http://www.finsmes.com/2018/03/moogsoft-secure...,FinsmesUSA,1
4,Zeta Global acquires commenting service†Disqus,A source close to the two companies tells us t...,https://techcrunch.com/2017/12/05/zeta-global-...,techcrunch,0


In [5]:
train_size = round(0.8 * data.shape[0])
dev_end = train_size + round(0.1 * data.shape[0])

train_df = data.iloc[:train_size, :]
dev_df = data.iloc[train_size : dev_end, :]
test_df = data.iloc[dev_end :]

train_df.head()

Unnamed: 0,title,content,link,source,class
0,Tibit Communications Raises $20M in Series B F...,"Tibit Communications, Inc., a Petaluma, CA-bas...",http://www.finsmes.com/2019/04/tibit-communica...,FinsmesUSA,1
1,Twitter blames human error after blocking a Ne...,"Over the holiday weekend, The New York Times f...",https://techcrunch.com/2017/11/27/twitter-blam...,techcrunch,0
2,SimplyCook Raises £4.5M in Series A Funding\n,"SimplyCook, a London, UK-based recipe kit serv...",http://www.finsmes.com/2019/01/simplycook-rais...,FinsmesUK,1
3,Moogsoft Secures $40M in Series D Funding\n,"Moogsoft, a San Francisco, CA-based provider o...",http://www.finsmes.com/2018/03/moogsoft-secure...,FinsmesUSA,1
4,Zeta Global acquires commenting service†Disqus,A source close to the two companies tells us t...,https://techcrunch.com/2017/12/05/zeta-global-...,techcrunch,0


#### Create tokenizer

In [6]:
# This is a path to an uncased (all lowercase) version of BERT
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

def create_tokenizer_from_hub_module():
  """Get the vocab file and casing info from the Hub module."""
  with tf.Graph().as_default():
    bert_module = hub.Module(BERT_MODEL_HUB)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
      vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])
      
  return bert.tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()

W0802 18:02:30.992128 140215057229632 deprecation_wrapper.py:119] From /home/brianmusisi/anaconda3/lib/python3.7/site-packages/bert/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.



#### Create input examples

In [7]:
def create_input_examples(df):
  examples = df.apply(lambda row : run_classifier.InputExample(guid=None, text_a=row['content'], text_b=None, label= row['class']), axis=1)
  return examples

In [8]:
train_examples = create_input_examples(train_df)
dev_examples = create_input_examples(dev_df)
test_examples = create_input_examples(test_df)

#### Converting examples to features

In [9]:
def convert_example_to_features(example, max_len, tokenizer):
  
  example_tokens = tokenizer.tokenize(example.text_a)
  
  if len(example_tokens)> max_len -2:
    example_tokens = example_tokens[: (max_len -2)]
   
  tokens = ['[CLS]']
  tokens = tokens + example_tokens + ['[SEP]']
  tokens=  tokens + ['[PAD]'] * (max_len - len(tokens))
  
  segment_ids = [0 for token in tokens]
  input_ids = tokenizer.convert_tokens_to_ids(tokens)
  input_masks = [1] * (len(example_tokens) + 1) + [0] * (max_len - len(example_tokens) -1)
  
  return input_ids, input_masks, segment_ids, example.label


def convert_example_list_to_features(tokenizer, examples, max_len):
  input_ids = []
  input_masks = []
  segment_ids = []
  labels = []
  
  for example in examples:
    input_id, input_mask, segment_id, label = convert_example_to_features(example, max_len, tokenizer)
    input_ids.append(input_id)
    input_masks.append(input_mask)
    segment_ids.append(segment_id)
    labels.append(label)
  
  return np.array(input_ids), np.array(input_masks), np.array(segment_ids), np.array(labels)

In [10]:
(train_input_ids, train_input_masks, train_segment_ids, train_labels) = convert_example_list_to_features(tokenizer, train_examples.tolist(), max_len=256)
(dev_input_ids, dev_input_masks, dev_segment_ids, dev_labels) = convert_example_list_to_features(tokenizer, dev_examples.tolist(), max_len=256)
(test_input_ids, test_input_masks, test_segment_ids, test_labels) = convert_example_list_to_features(tokenizer, test_examples.tolist(), max_len=256)

### Create Model

#### First create BertLayer

In [20]:
class BertLayer(tf.keras.layers.Layer):
    def __init__(self, n_fine_tune_layers=10, **kwargs):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = True
        self.output_size = 768
        super(BertLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.bert = hub.Module(
            "https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1",
            trainable=self.trainable,
            name="{}_module".format(self.name)
        )
        trainable_vars = self.bert.variables
        
        # Remove unused layers
        trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]
        
        # Select how many layers to fine tune
        trainable_vars = trainable_vars[-self.n_fine_tune_layers :]
        
        # Add to trainable weights
        for var in trainable_vars:
            self._trainable_weights.append(var)
        
        # Add non-trainable weights
        for var in self.bert.variables:
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)
        
        super(BertLayer, self).build(input_shape)

    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
        )
        result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
            "pooled_output"
        ]
        return result

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size)

In [21]:
max_len=256

In [22]:
in_id = tf.keras.layers.Input(shape=(max_len,), name="input_ids")
in_mask = tf.keras.layers.Input(shape=(max_len,), name="input_masks")
in_segment = tf.keras.layers.Input(shape=(max_len,), name="segment_ids")
bert_inputs = [in_id, in_mask, in_segment]

# Instantiate the custom Bert Layer defined above
bert_output = BertLayer(n_fine_tune_layers=10)(bert_inputs)

# Build the rest of the classifier 
dense = tf.keras.layers.Dense(256, activation='relu')(bert_output)
pred = tf.keras.layers.Dense(1, activation='sigmoid')(dense)

model = tf.keras.models.Model(inputs=bert_inputs, outputs=pred)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

W0802 18:19:40.807928 140215057229632 deprecation.py:323] From /home/brianmusisi/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 256)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 256)]        0                                            
__________________________________________________________________________________________________
bert_layer_2 (BertLayer)        (None, 768)          108931396   input_ids[0][0]                  
                                                                 input_masks[0][0]            

In [23]:
tf.test.is_gpu_available()

True

#### Start session and initialize variables

In [39]:
sess = tf.Session()

def initialize_session(sess):
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    K.set_session(sess)
    
initialize_session(sess)

In [25]:
early_stop = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)

In [28]:
model.reset_states()
history = model.fit(
    [train_input_ids, train_input_masks, train_segment_ids], 
    train_labels,
    validation_data=([dev_input_ids, dev_input_masks, test_segment_ids], test_labels),
    epochs=20,
    batch_size=128,
    callbacks=[early_stop]
)

Train on 35225 samples, validate on 4403 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 00012: early stopping


#### Function to plot history

In [37]:
def plot_history(history):
    acc = history.history['acc']
    val_acc = history.history['val_acc']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

#### Evaluate on the test_data

In [29]:
model.evaluate([test_input_ids, test_input_masks, test_segment_ids], test_labels)



[0.22133106133381725, 0.9218714]

In [30]:
print('Testing Accuracy is: {}'.format(0.9218714))

Testing Accuracy is: 0.9218714


In [32]:
model.save('models/Bert/BertModel_Dense256.h5')

#### Add more Dense Layers

In [43]:
in_id = tf.keras.layers.Input(shape=(max_len,), name="input_ids")
in_mask = tf.keras.layers.Input(shape=(max_len,), name="input_masks")
in_segment = tf.keras.layers.Input(shape=(max_len,), name="segment_ids")
bert_inputs = [in_id, in_mask, in_segment]

# Instantiate the custom Bert Layer defined above
bert_output = BertLayer(n_fine_tune_layers=10)(bert_inputs)

# Build the rest of the classifier 
dense = tf.keras.layers.Dense(256, activation='relu')(bert_output)
dense = tf.keras.layers.Dense(128, activation='relu')(dense)
pred = tf.keras.layers.Dense(1, activation='sigmoid')(dense)

model = tf.keras.models.Model(inputs=bert_inputs, outputs=pred)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 256)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 256)]        0                                            
__________________________________________________________________________________________________
bert_layer_4 (BertLayer)        (None, 768)          108931396   input_ids[0][0]                  
                                                                 input_masks[0][0]          

In [44]:
sess = tf.Session()

initialize_session(sess)

In [45]:
model.reset_states()
history = model.fit(
    [train_input_ids, train_input_masks, train_segment_ids], 
    train_labels,
    validation_data=([dev_input_ids, dev_input_masks, dev_segment_ids], dev_labels),
    epochs=20,
    batch_size=128,
    callbacks=[early_stop]
)

Train on 35225 samples, validate on 4403 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 00012: early stopping


In [46]:
loss, accuracy = model.evaluate([test_input_ids, test_input_masks, test_segment_ids], test_labels)
print('Testing Accuracy is: {}'.format(accuracy))

Testing Accuracy is: 0.9086986184120178


In [47]:
model.save('models/Bert/BertModel_Dense_256_128.h5')