In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import tensorflow as tf
import tensorflow_hub as tf_hub
import time
import numpy as np
import os
from bert.tokenization import FullTokenizer
from tqdm import tqdm_notebook
from tensorflow.keras import backend as K
import matplotlib.pyplot as plt
import text_normalizer as tn

#os.environ["CUDA_VISIBLE_DEVICES"] = "0"
tf.logging.set_verbosity(tf.logging.INFO)
SEED = 42
np.random.seed(SEED)
tf.set_random_seed(SEED)

%matplotlib inline

W0709 08:08:28.935521 139808272750400 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [2]:
print(tf.__version__)
print(tf_hub.__version__)

1.12.0
0.4.0


In [3]:
print(tf.test.is_gpu_available())
print(tf.test.gpu_device_name())

True
/device:GPU:0


In [4]:
dataset = pd.read_csv('./data/GH_complete_labeled_issues_prs - preprocessed.csv', encoding='utf-8', 
                      na_filter=False)
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152151 entries, 0 to 152150
Data columns (total 2 columns):
description    152151 non-null object
label          152151 non-null int64
dtypes: int64(1), object(1)
memory usage: 2.3+ MB


In [5]:
dataset = dataset[dataset.label != 0]

In [6]:
from collections import Counter

texts = dataset['description'].tolist()
labels = dataset['label'].tolist()

print('Before:', Counter(labels))
labels = [0 if item == 1 else 1 for item in labels]
print('After:', Counter(labels))

Before: Counter({1: 22572, 2: 671})
After: Counter({0: 22572, 1: 671})


In [7]:
from sklearn.model_selection import train_test_split

train_text, test_text, train_labels, test_labels = train_test_split(texts, labels, 
                                                                    test_size=0.25, random_state=SEED)
len(train_text), len(test_text)

(17432, 5811)

In [8]:
class PaddingInputExample(object):
    """Fake example so the num input examples is a multiple of the batch size.
       When running eval/predict on the TPU, we need to pad the number of examples
       to be a multiple of the batch size, because the TPU requires a fixed batch
       size. The alternative is to drop the last batch, which is bad because it means
       the entire output data won't be generated.
       We use this class instead of `None` because treating `None` as padding
       batches could cause silent errors.
  """
    
    
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        
        """Constructs a InputExample.
    Args:
      guid: Unique id for the example.
      text_a: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
      text_b: (Optional) string. The untokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
      label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

In [9]:
def create_tokenizer_from_hub_module(bert_path):
    """Get the vocab file and casing info from the Hub module."""
    bert_module =  tf_hub.Module(bert_path)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    vocab_file, do_lower_case = sess.run(
        [
            tokenization_info["vocab_file"],
            tokenization_info["do_lower_case"],
        ]
    )

    return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

In [10]:
def convert_text_to_examples(texts, labels):
    """Create InputExamples"""
    InputExamples = []
    for text, label in zip(texts, labels):
        InputExamples.append(
            InputExample(guid=None, text_a=text, text_b=None, label=label)
        )
    return InputExamples

In [11]:
def convert_single_example(tokenizer, example, max_seq_length=256):
    """Converts a single `InputExample` into a single `InputFeatures`."""

    if isinstance(example, PaddingInputExample):
        input_ids = [0] * max_seq_length
        input_mask = [0] * max_seq_length
        segment_ids = [0] * max_seq_length
        label = 0
        return input_ids, input_mask, segment_ids, label

    tokens_a = tokenizer.tokenize(example.text_a)
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0 : (max_seq_length - 2)]

    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    return input_ids, input_mask, segment_ids, example.label

def convert_examples_to_features(tokenizer, examples, max_seq_length=256):
    """Convert a set of `InputExample`s to a list of `InputFeatures`."""

    input_ids, input_masks, segment_ids, labels = [], [], [], []
    for example in tqdm_notebook(examples, desc="Converting examples to features"):
        input_id, input_mask, segment_id, label = convert_single_example(
            tokenizer, example, max_seq_length
        )
        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
        labels.append(label)
    return (
        np.array(input_ids),
        np.array(input_masks),
        np.array(segment_ids),
        np.array(labels).reshape(-1, 1),
    )

In [12]:
# Initialize session
sess = tf.Session()

# Params for bert model and tokenization
BERT_PATH = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"
MAX_SEQ_LENGTH = 512

In [13]:
# Instantiate tokenizer
tokenizer = create_tokenizer_from_hub_module(bert_path=BERT_PATH)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0709 08:08:34.764133 139808272750400 tf_logging.py:115] Saver not created because there are no variables in the graph to restore


In [14]:
train_text_lengths = np.array([len(doc.split(' ')) for doc in train_text])
test_text_lengths = np.array([len(doc.split(' ')) for doc in test_text])

In [15]:
train_text_idx = np.argwhere(train_text_lengths >= 5).ravel()
test_text_idx = np.argwhere(test_text_lengths >= 5).ravel()

train_text = [train_text[i] for i in train_text_idx]
train_labels = [train_labels[i] for i in train_text_idx]
test_text = [test_text[i] for i in test_text_idx]
test_labels = [test_labels[i] for i in test_text_idx]

len(train_text), len(train_labels), len(test_text), len(test_labels)

(17389, 17389, 5794, 5794)

In [16]:
# Convert data to InputExample format
train_examples = convert_text_to_examples(train_text, train_labels)
test_examples = convert_text_to_examples(test_text, test_labels)

In [18]:
#(train_input_ids, train_input_masks, 
# train_segment_ids, train_labels) =  convert_examples_to_features(tokenizer=tokenizer, 
#                                                                  examples=train_examples, 
#                                                                  max_seq_length=MAX_SEQ_LENGTH)

(test_input_ids, test_input_masks, 
 test_segment_ids, test_labels) =  convert_examples_to_features(tokenizer=tokenizer, 
                                                                examples=test_examples, 
                                                                max_seq_length=MAX_SEQ_LENGTH)

HBox(children=(IntProgress(value=0, description='Converting examples to features', max=5794, style=ProgressSty…




In [19]:
test_input_ids.shape

(5794, 512)

# Inference on CPU

In [20]:
# need to change from tf.layers to tf.keras
class BertLayer(tf.layers.Layer):
    
    def __init__(self, bert_path, n_fine_tune_encoders=10, **kwargs,):
        
        self.n_fine_tune_encoders = n_fine_tune_encoders
        self.trainable = True
        self.output_size = 768
        self.bert_path = bert_path
        super(BertLayer, self).__init__(**kwargs)

        
    def build(self, input_shape):
        self.bert = tf_hub.Module(self.bert_path,
                                  trainable=self.trainable, 
                                  name=f"{self.name}_module")

        # Remove unused layers
        trainable_vars = self.bert.variables
        trainable_vars = [var for var in trainable_vars 
                                  if not "/cls/" in var.name]
        trainable_layers = ["embeddings", "pooler/dense"]


        # Select how many layers to fine tune
        for i in range(self.n_fine_tune_encoders+1):
            trainable_layers.append(f"encoder/layer_{str(10 - i)}")

        # Update trainable vars to contain only the specified layers
        trainable_vars = [var for var in trainable_vars
                                  if any([l in var.name 
                                              for l in trainable_layers])]

        # Add to trainable weights
        for var in trainable_vars:
            self._trainable_weights.append(var)

        for var in self.bert.variables:
            if var not in self._trainable_weights:# and 'encoder/layer' not in var.name:
                self._non_trainable_weights.append(var)
        print('Trainable layers:', len(self._trainable_weights))
        print('Non Trainable layers:', len(self._non_trainable_weights))

        super(BertLayer, self).build(input_shape)

        
    def call(self, inputs):
        
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(input_ids=input_ids, 
                           input_mask=input_mask, 
                           segment_ids=segment_ids)
        
        pooled = self.bert(inputs=bert_inputs, 
                           signature="tokens", 
                           as_dict=True)["pooled_output"]

        return pooled

    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size) 

In [21]:
# Build model
def build_model(bert_path, max_seq_length, n_fine_tune_encoders=10): 
    
    inp_id = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids")
    inp_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_masks")
    inp_segment = tf.keras.layers.Input(shape=(max_seq_length,), name="segment_ids")
    bert_inputs = [inp_id, inp_mask, inp_segment]
    
    bert_output = BertLayer(bert_path=bert_path, 
                            n_fine_tune_encoders=n_fine_tune_encoders)(bert_inputs)
    
    dense = tf.keras.layers.Dense(256, activation='relu')(bert_output)
    pred = tf.keras.layers.Dense(1, activation='sigmoid')(dense)
    
    model = tf.keras.models.Model(inputs=bert_inputs, outputs=pred)
    model.compile(loss='binary_crossentropy', 
                  optimizer=tf.keras.optimizers.Adam(lr=2e-5), 
                  metrics=['accuracy'])    
    return model

In [22]:
def initialize_vars(sess):
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    K.set_session(sess)

In [23]:
model = None

In [24]:
with tf.device('cpu:0'):
    model = build_model(bert_path=BERT_PATH, max_seq_length=MAX_SEQ_LENGTH, n_fine_tune_encoders=10)
    initialize_vars(sess)
    model.load_weights('./bert_cve_model_weights_seq512b15.h5')

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0709 08:10:23.604956 139808272750400 tf_logging.py:115] Saver not created because there are no variables in the graph to restore


Trainable layers: 199
Non Trainable layers: 5
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0709 08:10:25.335416 139808272750400 tf_logging.py:115] Saver not created because there are no variables in the graph to restore


In [26]:
test_predictions = model.predict(x=[test_input_ids[:1000], 
                                    test_input_masks[:1000], 
                                    test_segment_ids[:1000]],
                                 batch_size=512,
                                 verbose=1)



In [27]:
test_predictions = model.predict(x=[test_input_ids[:1000], 
                                    test_input_masks[:1000], 
                                    test_segment_ids[:1000]],
                                 batch_size=50,
                                 verbose=1)



# BERT as a feature extractor - contextual embeddings

In [50]:
model.layers

[<tensorflow.python.keras.engine.input_layer.InputLayer at 0x7f26bda34e10>,
 <tensorflow.python.keras.engine.input_layer.InputLayer at 0x7f26bda346d8>,
 <tensorflow.python.keras.engine.input_layer.InputLayer at 0x7f26bda34748>,
 <__main__.BertLayer at 0x7f26bda34d68>,
 <tensorflow.python.keras.layers.core.Dense at 0x7f27a07ace48>,
 <tensorflow.python.keras.layers.core.Dense at 0x7f230551be48>]

In [47]:
m = tf.keras.models.Model(inputs=model.inputs, 
                          outputs=model.layers[3].output)

In [49]:
r = m.predict(x=[test_input_ids[:1], 
             test_input_masks[:1], 
             test_segment_ids[:1]])
r, r.shape

(array([[ 0.47619382, -0.6549353 , -0.9999619 ,  0.9375201 ,  0.97283417,
         -0.78681475,  0.8963109 ,  0.52210534, -0.99991065,  0.9983653 ,
         -0.9894177 ,  0.9995566 , -0.9802466 ,  0.99943465, -0.9486656 ,
         -0.9768426 , -0.9933897 , -0.28235286,  0.5027091 , -0.88477266,
          0.12394645,  0.9999988 , -0.98896664,  0.4716109 ,  0.27310258,
          0.99990416, -0.8426162 , -0.9560043 , -0.9793524 , -0.88095266,
         -0.7740885 ,  0.61384785,  0.99644727, -0.16334432, -0.9998966 ,
          0.9947635 ,  0.72072214,  0.925317  , -0.74917775, -0.4458454 ,
          0.9307592 ,  0.5275633 , -0.97903585,  0.94861597, -0.08209433,
          0.03902055, -0.99998385,  0.7135445 ,  0.96231747,  0.99993336,
          0.99981123,  0.9997307 ,  0.5342699 ,  0.56266433,  0.48588434,
         -0.7462636 ,  0.46644196,  0.7054201 , -0.5018115 , -0.6332456 ,
         -0.2952568 ,  0.78966975, -0.9981369 ,  0.9777408 ,  0.9999815 ,
          0.9999764 , -0.89810616, -0.

In [24]:
model_arch_json = model.to_json()

In [25]:
model_arch_json

'{"class_name": "Model", "config": {"name": "model", "layers": [{"name": "input_ids", "class_name": "InputLayer", "config": {"batch_input_shape": [null, 512], "dtype": "float32", "sparse": false, "name": "input_ids"}, "inbound_nodes": []}, {"name": "input_masks", "class_name": "InputLayer", "config": {"batch_input_shape": [null, 512], "dtype": "float32", "sparse": false, "name": "input_masks"}, "inbound_nodes": []}, {"name": "segment_ids", "class_name": "InputLayer", "config": {"batch_input_shape": [null, 512], "dtype": "float32", "sparse": false, "name": "segment_ids"}, "inbound_nodes": []}, {"name": "bert_layer_1", "class_name": "BertLayer", "config": {"name": "bert_layer_1", "trainable": true, "dtype": "float32"}, "inbound_nodes": [[["input_ids", 0, 0, {}], ["input_masks", 0, 0, {}], ["segment_ids", 0, 0, {}]]]}, {"name": "dense", "class_name": "Dense", "config": {"name": "dense", "trainable": true, "dtype": "float32", "units": 256, "activation": "relu", "use_bias": true, "kernel_in