This notebook will cover the model building for Bluegene supercomputer log dataset. The model building will include the following sections.

1.) loading required packages

2.) loading dataset

3.) dataprepration 

4.) word embeddings using Word2vec for DAN( deep average network) and WAN(weighted average network) 

5.) Creating WAN and DAN models

6.) Loading BERT models, this will include 
    1.) BERT with CLS tokens 

    2.) BERT with CLS token with pretraining turned on for the BERT 

    3.) BERT with pooled tokens

    4.) BERT with a CNN layer

Every model will be evaluated using the test/Val. dataset and the key metrics which are used are 

1.) Precision 

2.) Recall 

3.) F1 score

In [1]:
#Loading the google drive
from google.colab import drive 
drive.mount('/content/gdrive/')

Mounted at /content/gdrive/


In [2]:
#loading the packages
!pip install pydot --quiet
!pip install gensim==3.8.3 --quiet
!pip install tensorflow-datasets --quiet
!pip install -U tensorflow-text==2.8.2 --quiet
!pip install transformers --quiet

[K     |████████████████████████████████| 24.2 MB 4.9 MB/s 
[K     |████████████████████████████████| 4.9 MB 4.7 MB/s 
[K     |████████████████████████████████| 498.0 MB 13 kB/s 
[K     |████████████████████████████████| 5.8 MB 66.5 MB/s 
[K     |████████████████████████████████| 1.4 MB 52.8 MB/s 
[K     |████████████████████████████████| 462 kB 84.9 MB/s 
[K     |████████████████████████████████| 5.8 MB 4.8 MB/s 
[K     |████████████████████████████████| 182 kB 92.9 MB/s 
[K     |████████████████████████████████| 7.6 MB 74.7 MB/s 
[?25h

In [3]:
#loading the packages
import pandas as pd
import numpy as np
import sys
import os
import re

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
import tensorflow_datasets as tfds
import tensorflow_text as tf_text

from transformers import BertTokenizer, TFBertModel


import sklearn as sk

import nltk
from nltk.data import find

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


#This continues to work with gensim 3.8.3.  It doesn't yet work with 4.x.  
#Make sure your pip install command specifies gensim==3.8.3
import gensim
from gensim.models import word2vec, FastText
from gensim.utils import tokenize

In [4]:
#changing the directory to where the data is saved
os.chdir('/content/gdrive/MyDrive/W266_Final_Project/Tbird parsed/')

In [5]:
#creating the dataframe from bgl dataset
df=pd.read_csv('clean_log.csv')

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,Content1,Label1,EventId
0,0,in.tftpd[] tftp client does not accept options,0,ba463c69
1,1,postfix postdrop[] warning unable to look up p...,0,85f57867
2,2,postfix postdrop[] warning unable to look up p...,0,85f57867
3,3,postfix postdrop[] warning unable to look up p...,0,85f57867
4,4,postfix postdrop[] warning unable to look up p...,0,85f57867


In [7]:
#loading the NLTK for the word2vec embeddings 
nltk.download('word2vec_sample')

word2vec_sample = str(find('models/word2vec_sample/pruned.word2vec.txt'))

model = gensim.models.KeyedVectors.load_word2vec_format(word2vec_sample, binary=False)

[nltk_data] Downloading package word2vec_sample to /root/nltk_data...
[nltk_data]   Unzipping models/word2vec_sample.zip.


In [8]:
#creating the word ensamble for the tokenization 
whole_data_emb = df["Content1"].values.tolist()
whole_data_label=df["Label1"].values.tolist()

In [9]:
len(whole_data_emb)

5000000

In [None]:
# from sklearn.model_selection import train_test_split
# X_train, X_val, y_train, y_val = train_test_split(whole_data_emb, whole_data_label, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

In [27]:
#splitting the dataset into train, test and validation set
train_ratio = 0.80
test_ratio = 0.10
validation_ratio = 0.10



X_train, X_test, y_train, y_test = train_test_split(whole_data_emb, whole_data_label, test_size=test_ratio, random_state=1)

X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=validation_ratio/(train_ratio+test_ratio))

print("Train Size:",len(X_train))
print("Test Size:",len(X_test))
print("Valid Size:",len(X_valid))

Train Size: 3999999
Test Size: 500000
Valid Size: 500001


In [11]:
#@title Embedding Matrix Creation

EMBEDDING_DIM = 300    # we know... it's 300

# initialize embedding matrix and word-to-id map:
embedding_matrix = np.zeros((len(model.vocab.keys()) + 1, EMBEDDING_DIM))       
vocab_dict = {}

# build the embedding matrix and the word-to-id map:
for i, word in enumerate(model.vocab.keys()):
    embedding_vector = model[word]
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        vocab_dict[word] = i

# we can use the last index at the end of the vocab for unknown tokens
vocab_dict['[UNK]'] = len(vocab_dict)

In [12]:
#setting the sequence length
SEQUENCE_LENGTH = 100

In [13]:
# creating the word to vocab ids from the dataset
def docs_to_vocab_ids(tokenized_texts_list):
    """
    converting a list of strings to a list of lists of word ids
    """
    texts_vocab_ids = []
    text_labels = []
    valid_example_list = []
    for i, token_list in enumerate(tokenized_texts_list):

        # Get the vocab id for each token in this doc ([UNK] if not in vocab)
        vocab_ids = []
        for token in list(token_list.numpy()):
            decoded = token.decode('utf-8', errors='ignore')
            if decoded in vocab_dict:
                vocab_ids.append(vocab_dict[decoded])
            else:
                vocab_ids.append(vocab_dict['[UNK]'])
            
        # Truncate text to max length, add padding up to max length
        vocab_ids = vocab_ids[:SEQUENCE_LENGTH]
        n_padding = (SEQUENCE_LENGTH - len(vocab_ids))
        # For simplicity in this model, we'll just pad with unknown tokens
        vocab_ids += [vocab_dict['[UNK]']] * n_padding
        valid_example_list.append(i)
        # Add this example to the list of converted docs
        texts_vocab_ids.append(vocab_ids)
            
        if i % 50000 == 0:
            print('Examples processed: ', i)

    print('Total examples: ', i)
    return (np.array(texts_vocab_ids), valid_example_list)

In [None]:
#sampling the dataset from the datafram, this is done due to the large dataset
#this is only done for the word2vec embedding as it takes considerable resources
tokenizer = tf_text.WhitespaceTokenizer()
train_tokens = tokenizer.tokenize(X_train[:500000])
test_tokens= tokenizer.tokenize(X_test[:150000])                          

In [14]:
ytrain=np.array(y_train[:500000])
ytest=np.array(y_train[:150000])

In [None]:
train_tokens[0:3]

<tf.RaggedTensor [[b'ib', b'sm.x[]', b'[ib', b'sm', b'sweep.c]', b'**********************',
  b'NEW', b'SWEEP', b'********************']                              ,
 [b'kernel', b'THH()', b'mnt', b'projects', b'sysapps', b'src', b'ib',
  b'topspin', b'topspin-src-..-', b'third', b'party', b'thca', b'linux',
  b'kernel', b'mlxhh', b'thh', b'obj', b'host', b'amd', b'custom', b'rhel',
  b'mod', b'thh', b'hob', b'comm.c[]', b'XHH', b'hob', b'process', b'local',
  b'mad', b'Device', b'in', b'FATAL', b'state']                             ,
 [b'haldaemon', b'haldaemon', b'shutdown', b'failed']]>

In [None]:
test_tokens[0:3]

<tf.RaggedTensor [[b'sshd(pam', b'unix)[]', b'session', b'opened', b'for', b'user', b'root',
  b'by', b'(uid)']                                                         ,
 [b'gmond', b'gmond', b'shutdown', b'failed'],
 [b'apps', b'x', b'system', b'ganglia-..', b'sbin', b'gmetad[]', b'data',
  b'thread()', b'got', b'not', b'answer', b'from', b'any', b'[Thunderbird',
  b'A]', b'datasource']                                                    ]>

In [None]:

#running the workd2vec embeddings on the log dataset
train_input_ids, train_valid_example_list = docs_to_vocab_ids(train_tokens)
test_input_ids, test_valid_example_list = docs_to_vocab_ids(test_tokens)

Examples processed:  0
Examples processed:  50000
Examples processed:  100000
Examples processed:  150000
Examples processed:  200000
Examples processed:  250000
Examples processed:  300000
Examples processed:  350000
Examples processed:  400000
Examples processed:  450000
Total examples:  499999
Examples processed:  0
Examples processed:  50000
Examples processed:  100000
Total examples:  149999


In [None]:
#checking the word embeddings
train_input_ids[:2]

array([[43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981,
        43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981,
        43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981,
        43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981,
        43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981,
        43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981,
        43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981,
        43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981,
        43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981,
        43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981,
        43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981,
        43981],
       [17402, 43981, 43981, 34856, 43981, 43981, 43981, 43981, 43981,
         5629,  5256, 43981, 43981, 17402, 43981, 43981, 43981, 41049,
        43981, 34917, 43981, 43981, 43981, 33125, 43981, 4398

In [None]:
#checking the word embeddings
test_input_ids[:2]

array([[43981, 43981, 12690, 14624, 11511, 15580,  5592, 12929, 43981,
        43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981,
        43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981,
        43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981,
        43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981,
        43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981,
        43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981,
        43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981,
        43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981,
        43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981,
        43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981,
        43981],
       [43981, 43981, 18238, 12698, 43981, 43981, 43981, 43981, 43981,
        43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981, 43981,
        43981, 43981, 43981, 43981, 43981, 43981, 43981, 4398

DAN (Deep Average Network) using the word2vec embedding

In [None]:
def create_dan_model(retrain_embeddings=False, 
                     max_sequence_length=SEQUENCE_LENGTH,
                     hidden_dim=100,
                     dropout=0.3,
                     embedding_initializer='word2vec', 
                     learning_rate=0.001):
  """
  Construct the DAN model including the compilation and return it. Parametrize it using the arguments.
  :param retrain_embeddings: boolean, indicating whether  the word embeddings are trainable
  :param hidden_dim: dimension of the hidden layer
  :param dropout: dropout applied to the hidden layer

  :returns: the compiled model
  """

  if embedding_initializer == 'word2vec':
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix)
  else:
    embeddings_initializer='uniform'

  dan_input_layer = tf.keras.layers.Input(shape=(max_sequence_length,), dtype='int64')
  
  dan_embedding_layer = Embedding(embedding_matrix.shape[0],
                                    embedding_matrix.shape[1],
                                    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                                    input_length=SEQUENCE_LENGTH,
                                    trainable=retrain_embeddings)
  
  dan_embeddings = dan_embedding_layer(dan_input_layer)
    
  dan_avg_embeddings = tf.keras.layers.Lambda(lambda x: K.mean(x, axis=1),name='averaging')(dan_embeddings)
  
  last_hidden_output = tf.keras.layers.Dense(100, activation='relu',
                                                   name='dan_hidden_%d' % i)(dan_avg_embeddings)

  last_hidden_output = tf.keras.layers.Dropout(dropout)(last_hidden_output)

  dan_classification = tf.keras.layers.Dense(1,
                                               activation='sigmoid',
                                               name='dan_classification')(last_hidden_output)

    
  dan_model1 = tf.keras.models.Model(inputs=dan_input_layer, outputs=dan_classification)
    
    
  dan_model1.compile(loss='binary_crossentropy',
                      optimizer=tf.keras.optimizers.Adam(learning_rate=0.001,
                                                         beta_1=0.9,
                                                         beta_2=0.999,
                                                         epsilon=1e-07,
                                                         amsgrad=False,
                                                         name='Adam'),
                      metrics=['accuracy'])
  
  print(dan_model1.summary())
  return dan_model1


In [None]:
dan_model1 = create_dan_model()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding (Embedding)       (None, 100, 300)          13194600  
                                                                 
 averaging (Lambda)          (None, 300)               0         
                                                                 
 dan_hidden_43980 (Dense)    (None, 100)               30100     
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dan_classification (Dense)  (None, 1)                 101       
                                                                 
Total params: 13,224,801
Trainable params: 30,201
Non-trainab

In [None]:
#Exceuting the model
history_noshuffle = dan_model1.fit(train_input_ids,
                            ytrain,
                            validation_data=(test_input_ids,ytest),
                            batch_size=32,
                            epochs=5,
                            shuffle=True
                            )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Creating the function for calculating the perfomance of the models

In [38]:
# function to calculate the metrics such as F1, Recall and Precision
from sklearn.metrics import confusion_matrix
def conf_matrix(model,xtest1,ytest1):
  y_pred = model.predict(xtest1)
  y_pred = np.argmax(y_pred, axis=1)
  conf_mat = confusion_matrix(ytest1, y_pred)
  TP=conf_mat[0][0]
  FP=conf_mat[0][1]
  FN=conf_mat[1][0]
  TN=conf_mat[1][1]
  precision=TP/(TP+FN)*100
  print('Precision:',precision)
  recall=(TP)/(TP+FN)*100
  print('Recall:',recall)
  F1_score=(2*TP)/(2*TP+FP+FN)*100
  print('F1 Score',F1_score)

  return precision,recall, F1_score, conf_mat


In [None]:
conf_matrix(dan_model1,test_input_ids,ytest)

Precision: 95.54533333333333
Recall: 95.54533333333333
F1 Score 97.72192637342407


(95.54533333333333,
 95.54533333333333,
 97.72192637342407,
 array([[143318,      0],
        [  6682,      0]]))

Creating the WAN(Weighted Average Network)

In [None]:
def create_wan_model(retrain_embeddings=False, 
                     max_sequence_length=SEQUENCE_LENGTH,
                     hidden_dim=100,
                     dropout=0.3,
                     learning_rate=0.001):
  """
  Construct the WAN model including the compilation and return it. Parametrize it using the arguments.
  :param retrain_embeddings: boolean, indicating whether the word embeddings are trainable
  :param hidden_dim: dimension of the hidden layer
  :param dropout: dropout applied to the hidden layer

  :returns: the compiled model
  """
  ### YOUR CODE HERE

  #if embedding_initializer == 'word2vec':
      #embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix)
  #else:
      #embeddings_initializer='uniform'

  wan_input_layer = tf.keras.layers.Input(shape=(max_sequence_length,), dtype='int64') #fake input layer
 

  wan_embedding_layer = Embedding(embedding_matrix.shape[0],
                                    embedding_matrix.shape[1],
                                    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                                    input_length=SEQUENCE_LENGTH,
                                    trainable=retrain_embeddings)
  
  
  wan_embeddings = wan_embedding_layer(wan_input_layer)
  
  #query
  wan_query_layer = tf.keras.layers.Dense(embedding_matrix.shape[1]) #query
  
  wan_one_vector = tf.Variable(tf.ones((1, 1, 1)))
  
  wan_batch_of_ones = tf.tile(wan_one_vector, (tf.shape(wan_input_layer)[0], 1, 1)) 

  wan_query_vector = wan_query_layer(wan_batch_of_ones)

    
  wan_avg_embeddings = tf.keras.layers.Lambda(lambda x: K.mean(x, axis=1),name='averaging')(wan_embeddings)

  wan_attention_output,wan_attention_weights= tf.keras.layers.Attention()([wan_query_vector,wan_embeddings],return_attention_scores=True)

  
  wan_attention_output = tf.keras.layers.Reshape((wan_attention_output.shape[-1],))(wan_attention_output)


  
  last_hidden_output = tf.keras.layers.Dense(100, activation='relu',
                                                   name='wan_hidden_%d' % i)(wan_attention_output)

  last_hidden_output = tf.keras.layers.Dropout(dropout)(last_hidden_output)

  wan_classification = tf.keras.layers.Dense(1,
                                               activation='sigmoid',
                                               name='classification')(last_hidden_output)

    
  wan_model1 = tf.keras.models.Model(inputs=wan_input_layer, outputs=[wan_classification,wan_attention_weights])
    
    
  wan_model1.compile(loss=['binary_crossentropy'],
                      optimizer=tf.keras.optimizers.Adam(learning_rate=0.001,
                                                         beta_1=0.9,
                                                         beta_2=0.999,
                                                         epsilon=1e-07,
                                                         amsgrad=False,
                                                         name='Adam'),
                      metrics='accuracy')
  
  print(wan_model1.summary())
  
  return wan_model1


In [None]:

wan_model = create_wan_model()
history_wan = wan_model.fit(train_input_ids,
                            ytrain,
                            validation_data=(test_input_ids,ytest),
                            batch_size=32,
                            epochs=10,
                            shuffle=True
                            )
# use wan_history = ... below



Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 100)]        0           []                               
                                                                                                  
 tf.compat.v1.shape (TFOpLambda  (2,)                0           ['input_2[0][0]']                
 )                                                                                                
                                                                                                  
 tf.__operators__.getitem (Slic  ()                  0           ['tf.compat.v1.shape[0][0]']     
 ingOpLambda)                                                                                     
                                                                                            

Running the perfomance model

In [None]:
y_pred = wan_model.predict(test_input_ids)
y_pred=y_pred[0:150000][0]
y_pred=np.array(y_pred)
y_pred = np.argmax(y_pred, axis=1)
conf_mat = confusion_matrix(ytest, y_pred)
TP=conf_mat[0][0]
FP=conf_mat[0][1]
FN=conf_mat[1][0]
TN=conf_mat[1][1]
precision=TP/(TP+FN)*100
print('Precision:',precision)
recall=(TP)/(TP+FN)*100
print('Recall:',recall)
F1_score=(2*TP)/(2*TP+FP+FN)*100
print('F1 Score:',F1_score)
conf_mat

Precision: 95.54533333333333
Recall: 95.54533333333333
F1 Score: 97.72192637342407


array([[143318,      0],
       [  6682,      0]])

From this section on we will be using BERT based models, inorder to do that we will start with loading the BERT tokenizer and the BERT model to be used later on for experimentation

In [15]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [16]:
bert_model = TFBertModel.from_pretrained('bert-base-cased')

Downloading:   0%|          | 0.00/527M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [17]:
# num_train_examples = len(X_train)*.1
# num_test_examples = len(X_test)*.1
# num_val_examples=len(X_valid)*.1
max_length = 50

Running the tokenizer for BERT model

In [28]:
# num_train_examples = 2500000
num_train_examples = 500000
num_test_examples = 200000
num_val_examples=200000
max_length = 50


x_train = bert_tokenizer(X_train[:num_train_examples], 
              max_length=max_length,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')

x_test = bert_tokenizer(X_test[:num_test_examples], 
              max_length=max_length,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')

x_val= bert_tokenizer(X_valid[:num_val_examples], 
              max_length=max_length,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')


# x_train = bert_tokenizer(X_train, 
#               max_length=max_length,
#               truncation=True,
#               padding='max_length', 
#               return_tensors='tf')

# x_test = bert_tokenizer(X_test, 
#               max_length=max_length,
#               truncation=True,
#               padding='max_length', 
#               return_tensors='tf')

# x_val= bert_tokenizer(X_valid, 
#               max_length=max_length,
#               truncation=True,
#               padding='max_length', 
#               return_tensors='tf')




def select_min_length_examples(x_data, y_data):

  x_input_ids = []
  y_labels = []

  for ((input_ids, masks), label) in zip(zip(x_data['input_ids'], x_data['attention_mask']), y_data):
    if masks[-1] == 1:
      x_input_ids.append(input_ids)
      y_labels.append(label)

  return np.array(x_input_ids), np.array(y_labels) 


In [29]:
y_train = y_train[:num_train_examples]
y_test = y_test[:num_test_examples]
y_val = y_valid[:num_val_examples]

In [30]:
x_train

{'input_ids': <tf.Tensor: shape=(500000, 50), dtype=int32, numpy=
array([[  101, 18670,   157, ...,  1821,  1181,   102],
       [  101, 23875,   193, ...,     0,     0,     0],
       [  101,   178,  1830, ...,     0,     0,     0],
       ...,
       [  101,   188,  2737, ...,     0,     0,     0],
       [  101,  9366,  2895, ...,     0,     0,     0],
       [  101, 18670,   164, ...,   184,  1830,   102]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(500000, 50), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(500000, 50), dtype=int32, numpy=
array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1]], dtype

In [31]:
x_val

{'input_ids': <tf.Tensor: shape=(200000, 50), dtype=int32, numpy=
array([[  101, 23875,   193, ...,     0,     0,     0],
       [  101,   193, 27411, ...,     0,     0,     0],
       [  101,   188,  2737, ...,     0,     0,     0],
       ...,
       [  101,   188,  2737, ...,     0,     0,     0],
       [  101, 23875,   193, ...,     0,     0,     0],
       [  101, 23875,   193, ...,   187, 15093,   102]], dtype=int32)>, 'token_type_ids': <tf.Tensor: shape=(200000, 50), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(200000, 50), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1]], dtype

Running the BERT model on the input ids for creating the Embedding

In [32]:
bert_train_input_ids, bert_train_labels = select_min_length_examples(x_train, y_train)
bert_test_input_ids, bert_test_labels = select_min_length_examples(x_test, y_test)
bert_val_input_ids, bert_val_labels = select_min_length_examples(x_val, y_val)

In [33]:
SEQUENCE_LENGTH = 50

Creating the BERT CLS with the CLS token in the BERT model. For this we will use the BERT input ids and the pooled tokens for this model

In [34]:
def create_bert_cls_model(hidden_size = 100, 
                          dropout=0.3,
                          learning_rate=0.0005):
    """
    Build a simple classification model with BERT. Use the CLS Token output for classification purposes.
    """


    

    ### YOUR CODE HERE
    input_ids = tf.keras.layers.Input(shape=(SEQUENCE_LENGTH,), dtype=tf.int64, name='input_ids_layer')
    #token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='token_type_ids_layer')
    #attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='attention_mask_layer')

    
    bert_inputs = {'input_ids': input_ids}  

 

    bert_out = bert_model(bert_inputs)

    pooled_token = bert_out[1]
    #cls_token = bert_out[0][:, 0, :]

    hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(pooled_token)


    hidden = tf.keras.layers.Dropout(dropout)(hidden)  

    classification = tf.keras.layers.Dense(1, activation='sigmoid',name='classification_layer')(hidden)
    
    classification_model = tf.keras.Model(inputs=[input_ids], outputs=[classification])
    
    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                 loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), 
                                 metrics='accuracy')


    ### END YOUR CODE
    print(classification_model.summary())
    return classification_model

In [35]:
bert_classification_model = create_bert_cls_model()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_ids_layer (InputLayer  [(None, 50)]             0         
 )                                                               
                                                                 
 tf_bert_model (TFBertModel)  TFBaseModelOutputWithPoo  108310272
                             lingAndCrossAttentions(l            
                             ast_hidden_state=(None,             
                             50, 768),                           
                              pooler_output=(None, 76            
                             8),                                 
                              past_key_values=None, h            
                             idden_states=None, atten            
                             tions=None, cross_attent            
                             ions=None)                    

In [36]:
b_model = create_bert_cls_model()
history_wan = b_model.fit(bert_train_input_ids,
                            bert_train_labels,
                            validation_data=(bert_val_input_ids,bert_val_labels),
                            batch_size=8,
                            epochs=2
                            )

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_ids_layer (InputLayer  [(None, 50)]             0         
 )                                                               
                                                                 
 tf_bert_model (TFBertModel)  TFBaseModelOutputWithPoo  108310272
                             lingAndCrossAttentions(l            
                             ast_hidden_state=(None,             
                             50, 768),                           
                              pooler_output=(None, 76            
                             8),                                 
                              past_key_values=None, h            
                             idden_states=None, atten            
                             tions=None, cross_attent            
                             ions=None)                    

In [39]:
conf_matrix(b_model,bert_val_input_ids,bert_val_labels)

Precision: 80.43611793611794
Recall: 80.43611793611794
F1 Score 89.15744680851064


(80.43611793611794,
 80.43611793611794,
 89.15744680851064,
 array([[36666,     0],
        [ 8918,     0]]))

In [None]:
def create_bert_cls_model1(hidden_size = 100, 
                          dropout=0.3,
                          learning_rate=0.0005):
    """
    Build a simple classification model with BERT. Use the CLS Token output for classification purposes.
    """


    

    ### YOUR CODE HERE
    input_ids = tf.keras.layers.Input(shape=(SEQUENCE_LENGTH,), dtype=tf.int64, name='input_ids_layer')
    #token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='token_type_ids_layer')
    #attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='attention_mask_layer')

    
    bert_inputs = {'input_ids': input_ids}  

 

    bert_out = bert_model(bert_inputs)

    #pooled_token = bert_out[1]
    cls_token = bert_out[0][:, 0, :]

    hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(cls_token)


    hidden = tf.keras.layers.Dropout(dropout)(hidden)  

    classification = tf.keras.layers.Dense(1, activation='sigmoid',name='classification_layer')(hidden)
    
    classification_model = tf.keras.Model(inputs=[input_ids], outputs=[classification])
    
    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                 loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), 
                                 metrics='accuracy')


    ### END YOUR CODE
    print(classification_model.summary())
    return classification_model

In [None]:
bert_classification_model1 = create_bert_cls_model1()

Model: "model_13"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_ids_layer (InputLayer  [(None, 50)]             0         
 )                                                               
                                                                 
 tf_bert_model (TFBertModel)  multiple                 108310272 
                                                                 
 tf.__operators__.getitem_5   (None, 768)              0         
 (SlicingOpLambda)                                               
                                                                 
 hidden_layer (Dense)        (None, 100)               76900     
                                                                 
 dropout_50 (Dropout)        (None, 100)               0         
                                                                 
 classification_layer (Dense  (None, 1)                101

In [None]:
b_model1 = create_bert_cls_model1()
history_wan1 = b_model1.fit(bert_train_input_ids,
                            bert_train_labels,
                            validation_data=(bert_val_input_ids,bert_val_labels),
                            batch_size=8,
                            epochs=2
                            )

Model: "model_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_ids_layer (InputLayer  [(None, 50)]             0         
 )                                                               
                                                                 
 tf_bert_model (TFBertModel)  multiple                 108310272 
                                                                 
 tf.__operators__.getitem_7   (None, 768)              0         
 (SlicingOpLambda)                                               
                                                                 
 hidden_layer (Dense)        (None, 100)               76900     
                                                                 
 dropout_52 (Dropout)        (None, 100)               0         
                                                                 
 classification_layer (Dense  (None, 1)                101



Epoch 2/2


In [None]:
conf_matrix(b_model1,bert_val_input_ids,bert_val_labels)

Recall: 95.66510837229069
F1 Score: 97.78453518679409


(95.66510837229069, 97.78453518679409)

Creating a BERT CLS model with the pretraining of the BERT model turned on

In [None]:
def create_bert_classification_model(bert_model,
                                     train_layers=-1,
                                     hidden_size = 200, 
                                     dropout=0.3,
                                     learning_rate=0.001):
    """
    Build a simple classification model with BERT. Use the Pooled Output for classification purposes
    """
    if train_layers == -1:
        # Freeze all layers of pre-trained BERT model
        bert_model.trainable = False

    else:
        # Restrict training to the train_layers outer transformer layers
        retrain_layers = []

        for retrain_layer_number in range(train_layers):

            layer_code = '_' + str(11 - retrain_layer_number)
            retrain_layers.append(layer_code)
          
        
        print('retrain layers: ', retrain_layers)

        for w in bert_model.weights:
            if not any([x in w.name for x in retrain_layers]):
                print('freezing: ', w)
                w._trainable = False

    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask}      

    bert_out = bert_model(bert_inputs)

    pooled_token = bert_out[1]
    #cls_token = bert_out[0][:, 0, :]

    hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(pooled_token)


    hidden = tf.keras.layers.Dropout(dropout)(hidden)  


    classification = tf.keras.layers.Dense(1, activation='sigmoid',name='classification_layer')(hidden)
    
    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=[classification])
    
    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                 loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), 
                                 metrics='accuracy')
    
    return classification_model

In [None]:
bert_classification_model = create_bert_classification_model(bert_model, train_layers=-1)

In [None]:
bert_classification_model.summary()

Model: "model_16"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 attention_mask_layer (InputLay  [(None, 50)]        0           []                               
 er)                                                                                              
                                                                                                  
 input_ids_layer (InputLayer)   [(None, 50)]         0           []                               
                                                                                                  
 token_type_ids_layer (InputLay  [(None, 50)]        0           []                               
 er)                                                                                              
                                                                                           

In [None]:
xtrain_id=np.array(x_train.input_ids)
xtrain_token=np.array(x_train.token_type_ids)
xtrain_attn=np.array(x_train.attention_mask)
y_train=np.array(y_train)
xtest_id=np.array(x_test.input_ids)
xtest_token=np.array(x_test.token_type_ids)
xtest_attn=np.array(x_test.attention_mask)
y_test=np.array(y_test)

In [None]:
bert_classification_model_history = bert_classification_model.fit(
    [xtrain_id, xtrain_token, xtrain_attn],
    y_train[:50000],
    validation_data=([xtest_id, xtest_token, xtest_attn], y_test[:20000]),
    batch_size=32,
    epochs=2
)  

Epoch 1/2
Epoch 2/2


In [None]:
conf_matrix(bert_classification_model,[xtest_id, xtest_token, xtest_attn],y_test[:20000])

Recall: 95.585
F1 Score: 97.74266942761459


(95.585, 97.74266942761459)

In [None]:
def create_bert_train(bert_model,
                                     train_layers=1,
                                     hidden_size = 200, 
                                     dropout=0.3,
                                     learning_rate=0.001):
    """
    Build a simple classification model with BERT. Use the Pooled Output for classification purposes
    """
    if train_layers == -1:
        # Freeze all layers of pre-trained BERT model
        bert_model.trainable = False

    else:
        # Restrict training to the train_layers outer transformer layers
        retrain_layers = []

        for retrain_layer_number in range(train_layers):

            layer_code = '_' + str(11 - retrain_layer_number)
            retrain_layers.append(layer_code)
          
        
        print('retrain layers: ', retrain_layers)

        for w in bert_model.weights:
            if not any([x in w.name for x in retrain_layers]):
                print('freezing: ', w)
                w._trainable = False

    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask}      

    bert_out = bert_model(bert_inputs)

    pooled_token = bert_out[1]
    #cls_token = bert_out[0][:, 0, :]

    hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(pooled_token)


    hidden = tf.keras.layers.Dropout(dropout)(hidden)  


    classification = tf.keras.layers.Dense(1, activation='sigmoid',name='classification_layer')(hidden)
    
    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=[classification])
    
    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                 loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), 
                                 metrics='accuracy')
    
    return classification_model

In [None]:
bert_classification_model_1 = create_bert_train(bert_model, train_layers=1)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
       -4.70929965e-02,  2.04659086e-02,  2.65777670e-02, -6.90833628e-02,
       -2.86339247e-03,  3.58792543e-02,  5.09714521e-02, -2.22123563e-02,
        5.75837865e-02,  6.72026677e-03,  6.09670905e-03, -5.32138720e-02,
        3.30908857e-02,  4.07863855e-02, -1.12298056e-02, -4.83246632e-02,
       -5.72264008e-02,  1.66005548e-02, -2.49155685e-02,  2.57391017e-04,
       -9.65425670e-02,  5.47939055e-02, -2.25652978e-02, -3.11921537e-02,
        3.22624168e-04,  1.30029619e-02, -1.47148818e-01, -3.11547685e-02,
        3.28753889e-03, -8.21698755e-02, -2.69751232e-02,  1.11082476e-02,
        7.98455551e-02,  4.38706763e-02, -3.99685875e-02,  4.98550944e-02,
       -6.97232559e-02,  7.49441907e-02,  1.80163682e-02,  4.15917195e-04,
       -2.23934315e-02, -1.51800066e-02,  6.57838956e-02, -3.04725263e-02,
        6.04042262e-02,  1.47098145e-02,  4.14339751e-02,  4.95331828e-03,
       -6.65923432e-02,  7.61813894

In [None]:
bert_classification_model_history1 = bert_classification_model_1.fit(
    [xtrain_id, xtrain_token, xtrain_attn],
    y_train[:50000],
    validation_data=([xtest_id, xtest_token, xtest_attn], y_test[:20000]),
    batch_size=32,
    epochs=2
)  

Epoch 1/2
Epoch 2/2


In [None]:
conf_matrix(bert_classification_model_1,[xtest_id, xtest_token, xtest_attn],y_test[:20000])

Recall: 95.585
F1 Score: 97.74266942761459


(95.585, 97.74266942761459)

Building the BERT average model, here we will use the pooled token with the averaging them and then running through the transformer layers

In [None]:
def create_bert_avg_model(hidden_size = 100, 
                                dropout=0.3,
                                learning_rate=0.00005):
    """
    Build a simple classification model with BERT. Use the average of the BERT output tokens
    """

    ### YOUR CODE HERE
    ### YOUR CODE HERE
    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='input_ids_layer')
    #token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='token_type_ids_layer')
    #attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='attention_mask_layer')

    
    bert_inputs = {'input_ids': input_ids}  

 

    bert_out = bert_model(bert_inputs)


    pooled_token = bert_out[0][:, 1:-1, :]
    #cls_token = bert_out[0][:, 0, :]
    bert_average=tf.math.reduce_mean(pooled_token,axis=1)

    #bert_average= tf.keras.layers.Reshape((bert_average.shape[-1],))

    hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(bert_average)


    hidden = tf.keras.layers.Dropout(dropout)(hidden)  

    classification = tf.keras.layers.Dense(1, activation='sigmoid',name='classification_layer')(hidden)
    
    classification_model = tf.keras.Model(inputs=[input_ids], outputs=[classification])
    
    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                 loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), 
                                 metrics='accuracy')


    ### END YOUR CODE
    print(classification_model.summary())
    return classification_model


In [None]:
b_model1 = create_bert_avg_model()
history_wan = b_model1.fit(bert_train_input_ids,
                            bert_train_labels,
                            validation_data=(bert_test_input_ids,bert_test_labels),
                            batch_size=8,
                            epochs=2
                            )

Model: "model_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_ids_layer (InputLayer  [(None, 50)]             0         
 )                                                               
                                                                 
 tf_bert_model (TFBertModel)  multiple                 108310272 
                                                                 
 tf.__operators__.getitem_8   (None, 48, 768)          0         
 (SlicingOpLambda)                                               
                                                                 
 tf.math.reduce_mean (TFOpLa  (None, 768)              0         
 mbda)                                                           
                                                                 
 hidden_layer (Dense)        (None, 100)               76900     
                                                          

In [None]:
conf_matrix(b_model1,bert_test_input_ids,bert_test_labels)

Recall: 95.58823529411765
F1 Score: 97.74436090225564


(95.58823529411765, 97.74436090225564)

For the last model we will use a CNN layer on top of the BERT model, BERT will take the features created from the CNN, for the the tokesn we will use the input ids only

In [None]:
def create_bert_cnn_model(hidden_size = 100, 
                                learning_rate=0.00005,
                                num_filters = [100, 100, 50, 25],
                                kernel_sizes = [3, 5, 10, 20],
                                dropout = 0.3):
    """
    Build a  classification model with BERT, where you apply CNN layers  to the BERT output
    """

    ### YOUR CODE HERE

    input_ids = tf.keras.layers.Input(shape=(50,), dtype=tf.int64, name='input_ids_layer')
    #token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='token_type_ids_layer')
    #attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids}  

    bert_out = bert_model(bert_inputs)
 


    #pooled_token = bert_out[1]
    #pooled_token = bert_out[0]
    #cls_token = bert_out[0][:, 0, :]
  #bert_average=tf.math.reduce_mean(pooled_token,axis=1)


    conv_layers_for_all_kernel_sizes = []
    for kernel_size, filters in zip(kernel_sizes, num_filters):
        conv_layer = keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, activation='relu')(bert_out[0])
        conv_layer = keras.layers.GlobalMaxPooling1D()(conv_layer)
        conv_layers_for_all_kernel_sizes.append(conv_layer)
    
    conv_output = keras.layers.concatenate(conv_layers_for_all_kernel_sizes, axis=1)
    #last_hidden_output = keras.layers.Dropout(rate=dropout)(conv_output)

    
    last_hidden_output = keras.layers.Dense(hidden_size, activation='relu')(conv_output)
    last_hidden_output = keras.layers.Dropout(dropout)(last_hidden_output)

    #flatt=keras.layers.Flatten()(last_hidden_output)
    #cnn_prediction = keras.layers.Dense(1, activation='sigmoid')(last_hidden_output)
    cnn_prediction = keras.layers.Dense(1, activation='sigmoid')(last_hidden_output)

    classification_model = keras.Model(inputs=input_ids, outputs=cnn_prediction)
    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                      loss='binary_crossentropy',  # From information theory notebooks.
                      metrics=['accuracy'])
   
    ### END YOUR CODE
    
    print(classification_model.summary())
    return classification_model

In [None]:
cnn_bert_model=create_bert_cnn_model()
cnn_bert_model.summary

Model: "model_19"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids_layer (InputLayer)   [(None, 50)]         0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    multiple             108310272   ['input_ids_layer[0][0]']        
                                                                                                  
 conv1d (Conv1D)                (None, 48, 100)      230500      ['tf_bert_model[9][0]']          
                                                                                                  
 conv1d_1 (Conv1D)              (None, 46, 100)      384100      ['tf_bert_model[9][0]']          
                                                                                           

<bound method Model.summary of <keras.engine.functional.Functional object at 0x7f4303275490>>

In [None]:
cnn_bert_history = cnn_bert_model.fit(bert_train_input_ids,
                            bert_train_labels,
                            validation_data=(bert_test_input_ids,bert_test_labels),
                            batch_size=8,
                            epochs=2
                            )

Epoch 1/2
Epoch 2/2


In [None]:
conf_matrix(cnn_bert_model,bert_test_input_ids,bert_test_labels)

Recall: 95.58823529411765
F1 Score: 97.74436090225564


(95.58823529411765, 97.74436090225564)