# Setup
based on https://colab.research.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb#scrollTo=191zq3ZErihP

In [0]:
import datetime
import json
import os
import pprint
import random
import string
import sys
import tensorflow as tf

assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)

from google.colab import auth
auth.authenticate_user()
with tf.Session(TPU_ADDRESS) as session:
  print('TPU devices:')
  pprint.pprint(session.list_devices())

  # Upload credentials to TPU.
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
  # Now credentials are set for all future sessions on this TPU.

TPU address is grpc://10.108.13.194:8470
TPU devices:
[_DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:CPU:0, CPU, -1, 861201113876687645),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 17179869184, 7509863777384350548),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 10957324747785543829),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 16010609198301123736),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 15642925627654617544),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:3, TPU, 17179869184, 202491004858483873),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:4, TPU, 17179869184, 7185590080412384499),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:5, TPU, 17179869184, 9450033752344641433),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:6, TPU, 17179869184, 59370890480492

W0802 06:34:07.008509 139845769774976 lazy_loader.py:50] 
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



# Data Ingestion


## Data Preparation

In [0]:
#This file is written to ingest data from i2b2
#below are the requried library
from xml.dom import minidom # need this to read xlm files
import xml.etree.ElementTree as ET
import os
import pandas as pd
import random
import nltk
import string
from sklearn.preprocessing import LabelEncoder
from pandas import DataFrame
import nltk.data
from nltk import sent_tokenize
nltk.download('punkt') #this package needs to be downloaded separately
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from pandas import DataFrame
import re
import numpy as np

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### GDrive Setup

We first mount the google drive containing the training and test fiels

In [0]:
from google.colab import drive # this sets the file path to your personal google drive. You will need to enter the authorization code each time. 
drive.mount('/gdrive')
%cd /gdrive

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


## Creating List of Files to be Ingested

We then created two list - each contains the list of the file names for training and testing.

1.   train_filelist = 790 EHR records
2.   test_filelist=514 EHR records



In [1]:
# data processing created with the help of teaching assistant Sudha Subramanian, who previously worked with the same dataset

train_filelist=[]

for file in os.listdir('/gdrive/My Drive/w266_NLP/training-PHI'):#set your file path here
  filename = os.fsdecode(os.fsencode('/gdrive/My Drive/w266_NLP/training-PHI/'+file))
  if filename.endswith( ('.xml') ): # select xml files
    train_filelist.append(filename)

print("There are {} training file".format(len(train_filelist))) #check that the number of training file is 790 records for 178 patients

In [2]:
test_filelist=[]

for file in os.listdir('/gdrive/My Drive/w266_NLP/test-PHI'):#set your file path here
  filename = os.fsdecode(os.fsencode('/gdrive/My Drive/w266_NLP/test-PHI/'+file))
  if filename.endswith( ('.xml') ): # select xml files
    test_filelist.append(filename)

print("There are {} test file".format(len(test_filelist))) #check that the number of test file is 514 records for 178 patients

# Process Data Annotation

The tag generator process the annotation into a dataframe.

In [0]:
def tag_generator(file):
  '''The function extract the tags from the EHR record and turn them into pd dataframe'''
  tree = ET.parse(file)
  root=tree.getroot()
  
  PHI_category=['NAME','PROFESSION','LOCATION','AGE','DATE','CONTACT','ID']# Here are the seven PHI category defined by i2b2
  #PHI_category=[category]
  tag_list=[]#An empty list to hold all dictionary items
  for category in PHI_category:
    for tag in root.iter():
      if tag.tag==category:#skip if a specific tag is not found
          tag.attrib['Category']=category #add a column on category
          tag.attrib['File']=file[len(file)-10:len(file)-4] # add a column to indicate file name
          tag_list.append(tag.attrib)
  temp_df=pd.DataFrame(tag_list)
      
  return temp_df

In [0]:
def note_generator(file):
  #'''This function breakdown inidividaul EHR text note into sentences using XML tags, divided by new line and period'''
    tree = ET.ElementTree(file=file)
    root = tree.getroot()
    all_notes = []

    text = root.find('TEXT').text
    sentences = [sent.split('\n') for sent in sent_tokenize(text) if sent!='\n']
    

    for text in sentences:#this part ignore empty lines
        for sub_item in text:
            if sub_item.replace(' ','') != '':
                all_notes.append(sub_item)    
    
    return all_notes

### Install Bert tokenization 

In [0]:
!pip install bert-tensorflow    # this replaces the bert github clone
!pip install keras
import tensorflow as tf
import tensorflow_hub as hub

import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
from keras import backend as K



W0802 06:34:59.126397 139845769774976 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/bert/optimization.py:87: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

Using TensorFlow backend.


In [0]:
# We use the case model here
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1"

def create_tokenizer_from_hub_module():
  """Get the vocab file and casing info from the Hub module."""
  with tf.Graph().as_default():
    bert_module = hub.Module(BERT_MODEL_HUB)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
      vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])
      
  return bert.tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()

W0802 06:35:05.065534 139845769774976 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/bert/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.



In [0]:
def sentence_encoding(file):#this function is looped within the token_annotator function
  
  sentence_list=note_generator(file) #generate a list of sentences from tex
  
  df=tag_generator(file)
  text_list=df['text'].tolist() #generate a list of tag "TEXT"
  type_list=df['TYPE'].tolist() #generate a list of tag "type"
  category_list=df['Category'].tolist() #generate a list of tag "category"
  
  processed_sentence=[]
  processed_text=[]
  processed_type=[]
  processed_category=[]
  
  def findWholeWord(w):#this function finds a word within a string broken down by regular expression (case sensitive)
    return re.compile(r'\b({0})\b'.format(w)).search
  
  for sentence in sentence_list:
     for text in text_list:
        if findWholeWord(text)(sentence)!=None:
          processed_sentence.append(sentence)
          processed_text.append(text)
          processed_type.append(type_list[text_list.index(text)])
          processed_category.append(category_list[text_list.index(text)])

  
  temp_df=pd.DataFrame({'Sentence':processed_sentence, 'Word':processed_text, 'Type':processed_type, 'Category':processed_category})
  df = temp_df.drop_duplicates()
        
  return df
  #return sentence_list, text_list, type_list, category_list
  #return processed_sentence, processed_text, processed_type

# sentence_encoding(train_filelist[0])

In [0]:
def token_annotator(file):
  
  temp_df=sentence_encoding(file)#take the data frame and turn them into individual lists
  
  type_list=temp_df['Type'].tolist()
  temp_sentence_list=temp_df['Sentence'].tolist()
  word_list=temp_df['Word'].tolist()
  temp_unique_sentence_list=set(temp_sentence_list)
  sentence_list=list(temp_unique_sentence_list) #take out duplicate sentences
  
  tokenized_word=[] #separate individual text into words (e.g, Mia E. Tapia to "Mia","E.","Tapia")
  for phrase in word_list:
    tokenized_word.append(tokenizer.tokenize(phrase))
  
  tokenized_sentence=[]
  encoded_token=[]
  
  for i in range(len(sentence_list)): #tokenize the sentence and encode individual word
    token_list=tokenizer.tokenize(sentence_list[i])
    tokenized_sentence.append(token_list)
    temp_list=['O' for length in range(len(token_list))]
    for j in range(len(tokenized_word)):
      if all(elem in token_list for elem in tokenized_word[j])==True:
        #print(token_list, tokenized_word[j])
        for word in tokenized_word[j]:
          temp_list[token_list.index(word)]=(type_list[j])
          #print(temp_list)
    encoded_token.append(temp_list)
          
  return tokenized_sentence,encoded_token

#token_annotator(train_filelist[0])
  
 

In [0]:
def type_token_generator(file): 
  #this function convert all the text of a record into individual BERT tokenized list and generate type encoding list
  all_sentences=note_generator(file)
  tokenized_sentences=[]
  for sentence in all_sentences:
    tokenized_sentences.append(tokenizer.tokenize(sentence))
  
  type_token=[]
    
  sentence_list, encoded_token=token_annotator(file)
  
  for sentence in tokenized_sentences:
    if sentence in sentence_list:
      type_token.append(encoded_token[sentence_list.index(sentence)])
    else:
      type_token.append(['O'for i in range(len(sentence))])
  
  label_list=[]
  label_dict={"O":0, "DATE":1, "DOCTOR":2,"HOSPITAL":3,'PATIENT':4,'AGE':5,'MEDICALRECORD':6,'CITY':7,'STATE':8,'PHONE':9,'USERNAME':10,'IDNUM':11,'PROFESSION':12,'STREET':13,'ZIP':14,'ORGANIZATION':15,'COUNTRY':16,'FAX':17,'DEVICE':18,'EMAIL':19,'LOCATION-OTHER':20,'URL':21,'HEALTHPLAN':22,'BIOID':23}# ,'IPADDRESS':24,'ACCOUNT NUMBER':25}
  for type_list in type_token:# we convert the label to numerical for Bert training. We can add types here later. 
    label_list.append([label_dict.get(item,item)  for item in type_list])
    #label_list.append([0 if typetoken =='O' else 1 for typetoken in type_list])


  #return tokenized_sentences, type_token, label_list
  return tokenized_sentences, type_token, label_list #take a look at segment of the list to make sure the they are corect
# we were missing tokenized_sentences, type_token from the return, not sure why
                             


We use the cell below to double check that the encoding is correct. 

In [0]:
#DELETE - Check that the dictionary label functions properly
# unique_list=[]

# for i in range(789):#use 789 for train and 513 for test
#   label_list=type_token_generator(train_filelist[i])
#   for j in range(len(label_list)):
#     for item in label_list[j]:
#       if item not in unique_list:
#         unique_list.append(item)
    
# print(unique_list)

In [0]:
#unique_list.sort()
#print(unique_list)#we have 24 classes for training (0-23)
#test=[0, 1, 4, 6, 5, 3, 12, 2, 9, 11, 10, 13, 7, 8, 14, 18, 16, 15, 20, 19]
#train=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]

### Generating BERT array

In [0]:
def bert_array(file, max_seq_length):
  '''This function generates the 5 lists of array that is required to feed into the model'''
  
  token_sentence, type_token, label_list= type_token_generator(file)
  
  token_list=[]
  input_IDs=[]
  input_mask=[]#1 for non padding and 0 for padding
  segment_ID=[]
  label=[]
  
  for untrimmed_sentence in token_sentence:
    sentence=untrimmed_sentence[0:(max_seq_length)-2] #trim the list to allow space for CLS and SEP
    sentence.insert(0,'[CLS]')
    sentence.insert(len(sentence),'[SEP]')
    length_before_padding=len(sentence)
    temp_inputID=[1 for i in range(length_before_padding)]#insert 1 for [CLS] and [SEP] for mask
    sentence.extend(['[PAD]' for i in range(max_seq_length-len(sentence))])
    temp_inputID.extend([0 for i in range(max_seq_length-len(temp_inputID))])
    token_list.append(sentence)
    input_mask.append(temp_inputID)
    segment_ID.append([0 for i in range(max_seq_length)])
  
  for token in token_list:
    input_ids=tokenizer.convert_tokens_to_ids(token)
    input_IDs.append(input_ids)
  
  for untrimmed_item in label_list:
    item=untrimmed_item[0:(max_seq_length-2)]#trim the list to allow space for CLS and SEP
    item.insert(0,24)#class label 24 for CLS (Arnobio - you need to change 24 to 0 for binary)
    item.insert(len(item),25) #class label 25 for SEP (Arnobio - you need to change 25 to 0 for binary)
    item.extend([26 for i in range(max_seq_length-len(item))])  #class label 26 represents paddinging (ARnobio you need to change 26 to 0 for binary)
    label.append(item)
  
  
  return token_list, input_IDs, input_mask, segment_ID, label





# token='this'
# input_ids = tokenizer.convert_tokens_to_ids(['[CLS]'])


In [0]:
#check that the Bert Array function works properly
# max_seq_length = 20

# a,b,c,d,e=bert_array(train_filelist[78],max_seq_length)

# num=4
# print("Token list is:", a[num])
# print("ID list is:",b[num])      
# print("Input mask is:", c[num])
# print("Segment ID is:", d[num])
# print("Label is:", e[num])

# Generating data for BERT

In [0]:
# batch size
batch_size = 32

# create data
max_seq_length = 20

# num of files to retrieve
num_of_file_train = 200

In [0]:


def generate_train_data(num_of_file, max_seq_length):#Max number of file number is 789
  '''This function runs through a loop to append the tokens, input ids, input masks, segement id and labels to 5 individual np arrays '''
  temp_list0, temp_list1, temp_list2, temp_list3, temp_list4=[],[],[],[],[]
  for i in range(num_of_file):
    temp_data= bert_array(train_filelist[i],max_seq_length)
    
    for j in range(len(temp_data[0])):
      temp_list0.append(temp_data[0][j])
      temp_list1.append(temp_data[1][j])
      temp_list2.append(temp_data[2][j])
      temp_list3.append(temp_data[3][j])
      temp_list4.append(temp_data[4][j])
  
#   np_token_list=np.array(temp_list0)
#   np_input_ids=np.array(temp_list1)
#   np_input_masks=np.array(temp_list2)
#   np_segment_ids=np.array(temp_list3)
#   np_labels=np.array(temp_list4)
      
  #return np_token_list, np_input_ids, np_input_masks, np_segment_ids, np_labels
  return temp_list0, temp_list1, temp_list2, temp_list3, temp_list4

#change number of file here (MAX:789)


train_token_list, train_input_ids, train_input_masks, train_segment_ids, train_labels= generate_train_data(num_of_file_train,max_seq_length)

#print(train_token_list)
#check that the shape is correct
#print(train_input_ids.shape, train_input_masks.shape, train_token_list.shape, train_segment_ids.shape, train_labels.shape)

In [0]:
len(train_token_list)

17470

In [0]:
# x=np.array([train_token_list, train_input_ids, train_input_masks])

# print(x)

# y=[[123],[123],[456]]
# z=[['a,b,c'],['w','b','f'],['r,t,g']]
# w=[['sf'],['sdf'],['34,5,2']]

# a=np.array(y)
# b=np.array(z)
# c=np.array(w)
# d=[a,b,c]

# print(d)

In [0]:
num_of_file_test = 100

def generate_test_data(num_of_file, max_seq_length):#Max number of file is 513
  '''This function runs through a loop to append the tokens, input ids, input masks, segement id and labels to 5 individual np arrays '''
  temp_list0, temp_list1, temp_list2, temp_list3, temp_list4=[],[],[],[],[]
  for i in range(num_of_file):
    temp_data= bert_array(test_filelist[i],max_seq_length)
    
    for j in range(len(temp_data[0])):
      temp_list0.append(temp_data[0][j])
      temp_list1.append(temp_data[1][j])
      temp_list2.append(temp_data[2][j])
      temp_list3.append(temp_data[3][j])
      temp_list4.append(temp_data[4][j])
  
#   np_token_list=np.array(temp_list0)
#   np_input_ids=np.array(temp_list1)
#   np_input_masks=np.array(temp_list2)
#   np_segment_ids=np.array(temp_list3)
#   np_labels=np.array(temp_list4)
      
  #return np_token_list, np_input_ids, np_input_masks, np_segment_ids, np_labels
  return temp_list0, temp_list1, temp_list2, temp_list3, temp_list4

test_token_list, test_input_ids, test_input_masks, test_segment_ids, test_labels = generate_test_data(num_of_file_test,max_seq_length)
#print(test_input_ids.shape, test_input_masks.shape, test_token_list.shape, test_segment_ids.shape, test_labels.shape)

In [0]:
# # print(train_token_list[0])
# print(train_input_ids[0])
print(len(test_input_ids))
# # print(train_segment_ids[0])
# #print(type(test_labels))

8444


In [0]:
test_labels[0]

[24, 0, 0, 0, 1, 1, 1, 1, 0, 1, 25, 26, 26, 26, 26, 26, 26, 26, 26, 26]

### Save Data Arrays for Loading

In [0]:
# save tokens outside of notebook
%cd ..
np.save("train_token_list",train_token_list)
np.save("train_input_ids",train_token_list)
np.save("train_input_masks",train_token_list)
np.save("train_segment_ids",train_token_list)
np.save("train_labels",train_token_list)


np.save("test_token_list",test_token_list)
np.save("test_input_ids", test_input_ids)
np.save("test_input_masks",test_input_masks)
np.save("test_segment_ids",test_segment_ids)
np.save("test_labels",test_labels)
#%cd /gdrive

/


In [0]:
# download numpy arrays to local machine
from google.colab import files

# download train arrays
files.download('train_token_list.npy')
files.download('train_input_ids.npy')
files.download('train_input_masks.npy')
files.download('train_segment_ids.npy')
files.download('train_labels.npy')

# download test arrays
files.download('test_token_list.npy')
files.download('test_input_ids.npy')
files.download('test_input_masks.npy')
files.download('test_segment_ids.npy')
files.download('test_labels.npy')




# files manually uploaded to drive

### Load Data Arrays from Saved Files

In [0]:
# saved arrays are here https://drive.google.com/drive/u/1/folders/18uQWrQ5VO2tERtDg8VKcZUsiO0r6RhT2

#from google.colab import drive # this sets the file path to your personal google drive. You will need to enter the authorization code each time. 
#drive.mount('/gdrive')
cd /gdrive

E0731 05:37:06.803610 139950122833792 ultratb.py:152] Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2882, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-92-2f74130f9697>", line 1, in <module>
    get_ipython().magic('cd /gdrive')
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2160, in magic
    return self.run_line_magic(magic_name, magic_arg_s)
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2081, in run_line_magic
    result = fn(*args,**kwargs)
  File "</usr/local/lib/python3.6/dist-packages/decorator.py:decorator-gen-91>", line 2, in cd
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/magic.py", line 188, in <lambda>
    call = lambda f, *a, **k: f(*a, **k)
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/magics/osm.py", line 288, in cd
    oldcwd = py3compat.getcwd()
OSError: [Errno 107] Transport endpoint is not conne

OSError: ignored

In [0]:
from google.colab import files
uploaded = files.upload()

#Building the Bert Models

## Building out model A

In [0]:
# Partially based on and created with teh help with Joachim Rahmfeld and his work, as well as "BERT in Keras with Tensorflow hub" (https://towardsdatascience.com/bert-in-keras-with-tensorflow-hub-76bcbc9417b) 

#BERT_MODEL_HUB
class BertLayer(tf.keras.layers.Layer):
    def __init__(
        self,
        n_fine_tune_layers=3,
        pooling="sequence_output",
        bert_path=BERT_MODEL_HUB,
        #bert_path="https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1",
        **kwargs,
    ):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = True
        self.output_size = 768
        self.pooling = pooling
        self.bert_path = bert_path

        super(BertLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.bert = hub.Module(
            self.bert_path, trainable=self.trainable, name=f"{self.name}_module"
        )

        # Remove unused layers
        trainable_vars = self.bert.variables
        #         if self.pooling == "first":
        #             trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]
        #             trainable_layers = ["pooler/dense"]

        #         elif self.pooling == "sequence":
        trainable_vars = [
            var
            for var in trainable_vars
            if not "/cls/" in var.name and not "/pooler/" in var.name
        ]
        
        trainable_layers = []
        #         else:
        #             raise NameError(
        #                 f"Undefined pooling type (must be either first or mean, but is {self.pooling}"
        #             )

                # Select how many layers to fine tune
        for i in range(self.n_fine_tune_layers):
            trainable_layers.append(f"encoder/layer_{str(11 - i)}")

        # Update trainable vars to contain only the specified layers
        trainable_vars = [
            var
            for var in trainable_vars
            if any([l in var.name for l in trainable_layers])
        ]

        # Add to trainable weights
        for var in trainable_vars:
            self._trainable_weights.append(var)

        for var in self.bert.variables:
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)

        super(BertLayer, self).build(input_shape)

    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
        )
        #         if self.pooling == "first":
        #             pooled = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
        #                 "pooled_output"
        #             ]
        #         elif self.pooling == "sequence":
        #             result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
        #                 "sequence_output"
        #             ]

        #             mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
        #             masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
        #                     tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
        #             input_mask = tf.cast(input_mask, tf.float32)
        #             pooled = masked_reduce_mean(result, input_mask)
        #         else:
        #             raise NameError(f"Undefined pooling type (must be either first or mean, but is {self.pooling}")
                
        result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
                "sequence_output"
            ]
        
        mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
        
        return result

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size)

In [0]:

#num_labels = 25
#Here we build a custom loss function for our classes

def custom_loss(y_true, y_pred):
    """
    calculate loss function explicitly, filtering out 'extra inserted labels'
    
    y_true: Shape: (batch x (max_length + 1) )
    y_pred: predictions. Shape: (batch x x (max_length + 1) x num_distinct_ner_tokens ) 
    
    returns:  cost
    """

    #get labels and predictions
    
    y_label = tf.reshape(tf.layers.Flatten()(tf.cast(y_true, tf.int32)),[-1])
    print(y_label)
  
    
    mask = (y_label < 24)  #CLS=24, SEP=25, PAD=26 

    y_label_masked = tf.boolean_mask(y_label, mask)  # mask the labels
    
   
    y_flat_pred = tf.reshape(tf.layers.Flatten()(tf.cast(y_pred, tf.float32)),[-1, 27])
 
    
    y_flat_pred_masked = tf.boolean_mask(y_flat_pred, mask) # mask the predictions
    
    return tf.reduce_mean(sparse_categorical_crossentropy(y_label_masked, y_flat_pred_masked,from_logits=False ))

In [0]:
num_labels = 27

def custom_acc_orig_tokens(y_true, y_pred):
    """
    calculate loss dfunction filtering out also the newly inserted labels
    
    y_true: Shape: (batch x (max_length) )
    y_pred: predictions. Shape: (batch x x (max_length + 1) x num_distinct_ner_tokens ) 
    
    returns: accuracy
    """

    #get labels and predictions
    
    y_label = tf.reshape(tf.layers.Flatten()(tf.cast(y_true, tf.int64)),[-1])
    
    mask = (y_label < 24)
    y_label_masked = tf.boolean_mask(y_label, mask)
    
    y_predicted = tf.math.argmax(input = tf.reshape(tf.layers.Flatten()(tf.cast(y_pred, tf.float64)),\
                                                    [-1, num_labels]), axis=1)
    
    y_predicted_masked = tf.boolean_mask(y_predicted, mask)

    return tf.reduce_mean(tf.cast(tf.equal(y_predicted_masked,y_label_masked) , dtype=tf.float64))

In [0]:

# y_true = tf.constant([[27],[0]])
# print(y_true)
# y_label = tf.reshape(tf.layers.Flatten()(tf.cast(y_true, tf.int32)),[-1])
# print(y_label)
# mask = (y_label < 24)
# y_label_masked = tf.boolean_mask(y_label, mask)
# print(y_label_masked)
# y_flat_pred = tf.reshape(tf.layers.Flatten()(tf.cast(y_pred, tf.float32)),[-1, 27])
# print(y_flat_pred)
# y_flat_pred_masked = tf.boolean_mask(y_flat_pred, mask)

Tensor("Const_42:0", shape=(2, 1), dtype=int32)
Tensor("Reshape_42:0", shape=(2,), dtype=int32)
Tensor("boolean_mask_22/GatherV2:0", shape=(?,), dtype=int32)
Tensor("Reshape_43:0", shape=(2, 27), dtype=float32)


In [0]:
#Check that custom_loss works WHY DOESNT THIS WORK!!!????
sess.close()
y_true = tf.constant([[28],[0]]) #0.5108 should the correct answer
print(y_true)

y_pred = tf.constant([
    [0.0,0,0,0.6,0,0,0,0,0,0,0,0,0,0,0,0,0,0.4,0,0,0,0,0,.4,0,0.5,0],
    [0.6,0.4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,.4,0,0,0],
])
print(y_pred)
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())

print(custom_loss(y_true, y_pred).eval())

Tensor("Const:0", shape=(2, 1), dtype=int32)
Tensor("Const_1:0", shape=(2, 27), dtype=float32)




Tensor("Reshape:0", shape=(2,), dtype=int32)
0.84729946


In [0]:
# import numpy as np
# -np.log(0.6)

0.5108256237659907

In [0]:
from tensorflow.keras.backend import sparse_categorical_crossentropy

# Build model
def build_model(max_seq_length): 
    in_id = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids")
    in_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_masks")
    in_segment = tf.keras.layers.Input(shape=(max_seq_length,), name="segment_ids")
    #print(in_id, in_mask, in_segment)
    bert_inputs = [in_id, in_mask, in_segment]
    
    bert_output = BertLayer(n_fine_tune_layers=3, pooling="sequence_output")(bert_inputs)
    
    #print(bert_output)
    
    dense = tf.keras.layers.Dense(256, activation='relu')(bert_output)
    
    dense = tf.keras.layers.Dropout(rate=0.1)(dense)#random drop out to prevent overfitting
    
    pred = tf.keras.layers.Dense(27, activation='sigmoid')(dense)#Arnobio: 2 for binary class (not sure why) need to change output shpae to reflect number of classes
    
    model = tf.keras.models.Model(inputs=bert_inputs, outputs=pred)
    
    
#     losses = custom_loss#from Joachim's notebook
    
#     model.compile(loss=losses, optimizer='adam', metrics=['accuracy'])
#     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=[custom_acc_orig_tokens])
#     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# sparse_categorical_crossentropy works, accuracy function was not working
    
    
    model.summary()
    
    return model

#build_model(32)
def initialize_vars(sess):
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    K.set_session(sess)

###Train the BERT model A

### Data Preparation
Set up data to mimic Joachim's data input

We first load the process numpy array to Colab

In [0]:
#Here we import the preprocessed dataset from Google Drive. Sync the google drive file with your window so you can select from the drop down menu.


train_token_list1 = np.load('/gdrive/My Drive/w266_NLP/arrays-multiclass/train_token_list.npy',encoding='bytes')
train_token_list2=np.array(train_token_list1)
# test_token_list1 = np.load('/gdrive/My Drive/w266_NLP/arrays-multiclass/test_token_list.npy')
# train_input_ids1 = np.load('/gdrive/My Drive/w266_NLP/arrays-multiclass/train_input_ids.npy')
# test_input_ids1 = np.load('/gdrive/My Drive/w266_NLP/arrays-multiclass/test_input_ids.npy')
# train_input_masks1 = np.load('/gdrive/My Drive/w266_NLP/arrays-multiclass/train_input_masks.npy')
# test_input_masks1 = np.load('/gdrive/My Drive/w266_NLP/arrays-multiclass/test_input_masks.npy')
# train_labels1 = np.load('/gdrive/My Drive/w266_NLP/arrays-multiclass/train_labels.npy')
# test_labels1 = np.load('/gdrive/My Drive/w266_NLP/arrays-multiclass/test_labels.npy')
# train_segment_ids1 = np.load('/gdrive/My Drive/w266_NLP/arrays-multiclass/train_segment_ids.npy')
# test_segment_ids1 = np.load('/gdrive/My Drive/w266_NLP/arrays-multiclass/test_segment_ids.npy')


In [0]:
print(train_token_list1[0][4])
print(train_token_list[0][4])
print(type(train_token_list1[0][4]))
print(type(train_token_list[0][4]))
print(train_token_list1)
print(train_token_list)

210
210
<class 'numpy.str_'>
<class 'str'>


In [0]:
#train_token_list, train_input_ids, train_input_masks, train_segment_ids, train_la

X_train = np.array([train_input_ids,train_input_masks,train_segment_ids])
X_test = np.array([test_input_ids,test_input_masks,test_segment_ids])
train_label1=np.array(train_labels)
test_label1=np.array(test_labels)

print(X_train.shape)#confirm it's (3,73722,20) for training
print(X_test.shape) #confirm it's (3, 47683, 20) for test

(3, 17470, 20)
(3, 8444, 20)


In [0]:
num=101
print(X_test[1][num])
print(test_label1[num])

[1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0]
[24  0  0  0  0  0  0  0 25 26 26 26 26 26 26 26 26 26 26 26]


In [0]:
k_start = 0
k_end_train = 960
k_end_dev_start_test = 320
k_end_test = k_end_dev_start_test + 320

bert_inputs_train_k = [X_train[0][k_start:k_end_train], X_train[1][k_start:k_end_train], 
                       X_train[2][k_start:k_end_train]]
bert_inputs_dev_k = [X_test[0][k_start:k_end_dev_start_test], X_test[1][k_start:k_end_dev_start_test], 
                      X_test[2][k_start:k_end_dev_start_test]]
bert_inputs_test_k = [X_test[0][k_end_dev_start_test:k_end_test], X_test[1][k_end_dev_start_test:k_end_test], 
                      X_test[2][k_end_dev_start_test:k_end_test]]

bert_train_label = train_label1[k_start:k_end_train]
bert_dev_label = test_label1[k_start:k_end_dev_start_test]
bert_test_label = test_label1[k_end_dev_start_test:k_end_test]

print(len(bert_inputs_train_k))
print(bert_train_label.shape)
print(bert_dev_label.shape)
print(bert_test_label.shape)

3
(960, 20)
(320, 20)
(320, 20)


### Run Model

In [0]:
sess.close()

In [0]:
#keras.backend.get_session().run(tf.global_variables_initializer())

# # #https://stackoverflow.com/questions/34001922/failedpreconditionerror-attempting-to-use-uninitialized-in-tensorflow
# sess = tf.InteractiveSession()

# sess.run(tf.global_variables_initializer())
# sess.run(tf.local_variables_initializer())

model = build_model(max_seq_length)

# 

sess = tf.InteractiveSession()

# initialize_vars(sess)

sess.run(tf.global_variables_initializer())
sess.run(tf.local_variables_initializer())

# model.fit(
#     bert_inputs_train_k, 
#     {"ner": labels_train_k },
#     validation_data=(bert_inputs_test_k, {"ner": labels_test_k }),
#     epochs=8,
#     batch_size=32#,
#     #callbacks=[tensorboard]

model.fit(
    bert_inputs_train_k, 
    bert_train_label,
    validation_data=(bert_inputs_dev_k,bert_dev_label),
    epochs=1,
    batch_size=32#if we change this to input dimension then we solve the 32 problem. 
)



Model: "model_6"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 20)]         0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 20)]         0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 20)]         0                                            
__________________________________________________________________________________________________
bert_layer_6 (BertLayer)        (None, None, 768)    108931396   input_ids[0][0]                  
                                                                 input_masks[0][0]          



Train on 960 samples, validate on 320 samples


<tensorflow.python.keras.callbacks.History at 0x7f32c042d0f0>

### Save Model



In [0]:
# save model
# sess.close()

# sess = tf.InteractiveSession()

# # initialize_vars(sess)

# sess.run(tf.global_variables_initializer())
# sess.run(tf.local_variables_initializer())

# model.save('modelA-1.h5')    

# sess.close()

NameError: ignored

In [0]:
# save weights for the model
# sess.close()

# sess = tf.InteractiveSession()

# # initialize_vars(sess)

# sess.run(tf.global_variables_initializer())
# sess.run(tf.local_variables_initializer())

# model.save_weights('modelA_weights.h5')
# sess.close()
# reference for saving and downloading models https://stackoverflow.com/questions/48924165/google-colaboratory-weight-download-export-saved-models



In [0]:
# download model to local machine
from google.colab import files

# download train arrays
files.download('modelA.h5')
files.download('modelA_weights.h5')

# files manually uploaded to drive (models folder) 

###Error Analysis

In [0]:
# sess = tf.InteractiveSession()

# sess.run(tf.global_variables_initializer())
# sess.run(tf.local_variables_initializer())

result = model.predict(
    bert_inputs_test_k, 
    batch_size=32)

In [0]:
#Save result for future reference.
%cd ..
np.save('result_modelA1',result)

from google.colab import files
files.download('result_modelA1.npy')
np.save('test_label_modelA1',bert_test_label)
files.download('test_label_modelA1.npy')

/


In [0]:
num=77
print("The shape of result is:",result.shape)
print("The shape of test label is:", bert_test_label.shape)
#print(result[2][0])
pre_np=np.argmax(result, axis=2)
pre_np.shape
print("The predicted label is:", pre_np[num])
print("The actual label is:", bert_test_label[num])
print("The input mask is:",X_test[1][3200+num])

The shape of result is: (3200, 20, 27)
The shape of test label is: (3200, 20)
The predicted label is: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
The actual label is: [24  0  0  0  0  0  0  0  0  0  0  0 25 26 26 26 26 26 26 26]
The input mask is: [1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0]


In [0]:
# from sklearn.metrics import classification_report
# from sklearn.metrics import confusion_matrix
# pred_np=np.argmax(result, axis=2)
# print(pred_np.shape)
# print(pred_np.shape[0])
# test_np=bert_test_label
# print(len(test_np))

# pred_label=[]
# test_label=[]

# for i in range(pred_np.shape[0]):
#   pred_label.extend(pred_np[i])

# for i in range(len(test_np)):
#   test_label.extend(test_np[i])
  
  
# print(len(pred_label), len(test_label))

# print(classification_report(test_label, pred_label))

In [0]:
predictions_flat = [pred for preds in np.argmax(result, axis=2) for pred in preds]
labels_flat = [label for labels in bert_test_label for label in labels]

clean_preds = []
clean_labels = []

for pred, label in zip(predictions_flat, labels_flat):
    if label < 24:
        clean_preds.append(pred)
        clean_labels.append(label)

In [0]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
print(len(clean_preds))
print(len(clean_labels))
print(classification_report(clean_preds, clean_labels))

34510
34510
              precision    recall  f1-score   support

           0       1.00      0.93      0.97     34510
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.00      0.00      0.00         0

    accuracy                           0.93     34510
   macro avg  

  'recall', 'true', average, warn_for)


In [0]:
# sess = tf.InteractiveSession()



In [0]:
cm = tf.math.confusion_matrix(
    clean_labels,
    clean_preds,
    num_classes=None,
    dtype=tf.dtypes.int32,
    name=None,
    weights=None
).eval()

In [0]:
np.sum(cm, axis=0)

array([   2,    0,   45,    9,   22, 4093, 1624, 8983,   12,    7,    4,
       2634,  156,  115, 5909, 1795,  140,  151,    0,  317, 4343,  168,
        518,  389,  389, 1067, 1618])

In [0]:
#print (cm)
print(cm)

[[32233     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0]
 [ 1092     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0]
 [  415     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0]
 [  151     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0]
 [  170     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0]
 [   54     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0]
 [  119     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0]
 [   46     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0]
 [   16     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0]
 [   23     0     0     0     0     0

In [0]:
cm_most = cm[[0,1,2,3,4,5,6,7,8,9,10],:] [:, [0,1,2,3,4,5,6,7,8,9,10]]
print(cm_most)

[[32233     0     0     0     0     0     0     0     0     0     0]
 [ 1092     0     0     0     0     0     0     0     0     0     0]
 [  415     0     0     0     0     0     0     0     0     0     0]
 [  151     0     0     0     0     0     0     0     0     0     0]
 [  170     0     0     0     0     0     0     0     0     0     0]
 [   54     0     0     0     0     0     0     0     0     0     0]
 [  119     0     0     0     0     0     0     0     0     0     0]
 [   46     0     0     0     0     0     0     0     0     0     0]
 [   16     0     0     0     0     0     0     0     0     0     0]
 [   23     0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0     0]]


In [0]:
con=confusion_matrix(test_label, pred_label)
print(con)
# for i in range(len(con)):#print it one by one so it's easier to see
#   print(con[i])

[[    0   756   972     1     4   746   339  1956   592    65     1     0
    127    15  2611     0   627  1295    33   128  1592    29  3561     6
     24 16752     1]
 [    0    23    37     0     0    38    32    86    45     8     0     0
      7     0    36     0    71    12     0     8    39     0   114     0
      6   530     0]
 [    0    23     2     0     0    28    11    66     2     0     0     0
      5     0    15     0     8    35     1     1    29     0    33     0
      0   156     0]
 [    0     4     9     0     0     5     1     8     1     0     0     0
      1     0    13     0     0     5     0     0     1     0    16     0
      0    87     0]
 [    0    19     1     0     0     9     1    42     1     0     0     0
      4     0     9     0     2     5     0     0    10     1    12     0
      0    54     0]
 [    0     5     0     0     0     0     1     2     5     0     0     0
      1     0     1     0     2     2     0     0     0     0    13     0
      0

In [0]:
con=confusion_matrix(test_label, pred_label)
for i in range(len(con)):#print it one by one so it's easier to see
  print(con[i])

## Building out model B

In [0]:
# changes to the model

# changed fine tune layers from 3 to 10
n_fine_tune_layers = 10

#BERT_MODEL_HUB
class BertLayer(tf.keras.layers.Layer):
    def __init__(
        self,
        n_fine_tune_layers=n_fine_tune_layers,
        pooling="sequence_output",
        bert_path=BERT_MODEL_HUB,
        #bert_path="https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1",
        **kwargs,
    ):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = True
        self.output_size = 768
        self.pooling = pooling
        self.bert_path = bert_path

        super(BertLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.bert = hub.Module(
            self.bert_path, trainable=self.trainable, name=f"{self.name}_module"
        )

        # Remove unused layers
        trainable_vars = self.bert.variables
        #         if self.pooling == "first":
        #             trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]
        #             trainable_layers = ["pooler/dense"]

        #         elif self.pooling == "sequence":
        trainable_vars = [
            var
            for var in trainable_vars
            if not "/cls/" in var.name and not "/pooler/" in var.name
        ]
        
        trainable_layers = []
        #         else:
        #             raise NameError(
        #                 f"Undefined pooling type (must be either first or mean, but is {self.pooling}"
        #             )

                # Select how many layers to fine tune
        for i in range(self.n_fine_tune_layers):
            trainable_layers.append(f"encoder/layer_{str(11 - i)}")

        # Update trainable vars to contain only the specified layers
        trainable_vars = [
            var
            for var in trainable_vars
            if any([l in var.name for l in trainable_layers])
        ]

        # Add to trainable weights
        for var in trainable_vars:
            self._trainable_weights.append(var)

        for var in self.bert.variables:
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)

        super(BertLayer, self).build(input_shape)

    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
        )
        #         if self.pooling == "first":
        #             pooled = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
        #                 "pooled_output"
        #             ]
        #         elif self.pooling == "sequence":
        #             result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
        #                 "sequence_output"
        #             ]

        #             mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
        #             masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
        #                     tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
        #             input_mask = tf.cast(input_mask, tf.float32)
        #             pooled = masked_reduce_mean(result, input_mask)
        #         else:
        #             raise NameError(f"Undefined pooling type (must be either first or mean, but is {self.pooling}")
                
        result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
                "sequence_output"
            ]
        
        mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
        
        return result

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size)

### Train the BERT model B

In [0]:
# no changes to custom loss or custom accuracy functions

# changed learning rate from 0.1 to 0.08
learning_rate = 0.08

from tensorflow.keras.backend import sparse_categorical_crossentropy

# Build model
def build_model(max_seq_length): 
    in_id = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids")
    in_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_masks")
    in_segment = tf.keras.layers.Input(shape=(max_seq_length,), name="segment_ids")
    #print(in_id, in_mask, in_segment)
    bert_inputs = [in_id, in_mask, in_segment]
    
    bert_output = BertLayer(n_fine_tune_layers=n_fine_tune_layers, pooling="sequence_output")(bert_inputs)
    
    #print(bert_output)
    
    dense = tf.keras.layers.Dense(256, activation='relu')(bert_output)
    
    dense = tf.keras.layers.Dropout(rate=learning_rate)(dense)#random drop out to prevent overfitting
    
    pred = tf.keras.layers.Dense(27, activation='sigmoid')(dense)#Arnobio: 2 for binary class (not sure why) need to change output shpae to reflect number of classes
    
    model = tf.keras.models.Model(inputs=bert_inputs, outputs=pred)
    
    
#     losses = custom_loss#from Joachim's notebook
    
#     model.compile(loss=losses, optimizer='adam', metrics=['accuracy'])
#     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=[custom_acc_orig_tokens])
#     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# sparse_categorical_crossentropy works, accuracy function was not working
    
    
    model.summary()
    
    return model

#build_model(32)
def initialize_vars(sess):
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    K.set_session(sess)

### Data Preparation

In [0]:
#train_token_list, train_input_ids, train_input_masks, train_segment_ids, train_la

X_train = np.array([train_input_ids,train_input_masks,train_segment_ids])
X_test = np.array([test_input_ids,test_input_masks,test_segment_ids])
train_label1=np.array(train_labels)
test_label1=np.array(test_labels)

print(X_train.shape)

(3, 17470, 20)


In [0]:
# doubled the amount of data

k_start = 0
k_end_train = 9600
k_end_dev_start_test = 3200
k_end_test = k_end_dev_start_test + 3200

bert_inputs_train_k = [X_train[0][k_start:k_end_train], X_train[1][k_start:k_end_train], 
                       X_train[2][k_start:k_end_train]]

bert_inputs_dev_k = [X_test[0][k_start:k_end_dev_start_test], X_test[1][k_start:k_end_dev_start_test], 
                      X_test[2][k_start:k_end_dev_start_test]]

bert_inputs_test_k = [X_test[0][k_end_dev_start_test:k_end_test], X_test[1][k_end_dev_start_test:k_end_test], 
                      X_test[2][k_end_dev_start_test:k_end_test]]

bert_train_label = train_label1[k_start:k_end_train]
bert_dev_label = test_label1[k_start:k_end_dev_start_test]
bert_test_label = test_label1[k_end_dev_start_test:k_end_test]

print(len(bert_inputs_train_k))
print(bert_train_label.shape)
print(bert_test_label.shape)

3
(9600, 20)
(3200, 20)


### Run Model

In [0]:
sess.close()

NameError: ignored

In [0]:



model = build_model(max_seq_length)

# 

sess = tf.InteractiveSession()

# initialize_vars(sess)

sess.run(tf.global_variables_initializer())
sess.run(tf.local_variables_initializer())

# model.fit(
#     bert_inputs_train_k, 
#     {"ner": labels_train_k },
#     validation_data=(bert_inputs_test_k, {"ner": labels_test_k }),
#     epochs=8,
#     batch_size=32#,
#     #callbacks=[tensorboard]

epochs = 3

model.fit(
    bert_inputs_train_k, 
    bert_train_label,
    validation_data=(bert_inputs_test_k,bert_test_label),
    epochs=epochs,
    batch_size=32#if we change this to input dimension then we solve the 32 problem. 
)




W0802 06:43:51.063255 139845769774976 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/array_ops.py:1354: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 20)]         0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 20)]         0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 20)]         0                                            
__________________________________________________________________________________________________
bert_layer_2 (BertLayer)        (None, None, 768)    108931396   input_ids[0][0]                  
                                                                 input_masks[0][0]          

<tensorflow.python.keras.callbacks.History at 0x7f2ff8d40358>

### Save Model

In [0]:
# save model
sess.close()

sess = tf.InteractiveSession()

# initialize_vars(sess)

sess.run(tf.global_variables_initializer())
sess.run(tf.local_variables_initializer())

model.save('modelB.h5')    

sess.close()

In [0]:
# save weights for the model
sess.close()

sess = tf.InteractiveSession()

# initialize_vars(sess)

sess.run(tf.global_variables_initializer())
sess.run(tf.local_variables_initializer())

model.save_weights('modelB_weights.h5')
sess.close()
# reference for saving and downloading models https://stackoverflow.com/questions/48924165/google-colaboratory-weight-download-export-saved-models

In [0]:
# download model to local machine
from google.colab import files

# download train arrays
files.download('modelB.h5')
files.download('modelB_weights.h5')

# files manually uploaded to drive (models folder) 

## Building out model C (all data)

### Train BERT Model C

In [0]:
# changes to the model

# kept 10 fine tune layers as in model B
n_fine_tune_layers = 10

#BERT_MODEL_HUB
class BertLayer(tf.keras.layers.Layer):
    def __init__(
        self,
        n_fine_tune_layers=n_fine_tune_layers,
        pooling="sequence_output",
        bert_path=BERT_MODEL_HUB,
        #bert_path="https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1",
        **kwargs,
    ):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = True
        self.output_size = 768
        self.pooling = pooling
        self.bert_path = bert_path

        super(BertLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.bert = hub.Module(
            self.bert_path, trainable=self.trainable, name=f"{self.name}_module"
        )

        # Remove unused layers
        trainable_vars = self.bert.variables
        #         if self.pooling == "first":
        #             trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]
        #             trainable_layers = ["pooler/dense"]

        #         elif self.pooling == "sequence":
        trainable_vars = [
            var
            for var in trainable_vars
            if not "/cls/" in var.name and not "/pooler/" in var.name
        ]
        
        trainable_layers = []
        #         else:
        #             raise NameError(
        #                 f"Undefined pooling type (must be either first or mean, but is {self.pooling}"
        #             )

                # Select how many layers to fine tune
        for i in range(self.n_fine_tune_layers):
            trainable_layers.append(f"encoder/layer_{str(11 - i)}")

        # Update trainable vars to contain only the specified layers
        trainable_vars = [
            var
            for var in trainable_vars
            if any([l in var.name for l in trainable_layers])
        ]

        # Add to trainable weights
        for var in trainable_vars:
            self._trainable_weights.append(var)

        for var in self.bert.variables:
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)

        super(BertLayer, self).build(input_shape)

    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
        )
        #         if self.pooling == "first":
        #             pooled = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
        #                 "pooled_output"
        #             ]
        #         elif self.pooling == "sequence":
        #             result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
        #                 "sequence_output"
        #             ]

        #             mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
        #             masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
        #                     tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
        #             input_mask = tf.cast(input_mask, tf.float32)
        #             pooled = masked_reduce_mean(result, input_mask)
        #         else:
        #             raise NameError(f"Undefined pooling type (must be either first or mean, but is {self.pooling}")
                
        result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
                "sequence_output"
            ]
        
        mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
        
        return result

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size)

### Data Preparation

In [0]:
#train_token_list, train_input_ids, train_input_masks, train_segment_ids, train_la

X_train = np.array([train_input_ids,train_input_masks,train_segment_ids])
X_test = np.array([test_input_ids,test_input_masks,test_segment_ids])
train_label1=np.array(train_labels)
test_label1=np.array(test_labels)

print(X_train.shape)

In [0]:
# doubled the amount of data

k_start = 0
k_end_train = 6400
k_end_dev_start_test = 6400
k_end_test = k_end_dev_start_test + 6400

bert_inputs_train_k = [X_train[0][k_start:k_end_train], X_train[1][k_start:k_end_train], 
                       X_train[2][k_start:k_end_train]]

bert_inputs_dev_k = [X_test[0][k_start:k_end_dev_start_test], X_test[1][k_start:k_end_dev_start_test], 
                      X_test[2][k_start:k_end_dev_start_test]]

bert_inputs_test_k = [X_test[0][k_end_dev_start_test:k_end_test], X_test[1][k_end_dev_start_test:k_end_test], 
                      X_test[2][k_end_dev_start_test:k_end_test]]

bert_train_label = train_label1[k_start:k_end_train]
bert_dev_label = test_label1[k_start:k_end_dev_start_test]
bert_test_label = test_label1[k_end_dev_start_test:k_end_test]

print(len(bert_inputs_train_k))
print(bert_train_label.shape)
print(bert_test_label.shape)

### Run Model

In [0]:
sess.close()

In [0]:
#changed to 8 epochs

#keras.backend.get_session().run(tf.global_variables_initializer())

# # #https://stackoverflow.com/questions/34001922/failedpreconditionerror-attempting-to-use-uninitialized-in-tensorflow
# sess = tf.InteractiveSession()

# sess.run(tf.global_variables_initializer())
# sess.run(tf.local_variables_initializer())

sess.close()

model = build_model(max_seq_length)

# 

sess = tf.InteractiveSession()

# initialize_vars(sess)

sess.run(tf.global_variables_initializer())
sess.run(tf.local_variables_initializer())

# model.fit(
#     bert_inputs_train_k, 
#     {"ner": labels_train_k },
#     validation_data=(bert_inputs_test_k, {"ner": labels_test_k }),
#     epochs=8,
#     batch_size=32#,
#     #callbacks=[tensorboard]

epochs = 8

model.fit(
    bert_inputs_train_k, 
    bert_train_label,
    validation_data=(bert_inputs_test_k,bert_test_label),
    epochs=epochs,
    batch_size=32#if we change this to input dimension then we solve the 32 problem. 
)


sess.close()

### Save Model

In [0]:
# save model
sess.close()

sess = tf.InteractiveSession()

# initialize_vars(sess)

sess.run(tf.global_variables_initializer())
sess.run(tf.local_variables_initializer())

model.save('modelC.h5')    

sess.close()

In [0]:
# save weights for the model
sess.close()

sess = tf.InteractiveSession()

# initialize_vars(sess)

sess.run(tf.global_variables_initializer())
sess.run(tf.local_variables_initializer())

model.save_weights('modelC_weights.h5')
sess.close()
# reference for saving and downloading models https://stackoverflow.com/questions/48924165/google-colaboratory-weight-download-export-saved-models

In [0]:
# download model to local machine
from google.colab import files

# download train arrays
files.download('modelC.h5')
files.download('modelC_weights.h5')

# files manually uploaded to drive (models folder) 

#Error Analysis

##Model Predict and Result

In [0]:
#bert_inputs_infer = [X_test[0][0:64], X_test[1][0:64], X_test[2][0:64]]
#test_input_ids.shape, test_input_masks.shape, test_token_list.shape, test_segment_ids.shape, test_labels.shape)
#X_test = np.array([testSentence_ids,testMasks,testSequence_ids])

#print(test_input_ids[0:320])

#bert_inputs_infer=[test_input_ids[0:32], test_input_masks[0:32], test_segment_ids[0:32]]

#print(len(bert_inputs_infer[0]))
sess = tf.InteractiveSession()

sess.run(tf.global_variables_initializer())
sess.run(tf.local_variables_initializer())

result = model.predict(
    bert_inputs_test_k, 
    batch_size=32)

NameError: ignored

In [0]:
# print(type(result))
# print(result.shape)
# print(result)

In [0]:
result[0]

array([[0.49750134, 0.4809062 , 0.5489991 , 0.38408372, 0.6464709 ,
        0.603372  , 0.5634319 , 0.47409418, 0.59063023, 0.4131989 ,
        0.7505529 , 0.2428748 , 0.3318897 , 0.40037388, 0.4515211 ,
        0.7565875 , 0.56238157, 0.3636114 , 0.53489065, 0.45430994,
        0.68488646, 0.7278238 , 0.25792265, 0.24425074, 0.62498355,
        0.76362807, 0.7000843 ],
       [0.49572614, 0.38521197, 0.47859538, 0.64040685, 0.40236083,
        0.47550702, 0.70803577, 0.3633501 , 0.5116656 , 0.31707233,
        0.68250763, 0.26792964, 0.5650932 , 0.36306816, 0.5450336 ,
        0.6005913 , 0.39922518, 0.44142857, 0.49967673, 0.5896489 ,
        0.63725936, 0.735832  , 0.5137249 , 0.263304  , 0.74070704,
        0.71842873, 0.5230322 ],
       [0.51087207, 0.3080759 , 0.59912485, 0.51120436, 0.38382632,
        0.43908736, 0.72676015, 0.40901136, 0.38959575, 0.35390764,
        0.67280036, 0.18318978, 0.56863636, 0.53431016, 0.39484033,
        0.6990553 , 0.36938584, 0.5532697 , 0.4923

In [0]:
# #Convert all probabiliyt less than 0.5 to 1 else 0
# for i in range(32):
#   for j in range(32):
#     if result[i][j]<=0.5:
#       result[i][j]=1
#     else:
#       result[i][j]=0

# zero_count=0
# for i in range(32):
#   for j in range(32):
#     if test_label[i][j]==0:
#       zero_count+=1
      
# count=0
# for i in range(32):
#   for j in range(32):
#     if result[i][j]==test_label[i][j]:
#       count+=1
    
    
     
# #print (result[0][1]==test_label[0][1])
      
# print("total number of 0 in test label is:", zero_count)
# print("total number of token to be labelled is", total)
# print("accuracy if alwasy guess 0 is:", zero_count/total)

# print("total number of when preict and test is the same is:", count)
# total=32*32



# print("accuracy is", count/total)

total number of 0 in test label is: 994
total number of token to be labelled is 1024
accuracy if alwasy guess 0 is: 0.970703125
total number of when preict and test is the same is: 945
accuracy is 0.9228515625


In [0]:
print(len(result[0]))
print(result[2])#len is 27 represents the probability of each class

32
[0.525739   0.17723814 0.51883465 0.61479783 0.3393644  0.61199903
 0.6204618  0.61572665 0.7662282  0.42423853 0.4052503  0.44994062
 0.47312936 0.6477987  0.5720914  0.6492477  0.4279942  0.6344584
 0.28458297 0.52113175 0.5569579  0.48475057 0.5200761  0.29491848
 0.7383041  0.5899198  0.6237016 ]


In [0]:
#print(np.argmax(result, axis=1))
x=np.argmax(result, axis=2)
print(result[2][3])
print(len(x))

[0.33326754 0.30363172 0.3720487  0.6436627  0.3298968  0.44187316
 0.4364891  0.5344843  0.59936625 0.44336876 0.42019337 0.47927517
 0.5198763  0.6234932  0.61675876 0.6559623  0.40592694 0.48301476
 0.54483455 0.43970555 0.4094195  0.5696674  0.41309303 0.32302675
 0.6597335  0.37658548 0.5924104 ]
64


##Classification Report

https://stackoverflow.com/questions/1783653/computing-precision-and-recall-in-named-entity-recognition

In [0]:
from sklearn.metrics import classification_report
y_pred = model.predict(x_test, batch_size=64, verbose=1)
y_pred_bool = np.argmax(y_pred, axis=1)
print(classification_report(y_test, y_pred_bool))

NameError: ignored

In [0]:
print(classification_report(np.argmax(result, axis=2)[0],)

In [0]:
for i in result[0]:
  print(np.argmax(i))
#np.argmax(result[0][1])
#np.argmax(result[0][3])

25
24
6
15
24
10
24
24
24
6
15
24
6
24
21
24
24
21
24
24


In [0]:
np.argmax(result, axis=2)[0]

array([25, 24,  6, 15, 24, 10, 24, 24, 24,  6, 15, 24,  6, 24, 21, 24, 24,
       21, 24, 24])

In [0]:
result[0][3]

array([0.41188538, 0.28872663, 0.38126117, 0.47752753, 0.3996588 ,
       0.42883107, 0.69209856, 0.23030123, 0.2783481 , 0.3330383 ,
       0.7107179 , 0.21297577, 0.66454923, 0.4015806 , 0.31005138,
       0.77361715, 0.589921  , 0.5715536 , 0.34632716, 0.53967047,
       0.56555796, 0.3240528 , 0.46150875, 0.350986  , 0.76940316,
       0.73595166, 0.6687447 ], dtype=float32)

##Confusion Matrix

In [0]:
con=confusion_matrix(test_label, pred_label)
for i in range(len(con)):#print it one by one so it's easier to see
  print(con[i])

[  1   0  17 105   0  35  29  90  13   4  16   0   5  96  72 162   2  22
   1  11  15   1   3  45   6  67]
[0 0 1 3 0 4 0 0 0 0 2 1 0 0 1 4 0 0 0 0 0 0 1 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0]
[0 0 0 1 0 0 0 2 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0]
[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 0 2]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 

In [0]:
#generate_encoded_data(train_filelist[5:6])

In [0]:
#test_df[0:50]