# Setup
based on https://colab.research.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb#scrollTo=191zq3ZErihP

In [0]:
import datetime
import json
import os
import pprint
import random
import string
import sys
import tensorflow as tf

assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime; please see the first cell in this notebook for instructions!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)

from google.colab import auth
auth.authenticate_user()
with tf.Session(TPU_ADDRESS) as session:
  print('TPU devices:')
  pprint.pprint(session.list_devices())

  # Upload credentials to TPU.
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
  # Now credentials are set for all future sessions on this TPU.

TPU address is grpc://10.108.13.194:8470
TPU devices:
[_DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:CPU:0, CPU, -1, 861201113876687645),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 17179869184, 7509863777384350548),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 10957324747785543829),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 16010609198301123736),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 15642925627654617544),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:3, TPU, 17179869184, 202491004858483873),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:4, TPU, 17179869184, 7185590080412384499),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:5, TPU, 17179869184, 9450033752344641433),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:6, TPU, 17179869184, 59370890480492

W0802 06:36:28.714955 140462451693440 lazy_loader.py:50] 
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



# Data Ingestion


## Data Preparation

In [0]:
#This file is written to ingest data from i2b2
#below are the requried library
from xml.dom import minidom # need this to read xlm files
import xml.etree.ElementTree as ET
import os
import pandas as pd
import random
import nltk
import string
from sklearn.preprocessing import LabelEncoder
from pandas import DataFrame
import nltk.data
from nltk import sent_tokenize
nltk.download('punkt') #this package needs to be downloaded separately
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from pandas import DataFrame
import re
import numpy as np

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### GDrive Setup

We first mount the google drive containing the training and test fiels

In [0]:
from google.colab import drive # this sets the file path to your personal google drive. You will need to enter the authorization code each time. 
drive.mount('/gdrive')
%cd /gdrive

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


## Creating List of Files to be Ingested

We then created two list - each contains the list of the file names for training and testing.

1.   train_filelist = 790 EHR records
2.   test_filelist=514 EHR records



In [1]:
# data processing created with the help of teaching assistant Sudha Subramanian, who previously worked with the same dataset

train_filelist=[]

for file in os.listdir('/gdrive/My Drive/w266_NLP/training-PHI'):#set your file path here
  filename = os.fsdecode(os.fsencode('/gdrive/My Drive/w266_NLP/training-PHI/'+file))
  if filename.endswith( ('.xml') ): # select xml files
    train_filelist.append(filename)

print("There are {} training file".format(len(train_filelist))) #check that the number of training file is 790 records for 178 patients

In [2]:
test_filelist=[]

for file in os.listdir('/gdrive/My Drive/w266_NLP/test-PHI'):#set your file path here
  filename = os.fsdecode(os.fsencode('/gdrive/My Drive/w266_NLP/test-PHI/'+file))
  if filename.endswith( ('.xml') ): # select xml files
    test_filelist.append(filename)

print("There are {} test file".format(len(test_filelist))) #check that the number of test file is 514 records for 178 patients

# Process Data Annotation

The tag generator process the annotation into a dataframe.

In [0]:
def tag_generator(file):
  '''The function extract the tags from the EHR record and turn them into pd dataframe'''
  tree = ET.parse(file)
  root=tree.getroot()
  
  PHI_category=['NAME','PROFESSION','LOCATION','AGE','DATE','CONTACT','ID']# Here are the seven PHI category defined by i2b2
  #PHI_category=[category]
  tag_list=[]#An empty list to hold all dictionary items
  for category in PHI_category:
    for tag in root.iter():
      if tag.tag==category:#skip if a specific tag is not found
          tag.attrib['Category']=category #add a column on category
          tag.attrib['File']=file[len(file)-10:len(file)-4] # add a column to indicate file name
          tag_list.append(tag.attrib)
  temp_df=pd.DataFrame(tag_list)
      
  return temp_df

In [0]:
def note_generator(file):
  #'''This function breakdown inidividaul EHR text note into sentences, divided by new line and period'''
    tree = ET.ElementTree(file=file)
    root = tree.getroot()
    all_notes = []

    text = root.find('TEXT').text
    sentences = [sent.split('\n') for sent in sent_tokenize(text) if sent!='\n']
    

    for text in sentences:#this part ignore empty lines
        for sub_item in text:
            if sub_item.replace(' ','') != '':
                all_notes.append(sub_item)    
    
    return all_notes

### Install Bert tokenization 

In [0]:
!pip install bert-tensorflow    # this replaces the bert github clone
!pip install keras
import tensorflow as tf
import tensorflow_hub as hub

import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization
from keras import backend as K



W0802 06:38:35.420527 140462451693440 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/bert/optimization.py:87: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

Using TensorFlow backend.


In [0]:
# This is a path to an uncased (all lowercase) version of BERT(PW:cased?)
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1"

def create_tokenizer_from_hub_module():
  """Get the vocab file and casing info from the Hub module."""
  with tf.Graph().as_default():
    bert_module = hub.Module(BERT_MODEL_HUB)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
      vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])
      
  return bert.tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()

W0802 06:41:18.301390 140462451693440 deprecation_wrapper.py:119] From /usr/local/lib/python3.6/dist-packages/bert/tokenization.py:125: The name tf.gfile.GFile is deprecated. Please use tf.io.gfile.GFile instead.



In [0]:
def sentence_encoding(file):#this function is looped within the token_annotator function
  
  sentence_list=note_generator(file) #generate a list of sentences from tex
  
  df=tag_generator(file)
  text_list=df['text'].tolist() #generate a list of tag "TEXT"
  type_list=df['TYPE'].tolist() #generate a list of tag "type"
  category_list=df['Category'].tolist() #generate a list of tag "category"
  
  processed_sentence=[]
  processed_text=[]
  processed_type=[]
  processed_category=[]
  
  def findWholeWord(w):#this function finds a word within a string broken down by regular expression (case sensitive)
    return re.compile(r'\b({0})\b'.format(w)).search
  
  for sentence in sentence_list:
     for text in text_list:
        if findWholeWord(text)(sentence)!=None:
          processed_sentence.append(sentence)
          processed_text.append(text)
          processed_type.append(type_list[text_list.index(text)])
          processed_category.append(category_list[text_list.index(text)])

  
  temp_df=pd.DataFrame({'Sentence':processed_sentence, 'Word':processed_text, 'Type':processed_type, 'Category':processed_category})
  df = temp_df.drop_duplicates()
        
  return df
  #return sentence_list, text_list, type_list, category_list
  #return processed_sentence, processed_text, processed_type

# sentence_encoding(train_filelist[0])

In [0]:
def token_annotator(file):
  
  temp_df=sentence_encoding(file)#take the data frame and turn them into individual lists
  
  type_list=temp_df['Type'].tolist()
  temp_sentence_list=temp_df['Sentence'].tolist()
  word_list=temp_df['Word'].tolist()
  temp_unique_sentence_list=set(temp_sentence_list)
  sentence_list=list(temp_unique_sentence_list) #take out duplicate sentences
  
  tokenized_word=[] #separate individual text into words (e.g, Mia E. Tapia to "Mia","E.","Tapia")
  for phrase in word_list:
    tokenized_word.append(tokenizer.tokenize(phrase))
  
  tokenized_sentence=[]
  encoded_token=[]
  
  for i in range(len(sentence_list)): #tokenize the sentence and encode individual word
    token_list=tokenizer.tokenize(sentence_list[i])
    tokenized_sentence.append(token_list)
    temp_list=['O' for length in range(len(token_list))]
    for j in range(len(tokenized_word)):
      if all(elem in token_list for elem in tokenized_word[j])==True:
        #print(token_list, tokenized_word[j])
        for word in tokenized_word[j]:
          temp_list[token_list.index(word)]=(type_list[j])
          #print(temp_list)
    encoded_token.append(temp_list)
          
  return tokenized_sentence,encoded_token

#token_annotator(train_filelist[0])
  
 

In [0]:
def type_token_generator(file): 
  #this function convert all the text of a record into individual BERT tokenized list and generate type encoding list
  all_sentences=note_generator(file)
  tokenized_sentences=[]
  for sentence in all_sentences:
    tokenized_sentences.append(tokenizer.tokenize(sentence))
  
  type_token=[]
    
  sentence_list, encoded_token=token_annotator(file)
  
  for sentence in tokenized_sentences:
    if sentence in sentence_list:
      type_token.append(encoded_token[sentence_list.index(sentence)])
    else:
      type_token.append(['O'for i in range(len(sentence))])
  
  label_list=[]
  #label_dict={"O":0, "DATE":1, "DOCTOR":2,"HOSPITAL":3,'PATIENT':4,'AGE':5,'MEDICALRECORD':6,'CITY':7,'STATE':8,'PHONE':9,'USERNAME':10,'IDNUM':11,'PROFESSION':12,'STREET':13,'ZIP':14,'ORGANIZATION':15,'COUNTRY':16,'FAX':17,'DEVICE':18,'EMAIL':19,'LOCATION-OTHER':20,'URL':21,'HEALTHPLAN':22,'BIOID':23}# ,'IPADDRESS':24,'ACCOUNT NUMBER':25}
  for type_list in type_token:# we convert the label to numerical for Bert training. We can add types here later. 
    #label_list.append([label_dict.get(item,item)  for item in type_list])
    label_list.append([0 if typetoken =='O' else 1 for typetoken in type_list])


  #return tokenized_sentences, type_token, label_list
  return tokenized_sentences, type_token, label_list #take a look at segment of the list to make sure the they are corect
# we were missing tokenized_sentences, type_token from the return, not sure why
                             


### Generating BERT array

In [0]:
def bert_array(file, max_seq_length):
  '''This function generates the 5 lists of array that is required to feed into the model'''
  
  token_sentence, type_token, label_list= type_token_generator(file)
  
  token_list=[]
  input_IDs=[]
  input_mask=[]#1 for non padding and 0 for padding
  segment_ID=[]
  label=[]
  
  for untrimmed_sentence in token_sentence:
    sentence=untrimmed_sentence[0:(max_seq_length)-2] #trim the list to allow space for CLS and SEP
    sentence.insert(0,'[CLS]')
    sentence.insert(len(sentence),'[SEP]')
    length_before_padding=len(sentence)
    temp_inputID=[1 for i in range(length_before_padding)]#insert 1 for [CLS] and [SEP] for mask
    sentence.extend(['[PAD]' for i in range(max_seq_length-len(sentence))])
    temp_inputID.extend([0 for i in range(max_seq_length-len(temp_inputID))])
    token_list.append(sentence)
    input_mask.append(temp_inputID)
    segment_ID.append([0 for i in range(max_seq_length)])
  
  for token in token_list:
    input_ids=tokenizer.convert_tokens_to_ids(token)
    input_IDs.append(input_ids)
  
  for untrimmed_item in label_list:# we assign 0 to be non-PHI (including [CLS] and [SEP], 1 to be PHI and 2 to be padding
    item=untrimmed_item[0:(max_seq_length-2)]#trim the list to allow space for CLS and SEP
    item.insert(0,0)#class label 24 for CLS (Arnobio - you need to change 24 to 0 for binary)
    item.insert(len(item),0) #class label 25 for SEP (Arnobio - you need to change 25 to 0 for binary)
    item.extend([0 for i in range(max_seq_length-len(item))])  #class label 26 represents paddinging (ARnobio you need to change 26 to 0 for binary)
    label.append(item)
  
  
  return token_list, input_IDs, input_mask, segment_ID, label





# token='this'
# input_ids = tokenizer.convert_tokens_to_ids(['[CLS]'])


# Generating data for BERT

In [0]:
# batch size
batch_size = 32

# create data
max_seq_length = 20

# num of files to retrieve
num_of_file_train = 200


def generate_train_data(num_of_file, max_seq_length):#Max number of file number is 789
  '''This function runs through a loop to append the tokens, input ids, input masks, segement id and labels to 5 individual np arrays '''
  temp_list0, temp_list1, temp_list2, temp_list3, temp_list4=[],[],[],[],[]
  for i in range(num_of_file):
    temp_data= bert_array(train_filelist[i],max_seq_length)
    
    for j in range(len(temp_data[0])):
      temp_list0.append(temp_data[0][j])
      temp_list1.append(temp_data[1][j])
      temp_list2.append(temp_data[2][j])
      temp_list3.append(temp_data[3][j])
      temp_list4.append(temp_data[4][j])
  
#   np_token_list=np.array(temp_list0)
#   np_input_ids=np.array(temp_list1)
#   np_input_masks=np.array(temp_list2)
#   np_segment_ids=np.array(temp_list3)
#   np_labels=np.array(temp_list4)
      
  #return np_token_list, np_input_ids, np_input_masks, np_segment_ids, np_labels
  return temp_list0, temp_list1, temp_list2, temp_list3, temp_list4

#change number of file here (MAX:789)


train_token_list, train_input_ids, train_input_masks, train_segment_ids, train_labels= generate_train_data(num_of_file_train,max_seq_length)

# #print(train_token_list)
# #check that the shape is correct
# #print(train_input_ids.shape, train_input_masks.shape, train_token_list.shape, train_segment_ids.shape, train_labels.shape)

In [0]:
print(len(train_token_list))

17470


In [0]:
num_of_file_test = 150

def generate_test_data(num_of_file, max_seq_length):#Max number of file is 513
  '''This function runs through a loop to append the tokens, input ids, input masks, segement id and labels to 5 individual np arrays '''
  temp_list0, temp_list1, temp_list2, temp_list3, temp_list4=[],[],[],[],[]
  for i in range(num_of_file):
    temp_data= bert_array(test_filelist[i],max_seq_length)
    
    for j in range(len(temp_data[0])):
      temp_list0.append(temp_data[0][j])
      temp_list1.append(temp_data[1][j])
      temp_list2.append(temp_data[2][j])
      temp_list3.append(temp_data[3][j])
      temp_list4.append(temp_data[4][j])
  

  return temp_list0, temp_list1, temp_list2, temp_list3, temp_list4

test_token_list, test_input_ids, test_input_masks, test_segment_ids, test_labels = generate_test_data(num_of_file_test,max_seq_length)




In [0]:
# print(train_token_list[0])
 print(len(test_input_ids))
# print(train_segment_ids[0])
#print(type(test_labels))

12934


### Save Data Arrays for Loading

In [0]:
# save tokens outside o
#cd ..
np.save("train_token_list",train_token_list)
np.save("train_input_ids",train_token_list)
np.save("train_input_masks",train_token_list)
np.save("train_segment_ids",train_token_list)
np.save("train_labels",train_token_list)

#%cd /gdrive

In [0]:
cd gdrive

/gdrive


### Load Data Arrays from Saved Files

In [0]:
#from google.colab import drive # this sets the file path to your personal google drive. You will need to enter the authorization code each time. 
#drive.mount('/gdrive')
# cd /gdrive

#Buildling the Bert Model

In [0]:
# Partially based on and created with teh help with Joachim Rahmfeld and his work, as well as "BERT in Keras with Tensorflow hub" (https://towardsdatascience.com/bert-in-keras-with-tensorflow-hub-76bcbc9417b) 

#BERT_MODEL_HUB
class BertLayer(tf.keras.layers.Layer):
    def __init__(
        self,
        n_fine_tune_layers=1,
        pooling="sequence_output",
        bert_path=BERT_MODEL_HUB,
        #bert_path="https://tfhub.dev/google/bert_cased_L-12_H-768_A-12/1",
        **kwargs,
    ):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = True
        self.output_size = 768
        self.pooling = pooling
        self.bert_path = bert_path

        super(BertLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.bert = hub.Module(
            self.bert_path, trainable=self.trainable, name=f"{self.name}_module"
        )

        # Remove unused layers
        trainable_vars = self.bert.variables
        #         if self.pooling == "first":
        #             trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]
        #             trainable_layers = ["pooler/dense"]

        #         elif self.pooling == "sequence":
        trainable_vars = [
            var
            for var in trainable_vars
            if not "/cls/" in var.name and not "/pooler/" in var.name
        ]
        
        trainable_layers = []
        #         else:
        #             raise NameError(
        #                 f"Undefined pooling type (must be either first or mean, but is {self.pooling}"
        #             )

                # Select how many layers to fine tune
        for i in range(self.n_fine_tune_layers):
            trainable_layers.append(f"encoder/layer_{str(11 - i)}")

        # Update trainable vars to contain only the specified layers
        trainable_vars = [
            var
            for var in trainable_vars
            if any([l in var.name for l in trainable_layers])
        ]

        # Add to trainable weights
        for var in trainable_vars:
            self._trainable_weights.append(var)

        for var in self.bert.variables:
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)

        super(BertLayer, self).build(input_shape)

    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
        )
        #         if self.pooling == "first":
        #             pooled = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
        #                 "pooled_output"
        #             ]
        #         elif self.pooling == "sequence":
        #             result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
        #                 "sequence_output"
        #             ]

        #             mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
        #             masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
        #                     tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
        #             input_mask = tf.cast(input_mask, tf.float32)
        #             pooled = masked_reduce_mean(result, input_mask)
        #         else:
        #             raise NameError(f"Undefined pooling type (must be either first or mean, but is {self.pooling}")
                
        result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
                "sequence_output"
            ]
        
        mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
        
        return result

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size)

In [0]:
num_labels = 2

def custom_acc_orig_tokens(y_true, y_pred):
    """
    calculate loss dfunction filtering out also the newly inserted labels
    
    y_true: Shape: (batch x (max_length) )
    y_pred: predictions. Shape: (batch x x (max_length + 1) x num_distinct_ner_tokens ) 
    
    returns: accuracy
    """

    #get labels and predictions
    
    y_label = tf.reshape(tf.layers.Flatten()(tf.cast(y_true, tf.int64)),[-1])
    
    mask = (y_label < 24)
    y_label_masked = tf.boolean_mask(y_label, mask)
    
    y_predicted = tf.math.argmax(input = tf.reshape(tf.layers.Flatten()(tf.cast(y_pred, tf.float64)),\
                                                    [-1, num_labels]), axis=1)
    
    y_predicted_masked = tf.boolean_mask(y_predicted, mask)

    return tf.reduce_mean(tf.cast(tf.equal(y_predicted_masked,y_label_masked) , dtype=tf.float64))

In [0]:
# Build model
def build_model(max_seq_length): 
    in_id = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids")
    in_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_masks")
    in_segment = tf.keras.layers.Input(shape=(max_seq_length,), name="segment_ids")
    #print(in_id, in_mask, in_segment)
    bert_inputs = [in_id, in_mask, in_segment]
    
    bert_output = BertLayer(n_fine_tune_layers=0, pooling="sequence_output")(bert_inputs)
    
    #print(bert_output)
    
    dense = tf.keras.layers.Dense(256, activation='relu')(bert_output)
    
    dense = tf.keras.layers.Dropout(rate=0.1)(dense)#random drop out to prevent overfitting
    
    pred = tf.keras.layers.Dense(2, activation='sigmoid')(dense)#Arnobio: 2 for binary class (not sure why) need to change output shpae to reflect number of classes
    
    model = tf.keras.models.Model(inputs=bert_inputs, outputs=pred)
    

    #model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    #losses = custom_loss#added this copying the function from Joachim's notebook
    #model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    #model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    #model.compile(loss=losses, optimizer='adam', metrics=['accuracy'])
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=[custom_acc_orig_tokens])
    
    model.summary()
    
    return model

#build_model(32)
# def initialize_vars(sess):
#     sess.run(tf.local_variables_initializer())
#     sess.run(tf.global_variables_initializer())
#     sess.run(tf.tables_initializer())
#     K.set_session(sess)

##Train the BERT model

### Data Preparation
Set up data to mimic Joachim's data input

In [0]:
#train_token_list, train_input_ids, train_input_masks, train_segment_ids, train_la

X_train = np.array([train_input_ids,train_input_masks,train_segment_ids])
X_test = np.array([test_input_ids,test_input_masks,test_segment_ids])
train_label1=np.array(train_labels)
test_label1=np.array(test_labels)

print(X_train.shape)
print(X_test.shape)

(3, 17470, 20)
(3, 12934, 20)


In [0]:
k_start=0
k_end_train=9600
k_end_dev_start_test=3200
k_end_test=k_end_dev_start_test+3200

bert_inputs_train_k = [X_train[0][k_start:k_end_train], X_train[1][k_start:k_end_train], 
                       X_train[2][k_start:k_end_train]]

bert_inputs_dev_k = [X_test[0][k_start:k_end_dev_start_test], X_test[1][k_start:k_end_dev_start_test], 
                      X_test[2][k_start:k_end_dev_start_test]]

bert_inputs_test_k = [X_test[0][k_end_dev_start_test:k_end_test], X_test[1][k_end_dev_start_test:k_end_test], 
                      X_test[2][k_end_dev_start_test:k_end_test]]

bert_train_label=train_label1[k_start:k_end_train]
bert_dev_label=test_label1[k_start:k_end_dev_start_test]
bert_test_label=test_label1[k_end_dev_start_test:k_end_test]

print(len(bert_inputs_train_k))
print(bert_train_label.shape)
print(bert_dev_label.shape)
print(bert_test_label.shape)

3
(9600, 20)
(3200, 20)
(3200, 20)


In [0]:
count=0
for i in range(len(bert_test_label)):
  for j in range (20):
    if bert_test_label[i][j]==0:
      count+=1

print("the number of label 0 is", count)
print("the percent of zero is", count/64000)

the number of label 0 is 61723
the percent of zero is 0.964421875


In [0]:
#print(bert_inputs_train_k)

In [0]:
#keras.backend.get_session().run(tf.global_variables_initializer())

# # #https://stackoverflow.com/questions/34001922/failedpreconditionerror-attempting-to-use-uninitialized-in-tensorflow
# sess = tf.InteractiveSession()

# sess.run(tf.global_variables_initializer())
# sess.run(tf.local_variables_initializer())

model = build_model(max_seq_length)


sess = tf.InteractiveSession()

sess.run(tf.global_variables_initializer())
sess.run(tf.local_variables_initializer())

# num_train_examples = 64
# num_dev_examples = 32

# model.fit(
#     bert_inputs_train_k, 
#     {"ner": labels_train_k },
#     validation_data=(bert_inputs_test_k, {"ner": labels_test_k }),
#     epochs=8,
#     batch_size=32#,
#     #callbacks=[tensorboard]

model.fit(
    bert_inputs_train_k, 
    bert_train_label,
    validation_data=(bert_inputs_dev_k,bert_dev_label),
    epochs=5,
    batch_size=32#if we change this to input dimension then we solve the 32 problem. 
)


#sess.close()

W0802 06:53:20.039936 140462451693440 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0802 06:53:21.104645 140462451693440 deprecation.py:323] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/array_ops.py:1354: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 20)]         0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        [(None, 20)]         0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 20)]         0                                            
__________________________________________________________________________________________________
bert_layer (BertLayer)          (None, None, 768)    108931396   input_ids[0][0]                  
                                                                 input_masks[0][0]            

<tensorflow.python.keras.callbacks.History at 0x7fbf948cc630>

#Error Analysis

##Precision/Recall/F1

https://stackoverflow.com/questions/1783653/computing-precision-and-recall-in-named-entity-recognition

In [0]:

# sess = tf.InteractiveSession()

# sess.run(tf.global_variables_initializer())
# sess.run(tf.local_variables_initializer())

result = model.predict(
    bert_inputs_test_k, 
    batch_size=32)


32




In [0]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
pred_np=np.argmax(result, axis=2)

test_np=bert_test_label


pred_label=[]
test_label=[]

for i in range(pred_np.shape[0]):
  pred_label.extend(pred_np[i])

for i in range(len(test_np)):
  test_label.extend(test_np[i])
  
  
print(len(pred_label), len(test_label))

print(classification_report(test_label, pred_label))

64000 64000
              precision    recall  f1-score   support

           0       0.93      0.19      0.31     61723
           1       0.03      0.61      0.05      2277

    accuracy                           0.20     64000
   macro avg       0.48      0.40      0.18     64000
weighted avg       0.90      0.20      0.30     64000



In [0]:
#save data to gdrive
from google.colab import files
%cd ..
# np.save('result_baseline', result)
# files.download('result_baseline.npy')

np.save('test_label_baseline',bert_test_label)
files.download('test_label_baseline.npy')


/


In [0]:
%cd ..
np.save('test_label_baseline',bert_test_label)
files.download('test_label_baseline.npy')

/


In [0]:
con=confusion_matrix(test_label, pred_label)
for i in range(len(con)):#print it one by one so it's easier to see
  print(con[i])


[11605 50118]
[ 888 1389]


In [0]:
#Convert all probabiliyt less than 0.5 to 1 else 0
for i in range(32):
  for j in range(32):
    if result[i][j]<=0.5:
      result[i][j]=1
    else:
      result[i][j]=0

zero_count=0
for i in range(32):
  for j in range(32):
    if test_label[i][j]==0:
      zero_count+=1
      
count=0
for i in range(32):
  for j in range(32):
    if result[i][j]==test_label[i][j]:
      count+=1
    
    
     
#print (result[0][1]==test_label[0][1])
      
print("total number of 0 in test label is:", zero_count)
print("total number of token to be labelled is", total)
print("accuracy if alwasy guess 0 is:", zero_count/total)

print("total number of when preict and test is the same is:", count)
total=32*32



print("accuracy is", count/total)

total number of 0 in test label is: 994
total number of token to be labelled is 1024
accuracy if alwasy guess 0 is: 0.970703125
total number of when preict and test is the same is: 945
accuracy is 0.9228515625


In [0]:
pred=[[0,0,0,1,2,3,4,5,5,0],[1,4,2,3,0,0,0,0,0,0]]
golden=[[1,1,1,5,5,6,2,0,0,0],[0,0,0,0,2,3,4,2,4,0]]

def precision(predict, golden):
  '''We use the exact match approach as explained in this post: https://stackoverflow.com/questions/1783653/computing-precision-and-recall-in-named-entity-recognition'''
  TP=[]
  
  