In [None]:
#!pip install -q transformers

Model building

In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.utils import to_categorical
import transformers

from transformers import AutoTokenizer, TFBertModel

import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense

2023-02-01 13:31:22.819623: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-01 13:31:33.172676: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-02-01 13:31:33.172757: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-02-01 13:31:54.412072: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [10]:
class BERT_semantics:
    def __init__(self) -> None:
        pass

    def load_train_and_test_data(self):
        df_train = pd.read_csv('dataset/train.csv', header=0, sep=";", names=['input', 'semantics'], encoding='utf-8')
        df_test = pd.read_csv('dataset/test.csv', header=0, sep=",", names=['input', 'semantics'], encoding='utf-8')
        
        encoded_dict = {'StreetAddress':0, 'City':1, 'State':2, 'Date':3, 'DateTime':4, 'Email':5, 'Name':6, 'Gender':7, 'Latitude':8, 'Longitude':9, 'Manufacturer':10, 'PhoneNumber':11, 'RoleTitle':12, 'ssn':13, 'Zipcode':14, 'Boolean':15}        
        df_train['semantics'] = df_train.semantics.map(encoded_dict)
        df_test['semantics'] = df_test.semantics.map(encoded_dict)

        y_train = to_categorical(df_train.semantics)
        y_test = to_categorical(df_test.semantics)

        tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
        bert = TFBertModel.from_pretrained('bert-base-cased')

        # Tokenise the input
        # using tokeniser from bert-base-cased

        x_train = tokenizer(
            text=df_train.input.tolist(),
            add_special_tokens=True,
            max_length=20,
            truncation=True,
            padding='max_length',
            return_tensors='tf',
            return_token_type_ids='False',
            return_attention_mask=True,
            verbose=True)

        x_test = tokenizer(
            text=df_test.input.tolist(),
            add_special_tokens=True,
            max_length=20,
            truncation=True,
            padding='max_length',
            return_tensors='tf',
            return_token_type_ids='False',
            return_attention_mask=True,
            verbose=True)

        input_ids = x_train['input_ids']
        attention_mask = x_train['attention_mask']

        max_len = 20
        input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
        input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
        embeddings = bert(input_ids,attention_mask = input_mask)[0] 
        out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
        out = Dense(128, activation='relu')(out)
        out = tf.keras.layers.Dropout(0.1)(out)
        out = Dense(32,activation = 'relu')(out)
        y = Dense(16,activation = 'sigmoid')(out)
        model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
        model.layers[2].trainable = True

        optimizer = tf.keras.optimizers.legacy.Adam(
            learning_rate=5e-05, # this learning rate is for bert model , taken from huggingface website 
            epsilon=1e-08,
            decay=0.01,
            clipnorm=1.0)
        
        # Set loss and metrics
        loss =CategoricalCrossentropy(from_logits = True)
        metric = CategoricalAccuracy('balanced_accuracy'),
        
        # Compile the model
        model.compile(
            optimizer = optimizer,
            loss = loss, 
            metrics = metric)

        train_history = model.fit(
            x ={'input_ids':x_train['input_ids'],'attention_mask':x_train['attention_mask']} ,
            y = y_train,
            validation_data = (
            {'input_ids':x_test['input_ids'],'attention_mask':x_test['attention_mask']}, y_test
            ),
        epochs=1,
            batch_size=100
        )

        model.save('BERT_model.h5')

        #predicted_raw = model.predict({'input_ids':x_test['input_ids'],'attention_mask':x_test['attention_mask']})

        #y_predicted = np.argmax(predicted_raw, axis = 1)
        #y_true = df_test.semantics

        #from sklearn.metrics import classification_report
        #print(classification_report(y_true, y_predicted))

        return None

    def bert_tagging(self):
        
        #self.load_train_and_test_data()

        # Load the data as DataFrame
        df = pd.read_csv("dataset/person_data.csv",header=0, encoding='utf-8')
        
        # Fill the null values with previous values
        df = df.bfill()
        df = df.astype(str)
        
        model = tf.keras.models.load_model('BERT_model.h5', custom_objects={"TFBertModel": transformers.TFBertModel},compile=False)

        data = df.copy()
        row_count = len(df.axes[0])

        # Define the possible classes
        classes = ['StreetAddress', 'City', 'State', 'Date', 'DateTime', 'Email', 'Name', 'Gender', 'Latitude', 'Longitude', 'Manufacturer', 'PhoneNumber', 'RoleTitle', 'ssn', 'Zipcode', 'Boolean']
        df_label = pd.DataFrame({'Semantic_Tags': classes})
        df_label = df_label.set_index('Semantic_Tags')

        tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
        bert = TFBertModel.from_pretrained('bert-base-cased')

        encoded_dict = {'StreetAddress':0, 'City':1, 'State':2, 'Date':3, 'DateTime':4, 'Email':5, 'Name':6, 'Gender':7, 'Latitude':8, 'Longitude':9, 'Manufacturer':10, 'PhoneNumber':11, 'RoleTitle':12, 'ssn':13, 'Zipcode':14, 'Boolean':15}


        # Iterate over the rows in the DataFrame
        for col in data.columns:
            
            col_list = df[col].values.tolist()

            x_val = tokenizer(
            text=col_list,
            add_special_tokens=True,
            max_length=20,
            truncation=True,
            padding='max_length', 
            return_tensors='tf',
            return_token_type_ids = False,
            return_attention_mask = True,
            verbose = True)

            #create new column
            col_name = col + "_label"

            validation = model.predict({'input_ids':x_val['input_ids'],'attention_mask':x_val['attention_mask']})*100
            for i, row in data.iterrows():
                final = dict(zip(encoded_dict.keys(), validation[i]))
                label = max(final, key=final.get)
                data.at[i, col_name] = label

            
            for label in classes:
                df_label.at[label, col] = ((((data[col_name]==label).sum())/row_count)*100).round(0)
        
        df_label.to_csv('Results_labelled.csv')
        data.to_csv('Results_summary.csv')
    
        threshold = 60
        for col in df_label.columns:
            for j, row in df_label.iterrows():
                if row[col] >= threshold:
                    print(f"{col} --- {df_label[df_label[col]>=threshold].index.tolist()} --- {row[col]}%")
        
        return None

In [11]:
BERT_semantics().bert_tagging()

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


ID --- ['ssn'] --- 100.0%
FirstName --- ['Name'] --- 96.0%
LastName --- ['Name'] --- 97.0%
Title --- ['RoleTitle'] --- 87.0%
Gen --- ['Gender'] --- 96.0%
Snumber --- ['ssn'] --- 97.0%
Eaddress --- ['Email'] --- 100.0%
Tnumber --- ['PhoneNumber'] --- 99.0%
Mnumber --- ['PhoneNumber'] --- 100.0%
Salary --- ['Latitude'] --- 95.0%
Address1_Line1 --- ['StreetAddress'] --- 95.0%
Address1_City --- ['City'] --- 92.0%
Address1_StateOrProvince --- ['State'] --- 99.0%
Address1_PostalCode --- ['Zipcode'] --- 84.0%
Make --- ['Manufacturer'] --- 96.0%
Latitude --- ['Latitude'] --- 91.0%
Longitutde --- ['Longitude'] --- 100.0%
CREATED_DATE --- ['Date'] --- 100.0%
IS_CREATED --- ['Name'] --- 100.0%


In [15]:
model = tf.keras.models.load_model('BERT_model.h5', custom_objects={"TFBertModel": transformers.TFBertModel})
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 20)]         0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 20)]         0           []                               
                                                                                                  
 tf_bert_model_1 (TFBertModel)  TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 20,                                            

In [12]:
df_temp = pd.read_csv('dataset/train.csv', header=0, sep=";", names=['input', 'semantics'], encoding='utf-8')

In [21]:
size = 20        # sample size
replace = True  # with replacement
fn = lambda obj: obj.loc[np.random.choice(obj.index, size, replace),:]
df_new = df_temp.groupby('semantics', as_index=False).apply(fn)

In [20]:
df_new.shape

(1600, 2)

In [18]:
df_temp.shape

(5088, 2)

In [None]:
# define the optimization algorithm
#opt = SGD(learning_rate=0.01, momentum=0.9)
optimizer = tf.keras.optimizers.legacy.Adam(
            learning_rate=5                                                                                                                                                                                                 e-05, # this learning rate is for bert model , taken from huggingface website 
            epsilon=1e-08,
            decay=0.01,
            clipnorm=1.0)

# compile the model
old_model.compile(optimizer=opt, loss='binary_crossentropy')
# fit the model on old data
old_model.fit(X_old, y_old, epochs=150, batch_size=32, verbose=0)